diff --git a/BPTCEncoder/src/BC7Compressor.cpp b/BPTCEncoder/src/BC7Compressor.cpp
index 453bfb3..10ad0c6 100755
--- a/BPTCEncoder/src/BC7Compressor.cpp
+++ b/BPTCEncoder/src/BC7Compressor.cpp
@@ -37,18 +37,18 @@
 
 static const uint32 kNumShapes2 = 64;
 static const uint16 kShapeMask2[kNumShapes2] = {
-	0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80,
-	0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000,
-	0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce,
-	0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c,
-	0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a,
-	0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660,
-	0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c,
-	0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22
+  0xcccc, 0x8888, 0xeeee, 0xecc8, 0xc880, 0xfeec, 0xfec8, 0xec80,
+  0xc800, 0xffec, 0xfe80, 0xe800, 0xffe8, 0xff00, 0xfff0, 0xf000,
+  0xf710, 0x008e, 0x7100, 0x08ce, 0x008c, 0x7310, 0x3100, 0x8cce,
+  0x088c, 0x3110, 0x6666, 0x366c, 0x17e8, 0x0ff0, 0x718e, 0x399c,
+  0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a,
+  0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x0660,
+  0x0272, 0x04e4, 0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c,
+  0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0x0fcc, 0x7744, 0xee22
 };
 
 static const int kAnchorIdx2[kNumShapes2] = {
-	15,15,15,15,15,15,15,15,
+  15,15,15,15,15,15,15,15,
     15,15,15,15,15,15,15,15,
     15, 2, 8, 2, 2, 8, 8,15,
      2, 8, 2, 2, 8, 8, 2, 2,
@@ -60,14 +60,14 @@ static const int kAnchorIdx2[kNumShapes2] = {
 
 static const uint32 kNumShapes3 = 64;
 static const uint16 kShapeMask3[kNumShapes3][2] = {
-	{ 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 },
-	{ 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 },
-	{ 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 },
-	{ 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 },
-	{ 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 },
-	{ 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 },
-	{ 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 },
-	{ 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 }
+  { 0xfecc, 0xf600 }, { 0xffc8, 0x7300 }, { 0xff90, 0x3310 }, { 0xecce, 0x00ce }, { 0xff00, 0xcc00 }, { 0xcccc, 0xcc00 }, { 0xffcc, 0x00cc }, { 0xffcc, 0x3300 },
+  { 0xff00, 0xf000 }, { 0xfff0, 0xf000 }, { 0xfff0, 0xff00 }, { 0xcccc, 0x8888 }, { 0xeeee, 0x8888 }, { 0xeeee, 0xcccc }, { 0xffec, 0xec80 }, { 0x739c, 0x7310 },
+  { 0xfec8, 0xc800 }, { 0x39ce, 0x3100 }, { 0xfff0, 0xccc0 }, { 0xfccc, 0x0ccc }, { 0xeeee, 0xee00 }, { 0xff88, 0x7700 }, { 0xeec0, 0xcc00 }, { 0x7730, 0x3300 },
+  { 0x0cee, 0x00cc }, { 0xffcc, 0xfc88 }, { 0x6ff6, 0x0660 }, { 0xff60, 0x6600 }, { 0xcbbc, 0xc88c }, { 0xf966, 0xf900 }, { 0xceec, 0x0cc0 }, { 0xff10, 0x7310 },
+  { 0xff80, 0xec80 }, { 0xccce, 0x08ce }, { 0xeccc, 0xec80 }, { 0x6666, 0x4444 }, { 0x0ff0, 0x0f00 }, { 0x6db6, 0x4924 }, { 0x6bd6, 0x4294 }, { 0xcf3c, 0x0c30 },
+  { 0xc3fc, 0x03c0 }, { 0xffaa, 0xff00 }, { 0xff00, 0x5500 }, { 0xfcfc, 0xcccc }, { 0xcccc, 0x0c0c }, { 0xf6f6, 0x6666 }, { 0xaffa, 0x0ff0 }, { 0xfff0, 0x5550 },
+  { 0xfaaa, 0xf000 }, { 0xeeee, 0x0e0e }, { 0xf8f8, 0x8888 }, { 0xfff0, 0x9990 }, { 0xeeee, 0xe00e }, { 0x8ff8, 0x8888 }, { 0xf666, 0xf000 }, { 0xff00, 0x9900 },
+  { 0xff66, 0xff00 }, { 0xcccc, 0xc00c }, { 0xcffc, 0xcccc }, { 0xf000, 0x9000 }, { 0x8888, 0x0808 }, { 0xfefe, 0xeeee }, { 0xfffa, 0xfff0 }, { 0x7bde, 0x7310 }
 };
 
 static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b980 };
@@ -75,122 +75,127 @@ static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]);
 static uint32 gWMVal = -1;
 
 static const int kAnchorIdx3[2][kNumShapes3] = {
-	{ 3, 3,15,15, 8, 3,15,15,
+  { 3, 3,15,15, 8, 3,15,15,
      8, 8, 6, 6, 6, 5, 3, 3,
      3, 3, 8,15, 3, 3, 6,10,
      5, 8, 8, 6, 8, 5,15,15,
      8,15, 3, 5, 6,10, 8,15,
     15, 3,15, 5,15,15,15,15,
      3,15, 5, 5, 5, 8, 5,10,
-	 5,10, 8,13,15,12, 3, 3 },
+   5,10, 8,13,15,12, 3, 3 },
 
-	{ 15, 8, 8, 3,15,15, 3, 8,
+  { 15, 8, 8, 3,15,15, 3, 8,
     15,15,15,15,15,15,15, 8,
     15, 8,15, 3,15, 8,15, 8,
      3,15, 6,10,15,15,10, 8,
     15, 3,15,10,10, 8, 9,10,
      6,15, 8,15, 3, 6, 6, 8,
     15, 3,15,15,15,15,15,15,
-	15,15,15,15, 3,15,15, 8 }
+  15,15,15,15, 3,15,15, 8 }
 };
 
+template <typename T>
+static inline T sad(const T &a, const T &b) {
+  return (a > b)? a - b : b - a;
+}
+
 static int GetSubsetForIndex(int idx, const int shapeIdx, const int nSubsets) {
-	int subset = 0;
-	
-	switch(nSubsets) {
-		case 2:
-		{
-			subset = !!((1 << idx) & kShapeMask2[shapeIdx]);
-		}
-		break;
+  int subset = 0;
+  
+  switch(nSubsets) {
+    case 2:
+    {
+      subset = !!((1 << idx) & kShapeMask2[shapeIdx]);
+    }
+    break;
 
-		case 3:
-		{
-			if(1 << idx & kShapeMask3[shapeIdx][0])
-				subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]);
-			else
-				subset = 0;
-		}
-		break;
+    case 3:
+    {
+      if(1 << idx & kShapeMask3[shapeIdx][0])
+        subset = 1 + !!((1 << idx) & kShapeMask3[shapeIdx][1]);
+      else
+        subset = 0;
+    }
+    break;
 
-		default:
-		break;
-	}
+    default:
+    break;
+  }
 
-	return subset;
+  return subset;
 }
 
 static int GetAnchorIndexForSubset(int subset, const int shapeIdx, const int nSubsets) {
-	
-	int anchorIdx = 0;
-	switch(subset) {
-		case 1:
-		{
-			if(nSubsets == 2) {
-				anchorIdx = kAnchorIdx2[shapeIdx];
-			}
-			else {
-				anchorIdx = kAnchorIdx3[0][shapeIdx];
-			}
-		}
-		break;
+  
+  int anchorIdx = 0;
+  switch(subset) {
+    case 1:
+    {
+      if(nSubsets == 2) {
+        anchorIdx = kAnchorIdx2[shapeIdx];
+      }
+      else {
+        anchorIdx = kAnchorIdx3[0][shapeIdx];
+      }
+    }
+    break;
 
-		case 2:
-		{
-			assert(nSubsets == 3);
-			anchorIdx = kAnchorIdx3[1][shapeIdx];
-		}
-		break;
+    case 2:
+    {
+      assert(nSubsets == 3);
+      anchorIdx = kAnchorIdx3[1][shapeIdx];
+    }
+    break;
 
-		default:
-		break;
-	}
+    default:
+    break;
+  }
 
-	return anchorIdx;
+  return anchorIdx;
 }
 
 static int GetPointMaskForSubset(int subset, const int shapeIdx, const int nSubsets) {
-	int mask = 0xFFFF;
+  int mask = 0xFFFF;
 
-	assert(subset < nSubsets);
+  assert(subset < nSubsets);
 
-	switch(nSubsets) {
-		case 2:
-		{
-			mask = (subset)? kShapeMask2[shapeIdx] : ~(kShapeMask2[shapeIdx]);
-		}
-		break;
+  switch(nSubsets) {
+    case 2:
+    {
+      mask = (subset)? kShapeMask2[shapeIdx] : ~(kShapeMask2[shapeIdx]);
+    }
+    break;
 
-		case 3:
-		{
-			switch(subset) {
-				default:
-				case 0:
-				{
-					mask = ~(kShapeMask3[shapeIdx][0]);
-				}
-				break;
+    case 3:
+    {
+      switch(subset) {
+        default:
+        case 0:
+        {
+          mask = ~(kShapeMask3[shapeIdx][0]);
+        }
+        break;
 
-				case 1:
-				{
-					mask = ~(~(kShapeMask3[shapeIdx][0]) | kShapeMask3[shapeIdx][1]);
-				}
-				break;
+        case 1:
+        {
+          mask = ~(~(kShapeMask3[shapeIdx][0]) | kShapeMask3[shapeIdx][1]);
+        }
+        break;
 
-				case 2:
-				{
-					mask = kShapeMask3[shapeIdx][1];
-				}
-				break;
-			}
-		}
-		break;
+        case 2:
+        {
+          mask = kShapeMask3[shapeIdx][1];
+        }
+        break;
+      }
+    }
+    break;
 
-		default:
-		break;
-	}
+    default:
+    break;
+  }
 
-	return mask;
+  return mask;
 }
 
 #ifndef min
@@ -203,314 +208,314 @@ static int GetPointMaskForSubset(int subset, const int shapeIdx, const int nSubs
 
 template <typename T>
 static void insert(T* buf, int bufSz, T newVal, int idx = 0) {
-	int safeIdx = min(bufSz-1, max(idx, 0));
-	for(int i = bufSz - 1; i > safeIdx; i--) {
-		buf[i] = buf[i-1];
-	}
-	buf[safeIdx] = newVal;
+  int safeIdx = min(bufSz-1, max(idx, 0));
+  for(int i = bufSz - 1; i > safeIdx; i--) {
+    buf[i] = buf[i-1];
+  }
+  buf[safeIdx] = newVal;
 }
 
 template <typename T>
 static inline void swap(T &a, T &b) { T t = a; a = b; b = t; }
 
 const uint32 kBC7InterpolationValues[4][16][2] = {
-	{ {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-	{ {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
-	{ {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 },
-	{ {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} }
+  { {64, 0}, {33, 31}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { {64, 0}, {43, 21}, {21, 43}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64} }
 };
 
 int BC7CompressionMode::MaxAnnealingIterations = 50; // This is a setting.
 int BC7CompressionMode::NumUses[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 
 BC7CompressionMode::Attributes BC7CompressionMode::kModeAttributes[kNumModes] = {
-	{ 0, 4, 3, 3, 0, 4, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
-	{ 1, 6, 2, 3, 0, 6, 0, false, false, BC7CompressionMode::ePBitType_Shared },
-	{ 2, 6, 3, 2, 0, 5, 0, false, false, BC7CompressionMode::ePBitType_None },
-	{ 3, 6, 2, 2, 0, 7, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
-	{ 4, 0, 1, 2, 3, 5, 6, true,  true,  BC7CompressionMode::ePBitType_None },
-	{ 5, 0, 1, 2, 2, 7, 8, true,  false, BC7CompressionMode::ePBitType_None },
-	{ 6, 0, 1, 4, 0, 7, 7, false, false, BC7CompressionMode::ePBitType_NotShared },
-	{ 7, 6, 2, 2, 0, 5, 5, false, false, BC7CompressionMode::ePBitType_NotShared },
+  { 0, 4, 3, 3, 0, 4, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
+  { 1, 6, 2, 3, 0, 6, 0, false, false, BC7CompressionMode::ePBitType_Shared },
+  { 2, 6, 3, 2, 0, 5, 0, false, false, BC7CompressionMode::ePBitType_None },
+  { 3, 6, 2, 2, 0, 7, 0, false, false, BC7CompressionMode::ePBitType_NotShared },
+  { 4, 0, 1, 2, 3, 5, 6, true,  true,  BC7CompressionMode::ePBitType_None },
+  { 5, 0, 1, 2, 2, 7, 8, true,  false, BC7CompressionMode::ePBitType_None },
+  { 6, 0, 1, 4, 0, 7, 7, false, false, BC7CompressionMode::ePBitType_NotShared },
+  { 7, 6, 2, 2, 0, 5, 5, false, false, BC7CompressionMode::ePBitType_NotShared },
 };
 
 void BC7CompressionMode::ClampEndpointsToGrid(RGBAVector &p1, RGBAVector &p2, int &bestPBitCombo) const {
-	const int nPbitCombos = GetNumPbitCombos();
-	const bool hasPbits = nPbitCombos > 1;
-	const uint32 qmask = GetQuantizationMask();
+  const int nPbitCombos = GetNumPbitCombos();
+  const bool hasPbits = nPbitCombos > 1;
+  const uint32 qmask = GetQuantizationMask();
 
-	ClampEndpoints(p1, p2);
+  ClampEndpoints(p1, p2);
 
-	// !SPEED! This can be faster.
-	float minDist = FLT_MAX;
-	RGBAVector bp1, bp2;
-	for(int i = 0; i < nPbitCombos; i++) {
+  // !SPEED! This can be faster.
+  float minDist = FLT_MAX;
+  RGBAVector bp1, bp2;
+  for(int i = 0; i < nPbitCombos; i++) {
 
-		uint32 qp1, qp2;
-		if(hasPbits) {
-			qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]);
-			qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]);
-		}
-		else {
-			qp1 = p1.ToPixel(qmask);
-			qp2 = p2.ToPixel(qmask);
-		}
+    uint32 qp1, qp2;
+    if(hasPbits) {
+      qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]);
+      qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]);
+    }
+    else {
+      qp1 = p1.ToPixel(qmask);
+      qp2 = p2.ToPixel(qmask);
+    }
 
-		uint8 *pqp1 = (uint8 *)&qp1;
-		uint8 *pqp2 = (uint8 *)&qp2;
+    uint8 *pqp1 = (uint8 *)&qp1;
+    uint8 *pqp2 = (uint8 *)&qp2;
 
-		RGBAVector np1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
-		RGBAVector np2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
+    RGBAVector np1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
+    RGBAVector np2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
 
-		RGBAVector d1 = np1 - p1;
-		RGBAVector d2 = np2 - p2;
-		float dist = (d1 * d1) + (d2 * d2);
-		if(dist < minDist) {
-			minDist = dist;
-			bp1 = np1; bp2 = np2;
-			bestPBitCombo = i;
-		}
-	}
+    RGBAVector d1 = np1 - p1;
+    RGBAVector d2 = np2 - p2;
+    float dist = (d1 * d1) + (d2 * d2);
+    if(dist < minDist) {
+      minDist = dist;
+      bp1 = np1; bp2 = np2;
+      bestPBitCombo = i;
+    }
+  }
 
-	p1 = bp1;
-	p2 = bp2;
+  p1 = bp1;
+  p2 = bp2;
 }
 
 double BC7CompressionMode::CompressSingleColor(const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, int &bestPbitCombo) const {
 
-	const uint32 pixel = p.ToPixel();
+  const uint32 pixel = p.ToPixel();
 
-	uint32 bestDist = 0xFF;
-	bestPbitCombo = -1;
+  uint32 bestDist = 0xFF;
+  bestPbitCombo = -1;
 
-	for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) {
+  for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) {
 
-		const int *pbitCombo = GetPBitCombo(pbi);
-		
-		uint32 dist = 0x0;
-		uint32 bestValI[kNumColorChannels] = { -1, -1, -1, -1 };
-		uint32 bestValJ[kNumColorChannels] = { -1, -1, -1, -1 };
+    const int *pbitCombo = GetPBitCombo(pbi);
+    
+    uint32 dist = 0x0;
+    uint32 bestValI[kNumColorChannels] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
+    uint32 bestValJ[kNumColorChannels] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
 
-		for(int ci = 0; ci < kNumColorChannels; ci++) {
+    for(int ci = 0; ci < kNumColorChannels; ci++) {
 
-			const uint8 val = (pixel >> (ci * 8)) & 0xFF;
-			int nBits = ci == 3? GetAlphaChannelPrecision() : m_Attributes->colorChannelPrecision;
+      const uint8 val = (pixel >> (ci * 8)) & 0xFF;
+      int nBits = ci == 3? GetAlphaChannelPrecision() : m_Attributes->colorChannelPrecision;
 
-			// If we don't handle this channel, then we don't need to
-			// worry about how well we interpolate.
-			if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; }
+      // If we don't handle this channel, then we don't need to
+      // worry about how well we interpolate.
+      if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; continue; }
 
-			const int nPossVals = (1 << nBits);
-			int possValsH[256];
-			int possValsL[256];
+      const int nPossVals = (1 << nBits);
+      int possValsH[256];
+      int possValsL[256];
 
-			// Do we have a pbit?
-			const bool havepbit = GetPBitType() != ePBitType_None;
-			if(havepbit)
-				nBits++;
+      // Do we have a pbit?
+      const bool havepbit = GetPBitType() != ePBitType_None;
+      if(havepbit)
+        nBits++;
 
-			for(int i = 0; i < nPossVals; i++) {
+      for(int i = 0; i < nPossVals; i++) {
 
-				int vh = i, vl = i;
-				if(havepbit) {
-					vh <<= 1;
-					vl <<= 1;
+        int vh = i, vl = i;
+        if(havepbit) {
+          vh <<= 1;
+          vl <<= 1;
 
-					vh |= pbitCombo[1];
-					vl |= pbitCombo[0];
-				}
+          vh |= pbitCombo[1];
+          vl |= pbitCombo[0];
+        }
 
-				possValsH[i] = (vh << (8 - nBits));
-				possValsH[i] |= (possValsH[i] >> nBits);
+        possValsH[i] = (vh << (8 - nBits));
+        possValsH[i] |= (possValsH[i] >> nBits);
 
-				possValsL[i] = (vl << (8 - nBits));
-				possValsL[i] |= (possValsL[i] >> nBits);
-			}
+        possValsL[i] = (vl << (8 - nBits));
+        possValsL[i] |= (possValsL[i] >> nBits);
+      }
 
-			const uint32 interpVal0 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][0];
-			const uint32 interpVal1 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][1];
+      const uint32 interpVal0 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][0];
+      const uint32 interpVal1 = kBC7InterpolationValues[GetNumberOfBitsPerIndex() - 1][1][1];
 
-			// Find the closest interpolated val that to the given val...
-			uint32 bestChannelDist = 0xFF;
-			for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++)
-			for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) {
+      // Find the closest interpolated val that to the given val...
+      uint32 bestChannelDist = 0xFF;
+      for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++)
+      for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) {
 
-				const uint32 v1 = possValsL[i];
-				const uint32 v2 = possValsH[j];
+        const uint32 v1 = possValsL[i];
+        const uint32 v2 = possValsH[j];
 
-				const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6;
-				const uint32 err = (combo > val)? combo - val : val - combo;
+        const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6;
+        const uint32 err = (combo > val)? combo - val : val - combo;
 
-				if(err < bestChannelDist) {
-					bestChannelDist = err;
-					bestValI[ci] = v1;
-					bestValJ[ci] = v2;
-				}
-			}
+        if(err < bestChannelDist) {
+          bestChannelDist = err;
+          bestValI[ci] = v1;
+          bestValJ[ci] = v2;
+        }
+      }
 
-			dist = max(bestChannelDist, dist);
-		}
+      dist = max(bestChannelDist, dist);
+    }
 
-		if(dist < bestDist) {
-			bestDist = dist;
-			bestPbitCombo = pbi;
+    if(dist < bestDist) {
+      bestDist = dist;
+      bestPbitCombo = pbi;
 
-			for(int ci = 0; ci < kNumColorChannels; ci++) {
-				p1.c[ci] = float(bestValI[ci]);
-				p2.c[ci] = float(bestValJ[ci]);
-			}
-		}
-	}
+      for(int ci = 0; ci < kNumColorChannels; ci++) {
+        p1.c[ci] = float(bestValI[ci]);
+        p2.c[ci] = float(bestValJ[ci]);
+      }
+    }
+  }
 
-	return bestDist;
+  return bestDist;
 }
 
 // Fast random number generator. See more information at 
 // http://software.intel.com/en-us/articles/fast-random-number-generator-on-the-intel-pentiumr-4-processor/
 static uint32 g_seed = uint32(time(NULL));
 static inline uint32 fastrand() { 
-	g_seed = (214013 * g_seed + 2531011); 
-	return (g_seed>>16) & RAND_MAX; 
+  g_seed = (214013 * g_seed + 2531011); 
+  return (g_seed>>16) & RAND_MAX; 
 } 
 
 static const int kNumStepDirections = 8;
 static const RGBADir kStepDirections[kNumStepDirections] = {
 
-	// For pBit changes, we have 8 possible directions.
-	RGBADir(RGBAVector(1.0f, 1.0f, 1.0f, 0.0f)), 
-	RGBADir(RGBAVector(-1.0f, 1.0f, 1.0f, 0.0f)),
-	RGBADir(RGBAVector(1.0f, -1.0f, 1.0f, 0.0f)), 
-	RGBADir(RGBAVector(-1.0f, -1.0f, 1.0f, 0.0f)),
-	RGBADir(RGBAVector(1.0f, 1.0f, -1.0f, 0.0f)), 
-	RGBADir(RGBAVector(-1.0f, 1.0f, -1.0f, 0.0f)),
-	RGBADir(RGBAVector(1.0f, -1.0f, -1.0f, 0.0f)), 
-	RGBADir(RGBAVector(-1.0f, -1.0f, -1.0f, 0.0f))
+  // For pBit changes, we have 8 possible directions.
+  RGBADir(RGBAVector(1.0f, 1.0f, 1.0f, 0.0f)), 
+  RGBADir(RGBAVector(-1.0f, 1.0f, 1.0f, 0.0f)),
+  RGBADir(RGBAVector(1.0f, -1.0f, 1.0f, 0.0f)), 
+  RGBADir(RGBAVector(-1.0f, -1.0f, 1.0f, 0.0f)),
+  RGBADir(RGBAVector(1.0f, 1.0f, -1.0f, 0.0f)), 
+  RGBADir(RGBAVector(-1.0f, 1.0f, -1.0f, 0.0f)),
+  RGBADir(RGBAVector(1.0f, -1.0f, -1.0f, 0.0f)), 
+  RGBADir(RGBAVector(-1.0f, -1.0f, -1.0f, 0.0f))
 };
 
 static void ChangePointForDirWithoutPbitChange(RGBAVector &v, int dir, const float step[kNumColorChannels]) {
-	if(dir % 2) {
-		v.x -= step[0];
-	}
-	else {
-		v.x += step[0];
-	}
+  if(dir % 2) {
+    v.x -= step[0];
+  }
+  else {
+    v.x += step[0];
+  }
 
-	if(((dir / 2) % 2)) {
-		v.y -= step[1];
-	}
-	else  {
-		v.y += step[1];
-	}
+  if(((dir / 2) % 2)) {
+    v.y -= step[1];
+  }
+  else  {
+    v.y += step[1];
+  }
 
-	if(((dir / 4) % 2)) {
-		v.z -= step[2];
-	}
-	else {
-		v.z += step[2];
-	}
+  if(((dir / 4) % 2)) {
+    v.z -= step[2];
+  }
+  else {
+    v.z += step[2];
+  }
 
-	if(((dir / 8) % 2)) {
-		v.a -= step[3];
-	}
-	else {
-		v.a += step[3];
-	}
+  if(((dir / 8) % 2)) {
+    v.a -= step[3];
+  }
+  else {
+    v.a += step[3];
+  }
 }
 
 static void ChangePointForDirWithPbitChange(RGBAVector &v, int dir, int oldPbit, const float step[kNumColorChannels]) {
-	if(dir % 2 && oldPbit == 0) {
-		v.x -= step[0];
-	}
-	else if(!(dir % 2) && oldPbit == 1) {
-		v.x += step[0];
-	}
+  if(dir % 2 && oldPbit == 0) {
+    v.x -= step[0];
+  }
+  else if(!(dir % 2) && oldPbit == 1) {
+    v.x += step[0];
+  }
 
-	if(((dir / 2) % 2) && oldPbit == 0) {
-		v.y -= step[1];
-	}
-	else if(!((dir / 2) % 2) && oldPbit == 1) {
-		v.y += step[1];
-	}
+  if(((dir / 2) % 2) && oldPbit == 0) {
+    v.y -= step[1];
+  }
+  else if(!((dir / 2) % 2) && oldPbit == 1) {
+    v.y += step[1];
+  }
 
-	if(((dir / 4) % 2) && oldPbit == 0) {
-		v.z -= step[2];
-	}
-	else if(!((dir / 4) % 2) && oldPbit == 1) {
-		v.z += step[2];
-	}
+  if(((dir / 4) % 2) && oldPbit == 0) {
+    v.z -= step[2];
+  }
+  else if(!((dir / 4) % 2) && oldPbit == 1) {
+    v.z += step[2];
+  }
 
-	if(((dir / 8) % 2) && oldPbit == 0) {
-		v.a -= step[3];
-	}
-	else if(!((dir / 8) % 2) && oldPbit == 1) {
-		v.a += step[3];
-	}
+  if(((dir / 8) % 2) && oldPbit == 0) {
+    v.a -= step[3];
+  }
+  else if(!((dir / 8) % 2) && oldPbit == 1) {
+    v.a += step[3];
+  }
 }
 
 void BC7CompressionMode::PickBestNeighboringEndpoints(const RGBACluster &cluster, const RGBAVector &p1, const RGBAVector &p2, const int curPbitCombo, RGBAVector &np1, RGBAVector &np2, int &nPbitCombo, const VisitedState *visitedStates, int nVisited, float stepSz) const {
-	
-	// !SPEED! There might be a way to make this faster since we're working
-	// with floating point values that are powers of two. We should be able
-	// to just set the proper bits in the exponent and leave the mantissa to 0.
-	float step[kNumColorChannels] = {
-		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
-		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
-		stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
-		stepSz * float(1 << (8 - GetAlphaChannelPrecision()))
-	};
+  
+  // !SPEED! There might be a way to make this faster since we're working
+  // with floating point values that are powers of two. We should be able
+  // to just set the proper bits in the exponent and leave the mantissa to 0.
+  float step[kNumColorChannels] = {
+    stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+    stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+    stepSz * float(1 << (8 - m_Attributes->colorChannelPrecision)),
+    stepSz * float(1 << (8 - GetAlphaChannelPrecision()))
+  };
 
-	if(m_IsOpaque) {
-		step[(GetRotationMode() + 3) % kNumColorChannels] = 0.0f;
-	}
+  if(m_IsOpaque) {
+    step[(GetRotationMode() + 3) % kNumColorChannels] = 0.0f;
+  }
 
-	// First, let's figure out the new pbit combo... if there's no pbit then we don't need
-	// to worry about it.
-	const bool hasPbits = GetPBitType() != ePBitType_None;
-	if(hasPbits) {
+  // First, let's figure out the new pbit combo... if there's no pbit then we don't need
+  // to worry about it.
+  const bool hasPbits = GetPBitType() != ePBitType_None;
+  if(hasPbits) {
 
-		// If there is a pbit, then we must change it, because those will provide the closest values
-		// to the current point.
-		if(GetPBitType() == ePBitType_Shared)
-			nPbitCombo = (curPbitCombo + 1) % 2;
-		else {
-			// Not shared... p1 needs to change and p2 needs to change... which means that 
-			// combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice
-			// versa...
-			nPbitCombo = 3 - curPbitCombo;
-		}
+    // If there is a pbit, then we must change it, because those will provide the closest values
+    // to the current point.
+    if(GetPBitType() == ePBitType_Shared)
+      nPbitCombo = (curPbitCombo + 1) % 2;
+    else {
+      // Not shared... p1 needs to change and p2 needs to change... which means that 
+      // combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 and vice
+      // versa...
+      nPbitCombo = 3 - curPbitCombo;
+    }
 
-		assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1);
-		assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1);
-	}
+    assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1);
+    assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1);
+  }
 
-	bool visited = true;
-	int infLoopPrevent = -1;
-	while(visited && ++infLoopPrevent < 16) {
-		for(int pt = 0; pt < 2; pt++) {
+  bool visited = true;
+  int infLoopPrevent = -1;
+  while(visited && ++infLoopPrevent < 16) {
+    for(int pt = 0; pt < 2; pt++) {
 
-			const RGBAVector &p = (pt)? p1 : p2;
-			RGBAVector &np = (pt)? np1 : np2;
+      const RGBAVector &p = (pt)? p1 : p2;
+      RGBAVector &np = (pt)? np1 : np2;
 
-			np = p;
-			if(hasPbits) 
-				ChangePointForDirWithPbitChange(np, fastrand() % 16, GetPBitCombo(curPbitCombo)[pt], step);
-			else
-				ChangePointForDirWithoutPbitChange(np, fastrand() % 16, step);
+      np = p;
+      if(hasPbits) 
+        ChangePointForDirWithPbitChange(np, fastrand() % 16, GetPBitCombo(curPbitCombo)[pt], step);
+      else
+        ChangePointForDirWithoutPbitChange(np, fastrand() % 16, step);
 
-			for(int i = 0; i < kNumColorChannels; i++) {
-				np.c[i] = min(max(np.c[i], 0.0f), 255.0f);
-			}
-		}
+      for(int i = 0; i < kNumColorChannels; i++) {
+        np.c[i] = min(max(np.c[i], 0.0f), 255.0f);
+      }
+    }
 
-		visited = false;
-		for(int i = 0; i < nVisited; i++) {
-			visited = visited || (
-				visitedStates[i].p1 == np1 &&
-				visitedStates[i].p2 == np2 &&
-				visitedStates[i].pBitCombo == nPbitCombo
-			);
-		}
-	}
+    visited = false;
+    for(int i = 0; i < nVisited; i++) {
+      visited = visited || (
+        visitedStates[i].p1 == np1 &&
+        visitedStates[i].p2 == np2 &&
+        visitedStates[i].pBitCombo == nPbitCombo
+      );
+    }
+  }
 }
 
 // Fast generation of floats between 0 and 1. It generates a float
@@ -523,1420 +528,1440 @@ void BC7CompressionMode::PickBestNeighboringEndpoints(const RGBACluster &cluster
 COMPILE_ASSERT(RAND_MAX == 0x7FFF)
 
 static inline float frand() { 
-	const uint16 r = fastrand();
-	
-	// RAND_MAX is 0x7FFF, which offers 15 bits
-	// of precision. Therefore, we move the bits
-	// into the top of the 23 bit mantissa, and 
-	// repeat the most significant bits of r in 
-	// the least significant of the mantissa
-	const uint32 m = (r << 8) | (r >> 7);
-	const uint32 flt = (127 << 23) | m;
-	return *(reinterpret_cast<const float *>(&flt)) - 1.0f;
+  const uint16 r = fastrand();
+  
+  // RAND_MAX is 0x7FFF, which offers 15 bits
+  // of precision. Therefore, we move the bits
+  // into the top of the 23 bit mantissa, and 
+  // repeat the most significant bits of r in 
+  // the least significant of the mantissa
+  const uint32 m = (r << 8) | (r >> 7);
+  const uint32 flt = (127 << 23) | m;
+  return *(reinterpret_cast<const float *>(&flt)) - 1.0f;
 }
 
 bool BC7CompressionMode::AcceptNewEndpointError(double newError, double oldError, float temp) const {
 
-	// Always accept better endpoints.
-	if(newError < oldError)
-		return true;
+  // Always accept better endpoints.
+  if(newError < oldError)
+    return true;
 
-	const double p = exp((0.1f * (oldError - newError)) / temp);
-	const double r = frand();
+  const double p = exp((0.1f * (oldError - newError)) / temp);
+  const double r = frand();
 
-	return r < p;
+  return r < p;
 }
 
 double BC7CompressionMode::OptimizeEndpointsForCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const {
-	
-	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
-	const int nPbitCombos = GetNumPbitCombos();
-	const uint32 qmask = GetQuantizationMask();
+  
+  const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+  const int nPbitCombos = GetNumPbitCombos();
+  const uint32 qmask = GetQuantizationMask();
 
-	// Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints.
-	double curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo), bestIndices);
-	int curPbitCombo = bestPbitCombo;
-	double bestError = curError;
+  // Here we use simulated annealing to traverse the space of clusters to find the best possible endpoints.
+  double curError = cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo), bestIndices);
+  int curPbitCombo = bestPbitCombo;
+  double bestError = curError;
 
-	// Clamp endpoints to the grid...
-	uint32 qp1, qp2;
-	if(GetPBitType() != ePBitType_None) {
-		qp1 = p1.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[0]);
-		qp2 = p2.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[1]);
-	}
-	else {
-		qp1 = p1.ToPixel(qmask);
-		qp2 = p2.ToPixel(qmask);
-	}
+  // Clamp endpoints to the grid...
+  uint32 qp1, qp2;
+  if(GetPBitType() != ePBitType_None) {
+    qp1 = p1.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[0]);
+    qp2 = p2.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[1]);
+  }
+  else {
+    qp1 = p1.ToPixel(qmask);
+    qp2 = p2.ToPixel(qmask);
+  }
 
-	uint8 *pqp1 = (uint8 *)&qp1;
-	uint8 *pqp2 = (uint8 *)&qp2;
+  uint8 *pqp1 = (uint8 *)&qp1;
+  uint8 *pqp2 = (uint8 *)&qp2;
 
-	p1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
-	p2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
+  p1 = RGBAVector(float(pqp1[0]), float(pqp1[1]), float(pqp1[2]), float(pqp1[3]));
+  p2 = RGBAVector(float(pqp2[0]), float(pqp2[1]), float(pqp2[2]), float(pqp2[3]));
 
-	RGBAVector bp1 = p1, bp2 = p2;
+  RGBAVector bp1 = p1, bp2 = p2;
 
-	assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo)));
-	
-	int lastVisitedState = 0;
-	VisitedState visitedStates[kMaxAnnealingIterations];
+  assert(curError == cluster.QuantizedError(p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo)));
+  
+  int lastVisitedState = 0;
+  VisitedState visitedStates[kMaxAnnealingIterations];
 
-	visitedStates[lastVisitedState].p1 = p1;
-	visitedStates[lastVisitedState].p2 = p2; 
-	visitedStates[lastVisitedState].pBitCombo = curPbitCombo;
-	lastVisitedState++;
+  visitedStates[lastVisitedState].p1 = p1;
+  visitedStates[lastVisitedState].p2 = p2; 
+  visitedStates[lastVisitedState].pBitCombo = curPbitCombo;
+  lastVisitedState++;
 
-	const int maxEnergy = MaxAnnealingIterations;
+  const int maxEnergy = MaxAnnealingIterations;
 
-	for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) {
+  for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) {
 
-		float temp = float(energy) / float(maxEnergy-1);
+    float temp = float(energy) / float(maxEnergy-1);
 
-		int indices[kMaxNumDataPoints];
-		RGBAVector np1, np2;
-		int nPbitCombo;
+    int indices[kMaxNumDataPoints];
+    RGBAVector np1, np2;
+    int nPbitCombo;
 
-		PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, visitedStates, lastVisitedState);
+    PickBestNeighboringEndpoints(cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, visitedStates, lastVisitedState);
 
-		double error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(nPbitCombo), indices);
-		if(AcceptNewEndpointError(error, curError, temp)) {
-			curError = error;
-			p1 = np1;
-			p2 = np2;
-			curPbitCombo = nPbitCombo;
-		}
+    double error = cluster.QuantizedError(np1, np2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(nPbitCombo), indices);
+    if(AcceptNewEndpointError(error, curError, temp)) {
+      curError = error;
+      p1 = np1;
+      p2 = np2;
+      curPbitCombo = nPbitCombo;
+    }
 
-		if(error < bestError) {
-			memcpy(bestIndices, indices, sizeof(indices));
-			bp1 = np1;
-			bp2 = np2;
-			bestPbitCombo = nPbitCombo;
-			bestError = error;
+    if(error < bestError) {
+      memcpy(bestIndices, indices, sizeof(indices));
+      bp1 = np1;
+      bp2 = np2;
+      bestPbitCombo = nPbitCombo;
+      bestError = error;
 
-			visitedStates[lastVisitedState].p1 = np1;
-			visitedStates[lastVisitedState].p2 = np2; 
-			visitedStates[lastVisitedState].pBitCombo = nPbitCombo;
-			lastVisitedState++;
+      visitedStates[lastVisitedState].p1 = np1;
+      visitedStates[lastVisitedState].p2 = np2; 
+      visitedStates[lastVisitedState].pBitCombo = nPbitCombo;
+      lastVisitedState++;
 
-			// Restart...
-			energy = 0;
-		}
-	}
+      // Restart...
+      energy = 0;
+    }
+  }
 
-	p1 = bp1;
-	p2 = bp2;
+  p1 = bp1;
+  p2 = bp2;
 
-	return bestError;
+  return bestError;
 }
 
 double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int *alphaIndices) const {
-		
-	assert(GetModeNumber() == 4 || GetModeNumber() == 5);
-	assert(GetNumberOfSubsets() == 1);
-	assert(cluster.GetNumPoints() == kMaxNumDataPoints);
-	assert(m_Attributes->alphaChannelPrecision > 0);
+    
+  assert(GetModeNumber() == 4 || GetModeNumber() == 5);
+  assert(GetNumberOfSubsets() == 1);
+  assert(cluster.GetNumPoints() == kMaxNumDataPoints);
+  assert(m_Attributes->alphaChannelPrecision > 0);
 
-	// If all the points are the same in the cluster, then we need to figure out what the best
-	// approximation to this point is....
-	if(cluster.AllSamePoint()) {
+  // If all the points are the same in the cluster, then we need to figure out what the best
+  // approximation to this point is....
+  if(cluster.AllSamePoint()) {
 
-		assert(!"We should only be using this function in modes 4 & 5 that have a single subset, in which case single colors should have been detected much earlier.");
+    assert(!"We should only be using this function in modes 4 & 5 that have a single subset, in which case single colors should have been detected much earlier.");
 
-		const RGBAVector &p = cluster.GetPoint(0);
-		int dummyPbit = 0;
-		double bestErr = CompressSingleColor(p, p1, p2, dummyPbit);
+    const RGBAVector &p = cluster.GetPoint(0);
+    int dummyPbit = 0;
+    double bestErr = CompressSingleColor(p, p1, p2, dummyPbit);
 
-		// We're assuming all indices will be index 1...
-		for(int i = 0; i < cluster.GetNumPoints(); i++) {
-			bestIndices[i] = 1;
-			alphaIndices[i] = 1;
-		}
-		
-		return bestErr;
-	}
+    // We're assuming all indices will be index 1...
+    for(int i = 0; i < cluster.GetNumPoints(); i++) {
+      bestIndices[i] = 1;
+      alphaIndices[i] = 1;
+    }
+    
+    return bestErr;
+  }
 
-	RGBACluster rgbCluster;
-	float alphaVals[kMaxNumDataPoints];
+  RGBACluster rgbCluster;
+  float alphaVals[kMaxNumDataPoints];
 
-	float alphaMin = FLT_MAX, alphaMax = -FLT_MAX;
-	for(int i = 0; i < cluster.GetNumPoints(); i++) {
+  float alphaMin = FLT_MAX, alphaMax = -FLT_MAX;
+  for(int i = 0; i < cluster.GetNumPoints(); i++) {
 
-		RGBAVector v = cluster.GetPoint(i);
-		switch(GetRotationMode()) {
-			default:
-			case 0:
-				// Do nothing
-			break;
+    RGBAVector v = cluster.GetPoint(i);
+    switch(GetRotationMode()) {
+      default:
+      case 0:
+        // Do nothing
+      break;
 
-			case 1:
-				swap(v.r, v.a);
-				break;
+      case 1:
+        swap(v.r, v.a);
+        break;
 
-			case 2:
-				swap(v.g, v.a);
-				break;
+      case 2:
+        swap(v.g, v.a);
+        break;
 
-			case 3:
-				swap(v.b, v.a);
-				break;
-		}
+      case 3:
+        swap(v.b, v.a);
+        break;
+    }
 
-		alphaVals[i] = v.a;
-		v.a = 255.0f;
+    alphaVals[i] = v.a;
+    v.a = 255.0f;
 
-		alphaMin = min(alphaVals[i], alphaMin);
-		alphaMax = max(alphaVals[i], alphaMax);
+    alphaMin = min(alphaVals[i], alphaMin);
+    alphaMax = max(alphaVals[i], alphaMax);
 
-		rgbCluster.AddPoint(v);
-	}
+    rgbCluster.AddPoint(v);
+  }
 
-	int dummyPbit = 0;
-	RGBAVector rgbp1, rgbp2;
-	double rgbError = CompressCluster(rgbCluster, rgbp1, rgbp2, bestIndices, dummyPbit);
+  int dummyPbit = 0;
+  RGBAVector rgbp1, rgbp2;
+  double rgbError = CompressCluster(rgbCluster, rgbp1, rgbp2, bestIndices, dummyPbit);
 
-	float a1 = alphaMin, a2 = alphaMax;
-	double alphaError = DBL_MAX;
+  float a1 = alphaMin, a2 = alphaMax;
+  double alphaError = DBL_MAX;
 
-	typedef uint32 tInterpPair[2];
-	typedef tInterpPair tInterpLevel[16];
-	const tInterpLevel *interpVals = kBC7InterpolationValues + (GetNumberOfBitsPerAlpha() - 1);
-	const float weight = GetErrorMetric().a;
+  typedef uint32 tInterpPair[2];
+  typedef tInterpPair tInterpLevel[16];
+  const tInterpLevel *interpVals = kBC7InterpolationValues + (GetNumberOfBitsPerAlpha() - 1);
+  const float weight = GetErrorMetric().a;
 
-	const int nBuckets = (1 << GetNumberOfBitsPerAlpha());
+  const int nBuckets = (1 << GetNumberOfBitsPerAlpha());
 
-	// If they're the same, then we can get them exactly.
-	if(a1 == a2) 
-	{	
-		const uint8 step = 1 << (8-GetAlphaChannelPrecision());
-		const uint8 a1be = uint8(a1);
-		const uint8 a2be = uint8(a2);
-		const uint8 a1b = ::QuantizeChannel(a1be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
-		const uint8 a2b = ::QuantizeChannel(a2be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+  // If they're the same, then we can get them exactly.
+  if(a1 == a2) 
+  {  
+    const uint8 step = 1 << (8-GetAlphaChannelPrecision());
+    const uint8 a1be = uint8(a1);
+    const uint8 a2be = uint8(a2);
+    const uint8 a1b = ::QuantizeChannel(a1be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+    const uint8 a2b = ::QuantizeChannel(a2be, (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
 
-		// Mode 5 has 8 bits of precision for alpha.
-		if(GetModeNumber() == 5) {
+    // Mode 5 has 8 bits of precision for alpha.
+    if(GetModeNumber() == 5) {
 
-			assert(a1 == float(a1b));
-			assert(a2 == float(a2b));
+      assert(a1 == float(a1b));
+      assert(a2 == float(a2b));
 
-			for(int i = 0; i < kMaxNumDataPoints; i++)
-				alphaIndices[i] = 0;
+      for(int i = 0; i < kMaxNumDataPoints; i++)
+        alphaIndices[i] = 0;
 
-			alphaError = 0.0;
-		}
-		else {
-			assert(GetModeNumber() == 4);
-			
-			// Mode 4 can be treated like the 6 channel of DXT1 compression.
-			if(Optimal6CompressDXT1[a1be][0][0]) {
-				a1 = float((Optimal6CompressDXT1[a1be][1][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
-				a2 = float((Optimal6CompressDXT1[a2be][1][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
-			}
-			else {
-				a1 = float((Optimal6CompressDXT1[a1be][0][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
-				a2 = float((Optimal6CompressDXT1[a2be][0][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
-			}
+      alphaError = 0.0;
+    }
+    else {
+      assert(GetModeNumber() == 4);
+      
+      // Mode 4 can be treated like the 6 channel of DXT1 compression.
+      if(Optimal6CompressDXT1[a1be][0][0]) {
+        a1 = float((Optimal6CompressDXT1[a1be][1][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
+        a2 = float((Optimal6CompressDXT1[a2be][1][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
+      }
+      else {
+        a1 = float((Optimal6CompressDXT1[a1be][0][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4));
+        a2 = float((Optimal6CompressDXT1[a2be][0][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4));
+      }
 
-			if(m_IndexMode == 1) {
-				for(int i = 0; i < kMaxNumDataPoints; i++)
-					alphaIndices[i] = 1;
-			}
-			else {
-				for(int i = 0; i < kMaxNumDataPoints; i++)
-					alphaIndices[i] = 2;
-			}
+      if(m_IndexMode == 1) {
+        for(int i = 0; i < kMaxNumDataPoints; i++)
+          alphaIndices[i] = 1;
+      }
+      else {
+        for(int i = 0; i < kMaxNumDataPoints; i++)
+          alphaIndices[i] = 2;
+      }
 
-			uint32 interp0 = (*interpVals)[alphaIndices[0] & 0xFF][0];
-			uint32 interp1 = (*interpVals)[alphaIndices[0] & 0xFF][1];
+      uint32 interp0 = (*interpVals)[alphaIndices[0] & 0xFF][0];
+      uint32 interp1 = (*interpVals)[alphaIndices[0] & 0xFF][1];
 
-			const uint8 ip = (((uint32(a1) * interp0) + (uint32(a2) * interp1) + 32) >> 6) & 0xFF;
-			float pxError = weight * float((a1be > ip)? a1be - ip : ip - a1be);
-			pxError *= pxError;
-			alphaError = 16 * pxError;
-		}
-	}
-	else {
+      const uint8 ip = (((uint32(a1) * interp0) + (uint32(a2) * interp1) + 32) >> 6) & 0xFF;
+      float pxError = weight * float((a1be > ip)? a1be - ip : ip - a1be);
+      pxError *= pxError;
+      alphaError = 16 * pxError;
+    }
+  }
+  else {
 
-		float vals[1<<3];
-		memset(vals, 0, sizeof(vals));
+    float vals[1<<3];
+    memset(vals, 0, sizeof(vals));
 
-		int buckets[kMaxNumDataPoints];
+    int buckets[kMaxNumDataPoints];
 
-		// Figure out initial positioning.
-		for(int i = 0; i < nBuckets; i++) {
-			vals[i] = alphaMin + (float(i)/float(nBuckets-1)) * (alphaMax - alphaMin);
-		}
+    // Figure out initial positioning.
+    for(int i = 0; i < nBuckets; i++) {
+      vals[i] = alphaMin + (float(i)/float(nBuckets-1)) * (alphaMax - alphaMin);
+    }
 
-		// Assign each value to a bucket
-		for(int i = 0; i < kMaxNumDataPoints; i++) {
+    // Assign each value to a bucket
+    for(int i = 0; i < kMaxNumDataPoints; i++) {
 
-			float minDist = 255.0f;
-			for(int j = 0; j < nBuckets; j++) {
-				float dist = fabs(alphaVals[i] - vals[j]);
-				if(dist < minDist) {
-					minDist = dist;
-					buckets[i] = j;
-				}
-			}
-		}
-			
-		float npts[1 << 3];
+      float minDist = 255.0f;
+      for(int j = 0; j < nBuckets; j++) {
+        float dist = fabs(alphaVals[i] - vals[j]);
+        if(dist < minDist) {
+          minDist = dist;
+          buckets[i] = j;
+        }
+      }
+    }
+      
+    float npts[1 << 3];
 
-		// Do k-means
-		bool fixed = false;
-		while(!fixed) {
+    // Do k-means
+    bool fixed = false;
+    while(!fixed) {
 
-			memset(npts, 0, sizeof(npts));
+      memset(npts, 0, sizeof(npts));
 
-			float avg[1 << 3];
-			memset(avg, 0, sizeof(avg));
+      float avg[1 << 3];
+      memset(avg, 0, sizeof(avg));
 
-			// Calculate average of each cluster
-			for(int i = 0; i < nBuckets; i++) {
-				for(int j = 0; j < kMaxNumDataPoints; j++) {
+      // Calculate average of each cluster
+      for(int i = 0; i < nBuckets; i++) {
+        for(int j = 0; j < kMaxNumDataPoints; j++) {
 
-					if(buckets[j] == i) {
-						avg[i] += alphaVals[j];
-						npts[i] += 1.0f;
-					}
-				}
+          if(buckets[j] == i) {
+            avg[i] += alphaVals[j];
+            npts[i] += 1.0f;
+          }
+        }
 
-				if(npts[i] > 0.0f) 
-					avg[i] /= npts[i];
-			}
+        if(npts[i] > 0.0f) 
+          avg[i] /= npts[i];
+      }
 
-			// Did we change anything?
-			fixed = true;
-			for(int i = 0; i < nBuckets; i++) {
-				fixed = fixed && (avg[i] == vals[i]);
-			}
+      // Did we change anything?
+      fixed = true;
+      for(int i = 0; i < nBuckets; i++) {
+        fixed = fixed && (avg[i] == vals[i]);
+      }
 
-			// Reassign indices...
-			memcpy(vals, avg, sizeof(vals));
+      // Reassign indices...
+      memcpy(vals, avg, sizeof(vals));
 
-			// Reassign each value to a bucket
-			for(int i = 0; i < kMaxNumDataPoints; i++) {
+      // Reassign each value to a bucket
+      for(int i = 0; i < kMaxNumDataPoints; i++) {
 
-				float minDist = 255.0f;
-				for(int j = 0; j < nBuckets; j++) {
-					float dist = fabs(alphaVals[i] - vals[j]);
-					if(dist < minDist) {
-						minDist = dist;
-						buckets[i] = j;
-					}
-				}
-			}
-		}
+        float minDist = 255.0f;
+        for(int j = 0; j < nBuckets; j++) {
+          float dist = fabs(alphaVals[i] - vals[j]);
+          if(dist < minDist) {
+            minDist = dist;
+            buckets[i] = j;
+          }
+        }
+      }
+    }
 
-		// Do least squares fit of vals.
-		float asq = 0.0, bsq = 0.0, ab = 0.0;
-		float ax(0.0), bx(0.0);
-		for(int i = 0; i < nBuckets; i++) {
-			float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
-			float b = float(i) / float(nBuckets - 1);
+    // Do least squares fit of vals.
+    float asq = 0.0, bsq = 0.0, ab = 0.0;
+    float ax(0.0), bx(0.0);
+    for(int i = 0; i < nBuckets; i++) {
+      float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
+      float b = float(i) / float(nBuckets - 1);
 
-			float n = npts[i];
-			float x = vals[i];
+      float n = npts[i];
+      float x = vals[i];
 
-			asq += n * a * a;
-			bsq += n * b * b;
-			ab += n * a * b;
+      asq += n * a * a;
+      bsq += n * b * b;
+      ab += n * a * b;
 
-			ax += x * a * n;
-			bx += x * b * n;
-		}
+      ax += x * a * n;
+      bx += x * b * n;
+    }
 
-		float f = 1.0f / (asq * bsq - ab * ab);
-		a1 = f * (ax * bsq - bx * ab);
-		a2 = f * (bx * asq - ax * ab);
+    float f = 1.0f / (asq * bsq - ab * ab);
+    a1 = f * (ax * bsq - bx * ab);
+    a2 = f * (bx * asq - ax * ab);
 
-		// Clamp
-		a1 = min(255.0f, max(0.0f, a1));
-		a2 = min(255.0f, max(0.0f, a2));
+    // Clamp
+    a1 = min(255.0f, max(0.0f, a1));
+    a2 = min(255.0f, max(0.0f, a2));
 
-		// Quantize
-		const uint8 a1b = ::QuantizeChannel(uint8(a1), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
-		const uint8 a2b = ::QuantizeChannel(uint8(a2), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+    // Quantize
+    const uint8 a1b = ::QuantizeChannel(uint8(a1), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
+    const uint8 a2b = ::QuantizeChannel(uint8(a2), (((char)0x80) >> (GetAlphaChannelPrecision() - 1)));
 
-		// Compute error
-		for(int i = 0; i < kMaxNumDataPoints; i++) {
+    // Compute error
+    for(int i = 0; i < kMaxNumDataPoints; i++) {
 
-			uint8 val = uint8(alphaVals[i]);
+      uint8 val = uint8(alphaVals[i]);
 
-			float minError = FLT_MAX;
-			int bestBucket = -1;
+      float minError = FLT_MAX;
+      int bestBucket = -1;
 
-			for(int j = 0; j < nBuckets; j++) {
-				uint32 interp0 = (*interpVals)[j][0];
-				uint32 interp1 = (*interpVals)[j][1];
+      for(int j = 0; j < nBuckets; j++) {
+        uint32 interp0 = (*interpVals)[j][0];
+        uint32 interp1 = (*interpVals)[j][1];
 
-				const uint8 ip = (((uint32(a1b) * interp0) + (uint32(a2b) * interp1) + 32) >> 6) & 0xFF;
-				float pxError = weight * float((val > ip)? val - ip : ip - val);
-				pxError *= pxError;
+        const uint8 ip = (((uint32(a1b) * interp0) + (uint32(a2b) * interp1) + 32) >> 6) & 0xFF;
+        float pxError = weight * float((val > ip)? val - ip : ip - val);
+        pxError *= pxError;
 
-				if(pxError < minError) {
-					minError = pxError;
-					bestBucket = j;
-				}
-			}
+        if(pxError < minError) {
+          minError = pxError;
+          bestBucket = j;
+        }
+      }
 
-			alphaError += minError;
-			alphaIndices[i] = bestBucket;
-		}
-	}
+      alphaError += minError;
+      alphaIndices[i] = bestBucket;
+    }
+  }
 
-	for(int i = 0; i < kNumColorChannels; i++) {
-		p1.c[i] = (i == (kNumColorChannels-1))? a1 : rgbp1.c[i];
-		p2.c[i] = (i == (kNumColorChannels-1))? a2 : rgbp2.c[i];
-	}
+  for(int i = 0; i < kNumColorChannels; i++) {
+    p1.c[i] = (i == (kNumColorChannels-1))? a1 : rgbp1.c[i];
+    p2.c[i] = (i == (kNumColorChannels-1))? a2 : rgbp2.c[i];
+  }
 
-	return rgbError + alphaError;
+  return rgbError + alphaError;
 }
 
 double BC7CompressionMode::CompressCluster(const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, int *bestIndices, int &bestPbitCombo) const {
-		
-	// If all the points are the same in the cluster, then we need to figure out what the best
-	// approximation to this point is....
-	if(cluster.AllSamePoint()) {
-		const RGBAVector &p = cluster.GetPoint(0);
-		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+    
+  // If all the points are the same in the cluster, then we need to figure out what the best
+  // approximation to this point is....
+  if(cluster.AllSamePoint()) {
+    const RGBAVector &p = cluster.GetPoint(0);
+    double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
 
-		// We're assuming all indices will be index 1...
-		for(int i = 0; i < cluster.GetNumPoints(); i++) {
-			bestIndices[i] = 1;
-		}
-		
-		return bestErr;
-	}
-	
-	const int nBuckets = (1 << GetNumberOfBitsPerIndex());
-	const int nPbitCombos = GetNumPbitCombos();
-	const uint32 qmask = GetQuantizationMask();
+    // We're assuming all indices will be index 1...
+    for(int i = 0; i < cluster.GetNumPoints(); i++) {
+      bestIndices[i] = 1;
+    }
+    
+    return bestErr;
+  }
+  
+  const int nBuckets = (1 << GetNumberOfBitsPerIndex());
+  const int nPbitCombos = GetNumPbitCombos();
+  const uint32 qmask = GetQuantizationMask();
 
 #if 1
-	RGBAVector avg = cluster.GetTotal() / float(cluster.GetNumPoints());
-	RGBADir axis;
-	::GetPrincipalAxis(cluster.GetNumPoints(), cluster.GetPoints(), axis);
+  RGBAVector avg = cluster.GetTotal() / float(cluster.GetNumPoints());
+  RGBADir axis;
+  ::GetPrincipalAxis(cluster.GetNumPoints(), cluster.GetPoints(), axis);
 
-	float mindp = FLT_MAX, maxdp = -FLT_MAX;
-	for(int i = 0 ; i < cluster.GetNumPoints(); i++) {
-		float dp = (cluster.GetPoint(i) - avg) * axis;
-		if(dp < mindp) mindp = dp;
-		if(dp > maxdp) maxdp = dp;
-	}
-	
-	p1 = avg + mindp * axis;
-	p2 = avg + maxdp * axis;
+  float mindp = FLT_MAX, maxdp = -FLT_MAX;
+  for(int i = 0 ; i < cluster.GetNumPoints(); i++) {
+    float dp = (cluster.GetPoint(i) - avg) * axis;
+    if(dp < mindp) mindp = dp;
+    if(dp > maxdp) maxdp = dp;
+  }
+  
+  p1 = avg + mindp * axis;
+  p2 = avg + maxdp * axis;
 #else
-	cluster.GetBoundingBox(p1, p2);
+  cluster.GetBoundingBox(p1, p2);
 #endif
 
-	ClampEndpoints(p1, p2);
+  ClampEndpoints(p1, p2);
 
-	RGBAVector pts[1 << 4]; // At most 4 bits per index.
-	int numPts[1<<4];
-	assert(nBuckets <= 1 << 4);
+  RGBAVector pts[1 << 4]; // At most 4 bits per index.
+  int numPts[1<<4];
+  assert(nBuckets <= 1 << 4);
 
-	for(int i = 0; i < nBuckets; i++) {
-		float s = (float(i) / float(nBuckets - 1));
-		pts[i] = (1.0f - s) * p1 + s * p2;
-	}
+  for(int i = 0; i < nBuckets; i++) {
+    float s = (float(i) / float(nBuckets - 1));
+    pts[i] = (1.0f - s) * p1 + s * p2;
+  }
 
-	assert(pts[0] == p1);
-	assert(pts[nBuckets - 1] == p2);
+  assert(pts[0] == p1);
+  assert(pts[nBuckets - 1] == p2);
 
-	// Do k-means clustering...
-	int bucketIdx[kMaxNumDataPoints];
+  // Do k-means clustering...
+  int bucketIdx[kMaxNumDataPoints];
 
-	bool fixed = false;
-	while(!fixed) {
-		
-		RGBAVector newPts[1 << 4];
+  bool fixed = false;
+  while(!fixed) {
+    
+    RGBAVector newPts[1 << 4];
 
-		// Assign each of the existing points to one of the buckets...
-		for(int i = 0; i < cluster.GetNumPoints(); i++) {
+    // Assign each of the existing points to one of the buckets...
+    for(int i = 0; i < cluster.GetNumPoints(); i++) {
 
-			int minBucket = -1;
-			float minDist = FLT_MAX;
-			for(int j = 0; j < nBuckets; j++) {
-				RGBAVector v = cluster.GetPoint(i) - pts[j];
-				float distSq = v * v;
-				if(distSq < minDist)
-				{
-					minDist = distSq;
-					minBucket = j;
-				}
-			}
+      int minBucket = -1;
+      float minDist = FLT_MAX;
+      for(int j = 0; j < nBuckets; j++) {
+        RGBAVector v = cluster.GetPoint(i) - pts[j];
+        float distSq = v * v;
+        if(distSq < minDist)
+        {
+          minDist = distSq;
+          minBucket = j;
+        }
+      }
 
-			assert(minBucket >= 0);
-			bucketIdx[i] = minBucket;
-		}
+      assert(minBucket >= 0);
+      bucketIdx[i] = minBucket;
+    }
 
-		// Calculate new buckets based on centroids of clusters...
-		for(int i = 0; i < nBuckets; i++) {
-			
-			numPts[i] = 0;
-			newPts[i] = RGBAVector(0.0f);
-			for(int j = 0; j < cluster.GetNumPoints(); j++) {
-				if(bucketIdx[j] == i) {
-					numPts[i]++;
-					newPts[i] += cluster.GetPoint(j);
-				}
-			}
+    // Calculate new buckets based on centroids of clusters...
+    for(int i = 0; i < nBuckets; i++) {
+      
+      numPts[i] = 0;
+      newPts[i] = RGBAVector(0.0f);
+      for(int j = 0; j < cluster.GetNumPoints(); j++) {
+        if(bucketIdx[j] == i) {
+          numPts[i]++;
+          newPts[i] += cluster.GetPoint(j);
+        }
+      }
 
-			// If there are no points in this cluster, then it should
-			// remain the same as last time and avoid a divide by zero.
-			if(0 != numPts[i])
-				newPts[i] /= float(numPts[i]);
-		}
+      // If there are no points in this cluster, then it should
+      // remain the same as last time and avoid a divide by zero.
+      if(0 != numPts[i])
+        newPts[i] /= float(numPts[i]);
+    }
 
-		// If we haven't changed, then we're done.
-		fixed = true;
-		for(int i = 0; i < nBuckets; i++) {
-			if(pts[i] != newPts[i])
-				fixed = false;
-		}
+    // If we haven't changed, then we're done.
+    fixed = true;
+    for(int i = 0; i < nBuckets; i++) {
+      if(pts[i] != newPts[i])
+        fixed = false;
+    }
 
-		// Assign the new points to be the old points.
-		for(int i = 0; i < nBuckets; i++) {
-			pts[i] = newPts[i];
-		}
-	}
+    // Assign the new points to be the old points.
+    for(int i = 0; i < nBuckets; i++) {
+      pts[i] = newPts[i];
+    }
+  }
 
-	// If there's only one bucket filled, then just compress for that single color...
-	int numBucketsFilled = 0, lastFilledBucket = -1;
-	for(int i = 0; i < nBuckets; i++) {
-		if(numPts[i] > 0) {
-			numBucketsFilled++;
-			lastFilledBucket = i;
-		}
-	}
+  // If there's only one bucket filled, then just compress for that single color...
+  int numBucketsFilled = 0, lastFilledBucket = -1;
+  for(int i = 0; i < nBuckets; i++) {
+    if(numPts[i] > 0) {
+      numBucketsFilled++;
+      lastFilledBucket = i;
+    }
+  }
 
-	assert(numBucketsFilled > 0);
-	if(1 == numBucketsFilled) {
-		const RGBAVector &p = pts[lastFilledBucket];
-		double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
+  assert(numBucketsFilled > 0);
+  if(1 == numBucketsFilled) {
+    const RGBAVector &p = pts[lastFilledBucket];
+    double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo);
 
-		// We're assuming all indices will be index 1...
-		for(int i = 0; i < cluster.GetNumPoints(); i++) {
-			bestIndices[i] = 1;
-		}
-		  
-		return bestErr;
-	}
+    // We're assuming all indices will be index 1...
+    for(int i = 0; i < cluster.GetNumPoints(); i++) {
+      bestIndices[i] = 1;
+    }
+      
+    return bestErr;
+  }
 
-	// Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit
-	// of the clusters. For more information, take a look at this article by NVidia:
-	// http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
-	float asq = 0.0, bsq = 0.0, ab = 0.0;
-	RGBAVector ax(0.0), bx(0.0);
-	for(int i = 0; i < nBuckets; i++) {
-		float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
-		float b = float(i) / float(nBuckets - 1);
+  // Now that we know the index of each pixel, we can assign the endpoints based on a least squares fit
+  // of the clusters. For more information, take a look at this article by NVidia:
+  // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf
+  float asq = 0.0, bsq = 0.0, ab = 0.0;
+  RGBAVector ax(0.0), bx(0.0);
+  for(int i = 0; i < nBuckets; i++) {
+    float a = float(nBuckets - 1 - i) / float(nBuckets - 1);
+    float b = float(i) / float(nBuckets - 1);
 
-		int n = numPts[i];
-		RGBAVector x = pts[i];
+    int n = numPts[i];
+    RGBAVector x = pts[i];
 
-		asq += float(n) * a * a;
-		bsq += float(n) * b * b;
-		ab += float(n) * a * b;
+    asq += float(n) * a * a;
+    bsq += float(n) * b * b;
+    ab += float(n) * a * b;
 
-		ax += x * a * float(n);
-		bx += x * b * float(n);
-	}
+    ax += x * a * float(n);
+    bx += x * b * float(n);
+  }
 
-	float f = 1.0f / (asq * bsq - ab * ab);
-	p1 = f * (ax * bsq - bx * ab);
-	p2 = f * (bx * asq - ax * ab);
+  float f = 1.0f / (asq * bsq - ab * ab);
+  p1 = f * (ax * bsq - bx * ab);
+  p2 = f * (bx * asq - ax * ab);
 
-	ClampEndpointsToGrid(p1, p2, bestPbitCombo);
+  ClampEndpointsToGrid(p1, p2, bestPbitCombo);
 
-	#ifdef _DEBUG
-		int pBitCombo = bestPbitCombo;
-		RGBAVector tp1 = p1, tp2 = p2;
-		ClampEndpointsToGrid(tp1, tp2, pBitCombo);
+  #ifdef _DEBUG
+    int pBitCombo = bestPbitCombo;
+    RGBAVector tp1 = p1, tp2 = p2;
+    ClampEndpointsToGrid(tp1, tp2, pBitCombo);
 
-		assert(p1 == tp1);
-		assert(p2 == tp2);
-		assert(pBitCombo == bestPbitCombo);
-	#endif
+    assert(p1 == tp1);
+    assert(p2 == tp2);
+    assert(pBitCombo == bestPbitCombo);
+  #endif
 
-	assert(bestPbitCombo >= 0);
+  assert(bestPbitCombo >= 0);
 
-	return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo);
+  return OptimizeEndpointsForCluster(cluster, p1, p2, bestIndices, bestPbitCombo);
 }
 
 double BC7CompressionMode::Compress(BitStream &stream, const int shapeIdx, const RGBACluster *clusters) {
 
-	const int kModeNumber = GetModeNumber();
-	const int nPartitionBits = GetNumberOfPartitionBits();
-	const int nSubsets = GetNumberOfSubsets();
+  const int kModeNumber = GetModeNumber();
+  const int nPartitionBits = GetNumberOfPartitionBits();
+  const int nSubsets = GetNumberOfSubsets();
 
-	// Mode #
-	stream.WriteBits(1 << kModeNumber, kModeNumber + 1);
+  // Mode #
+  stream.WriteBits(1 << kModeNumber, kModeNumber + 1);
 
-	// Partition #
-	assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx);
-	stream.WriteBits(shapeIdx, nPartitionBits);
-		
-	RGBAVector p1[kMaxNumSubsets], p2[kMaxNumSubsets];
-	int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = {
-		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-		{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }
-	};
-	int bestAlphaIndices[kMaxNumDataPoints] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
-	int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 };
-	int bestRotationMode = -1, bestIndexMode = -1;
+  // Partition #
+  assert((((1 << nPartitionBits) - 1) & shapeIdx) == shapeIdx);
+  stream.WriteBits(shapeIdx, nPartitionBits);
+    
+  RGBAVector p1[kMaxNumSubsets], p2[kMaxNumSubsets];
+  int bestIndices[kMaxNumSubsets][kMaxNumDataPoints] = {
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }
+  };
+  int bestAlphaIndices[kMaxNumDataPoints] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  int bestPbitCombo[kMaxNumSubsets] = { -1, -1, -1 };
+  int bestRotationMode = -1, bestIndexMode = -1;
 
-	double totalErr = 0.0;
-	for(int cidx = 0; cidx < nSubsets; cidx++) {
-		int indices[kMaxNumDataPoints];
+  double totalErr = 0.0;
+  for(int cidx = 0; cidx < nSubsets; cidx++) {
+    int indices[kMaxNumDataPoints];
 
-		if(m_Attributes->hasRotation) {
+    if(m_Attributes->hasRotation) {
 
-			assert(nSubsets == 1);
+      assert(nSubsets == 1);
 
-			int alphaIndices[kMaxNumDataPoints];
+      int alphaIndices[kMaxNumDataPoints];
 
-			double bestError = DBL_MAX;
-			for(int rotMode = 0; rotMode < 4; rotMode++) {
+      double bestError = DBL_MAX;
+      for(int rotMode = 0; rotMode < 4; rotMode++) {
 
-				SetRotationMode(rotMode);
-				const int nIdxModes = kModeNumber == 4? 2 : 1;
+        SetRotationMode(rotMode);
+        const int nIdxModes = kModeNumber == 4? 2 : 1;
 
-				for(int idxMode = 0; idxMode < nIdxModes; idxMode++) {
+        for(int idxMode = 0; idxMode < nIdxModes; idxMode++) {
 
-					SetIndexMode(idxMode);
+          SetIndexMode(idxMode);
 
-					RGBAVector v1, v2;
-					double error = CompressCluster(clusters[cidx], v1, v2, indices, alphaIndices);
-					if(error < bestError) {
-						bestError = error;
+          RGBAVector v1, v2;
+          double error = CompressCluster(clusters[cidx], v1, v2, indices, alphaIndices);
+          if(error < bestError) {
+            bestError = error;
 
-						memcpy(bestIndices[cidx], indices, sizeof(indices));
-						memcpy(bestAlphaIndices, alphaIndices, sizeof(alphaIndices));
+            memcpy(bestIndices[cidx], indices, sizeof(indices));
+            memcpy(bestAlphaIndices, alphaIndices, sizeof(alphaIndices));
 
-						bestRotationMode = rotMode;
-						bestIndexMode = idxMode;
+            bestRotationMode = rotMode;
+            bestIndexMode = idxMode;
 
-						p1[cidx] = v1;
-						p2[cidx] = v2;
-					}
-				}
-			}
+            p1[cidx] = v1;
+            p2[cidx] = v2;
+          }
+        }
+      }
 
-			totalErr += bestError;
-		}
-		else {
-			// Compress this cluster
-			totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], indices, bestPbitCombo[cidx]);
+      totalErr += bestError;
+    }
+    else {
+      // Compress this cluster
+      totalErr += CompressCluster(clusters[cidx], p1[cidx], p2[cidx], indices, bestPbitCombo[cidx]);
 
-			// Map the indices to their proper position.
-			int idx = 0;
-			for(int i = 0; i < 16; i++) {
-				int subs = GetSubsetForIndex(i, shapeIdx, GetNumberOfSubsets());
-				if(subs == cidx) {
-					bestIndices[cidx][i] = indices[idx++];
-				}
-			}
-		}
-	}
+      // Map the indices to their proper position.
+      int idx = 0;
+      for(int i = 0; i < 16; i++) {
+        int subs = GetSubsetForIndex(i, shapeIdx, GetNumberOfSubsets());
+        if(subs == cidx) {
+          bestIndices[cidx][i] = indices[idx++];
+        }
+      }
+    }
+  }
 
-	stream.WriteBits(bestRotationMode, m_Attributes->hasRotation? 2 : 0);
-	stream.WriteBits(bestIndexMode, m_Attributes->hasIdxMode? 1 : 0);
+  stream.WriteBits(bestRotationMode, m_Attributes->hasRotation? 2 : 0);
+  stream.WriteBits(bestIndexMode, m_Attributes->hasIdxMode? 1 : 0);
 
 #ifdef _DEBUG
-	for(int i = 0; i < kMaxNumDataPoints; i++) {
+  for(int i = 0; i < kMaxNumDataPoints; i++) {
 
-		int nSet = 0;
-		for(int j = 0; j < nSubsets; j++) {
-			if(bestIndices[j][i] >= 0)
-				nSet++;
-		}
+    int nSet = 0;
+    for(int j = 0; j < nSubsets; j++) {
+      if(bestIndices[j][i] >= 0)
+        nSet++;
+    }
 
-		assert(nSet == 1);
-	}
+    assert(nSet == 1);
+  }
 #endif
 
-	// Get the quantization mask
-	const uint32 qmask = GetQuantizationMask();
+  // Get the quantization mask
+  const uint32 qmask = GetQuantizationMask();
 
-	//Quantize the points...
-	uint32 pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets];
-	for(int i = 0; i < nSubsets; i++) {
-		switch(GetPBitType()) {
-			default:
-			case ePBitType_None: 
-				pixel1[i] = p1[i].ToPixel(qmask); 
-				pixel2[i] = p2[i].ToPixel(qmask); 
-			break;
+  //Quantize the points...
+  uint32 pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets];
+  for(int i = 0; i < nSubsets; i++) {
+    switch(GetPBitType()) {
+      default:
+      case ePBitType_None: 
+        pixel1[i] = p1[i].ToPixel(qmask); 
+        pixel2[i] = p2[i].ToPixel(qmask); 
+      break;
 
-			case ePBitType_Shared: 
-			case ePBitType_NotShared: 
-				pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); 
-				pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); 
-			break;
-		}
-	}
+      case ePBitType_Shared: 
+      case ePBitType_NotShared: 
+        pixel1[i] = p1[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[0]); 
+        pixel2[i] = p2[i].ToPixel(qmask, GetPBitCombo(bestPbitCombo[i])[1]); 
+      break;
+    }
+  }
 
-	// If the anchor index does not have 0 in the leading bit, then 
-	// we need to swap EVERYTHING.
-	for(int sidx = 0; sidx < nSubsets; sidx++) {
+  // If the anchor index does not have 0 in the leading bit, then 
+  // we need to swap EVERYTHING.
+  for(int sidx = 0; sidx < nSubsets; sidx++) {
 
-		int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx, nSubsets);
-		assert(bestIndices[sidx][anchorIdx] != -1);
+    int anchorIdx = GetAnchorIndexForSubset(sidx, shapeIdx, nSubsets);
+    assert(bestIndices[sidx][anchorIdx] != -1);
 
-		const int nAlphaIndexBits = GetNumberOfBitsPerAlpha(bestIndexMode);
-		const int nIndexBits = GetNumberOfBitsPerIndex(bestIndexMode);
-		if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) {
-			uint32 t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t;
+    const int nAlphaIndexBits = GetNumberOfBitsPerAlpha(bestIndexMode);
+    const int nIndexBits = GetNumberOfBitsPerIndex(bestIndexMode);
+    if(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)) {
+      uint32 t = pixel1[sidx]; pixel1[sidx] = pixel2[sidx]; pixel2[sidx] = t;
 
-			int nIndexVals = 1 << nIndexBits;
-			for(int i = 0; i < 16; i++) {
-				bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i];
-			}
+      int nIndexVals = 1 << nIndexBits;
+      for(int i = 0; i < 16; i++) {
+        bestIndices[sidx][i] = (nIndexVals - 1) - bestIndices[sidx][i];
+      }
 
-			int nAlphaIndexVals = 1 << nAlphaIndexBits;
-			if(m_Attributes->hasRotation) {
-				for(int i = 0; i < 16; i++) {
-					bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
-				}
-			}
-		}
+      int nAlphaIndexVals = 1 << nAlphaIndexBits;
+      if(m_Attributes->hasRotation) {
+        for(int i = 0; i < 16; i++) {
+          bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
+        }
+      }
+    }
 
-		if(m_Attributes->hasRotation && bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)) {
-			uint8 * bp1 = (uint8 *)(&pixel1[sidx]); 
-			uint8 * bp2 = (uint8 *)(&pixel2[sidx]); 
-			uint8 t = bp1[3]; bp1[3] = bp2[3]; bp2[3] = t;
+    if(m_Attributes->hasRotation && bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)) {
+      uint8 * bp1 = (uint8 *)(&pixel1[sidx]); 
+      uint8 * bp2 = (uint8 *)(&pixel2[sidx]); 
+      uint8 t = bp1[3]; bp1[3] = bp2[3]; bp2[3] = t;
 
-			int nAlphaIndexVals = 1 << nAlphaIndexBits;
-			for(int i = 0; i < 16; i++) {
-				bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
-			}
-		}
+      int nAlphaIndexVals = 1 << nAlphaIndexBits;
+      for(int i = 0; i < 16; i++) {
+        bestAlphaIndices[i] = (nAlphaIndexVals - 1) - bestAlphaIndices[i];
+      }
+    }
 
-		assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)));
-		assert(!m_Attributes->hasRotation || !(bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)));
-	}
+    assert(!(bestIndices[sidx][anchorIdx] >> (nIndexBits - 1)));
+    assert(!m_Attributes->hasRotation || !(bestAlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)));
+  }
 
-	// Get the quantized values...
-	uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets];
-	uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets];
-	for(int i = 0; i < nSubsets; i++) {
-		r1[i] = pixel1[i] & 0xFF;
-		r2[i] = pixel2[i] & 0xFF;
+  // Get the quantized values...
+  uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets];
+  uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets];
+  for(int i = 0; i < nSubsets; i++) {
+    r1[i] = pixel1[i] & 0xFF;
+    r2[i] = pixel2[i] & 0xFF;
 
-		g1[i] = (pixel1[i] >> 8) & 0xFF;
-		g2[i] = (pixel2[i] >> 8) & 0xFF;
+    g1[i] = (pixel1[i] >> 8) & 0xFF;
+    g2[i] = (pixel2[i] >> 8) & 0xFF;
 
-		b1[i] = (pixel1[i] >> 16) & 0xFF;
-		b2[i] = (pixel2[i] >> 16) & 0xFF;
+    b1[i] = (pixel1[i] >> 16) & 0xFF;
+    b2[i] = (pixel2[i] >> 16) & 0xFF;
 
-		a1[i] = (pixel1[i] >> 24) & 0xFF;
-		a2[i] = (pixel2[i] >> 24) & 0xFF;
-	}
+    a1[i] = (pixel1[i] >> 24) & 0xFF;
+    a2[i] = (pixel2[i] >> 24) & 0xFF;
+  }
 
-	// Write them out...
-	const int nRedBits = m_Attributes->colorChannelPrecision;
-	for(int i = 0; i < nSubsets; i++) {
-		stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits);
-		stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits);
-	}
+  // Write them out...
+  const int nRedBits = m_Attributes->colorChannelPrecision;
+  for(int i = 0; i < nSubsets; i++) {
+    stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits);
+    stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits);
+  }
 
-	const int nGreenBits = m_Attributes->colorChannelPrecision;
-	for(int i = 0; i < nSubsets; i++) {
-		stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits);
-		stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits);
-	}
+  const int nGreenBits = m_Attributes->colorChannelPrecision;
+  for(int i = 0; i < nSubsets; i++) {
+    stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits);
+    stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits);
+  }
 
-	const int nBlueBits = m_Attributes->colorChannelPrecision;
-	for(int i = 0; i < nSubsets; i++) {
-		stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits);
-		stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits);
-	}
+  const int nBlueBits = m_Attributes->colorChannelPrecision;
+  for(int i = 0; i < nSubsets; i++) {
+    stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits);
+    stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits);
+  }
 
-	const int nAlphaBits = m_Attributes->alphaChannelPrecision;
-	for(int i = 0; i < nSubsets; i++) {
-		stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits);
-		stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits);
-	}
+  const int nAlphaBits = m_Attributes->alphaChannelPrecision;
+  for(int i = 0; i < nSubsets; i++) {
+    stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits);
+    stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits);
+  }
 
-	// Write out the best pbits..
-	if(GetPBitType() != ePBitType_None) {
-		for(int s = 0; s < nSubsets; s++) {
-			const int *pbits = GetPBitCombo(bestPbitCombo[s]);
-			stream.WriteBits(pbits[0], 1);
-			if(GetPBitType() != ePBitType_Shared)
-				stream.WriteBits(pbits[1], 1);
-		}
-	}
+  // Write out the best pbits..
+  if(GetPBitType() != ePBitType_None) {
+    for(int s = 0; s < nSubsets; s++) {
+      const int *pbits = GetPBitCombo(bestPbitCombo[s]);
+      stream.WriteBits(pbits[0], 1);
+      if(GetPBitType() != ePBitType_Shared)
+        stream.WriteBits(pbits[1], 1);
+    }
+  }
 
-	// If our index mode has changed, then we need to write the alpha indices first.
-	if(m_Attributes->hasIdxMode && bestIndexMode == 1) {
+  // If our index mode has changed, then we need to write the alpha indices first.
+  if(m_Attributes->hasIdxMode && bestIndexMode == 1) {
 
-		assert(m_Attributes->hasRotation);
+    assert(m_Attributes->hasRotation);
 
-		for(int i = 0; i < 16; i++) {
-			const int idx = bestAlphaIndices[i];
-			assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
-			assert(GetNumberOfBitsPerAlpha(bestIndexMode) == 2);
-			assert(idx >= 0 && idx < (1 << 2));
-			assert(i != 0 || !(idx >> 1) || !"Leading bit of anchor index is not zero!");
-			stream.WriteBits(idx, (i == 0)? 1 : 2);
-		}
+    for(int i = 0; i < 16; i++) {
+      const int idx = bestAlphaIndices[i];
+      assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
+      assert(GetNumberOfBitsPerAlpha(bestIndexMode) == 2);
+      assert(idx >= 0 && idx < (1 << 2));
+      assert(i != 0 || !(idx >> 1) || !"Leading bit of anchor index is not zero!");
+      stream.WriteBits(idx, (i == 0)? 1 : 2);
+    }
 
-		for(int i = 0; i < 16; i++) {
-			const int idx = bestIndices[0][i];
-			assert(GetSubsetForIndex(i, shapeIdx, nSubsets) == 0);
-			assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
-			assert(GetNumberOfBitsPerIndex(bestIndexMode) == 3);
-			assert(idx >= 0 && idx < (1 << 3));
-			assert(i != 0 || !(idx >> 2) || !"Leading bit of anchor index is not zero!");
-			stream.WriteBits(idx, (i == 0)? 2 : 3);
-		}
-	}
-	else {
-		for(int i = 0; i < 16; i++) {
-			const int subs = GetSubsetForIndex(i, shapeIdx, nSubsets);
-			const int idx = bestIndices[subs][i];
-			const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx, nSubsets);
-			const int nBitsForIdx = GetNumberOfBitsPerIndex(bestIndexMode);
-			assert(idx >= 0 && idx < (1 << nBitsForIdx));
-			assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
-			stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
-		}
+    for(int i = 0; i < 16; i++) {
+      const int idx = bestIndices[0][i];
+      assert(GetSubsetForIndex(i, shapeIdx, nSubsets) == 0);
+      assert(GetAnchorIndexForSubset(0, shapeIdx, nSubsets) == 0);
+      assert(GetNumberOfBitsPerIndex(bestIndexMode) == 3);
+      assert(idx >= 0 && idx < (1 << 3));
+      assert(i != 0 || !(idx >> 2) || !"Leading bit of anchor index is not zero!");
+      stream.WriteBits(idx, (i == 0)? 2 : 3);
+    }
+  }
+  else {
+    for(int i = 0; i < 16; i++) {
+      const int subs = GetSubsetForIndex(i, shapeIdx, nSubsets);
+      const int idx = bestIndices[subs][i];
+      const int anchorIdx = GetAnchorIndexForSubset(subs, shapeIdx, nSubsets);
+      const int nBitsForIdx = GetNumberOfBitsPerIndex(bestIndexMode);
+      assert(idx >= 0 && idx < (1 << nBitsForIdx));
+      assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
+      stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
+    }
 
-		if(m_Attributes->hasRotation) {
-			for(int i = 0; i < 16; i++) {
-				const int idx = bestAlphaIndices[i];
-				const int anchorIdx = 0;
-				const int nBitsForIdx = GetNumberOfBitsPerAlpha(bestIndexMode);
-				assert(idx >= 0 && idx < (1 << nBitsForIdx));
-				assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
-				stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
-			}
-		}
-	}
-	assert(stream.GetBitsWritten() == 128);
-	return totalErr;
+    if(m_Attributes->hasRotation) {
+      for(int i = 0; i < 16; i++) {
+        const int idx = bestAlphaIndices[i];
+        const int anchorIdx = 0;
+        const int nBitsForIdx = GetNumberOfBitsPerAlpha(bestIndexMode);
+        assert(idx >= 0 && idx < (1 << nBitsForIdx));
+        assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!");
+        stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx);
+      }
+    }
+  }
+  assert(stream.GetBitsWritten() == 128);
+  return totalErr;
 }
 
 namespace BC7C
 {
-	static ErrorMetric gErrorMetric = eErrorMetric_Uniform;
-	void SetErrorMetric(ErrorMetric e) { gErrorMetric = e; }
-
-	ALIGN_SSE const float kErrorMetrics[kNumErrorMetrics][kNumColorChannels] = {
-		{ 1.0f, 1.0f, 1.0f, 1.0f },
-		{ sqrtf(0.3f), sqrtf(0.56f), sqrtf(0.11f), 1.0f }
-	};
-
-	const float *GetErrorMetric() { return kErrorMetrics[GetErrorMetricEnum()]; }
-	ErrorMetric GetErrorMetricEnum() { return gErrorMetric; }
-
-	// Function prototypes
-	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock);
-	static void CompressBC7Block(const uint32 *block, uint8 *outBuf);
-
-	static int gQualityLevel = 50;
-	void SetQualityLevel(int q) {
-		gQualityLevel = max(0, q);
-	}
-	int GetQualityLevel() { return gQualityLevel; }
-
-	// Returns true if the entire block is a single color.
-	static bool AllOneColor(const uint32 block[16]) {
-		const uint32 pixel = block[0];
-		for(int i = 1; i < 16; i++) {
-			if( block[i] != pixel )
-				return false;
-		}
-
-		return true;
-	}
-
-	// Write out a transparent block.
-	static void WriteTransparentBlock(BitStream &stream) {
-		// Use mode 6
-		stream.WriteBits(1 << 6, 7);
-		stream.WriteBits(0, 128-7);
-		assert(stream.GetBitsWritten() == 128);
-	}
-
-	// Compresses a single color optimally and outputs the result.
-	static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) {
-
-		stream.WriteBits(1 << 5, 6); // Mode 5
-		stream.WriteBits(0, 2); // No rotation bits.
-
-		uint8 r = pixel & 0xFF;
-		uint8 g = (pixel >> 8) & 0xFF;
-		uint8 b = (pixel >> 16) & 0xFF;
-		uint8 a = (pixel >> 24) & 0xFF;
-
-		// Red endpoints
-		stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7);
-		stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7);
-
-		// Green endpoints
-		stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7);
-		stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7);
-
-		// Blue endpoints
-		stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7);
-		stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7);
-
-		// Alpha endpoints... are just the same.
-		stream.WriteBits(a, 8);
-		stream.WriteBits(a, 8);
-		
-		// Color indices are 1 for each pixel...
-		// Anchor index is 0, so 1 bit for the first pixel, then
-		// 01 for each following pixel giving the sequence of 31 bits:
-		// ...010101011
-		stream.WriteBits(0xaaaaaaab, 31);
-
-		// Alpha indices...
-		stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); 
-	}
-
-	static int gModeChosen = -1;
-	static int gBestMode = -1;
-
-	// Compress an image using BC7 compression. Use the inBuf parameter to point to an image in
-	// 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
-	// The buffer pointed to by outBuf should be large enough to store the compressed image. This
-	// implementation has an 4:1 compression ratio.
-	void CompressImageBC7(const unsigned char *inBuf, unsigned char *outBuf, unsigned int width, unsigned int height)
-	{
-		uint32 block[16];
-		BC7CompressionMode::ResetNumUses();
-		BC7CompressionMode::MaxAnnealingIterations = min(BC7CompressionMode::kMaxAnnealingIterations, GetQualityLevel());
-
-		for(int j = 0; j < height; j += 4)
-		{
-			for(int i = 0; i < width; i += 4)
-			{
-			  // ExtractBlock(inBuf + i * 4, width, block);
-			  CompressBC7Block((const uint32 *)inBuf, outBuf);
-			  BC7CompressionMode::NumUses[gBestMode]++;
-
-			  outBuf += 16;
-			  inBuf += 64;
-			}
-		}
-	}
-
-	// Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter
-	// specifies the size of the image in pixels.
-	static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock)
-	{
-		for(int j = 0; j < 4; j++)
-		{
-			memcpy(&colorBlock[j * 4], inPtr, 4 * 4);
-			inPtr += width * 4;
-		}
-	}
-
-	static double CompressTwoClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
-
-		uint8 tempBuf1[16];
-		BitStream tmpStream1(tempBuf1, 128, 0);
-		BC7CompressionMode compressor1(1, opaque);
-			
-		double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters);
-		memcpy(outBuf, tempBuf1, 16);
-		gModeChosen = 1;
-		if(bestError == 0.0) {
-			return 0.0;
-		}
-
-		uint8 tempBuf3[16];
-		BitStream tmpStream3(tempBuf3, 128, 0);
-		BC7CompressionMode compressor3(3, opaque);
-
-		double error;
-		if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) {
-			gModeChosen = 3;
-			bestError = error;
-			memcpy(outBuf, tempBuf3, 16);
-			if(bestError == 0.0) {
-				return 0.0;
-			}
-		}
-		
-		// Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha.
-		if(!opaque) 
-		{
-			uint8 tempBuf7[16];
-			BitStream tmpStream7(tempBuf7, 128, 0);
-			BC7CompressionMode compressor7(7, opaque);		
-			if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) {
-				gModeChosen = 7;
-				memcpy(outBuf, tempBuf7, 16);
-				return error;
-			}
-		}
-
-		return bestError;
-	}
-
-	static double CompressThreeClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
-
-		uint8 tempBuf0[16];
-		BitStream tmpStream0(tempBuf0, 128, 0);
-
-		uint8 tempBuf2[16];
-		BitStream tmpStream2(tempBuf2, 128, 0);
-
-		BC7CompressionMode compressor0(0, opaque);
-		BC7CompressionMode compressor2(2, opaque);
-			
-		double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX;
-		gModeChosen = 0;
-		memcpy(outBuf, tempBuf0, 16);
-		if(bestError == 0.0) {
-			return 0.0;
-		}
-
-		if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) {
-			gModeChosen = 2;
-			memcpy(outBuf, tempBuf2, 16);
-			return error;
-		}
-
-		return bestError;
-	}
-
-	static void PopulateTwoClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
-		const uint16 shape = kShapeMask2[shapeIdx]; 
-		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
-
-			const RGBAVector &p = points.GetPoint(pt);
-
-			if((1 << pt) & shape)
-				clusters[1].AddPoint(p);
-			else
-				clusters[0].AddPoint(p);
-		}
-
-		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
-		assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF);
-		assert((shape & clusters[1].GetPointBitString()) == shape);
-	}
-
-	static void PopulateThreeClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
-		for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
-
-			const RGBAVector &p = points.GetPoint(pt);
-
-			if((1 << pt) & kShapeMask3[shapeIdx][0]) {
-				if((1 << pt) & kShapeMask3[shapeIdx][1])
-					clusters[2].AddPoint(p);
-				else
-					clusters[1].AddPoint(p);
-			}
-			else
-				clusters[0].AddPoint(p);
-		}
-
-		assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
-		assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString()));
-		assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString()));
-	}
-
-	static double EstimateTwoClusterError(RGBACluster &c) {
-		RGBAVector Min, Max, v;
-		c.GetBoundingBox(Min, Max);
-		v = Max - Min;
-		if(v * v == 0) {
-			return 0.0;
-		}
-
-		const float *w = BC7C::GetErrorMetric();
-		return 0.0001 + c.QuantizedError(Min, Max, 8, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
-	}
-
-	static double EstimateThreeClusterError(RGBACluster &c) {
-		RGBAVector Min, Max, v;
-		c.GetBoundingBox(Min, Max);
-		v = Max - Min;
-		if(v * v == 0) {
-			return 0.0;
-		}
-
-		const float *w = BC7C::GetErrorMetric();
-		return 0.0001 + c.QuantizedError(Min, Max, 4, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
-	}
-
-	// Compress a single block.
-	static void CompressBC7Block(const uint32 *block, uint8 *outBuf) {
-
-		// All a single color?
-		if(AllOneColor(block)) {
-			BitStream bStrm(outBuf, 128, 0);
-			CompressOptimalColorBC7(*block, bStrm);
-			gBestMode = 5;
-			return;
-		}
-
-		RGBACluster blockCluster;
-		bool opaque = true;
-		bool transparent = true;
-
-		for(int i = 0; i < kMaxNumDataPoints; i++) {
-			RGBAVector p = RGBAVector(i, block[i]);
-			blockCluster.AddPoint(p);
-			if(fabs(p.a - 255.0f) > 1e-10)
-				opaque = false;
-
-			if(p.a > 0.0f)
-				transparent = false;
-		}
-
-		// The whole block is transparent?
-		if(transparent) {
-			BitStream bStrm(outBuf, 128, 0);
-			WriteTransparentBlock(bStrm);
-			gBestMode = 6;
-			return;
-		}
-
-		// First we must figure out which shape to use. To do this, simply
-		// see which shape has the smallest sum of minimum bounding spheres.
-		double bestError[2] = { DBL_MAX, DBL_MAX };
-		int bestShapeIdx[2] = { -1, -1 };
-		RGBACluster bestClusters[2][3];
-
-		for(int i = 0; i < kNumShapes2; i++) 
-		{
-			RGBACluster clusters[2];
-			PopulateTwoClustersForShape(blockCluster, i, clusters);
-
-			double err = 0.0;
-			for(int ci = 0; ci < 2; ci++) {
-				err += EstimateTwoClusterError(clusters[ci]);
-			}
-
-			// If it's small, we'll take it!
-			if(err < 1e-9) {
-				CompressTwoClusters(i, clusters, outBuf, opaque);
-				gBestMode = gModeChosen;
-				return;
-			}
-			
-			if(err < bestError[0]) {
-				bestError[0] = err;
-				bestShapeIdx[0] = i;
-				bestClusters[0][0] = clusters[0];
-				bestClusters[0][1] = clusters[1];
-			}
-		}
-
-		// There are not 3 subset blocks that support alpha, so only check these
-		// if the entire block is opaque.
-		if(opaque) {
-			for(int i = 0; i < kNumShapes3; i++) {
-
-				RGBACluster clusters[3];
-				PopulateThreeClustersForShape(blockCluster, i, clusters);
-
-				double err = 0.0;
-				for(int ci = 0; ci < 3; ci++) {
-					err += EstimateThreeClusterError(clusters[ci]);
-				}
-
-				// If it's small, we'll take it!
-				if(err < 1e-9) {
-					CompressThreeClusters(i, clusters, outBuf, opaque);
-					gBestMode = gModeChosen;
-					return;
-				}
-
-				if(err < bestError[1]) {
-					bestError[1] = err;
-					bestShapeIdx[1] = i;
-					bestClusters[1][0] = clusters[0];
-					bestClusters[1][1] = clusters[1];
-					bestClusters[1][2] = clusters[2];
-				}
-			}
-		}
-
-		uint8 tempBuf1[16], tempBuf2[16];
-
-		BitStream tempStream1 (tempBuf1, 128, 0);
-		BC7CompressionMode compressor(6, opaque);
-		double best = compressor.Compress(tempStream1, 0, &blockCluster);
-		gBestMode = 6;
-		if(best == 0.0f) {
-			memcpy(outBuf, tempBuf1, 16);
-			return;
-		}
-
-		// Check modes 4 and 5 if the block isn't opaque...
-		if(!opaque) {
-			for(int mode = 4; mode <= 5; mode++) {
-
-				BitStream tempStream2(tempBuf2, 128, 0);
-				BC7CompressionMode compressorTry(mode, opaque);
-
-				double error = compressorTry.Compress(tempStream2, 0, &blockCluster);
-				if(error < best) {
-
-					gBestMode = mode;
-					best = error;
-
-					if(best == 0.0f) {
-						memcpy(outBuf, tempBuf2, 16);
-						return;
-					}
-					else {
-						memcpy(tempBuf1, tempBuf2, 16);
-					}
-				}
-			}
-		}
-
-		double error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, opaque);
-		if(error < best) {
-
-			gBestMode = gModeChosen;
-			best = error;
-			
-			if(error == 0.0f) {
-				memcpy(outBuf, tempBuf2, 16);
-				return;
-			}
-			else {
-				memcpy(tempBuf1, tempBuf2, 16);
-			}
-		}
-
-		if(opaque) {
-			if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, opaque) < best) {
-
-				gBestMode = gModeChosen;
-				memcpy(outBuf, tempBuf2, 16);
-
-				return;
-			}
-		}
-
-		memcpy(outBuf, tempBuf1, 16);
-	}
-
-	static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]) {
-
-		BitStreamReadOnly strm(block);
-
-		uint32 mode = 0;
-		while(!strm.ReadBit()) {
-			mode++;
-		}
-
-		const BC7CompressionMode::Attributes *attrs = BC7CompressionMode::GetAttributesForMode(mode);
-		const uint32 nSubsets = attrs->numSubsets;
-
-		uint32 idxMode = 0;
-		uint32 rotMode = 0;
-		uint32 shapeIdx = 0;
-		if ( nSubsets > 1 ) {
-			shapeIdx = strm.ReadBits(mode == 0? 4 : 6);
-		}
-		else if( attrs->hasRotation ) {
-			rotMode = strm.ReadBits(2);
-			if( attrs->hasIdxMode )
-				idxMode = strm.ReadBit();
-		}
-
-		assert(idxMode < 2);
-		assert(rotMode < 4);
-		assert(shapeIdx < ((mode == 0)? 16 : 64));
-
-		uint32 cp = attrs->colorChannelPrecision;
-		const uint32 shift = 8 - cp;
-
-		uint8 eps[3][2][4];
-		for(uint32 ch = 0; ch < 3; ch++)
-		for(uint32 i = 0; i < nSubsets; i++)
-		for(uint32 ep = 0; ep < 2; ep++) 
-			eps[i][ep][ch] = strm.ReadBits(cp) << shift;
-
-		uint32 ap = attrs->alphaChannelPrecision;
-		const uint32 ash = 8 - ap;
-
-		for(uint32 i = 0; i < nSubsets; i++)
-		for(uint32 ep = 0; ep < 2; ep++) 
-			eps[i][ep][3] = strm.ReadBits(ap) << ash;
-
-		// Handle pbits
-		switch(attrs->pbitType) {
-			case BC7CompressionMode::ePBitType_None:
-				// Do nothing.
-			break;
-
-			case BC7CompressionMode::ePBitType_Shared:
-
-				cp += 1;
-				ap += 1;
-
-				for(uint32 i = 0; i < nSubsets; i++) {
-
-					uint32 pbit = strm.ReadBit();
-
-					for(uint32 j = 0; j < 2; j++)
-					for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
-						const uint32 prec = ch == 3? ap : cp;
-						eps[i][j][ch] |= pbit << (8-prec);
-					}
-				}
-			break;
-
-			case BC7CompressionMode::ePBitType_NotShared:
-				
-				cp += 1;
-				ap += 1;
-			
-				for(uint32 i = 0; i < nSubsets; i++)
-				for(uint32 j = 0; j < 2; j++) {
-
-					uint32 pbit = strm.ReadBit();
-
-					for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
-						const uint32 prec = ch == 3? ap : cp;
-						eps[i][j][ch] |= pbit << (8-prec);
-					}
-				}
-			break;
-		}
-
-		// Quantize endpoints...
-		for(uint32 i = 0; i < nSubsets; i++)
-		for(uint32 j = 0; j < 2; j++)
-		for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
-			const uint32 prec = ch == 3? ap : cp;
-			eps[i][j][ch] |= eps[i][j][ch] >> prec;		
-		}
-
-		// Figure out indices...
-		uint32 alphaIndices[kMaxNumDataPoints];
-		uint32 colorIndices[kMaxNumDataPoints];
-
-		int nBitsPerAlpha = attrs->numBitsPerAlpha;
-		int nBitsPerColor = attrs->numBitsPerIndex;
-
-		uint32 idxPrec = attrs->numBitsPerIndex;
-		for(int i = 0; i < kMaxNumDataPoints; i++) {
-			uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
-
-			int idx = 0;
-			if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
-				idx = strm.ReadBits(idxPrec - 1);
-			}
-			else {
-				idx = strm.ReadBits(idxPrec);
-			}
-			colorIndices[i] = idx;
-		}
-
-		idxPrec = attrs->numBitsPerAlpha;
-		if(idxPrec == 0) {
-			memcpy(alphaIndices, colorIndices, sizeof(alphaIndices));
-		}
-		else {
-			for(int i = 0; i < kMaxNumDataPoints; i++) {
-				uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
-
-				int idx = 0;
-				if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
-					idx = strm.ReadBits(idxPrec - 1);
-				}
-				else {
-					idx = strm.ReadBits(idxPrec);
-				}
-				alphaIndices[i] = idx;
-			}
-
-			if(idxMode) {
-				for(int i = 0; i < kMaxNumDataPoints; i++) {
-					swap(alphaIndices[i], colorIndices[i]);
-				}
-
-				swap(nBitsPerAlpha, nBitsPerColor);
-			}
-		}
-
-		assert(strm.GetBitsRead() == 128);
-
-		// Get final colors by interpolating...
-		for(int i = 0; i < kMaxNumDataPoints; i++) {
-
-			const uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
-			uint32 &pixel = outBuf[i];
-			
-			pixel = 0;
-			for(int ch = 0; ch < 3; ch++) {
-				uint32 i0 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][0];
-				uint32 i1 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][1];
-
-				const uint8 ip = (((uint32(eps[subset][0][ch]) * i0) + (uint32(eps[subset][1][ch]) * i1) + 32) >> 6) & 0xFF;
-				pixel |= ip << (8*ch);
-			}
-
-			if(attrs->alphaChannelPrecision > 0) {
-				uint32 i0 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][0];
-				uint32 i1 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][1];
-
-				const uint8 ip = (((uint32(eps[subset][0][3]) * i0) + (uint32(eps[subset][1][3]) * i1) + 32) >> 6) & 0xFF;
-				pixel |= ip << 24;
-			}
-			else {
-				pixel |= 0xFF000000;
-			}
-			
-			// Swap colors if necessary...
-			uint8 *pb = (uint8 *)&pixel;
-			switch(rotMode) {
-				default:
-				case 0:
-					// Do nothing
-					break;
-
-				case 1:
-					swap(pb[0], pb[3]);
-					break;
-
-				case 2:
-					swap(pb[1], pb[3]);
-					break;
-
-				case 3:
-					swap(pb[2], pb[3]);
-					break;
-			}
-		}
-	}
-
-	// Convert the image from a BC7 buffer to a RGBA8 buffer
-	void DecompressImageBC7(const uint8 *inBuf, uint8* outBuf, int width, int height) {
-
-		int blockIdx = 0;
-		for(int j = 0; j < height; j += 4, outBuf += width * 3 * 4)
-		{
-			for(int i = 0; i < width; i += 4)
-			{
-				uint32 pixels[16];
-				DecompressBC7Block(inBuf + (16*(blockIdx++)), pixels);
-
-				memcpy(outBuf, pixels, 4 * sizeof(uint32));
-				memcpy(outBuf + (width * 4), pixels + 4, 4 * sizeof(uint32));
-				memcpy(outBuf + 2*(width * 4), pixels + 8, 4 * sizeof(uint32));
-				memcpy(outBuf + 3*(width * 4), pixels + 12, 4 * sizeof(uint32));
-				outBuf += 16;
-			}
-		}
-	}
+  static ErrorMetric gErrorMetric = eErrorMetric_Uniform;
+  void SetErrorMetric(ErrorMetric e) { gErrorMetric = e; }
+
+  ALIGN_SSE const float kErrorMetrics[kNumErrorMetrics][kNumColorChannels] = {
+    { 1.0f, 1.0f, 1.0f, 1.0f },
+    { sqrtf(0.3f), sqrtf(0.56f), sqrtf(0.11f), 1.0f }
+  };
+
+  const float *GetErrorMetric() { return kErrorMetrics[GetErrorMetricEnum()]; }
+  ErrorMetric GetErrorMetricEnum() { return gErrorMetric; }
+
+  // Function prototypes
+  static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock);
+  static void CompressBC7Block(const uint32 *block, uint8 *outBuf);
+
+  static int gQualityLevel = 50;
+  void SetQualityLevel(int q) {
+    gQualityLevel = max(0, q);
+  }
+  int GetQualityLevel() { return gQualityLevel; }
+
+  // Returns true if the entire block is a single color.
+  static bool AllOneColor(const uint32 block[16]) {
+    const uint32 pixel = block[0];
+    for(int i = 1; i < 16; i++) {
+      if( block[i] != pixel )
+        return false;
+    }
+
+    return true;
+  }
+
+  // Write out a transparent block.
+  static void WriteTransparentBlock(BitStream &stream) {
+    // Use mode 6
+    stream.WriteBits(1 << 6, 7);
+    stream.WriteBits(0, 128-7);
+    assert(stream.GetBitsWritten() == 128);
+  }
+
+  // Compresses a single color optimally and outputs the result.
+  static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) {
+
+    stream.WriteBits(1 << 5, 6); // Mode 5
+    stream.WriteBits(0, 2); // No rotation bits.
+
+    uint8 r = pixel & 0xFF;
+    uint8 g = (pixel >> 8) & 0xFF;
+    uint8 b = (pixel >> 16) & 0xFF;
+    uint8 a = (pixel >> 24) & 0xFF;
+
+    // Red endpoints
+    stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7);
+    stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7);
+
+    // Green endpoints
+    stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7);
+    stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7);
+
+    // Blue endpoints
+    stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7);
+    stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7);
+
+    // Alpha endpoints... are just the same.
+    stream.WriteBits(a, 8);
+    stream.WriteBits(a, 8);
+    
+    // Color indices are 1 for each pixel...
+    // Anchor index is 0, so 1 bit for the first pixel, then
+    // 01 for each following pixel giving the sequence of 31 bits:
+    // ...010101011
+    stream.WriteBits(0xaaaaaaab, 31);
+
+    // Alpha indices...
+    stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); 
+  }
+
+  static int gModeChosen = -1;
+  static int gBestMode = -1;
+
+  static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]);
+
+  // Compress an image using BC7 compression. Use the inBuf parameter to point to an image in
+  // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
+  // The buffer pointed to by outBuf should be large enough to store the compressed image. This
+  // implementation has an 4:1 compression ratio.
+  void CompressImageBC7(const unsigned char *inBuf, unsigned char *outBuf, unsigned int width, unsigned int height)
+  {
+    uint32 block[16];
+    BC7CompressionMode::ResetNumUses();
+    BC7CompressionMode::MaxAnnealingIterations = min(BC7CompressionMode::kMaxAnnealingIterations, GetQualityLevel());
+
+    for(int j = 0; j < height; j += 4)
+    {
+      for(int i = 0; i < width; i += 4)
+      {
+        // ExtractBlock(inBuf + i * 4, width, block);
+        CompressBC7Block((const uint32 *)inBuf, outBuf);
+        BC7CompressionMode::NumUses[gBestMode]++;
+
+#ifndef NDEBUG
+        uint8 *block = (uint8 *)outBuf;
+        uint32 unComp[16];
+        DecompressBC7Block(block, unComp);
+        uint8* unCompData = (uint8 *)unComp;
+
+        int diffSum = 0;
+        for(int i = 0; i < 64; i++) {
+          diffSum += sad(unCompData[i], inBuf[i]);
+        }
+        double blockError = double(diffSum) / 64.0;
+        if(blockError > 50.0) {
+          fprintf(stderr, "WARNING: Block error very high (%.2f)\n", blockError);
+        }
+#endif
+
+        outBuf += 16;
+        inBuf += 64;
+      }
+    }
+  }
+
+  // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter
+  // specifies the size of the image in pixels.
+  static void ExtractBlock(const uint8* inPtr, int width, uint32* colorBlock)
+  {
+    for(int j = 0; j < 4; j++)
+    {
+      memcpy(&colorBlock[j * 4], inPtr, 4 * 4);
+      inPtr += width * 4;
+    }
+  }
+
+  static double CompressTwoClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
+
+    uint8 tempBuf1[16];
+    BitStream tmpStream1(tempBuf1, 128, 0);
+    BC7CompressionMode compressor1(1, opaque);
+      
+    double bestError = compressor1.Compress(tmpStream1, shapeIdx, clusters);
+    memcpy(outBuf, tempBuf1, 16);
+    gModeChosen = 1;
+    if(bestError == 0.0) {
+      return 0.0;
+    }
+
+    uint8 tempBuf3[16];
+    BitStream tmpStream3(tempBuf3, 128, 0);
+    BC7CompressionMode compressor3(3, opaque);
+
+    double error;
+    if((error = compressor3.Compress(tmpStream3, shapeIdx, clusters)) < bestError) {
+      gModeChosen = 3;
+      bestError = error;
+      memcpy(outBuf, tempBuf3, 16);
+      if(bestError == 0.0) {
+        return 0.0;
+      }
+    }
+    
+    // Mode 3 offers more precision for RGB data. Mode 7 is really only if we have alpha.
+    if(!opaque) 
+    {
+      uint8 tempBuf7[16];
+      BitStream tmpStream7(tempBuf7, 128, 0);
+      BC7CompressionMode compressor7(7, opaque);    
+      if((error = compressor7.Compress(tmpStream7, shapeIdx, clusters)) < bestError) {
+        gModeChosen = 7;
+        memcpy(outBuf, tempBuf7, 16);
+        return error;
+      }
+    }
+
+    return bestError;
+  }
+
+  static double CompressThreeClusters(int shapeIdx, const RGBACluster *clusters, uint8 *outBuf, bool opaque) {
+
+    uint8 tempBuf0[16];
+    BitStream tmpStream0(tempBuf0, 128, 0);
+
+    uint8 tempBuf2[16];
+    BitStream tmpStream2(tempBuf2, 128, 0);
+
+    BC7CompressionMode compressor0(0, opaque);
+    BC7CompressionMode compressor2(2, opaque);
+      
+    double error, bestError = (shapeIdx < 16)? compressor0.Compress(tmpStream0, shapeIdx, clusters) : DBL_MAX;
+    gModeChosen = 0;
+    memcpy(outBuf, tempBuf0, 16);
+    if(bestError == 0.0) {
+      return 0.0;
+    }
+
+    if((error = compressor2.Compress(tmpStream2, shapeIdx, clusters)) < bestError) {
+      gModeChosen = 2;
+      memcpy(outBuf, tempBuf2, 16);
+      return error;
+    }
+
+    return bestError;
+  }
+
+  static void PopulateTwoClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
+    const uint16 shape = kShapeMask2[shapeIdx]; 
+    for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+      const RGBAVector &p = points.GetPoint(pt);
+
+      if((1 << pt) & shape)
+        clusters[1].AddPoint(p);
+      else
+        clusters[0].AddPoint(p);
+    }
+
+    assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+    assert((clusters[0].GetPointBitString() ^ clusters[1].GetPointBitString()) == 0xFFFF);
+    assert((shape & clusters[1].GetPointBitString()) == shape);
+  }
+
+  static void PopulateThreeClustersForShape(const RGBACluster &points, int shapeIdx, RGBACluster *clusters) {
+    for(int pt = 0; pt < kMaxNumDataPoints; pt++) {
+
+      const RGBAVector &p = points.GetPoint(pt);
+
+      if((1 << pt) & kShapeMask3[shapeIdx][0]) {
+        if((1 << pt) & kShapeMask3[shapeIdx][1])
+          clusters[2].AddPoint(p);
+        else
+          clusters[1].AddPoint(p);
+      }
+      else
+        clusters[0].AddPoint(p);
+    }
+
+    assert(!(clusters[0].GetPointBitString() & clusters[1].GetPointBitString()));
+    assert(!(clusters[2].GetPointBitString() & clusters[1].GetPointBitString()));
+    assert(!(clusters[0].GetPointBitString() & clusters[2].GetPointBitString()));
+  }
+
+  static double EstimateTwoClusterError(RGBACluster &c) {
+    RGBAVector Min, Max, v;
+    c.GetBoundingBox(Min, Max);
+    v = Max - Min;
+    if(v * v == 0) {
+      return 0.0;
+    }
+
+    const float *w = BC7C::GetErrorMetric();
+    return 0.0001 + c.QuantizedError(Min, Max, 8, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
+  }
+
+  static double EstimateThreeClusterError(RGBACluster &c) {
+    RGBAVector Min, Max, v;
+    c.GetBoundingBox(Min, Max);
+    v = Max - Min;
+    if(v * v == 0) {
+      return 0.0;
+    }
+
+    const float *w = BC7C::GetErrorMetric();
+    return 0.0001 + c.QuantizedError(Min, Max, 4, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3]));
+  }
+
+  // Compress a single block.
+  static void CompressBC7Block(const uint32 *block, uint8 *outBuf) {
+
+    // All a single color?
+    if(AllOneColor(block)) {
+      BitStream bStrm(outBuf, 128, 0);
+      CompressOptimalColorBC7(*block, bStrm);
+      gBestMode = 5;
+      return;
+    }
+
+    RGBACluster blockCluster;
+    bool opaque = true;
+    bool transparent = true;
+
+    for(int i = 0; i < kMaxNumDataPoints; i++) {
+      RGBAVector p = RGBAVector(i, block[i]);
+      blockCluster.AddPoint(p);
+      if(fabs(p.a - 255.0f) > 1e-10)
+        opaque = false;
+
+      if(p.a > 0.0f)
+        transparent = false;
+    }
+
+    // The whole block is transparent?
+    if(transparent) {
+      BitStream bStrm(outBuf, 128, 0);
+      WriteTransparentBlock(bStrm);
+      gBestMode = 6;
+      return;
+    }
+
+    // First we must figure out which shape to use. To do this, simply
+    // see which shape has the smallest sum of minimum bounding spheres.
+    double bestError[2] = { DBL_MAX, DBL_MAX };
+    int bestShapeIdx[2] = { -1, -1 };
+    RGBACluster bestClusters[2][3];
+
+    for(int i = 0; i < kNumShapes2; i++) 
+    {
+      RGBACluster clusters[2];
+      PopulateTwoClustersForShape(blockCluster, i, clusters);
+
+      double err = 0.0;
+      for(int ci = 0; ci < 2; ci++) {
+        err += EstimateTwoClusterError(clusters[ci]);
+      }
+
+      // If it's small, we'll take it!
+      if(err < 1e-9) {
+        CompressTwoClusters(i, clusters, outBuf, opaque);
+        gBestMode = gModeChosen;
+        return;
+      }
+      
+      if(err < bestError[0]) {
+        bestError[0] = err;
+        bestShapeIdx[0] = i;
+        bestClusters[0][0] = clusters[0];
+        bestClusters[0][1] = clusters[1];
+      }
+    }
+
+    // There are not 3 subset blocks that support alpha, so only check these
+    // if the entire block is opaque.
+    if(opaque) {
+      for(int i = 0; i < kNumShapes3; i++) {
+
+        RGBACluster clusters[3];
+        PopulateThreeClustersForShape(blockCluster, i, clusters);
+
+        double err = 0.0;
+        for(int ci = 0; ci < 3; ci++) {
+          err += EstimateThreeClusterError(clusters[ci]);
+        }
+
+        // If it's small, we'll take it!
+        if(err < 1e-9) {
+          CompressThreeClusters(i, clusters, outBuf, opaque);
+          gBestMode = gModeChosen;
+          return;
+        }
+
+        if(err < bestError[1]) {
+          bestError[1] = err;
+          bestShapeIdx[1] = i;
+          bestClusters[1][0] = clusters[0];
+          bestClusters[1][1] = clusters[1];
+          bestClusters[1][2] = clusters[2];
+        }
+      }
+    }
+                
+    uint8 tempBuf1[16], tempBuf2[16];
+
+    BitStream tempStream1 (tempBuf1, 128, 0);
+    BC7CompressionMode compressor(6, opaque);
+    double best = compressor.Compress(tempStream1, 0, &blockCluster);
+    gBestMode = 6;
+    if(best == 0.0f) {
+      memcpy(outBuf, tempBuf1, 16);
+      return;
+    }
+
+    // Check modes 4 and 5 if the block isn't opaque...
+    if(!opaque) {
+      for(int mode = 4; mode <= 5; mode++) {
+
+        BitStream tempStream2(tempBuf2, 128, 0);
+        BC7CompressionMode compressorTry(mode, opaque);
+
+        double error = compressorTry.Compress(tempStream2, 0, &blockCluster);
+        if(error < best) {
+
+          gBestMode = mode;
+          best = error;
+
+          if(best == 0.0f) {
+            memcpy(outBuf, tempBuf2, 16);
+            return;
+          }
+          else {
+            memcpy(tempBuf1, tempBuf2, 16);
+          }
+        }
+      }
+    }
+
+    double error = CompressTwoClusters(bestShapeIdx[0], bestClusters[0], tempBuf2, opaque);
+    if(error < best) {
+
+      gBestMode = gModeChosen;
+      best = error;
+      
+      if(error == 0.0f) {
+        memcpy(outBuf, tempBuf2, 16);
+        return;
+      }
+      else {
+        memcpy(tempBuf1, tempBuf2, 16);
+      }
+    }
+
+    if(opaque) {
+      if(CompressThreeClusters(bestShapeIdx[1], bestClusters[1], tempBuf2, opaque) < best) {
+
+        gBestMode = gModeChosen;
+        memcpy(outBuf, tempBuf2, 16);
+
+        return;
+      }
+    }
+
+    memcpy(outBuf, tempBuf1, 16);
+  }
+
+  static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]) {
+
+    BitStreamReadOnly strm(block);
+
+    uint32 mode = 0;
+    while(!strm.ReadBit()) {
+      mode++;
+    }
+
+    const BC7CompressionMode::Attributes *attrs = BC7CompressionMode::GetAttributesForMode(mode);
+    const uint32 nSubsets = attrs->numSubsets;
+
+    uint32 idxMode = 0;
+    uint32 rotMode = 0;
+    uint32 shapeIdx = 0;
+    if ( nSubsets > 1 ) {
+      shapeIdx = strm.ReadBits(mode == 0? 4 : 6);
+    }
+    else if( attrs->hasRotation ) {
+      rotMode = strm.ReadBits(2);
+      if( attrs->hasIdxMode )
+        idxMode = strm.ReadBit();
+    }
+
+    assert(idxMode < 2);
+    assert(rotMode < 4);
+    assert(shapeIdx < ((mode == 0)? 16 : 64));
+
+    uint32 cp = attrs->colorChannelPrecision;
+    const uint32 shift = 8 - cp;
+
+    uint8 eps[3][2][4];
+    for(uint32 ch = 0; ch < 3; ch++)
+    for(uint32 i = 0; i < nSubsets; i++)
+    for(uint32 ep = 0; ep < 2; ep++) 
+      eps[i][ep][ch] = strm.ReadBits(cp) << shift;
+
+    uint32 ap = attrs->alphaChannelPrecision;
+    const uint32 ash = 8 - ap;
+
+    for(uint32 i = 0; i < nSubsets; i++)
+    for(uint32 ep = 0; ep < 2; ep++) 
+      eps[i][ep][3] = strm.ReadBits(ap) << ash;
+
+    // Handle pbits
+    switch(attrs->pbitType) {
+      case BC7CompressionMode::ePBitType_None:
+        // Do nothing.
+      break;
+
+      case BC7CompressionMode::ePBitType_Shared:
+
+        cp += 1;
+        ap += 1;
+
+        for(uint32 i = 0; i < nSubsets; i++) {
+
+          uint32 pbit = strm.ReadBit();
+
+          for(uint32 j = 0; j < 2; j++)
+          for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+            const uint32 prec = ch == 3? ap : cp;
+            eps[i][j][ch] |= pbit << (8-prec);
+          }
+        }
+      break;
+
+      case BC7CompressionMode::ePBitType_NotShared:
+        
+        cp += 1;
+        ap += 1;
+      
+        for(uint32 i = 0; i < nSubsets; i++)
+        for(uint32 j = 0; j < 2; j++) {
+
+          uint32 pbit = strm.ReadBit();
+
+          for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+            const uint32 prec = ch == 3? ap : cp;
+            eps[i][j][ch] |= pbit << (8-prec);
+          }
+        }
+      break;
+    }
+
+    // Quantize endpoints...
+    for(uint32 i = 0; i < nSubsets; i++)
+    for(uint32 j = 0; j < 2; j++)
+    for(uint32 ch = 0; ch < kNumColorChannels; ch++) {
+      const uint32 prec = ch == 3? ap : cp;
+      eps[i][j][ch] |= eps[i][j][ch] >> prec;    
+    }
+
+    // Figure out indices...
+    uint32 alphaIndices[kMaxNumDataPoints];
+    uint32 colorIndices[kMaxNumDataPoints];
+
+    int nBitsPerAlpha = attrs->numBitsPerAlpha;
+    int nBitsPerColor = attrs->numBitsPerIndex;
+
+    uint32 idxPrec = attrs->numBitsPerIndex;
+    for(int i = 0; i < kMaxNumDataPoints; i++) {
+      uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+
+      int idx = 0;
+      if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
+        idx = strm.ReadBits(idxPrec - 1);
+      }
+      else {
+        idx = strm.ReadBits(idxPrec);
+      }
+      colorIndices[i] = idx;
+    }
+
+    idxPrec = attrs->numBitsPerAlpha;
+    if(idxPrec == 0) {
+      memcpy(alphaIndices, colorIndices, sizeof(alphaIndices));
+    }
+    else {
+      for(int i = 0; i < kMaxNumDataPoints; i++) {
+        uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+
+        int idx = 0;
+        if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) {
+          idx = strm.ReadBits(idxPrec - 1);
+        }
+        else {
+          idx = strm.ReadBits(idxPrec);
+        }
+        alphaIndices[i] = idx;
+      }
+
+      if(idxMode) {
+        for(int i = 0; i < kMaxNumDataPoints; i++) {
+          swap(alphaIndices[i], colorIndices[i]);
+        }
+
+        swap(nBitsPerAlpha, nBitsPerColor);
+      }
+    }
+
+    assert(strm.GetBitsRead() == 128);
+
+    // Get final colors by interpolating...
+    for(int i = 0; i < kMaxNumDataPoints; i++) {
+
+      const uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets);
+      uint32 &pixel = outBuf[i];
+      
+      pixel = 0;
+      for(int ch = 0; ch < 4; ch++) {
+        uint32 i0 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][0];
+        uint32 i1 = kBC7InterpolationValues[nBitsPerColor - 1][colorIndices[i]][1];
+
+        const uint8 ip = (((uint32(eps[subset][0][ch]) * i0) + (uint32(eps[subset][1][ch]) * i1) + 32) >> 6) & 0xFF;
+        pixel |= ip << (8*ch);
+      }
+
+      if(attrs->alphaChannelPrecision > 0) {
+        uint32 i0 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][0];
+        uint32 i1 = kBC7InterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][1];
+
+        const uint8 ip = (((uint32(eps[subset][0][3]) * i0) + (uint32(eps[subset][1][3]) * i1) + 32) >> 6) & 0xFF;
+        pixel |= ip << 24;
+      }
+      else {
+        pixel |= 0xFF000000;
+      }
+      
+      // Swap colors if necessary...
+      uint8 *pb = (uint8 *)&pixel;
+      switch(rotMode) {
+        default:
+        case 0:
+          // Do nothing
+          break;
+
+        case 1:
+          swap(pb[0], pb[3]);
+          break;
+
+        case 2:
+          swap(pb[1], pb[3]);
+          break;
+
+        case 3:
+          swap(pb[2], pb[3]);
+          break;
+      }
+    }
+  }
+
+  // Convert the image from a BC7 buffer to a RGBA8 buffer
+  void DecompressImageBC7(const uint8 *inBuf, uint8* outBuf, unsigned int width, unsigned int height) {
+
+    unsigned int blockIdx = 0;
+    //    for(unsigned int j = 0; j < height; j += 4, outBuf += width * 3 * 4)
+    for(unsigned int j = 0; j < height; j += 4)
+    {
+      for(unsigned int i = 0; i < width; i += 4)
+      {
+        uint32 pixels[16];
+        DecompressBC7Block(inBuf + (16*(blockIdx++)), pixels);
+
+        memcpy(outBuf, pixels, 16 * sizeof(uint32));
+        //memcpy(outBuf + (width * 4), pixels + 4, 4 * sizeof(uint32));
+        //memcpy(outBuf + 2*(width * 4), pixels + 8, 4 * sizeof(uint32));
+        //memcpy(outBuf + 3*(width * 4), pixels + 12, 4 * sizeof(uint32));
+        //outBuf += 16;
+        outBuf += 64;
+      }
+    }
+  }
 }
diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp
index de081d1..0aa37b5 100644
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@@ -84,8 +84,17 @@ int main(int argc, char **argv) {
 
   CompressedImage *ci = CompressImage(file, settings);
 
+  double PSNR = ComputePSNR(*ci, file);
+  if(PSNR > 0.0) {
+    fprintf(stdout, "PSNR: %.3f\n", PSNR);
+  }
+  else {
+    fprintf(stderr, "Error computing PSNR\n");
+  }
+
   // Cleanup 
   if(NULL != ci)
     delete ci;
+
   return 0;
 }
diff --git a/Core/include/CompressedImage.h b/Core/include/CompressedImage.h
index bfce349..b0246ad 100644
--- a/Core/include/CompressedImage.h
+++ b/Core/include/CompressedImage.h
@@ -30,6 +30,8 @@ class CompressedImage {
 
   CompressedImage( const CompressedImage &other );
   ~CompressedImage();
+
+  bool DecompressImage(unsigned char *outBuf, unsigned int outBufSz) const;
 };
 
 #endif // _COMPRESSED_IMAGE_H_
diff --git a/Core/include/TexComp.h b/Core/include/TexComp.h
index 5b4598f..d8ab436 100644
--- a/Core/include/TexComp.h
+++ b/Core/include/TexComp.h
@@ -25,4 +25,6 @@ typedef void (* CompressionFunc)(
   unsigned int height
 );
 
+extern double ComputePSNR(const CompressedImage &, const ImageFile &);
+
 #endif //_TEX_COMP_H_
diff --git a/Core/src/CompressedImage.cpp b/Core/src/CompressedImage.cpp
index db616cc..3404aeb 100644
--- a/Core/src/CompressedImage.cpp
+++ b/Core/src/CompressedImage.cpp
@@ -2,12 +2,17 @@
 
 #include <string.h>
 #include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "BC7Compressor.h"
 
 CompressedImage::CompressedImage()
   : m_Width(0)
   , m_Height(0)
   , m_Format(ECompressionFormat(-1))
   , m_Data(0)
+  , m_DataSz(0)
 { }
 
 CompressedImage::CompressedImage( const CompressedImage &other )
@@ -15,6 +20,7 @@ CompressedImage::CompressedImage( const CompressedImage &other )
   , m_Height(other.m_Height)
   , m_Format(other.m_Format)
   , m_Data(0)
+  , m_DataSz(0)
 {
   InitData(other.m_Data);
 }
@@ -29,23 +35,24 @@ CompressedImage::CompressedImage(
 , m_Height(height)
 , m_Format(format)
 , m_Data(0)
+, m_DataSz(0)
 {
   InitData(data);
 }
 
 void CompressedImage::InitData(const unsigned char *withData) {
-  unsigned int dataSz = 0;
+  m_DataSz = 0;
   int uncompDataSz = m_Width * m_Height * 4;
 
   switch(m_Format) {
-    case eCompressionFormat_DXT1: dataSz = uncompDataSz / 8; break;
-    case eCompressionFormat_DXT5: dataSz = uncompDataSz / 4; break;
-    case eCompressionFormat_BPTC: dataSz = uncompDataSz / 4; break;
+    case eCompressionFormat_DXT1: m_DataSz = uncompDataSz / 8; break;
+    case eCompressionFormat_DXT5: m_DataSz = uncompDataSz / 4; break;
+    case eCompressionFormat_BPTC: m_DataSz = uncompDataSz / 4; break;
   }
   
-  if(dataSz > 0) {
-    m_Data = new unsigned char[dataSz];
-    memcpy(m_Data, withData, dataSz);
+  if(m_DataSz > 0) {
+    m_Data = new unsigned char[m_DataSz];
+    memcpy(m_Data, withData, m_DataSz);
   }
 }
 
@@ -55,3 +62,34 @@ CompressedImage::~CompressedImage() {
     m_Data = NULL;
   }
 }
+
+bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBufSz) const {
+
+  // First make sure that we have enough data
+  int dataSz = 0;
+  switch(m_Format) {
+    case eCompressionFormat_DXT1: dataSz = m_DataSz * 8; break;
+    case eCompressionFormat_DXT5: dataSz = m_DataSz * 4; break;
+    case eCompressionFormat_BPTC: dataSz = m_DataSz * 4; break;
+  }
+
+  if(dataSz > outBufSz) {
+    fprintf(stderr, "Not enough space to store entire decompressed image! "
+                    "Got %d bytes, but need %d!\n", outBufSz, dataSz);
+    return false;
+  }
+
+  switch(m_Format) {
+  case eCompressionFormat_BPTC: 
+    BC7C::DecompressImageBC7(m_Data, outBuf, m_Width, m_Height);
+    break;
+
+  default:
+    const char *errStr = "Have not implemented decompression method.";
+    fprintf(stderr, "%s\n", errStr);
+    assert(!errStr);
+    return false;
+  }
+
+  return true;
+}
diff --git a/Core/src/TexComp.cpp b/Core/src/TexComp.cpp
index f3476df..43f10d5 100644
--- a/Core/src/TexComp.cpp
+++ b/Core/src/TexComp.cpp
@@ -2,6 +2,7 @@
 #include "TexComp.h"
 #include "ThreadGroup.h"
 
+#include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
@@ -148,4 +149,57 @@ CompressedImage * CompressImage(
   delete [] cmpData;
 
   return outImg;
-} 
+}
+
+template <typename T>
+static inline T sad(const T &a, const T &b) {
+  return (a > b)? a - b : b - a;
+}
+
+double ComputePSNR(const CompressedImage &ci, const ImageFile &file) {
+  unsigned int imageSz = 4 * file.GetWidth() * file.GetHeight();
+  unsigned char *unCompData = new unsigned char[imageSz];
+  if(!(ci.DecompressImage(unCompData, imageSz))) {
+    ReportError("Failed to decompress image.");
+    return -1.0f;
+  }
+
+  const unsigned char *rawData = file.GetPixels();
+
+  const double wr = 1.0;
+  const double wg = 1.0;
+  const double wb = 1.0;
+    
+  double MSE = 0.0;
+  for(int i = 0; i < imageSz; i+=4) {
+
+    const unsigned char *pixelDataRaw = rawData + i;
+    const unsigned char *pixelDataUncomp = unCompData + i;
+
+    double dr = double(sad(pixelDataRaw[0], pixelDataUncomp[0])) * wr;
+    double dg = double(sad(pixelDataRaw[1], pixelDataUncomp[1])) * wg;
+    double db = double(sad(pixelDataRaw[2], pixelDataUncomp[2])) * wb;
+
+    const double pixelMSE = 
+      (double(dr) * double(dr)) + 
+      (double(dg) * double(dg)) + 
+      (double(db) * double(db));
+
+    //fprintf(stderr, "Pixel MSE: %f\n", pixelMSE);
+    MSE += pixelMSE;
+  }
+
+  MSE /= (double(file.GetWidth()) * double(file.GetHeight()));
+
+  double MAXI = 
+    (255.0 * wr) * (255.0 * wr) + 
+    (255.0 * wg) * (255.0 * wg) + 
+    (255.0 * wb) * (255.0 * wb);
+
+  double PSNR = 10 * log10(MAXI/MSE);
+
+  // Cleanup
+  delete unCompData;
+  return PSNR;
+
+}
diff --git a/IO/src/ImageFile.cpp b/IO/src/ImageFile.cpp
index e1f3167..5fb75da 100644
--- a/IO/src/ImageFile.cpp
+++ b/IO/src/ImageFile.cpp
@@ -196,6 +196,11 @@ bool ImageFile::LoadImage(const unsigned char *rawImageData) {
   const unsigned int aw = ((m_Width + 3) >> 2) << 2;
   const unsigned int ah = ((m_Height + 3) >> 2) << 2;
 
+#ifndef NDEBUG
+  if(aw != m_Width || ah != m_Height)
+    fprintf(stderr, "Warning: Image dimension not multiple of four. Space will be filled with black.\n");
+#endif
+
   int byteIdx = 0;
   for(int i = 0; i < ah; i+=4) {
     for(int j = 0; j < aw; j+= 4) {
diff --git a/IO/src/ImageLoaderPNG.cpp b/IO/src/ImageLoaderPNG.cpp
index c267cfa..cac1850 100644
--- a/IO/src/ImageLoaderPNG.cpp
+++ b/IO/src/ImageLoaderPNG.cpp
@@ -112,7 +112,7 @@ bool ImageLoaderPNG::ReadData() {
 	
 	unsigned int byteIdx = 0;
 	for(int j = 0; j < m_Width; j++) {
-
+ 	  m_RedData[rowOffset + j] = rowData[byteIdx++];
 	}
 
 	assert(byteIdx == bpr);