From 0afa1281a8591aff5f09e57725859d63f338ecdf Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Fri, 13 Sep 2013 13:10:07 -0400
Subject: [PATCH 01/32] Change default wrap mode to wrap

---
 PVRTCEncoder/include/PVRTCCompressor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PVRTCEncoder/include/PVRTCCompressor.h b/PVRTCEncoder/include/PVRTCCompressor.h
index 5e485fb..62ea975 100644
--- a/PVRTCEncoder/include/PVRTCCompressor.h
+++ b/PVRTCEncoder/include/PVRTCCompressor.h
@@ -70,7 +70,7 @@ namespace PVRTCC {
   // decompress the data.
   void Decompress(const DecompressionJob &,
                   bool bTwoBitMode = false,
-                  const EWrapMode wrapMode = eWrapMode_Clamp,
+                  const EWrapMode wrapMode = eWrapMode_Wrap,
                   bool bDebugImages = false);
 
 }  // namespace PVRTCC

From 093576c9e73a399c27b6160e8da9e64a3ab02583 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Fri, 13 Sep 2013 13:10:22 -0400
Subject: [PATCH 02/32] Add initial code for our compressor.

---
 PVRTCEncoder/CMakeLists.txt            |  1 +
 PVRTCEncoder/include/PVRTCCompressor.h |  7 +++
 PVRTCEncoder/src/Compressor.cpp        | 76 ++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 PVRTCEncoder/src/Compressor.cpp

diff --git a/PVRTCEncoder/CMakeLists.txt b/PVRTCEncoder/CMakeLists.txt
index a36c3c9..d2c0bc3 100644
--- a/PVRTCEncoder/CMakeLists.txt
+++ b/PVRTCEncoder/CMakeLists.txt
@@ -64,6 +64,7 @@ SET( HEADERS
 )
 
 SET( SOURCES
+  src/Compressor.cpp
   src/Decompressor.cpp
   src/Pixel.cpp
   src/Block.cpp
diff --git a/PVRTCEncoder/include/PVRTCCompressor.h b/PVRTCEncoder/include/PVRTCCompressor.h
index 62ea975..c9001cd 100644
--- a/PVRTCEncoder/include/PVRTCCompressor.h
+++ b/PVRTCEncoder/include/PVRTCCompressor.h
@@ -73,6 +73,13 @@ namespace PVRTCC {
                   const EWrapMode wrapMode = eWrapMode_Wrap,
                   bool bDebugImages = false);
 
+  // Takes a stream of uncompressed RGBA8 data and compresses it into PVRTC
+  // version one. The width and height must be specified in order to properly
+  // decompress the data.
+  void Compress(const DecompressionJob &,
+                bool bTwoBitMode = false,
+                const EWrapMode wrapMode = eWrapMode_Wrap);
+
 }  // namespace PVRTCC
 
 #endif  // PVRTCENCODER_INCLUDE_PVRTCCOMPRESSOR_H_
diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
new file mode 100644
index 0000000..b17484f
--- /dev/null
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -0,0 +1,76 @@
+/* FasTC
+ * Copyright (c) 2013 University of North Carolina at Chapel Hill.
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for educational, research, and non-profit purposes, without
+ * fee, and without a written agreement is hereby granted, provided that the
+ * above copyright notice, this paragraph, and the following four paragraphs
+ * appear in all copies.
+ *
+ * Permission to incorporate this software into commercial products may be
+ * obtained by contacting the authors or the Office of Technology Development
+ * at the University of North Carolina at Chapel Hill <otd@unc.edu>.
+ *
+ * This software program and documentation are copyrighted by the University of
+ * North Carolina at Chapel Hill. The software program and documentation are
+ * supplied "as is," without any accompanying services from the University of
+ * North Carolina at Chapel Hill or the authors. The University of North
+ * Carolina at Chapel Hill and the authors do not warrant that the operation of
+ * the program will be uninterrupted or error-free. The end-user understands
+ * that the program was developed for research purposes and is advised not to
+ * rely exclusively on the program for any reason.
+ *
+ * IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE
+ * AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA
+ * AT CHAPEL HILL OR THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS SPECIFICALLY
+ * DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY 
+ * STATUTORY WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON
+ * AN "AS IS" BASIS, AND THE UNIVERSITY  OF NORTH CAROLINA AT CHAPEL HILL AND
+ * THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ *
+ * Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+ *
+ * The authors may be contacted via:
+ *
+ * Pavel Krajcevski
+ * Dept of Computer Science
+ * 201 S Columbia St
+ * Frederick P. Brooks, Jr. Computer Science Bldg
+ * Chapel Hill, NC 27599-3175
+ * USA
+ * 
+ * <http://gamma.cs.unc.edu/FasTC/>
+ */
+
+#include "PVRTCCompressor.h"
+
+#include "Pixel.h"
+#include "Image.h"
+
+namespace PVRTCC {
+
+  void Compress(const DecompressionJob &dcj,
+                bool bTwoBitMode,
+                const EWrapMode wrapMode) {
+    Image img(dcj.height, dcj.width);
+    for(uint32 j = 0; j < dcj.height; j++) {
+      for(uint32 i = 0; i < dcj.width; i++) {
+        const uint32 *pixels = reinterpret_cast<const uint32 *>(dcj.inBuf);
+        img(i, j).UnpackRGBA(pixels[j * dcj.width + i]);
+      }
+    }
+
+    // Downscale it using anisotropic diffusion based scheme in order to preserve
+    // image features, then reupscale and compute deltas. Use deltas to generate
+    // initial A & B images followed by modulation data.
+  }
+
+}  // namespace PVRTCC

From e0ec005ac8fa27795170d843a022d33c8581dbe1 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 18 Sep 2013 14:00:53 -0400
Subject: [PATCH 03/32] Fix link problems

---
 BPTCEncoder/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/BPTCEncoder/CMakeLists.txt b/BPTCEncoder/CMakeLists.txt
index 0bbac8e..9157010 100644
--- a/BPTCEncoder/CMakeLists.txt
+++ b/BPTCEncoder/CMakeLists.txt
@@ -241,3 +241,5 @@ ADD_LIBRARY( BPTCEncoder
 	${HEADERS}
 	${SOURCES}
 )
+
+TARGET_LINK_LIBRARIES( BPTCEncoder FasTCCore )
\ No newline at end of file

From 4135e38f2253a4c60efa9ef2ec4b8b55a1a3e764 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 18 Sep 2013 18:00:59 -0400
Subject: [PATCH 04/32] Set the default wrap behavior to wrap for image
 upscale, too

---
 PVRTCEncoder/src/Image.h        | 2 +-
 PVRTCEncoder/test/ImageTest.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/src/Image.h b/PVRTCEncoder/src/Image.h
index eabc045..8550d2a 100644
--- a/PVRTCEncoder/src/Image.h
+++ b/PVRTCEncoder/src/Image.h
@@ -69,7 +69,7 @@ class Image {
   ~Image();
 
   void BilinearUpscale(uint32 xtimes, uint32 ytimes,
-                       EWrapMode wrapMode = eWrapMode_Clamp);
+                       EWrapMode wrapMode = eWrapMode_Wrap);
   void ChangeBitDepth(const uint8 (&depths)[4]);
   void ExpandTo8888();
 
diff --git a/PVRTCEncoder/test/ImageTest.cpp b/PVRTCEncoder/test/ImageTest.cpp
index d48a195..e16d622 100644
--- a/PVRTCEncoder/test/ImageTest.cpp
+++ b/PVRTCEncoder/test/ImageTest.cpp
@@ -131,7 +131,7 @@ TEST(Image, BilinearUpscale) {
   }
 
   PVRTCC::Image img(4, 4, pxs);
-  img.BilinearUpscale(1, 1);
+  img.BilinearUpscale(1, 1, PVRTCC::eWrapMode_Clamp);
   EXPECT_EQ(img.GetWidth(), static_cast<uint32>(8));
   EXPECT_EQ(img.GetHeight(), static_cast<uint32>(8));
 
@@ -171,7 +171,7 @@ TEST(Image, BilinearUpscaleMaintainsPixels) {
   }
 
   PVRTCC::Image img(w, h, pxs);
-  img.BilinearUpscale(2, 2);
+  img.BilinearUpscale(2, 2, PVRTCC::eWrapMode_Clamp);
   EXPECT_EQ(img.GetWidth(), w << 2);
   EXPECT_EQ(img.GetHeight(), h << 2);
 
@@ -199,7 +199,7 @@ TEST(Image, NonuniformBilinearUpscale) {
   }
 
   PVRTCC::Image img(kHeight, kWidth, pxs);
-  img.BilinearUpscale(2, 1);
+  img.BilinearUpscale(2, 1, PVRTCC::eWrapMode_Clamp);
   EXPECT_EQ(img.GetWidth(), static_cast<uint32>(kWidth << 2));
   EXPECT_EQ(img.GetHeight(), static_cast<uint32>(kHeight << 1));
 

From 16cc7f4a93faaffe5f5715616f0dd7239e7cd059 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 18 Sep 2013 18:02:35 -0400
Subject: [PATCH 05/32] Use GetHeight/GetWidth() where appropriate instead of
 m_Width/m_Height

---
 PVRTCEncoder/src/Image.cpp | 81 ++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 39 deletions(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index c90a57c..cbca9d3 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -83,27 +83,27 @@ Image::Image(uint32 height, uint32 width, const Pixel *pixels)
 }
 
 Image::Image(const Image &other)
-  : m_Width(other.m_Width)
-  , m_Height(other.m_Height)
-  , m_Pixels(new Pixel[other.m_Width * other.m_Height])
-  , m_FractionalPixels(new Pixel[other.m_Width * other.m_Height]) {
-  memcpy(m_Pixels, other.m_Pixels, m_Width * m_Height * sizeof(Pixel));
+  : m_Width(other.GetWidth())
+  , m_Height(other.GetHeight())
+  , m_Pixels(new Pixel[other.GetWidth() * other.GetHeight()])
+  , m_FractionalPixels(new Pixel[other.GetWidth() * other.GetHeight()]) {
+  memcpy(m_Pixels, other.m_Pixels, GetWidth() * GetHeight() * sizeof(Pixel));
 }
 
 Image &Image::operator=(const Image &other) {
-  m_Width = other.m_Width;
-  m_Height = other.m_Height;
+  m_Width = other.GetWidth();
+  m_Height = other.GetHeight();
 
   assert(m_Pixels);
   delete m_Pixels;
-  m_Pixels = new Pixel[other.m_Width * other.m_Height];
-  memcpy(m_Pixels, other.m_Pixels, m_Width * m_Height * sizeof(Pixel));
+  m_Pixels = new Pixel[other.GetWidth() * other.GetHeight()];
+  memcpy(m_Pixels, other.m_Pixels, GetWidth() * GetHeight() * sizeof(Pixel));
 
   assert(m_FractionalPixels);
   delete m_FractionalPixels;
-  m_FractionalPixels = new Pixel[other.m_Width * other.m_Height];
+  m_FractionalPixels = new Pixel[other.GetWidth() * other.GetHeight()];
   memcpy(m_FractionalPixels, other.m_FractionalPixels,
-         m_Width * m_Height * sizeof(Pixel));
+         GetWidth() * GetHeight() * sizeof(Pixel));
 
   return *this;
 }
@@ -129,8 +129,8 @@ static bool CompareBitDepths(const uint8 (&depth1)[4],
 
 void Image::BilinearUpscale(uint32 xtimes, uint32 ytimes,
                             EWrapMode wrapMode) {
-  const uint32 newWidth = m_Width << xtimes;
-  const uint32 newHeight = m_Height << ytimes;
+  const uint32 newWidth = GetWidth() << xtimes;
+  const uint32 newHeight = GetHeight() << ytimes;
 
   const uint32 xscale = 1 << xtimes;
   const uint32 xoffset = xscale >> 1;
@@ -214,9 +214,9 @@ void Image::BilinearUpscale(uint32 xtimes, uint32 ytimes,
 }
 
 void Image::ChangeBitDepth(const uint8 (&depths)[4]) {
-  for(uint32 j = 0; j < m_Height; j++) {
-    for(uint32 i = 0; i < m_Width; i++) {
-      uint32 pidx = j * m_Width + i;
+  for(uint32 j = 0; j < GetHeight(); j++) {
+    for(uint32 i = 0; i < GetWidth(); i++) {
+      uint32 pidx = j * GetWidth() + i;
       m_Pixels[pidx].ChangeBitDepth(depths);
     }
   }
@@ -229,10 +229,10 @@ void Image::ExpandTo8888() {
   uint8 fractionDepth[4];
   const uint8 fullDepth[4] = { 8, 8, 8, 8 };
 
-  for(uint32 j = 0; j < m_Height; j++) {
-    for(uint32 i = 0; i < m_Width; i++) {
+  for(uint32 j = 0; j < GetHeight(); j++) {
+    for(uint32 i = 0; i < GetWidth(); i++) {
 
-      uint32 pidx = j * m_Width + i;
+      uint32 pidx = j * GetWidth() + i;
       m_Pixels[pidx].ChangeBitDepth(fullDepth);
       m_FractionalPixels[pidx].GetBitDepth(fractionDepth);
 
@@ -257,15 +257,15 @@ const Pixel &Image::GetPixel(int32 i, int32 j, EWrapMode wrapMode) {
     if(wrapMode == eWrapMode_Clamp) {
       i = 0;
     } else {
-      i += m_Width;
+      i += GetWidth();
     }
   }
 
-  while(i >= static_cast<int32>(m_Width)) {
+  while(i >= static_cast<int32>(GetWidth())) {
     if(wrapMode == eWrapMode_Clamp) {
-      i = m_Width - 1;
+      i = GetWidth() - 1;
     } else {
-      i -= m_Width;
+      i -= GetWidth();
     }
   }
 
@@ -273,46 +273,49 @@ const Pixel &Image::GetPixel(int32 i, int32 j, EWrapMode wrapMode) {
     if(wrapMode == eWrapMode_Clamp) {
       j = 0;
     } else {
-      j += m_Height;
+      j += GetHeight();
     }
   }
 
-  while(j >= static_cast<int32>(m_Height)) {
+  while(j >= static_cast<int32>(GetHeight())) {
     if(wrapMode == eWrapMode_Clamp) {
-      j = m_Height - 1;
+      j = GetHeight() - 1;
     } else {
-      j -= m_Height;
+      j -= GetHeight();
     }
   }
 
-  return m_Pixels[j * m_Width + i];
+  int32 idx = j * GetWidth() + i;
+  assert(idx >= 0);
+  assert(idx < GetWidth() * GetHeight());
+  return idx;
 }
 
 Pixel & Image::operator()(uint32 i, uint32 j) {
-  assert(i < m_Width);
-  assert(j < m_Height);
-  return m_Pixels[j * m_Width + i];
+  assert(i < GetWidth());
+  assert(j < GetHeight());
+  return m_Pixels[j * GetWidth() + i];
 }
 
 const Pixel & Image::operator()(uint32 i, uint32 j) const {
-  assert(i < m_Width);
-  assert(j < m_Height);
-  return m_Pixels[j * m_Width + i];
+  assert(i < GetWidth());
+  assert(j < GetHeight());
+  return m_Pixels[j * GetWidth() + i];
 }
 
 void Image::DebugOutput(const char *filename) const {
-  uint32 *outPixels = new uint32[m_Width * m_Height];
+  uint32 *outPixels = new uint32[GetWidth() * GetHeight()];
   const uint8 fullDepth[4] = { 8, 8, 8, 8 };
-  for(int j = 0; j < m_Height; j++) {
-    for(int i = 0; i < m_Width; i++) {
-      uint32 idx = j * m_Width + i;
+  for(uint32 j = 0; j < GetHeight(); j++) {
+    for(uint32 i = 0; i < GetWidth(); i++) {
+      uint32 idx = j * GetWidth() + i;
       Pixel p = m_Pixels[idx];
       p.ChangeBitDepth(fullDepth);
       outPixels[idx] = p.PackRGBA();
     }
   }
 
-  ::Image img(m_Width, m_Height, outPixels);
+  ::Image img(GetWidth(), GetHeight(), outPixels);
 
   char debugFilename[256];
   snprintf(debugFilename, sizeof(debugFilename), "%s.png", filename);

From e609075d04b117c90194d09a757efe782c73c1a0 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 18 Sep 2013 18:03:08 -0400
Subject: [PATCH 06/32] Split apart the index calculation and pixel lookup
 functions

---
 PVRTCEncoder/src/Image.cpp | 6 +++++-
 PVRTCEncoder/src/Image.h   | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index cbca9d3..1cc997f 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -252,7 +252,11 @@ void Image::ExpandTo8888() {
   }
 }
 
-const Pixel &Image::GetPixel(int32 i, int32 j, EWrapMode wrapMode) {
+const Pixel &Image::GetPixel(int32 i, int32 j, EWrapMode wrapMode) const {
+  return m_Pixels[GetPixelIndex(i, j, wrapMode)];
+}
+
+const uint32 Image::GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode) const {
   while(i < 0) {
     if(wrapMode == eWrapMode_Clamp) {
       i = 0;
diff --git a/PVRTCEncoder/src/Image.h b/PVRTCEncoder/src/Image.h
index 8550d2a..e7b35e6 100644
--- a/PVRTCEncoder/src/Image.h
+++ b/PVRTCEncoder/src/Image.h
@@ -87,7 +87,8 @@ class Image {
   Pixel *m_Pixels;
   Pixel *m_FractionalPixels;
 
-  const Pixel &GetPixel(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp);
+  const uint32 GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp) const;
+  const Pixel &GetPixel(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp) const;
 };
 
 }  // namespace PVRTCC

From 9f4fa671d9abb8d18e438e194759386765bdedcd Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 18 Sep 2013 18:03:44 -0400
Subject: [PATCH 07/32] Add a first pass at content aware downscaling.

---
 PVRTCEncoder/src/Image.cpp | 159 +++++++++++++++++++++++++++++++++++++
 PVRTCEncoder/src/Image.h   |  10 +++
 2 files changed, 169 insertions(+)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index 1cc997f..fdb982b 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -55,12 +55,18 @@
 #include <cassert>
 #include <cstring>
 #include <cstdio>
+#include <cmath>
 
 #include "Pixel.h"
 
 #include "Core/include/Image.h"
 #include "IO/include/ImageFile.h"
 
+static float ConvertChannelToFloat(uint8 channel, uint8 bitDepth) {
+  float denominator = static_cast<float>((1 << bitDepth) - 1);
+  return static_cast<float>(channel) / denominator;
+}
+
 namespace PVRTCC {
 
 Image::Image(uint32 height, uint32 width)
@@ -213,6 +219,159 @@ void Image::BilinearUpscale(uint32 xtimes, uint32 ytimes,
   m_Height = newHeight;
 }
 
+void Image::ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
+                                  EWrapMode wrapMode, bool bOffsetNewPixels) {
+  const uint32 w = GetWidth();
+  const uint32 h = GetHeight();
+
+  const uint32 newWidth = w >> xtimes;
+  const uint32 newHeight = h >> ytimes;
+
+  Pixel *downscaledPixels = new Pixel[newWidth * newHeight];
+  const uint32 numDownscaledPixels = newWidth * newHeight;
+
+  uint8 bitDepth[4];
+  m_Pixels[0].GetBitDepth(bitDepth);
+
+  for(uint32 i = 0; i < numDownscaledPixels; i++) {
+    downscaledPixels[i].ChangeBitDepth(bitDepth);
+  }
+
+  // Allocate memory
+  float *imgData = new float[19 * w * h];
+  float *I = imgData;
+  float *Ix[5] = {
+    imgData + (w * h),
+    imgData + (2 * w * h),
+    imgData + (3 * w * h),
+    imgData + (4 * w * h),
+    imgData + (18 * w * h),
+  };
+  float *Iy = imgData + (5 * w * h);
+  float *Ixx[4] = {
+    imgData + (6 * w * h),
+    imgData + (7 * w * h),
+    imgData + (8 * w * h),
+    imgData + (9 * w * h)
+  };
+  float *Iyy[4] = {
+    imgData + (10 * w * h),
+    imgData + (11 * w * h),
+    imgData + (12 * w * h),
+    imgData + (13 * w * h)
+  };
+  float *Ixy[4] = {
+    imgData + (14 * w * h),
+    imgData + (15 * w * h),
+    imgData + (16 * w * h),
+    imgData + (17 * w * h)
+  };
+
+  // Then, compute the intensity of the image
+  for(uint32 i = 0; i < w * h; i++) {
+    // First convert the pixel values to floats using
+    // premultiplied alpha...
+    float a = ConvertChannelToFloat(m_Pixels[i].A(), bitDepth[0]);
+    float r = a * ConvertChannelToFloat(m_Pixels[i].R(), bitDepth[1]);
+    float g = a * ConvertChannelToFloat(m_Pixels[i].G(), bitDepth[2]);
+    float b = a * ConvertChannelToFloat(m_Pixels[i].B(), bitDepth[3]);
+
+    I[i] = r * 0.21 + g * 0.71 + b * 0.07;
+  }
+
+  // Use central differences to calculate Ix, Iy, Ixx, Iyy...
+  for(uint32 j = 0; j < h; j++) {
+    for(uint32 i = 0; i < w; i++) {
+      uint32 hm2xidx = GetPixelIndex(i-2, j);
+      uint32 hm1xidx = GetPixelIndex(i-1, j);
+      uint32 hp1xidx = GetPixelIndex(i+1, j);
+      uint32 hp2xidx = GetPixelIndex(i+2, j);
+
+      uint32 hm2yidx = GetPixelIndex(i, j-2);
+      uint32 hm1yidx = GetPixelIndex(i, j-1);
+      uint32 hp1yidx = GetPixelIndex(i, j+1);
+      uint32 hp2yidx = GetPixelIndex(i, j+2);
+
+      uint32 idx = GetPixelIndex(i, j);
+      Ix[4][idx] = (I[hm2xidx] - 8*I[hm1xidx] + 8*I[hp1xidx] - I[hp2xidx]) / 12.0f;
+      Iy[idx] = (I[hm2yidx] - 8*I[hm1yidx] + 8*I[hp1yidx] - I[hp2yidx]) / 12.0f;
+
+      for(uint32 c = 0; c <= 3; c++) {
+        #define CPNT(dx) ConvertChannelToFloat(m_Pixels[dx].Component(c), bitDepth[c])
+        Ix[c][idx] = (CPNT(hm2xidx) - 8*CPNT(hm1xidx) + 8*CPNT(hp1xidx) - CPNT(hp2xidx)) / 12.0f;
+        Ixx[c][idx] = (-CPNT(hm2xidx) + 16*CPNT(hm1xidx) - 30*CPNT(idx) + 16*CPNT(hp1xidx) - CPNT(hp2xidx)) / 12.0f;
+        Iyy[c][idx] = (-CPNT(hm2yidx) + 16*CPNT(hm1yidx) - 30*CPNT(idx) + 16*CPNT(hp1yidx) - CPNT(hp2yidx)) / 12.0f;
+        #undef CPNT
+      }
+    }
+  }
+
+  // Finally, compute Ixy
+  for(uint32 j = 0; j < h; j++) {
+    for(uint32 i = 0; i < w; i++) {
+      uint32 hm2y = GetPixelIndex(i, j-2);
+      uint32 hm1y = GetPixelIndex(i, j-1);
+      uint32 hp1y = GetPixelIndex(i, j+1);
+      uint32 hp2y = GetPixelIndex(i, j+2);
+
+      uint32 idx = GetPixelIndex(i, j);
+      for(uint32 c = 0; c <= 3; c++) {
+        Ixy[c][idx] = (Ix[c][hm2y] - 8*Ix[c][hm1y] + 8*Ix[c][hp1y] - Ix[c][hp2y]) / 12.0f;
+      }
+    }
+  }
+
+  // Now, for each pixel that we take into consideration, use
+  // a smoothing step that is taken from the anisotropic diffusion
+  // equation:
+  // I_t = (I_x^2I_yy - 2I_xyI_xI_y + I_y^2I_xx)(I_x^2 + I_y^2)
+  for(uint32 j = 0; j < newHeight; j++) {
+    for(uint32 i = 0; i < newWidth; i++) {
+
+      // Map this new pixel back into the original space...
+      uint32 scalex = 1 << xtimes;
+      uint32 scaley = 1 << ytimes;
+
+      uint32 x = scalex * i;
+      uint32 y = scaley * j;
+
+      if(bOffsetNewPixels) {
+        x += scalex >> 1;
+        y += scaley >> 1;
+      }
+
+      uint32 idx = GetPixelIndex(x, y);
+      Pixel current = m_Pixels[idx];
+
+      Pixel result;
+      result.ChangeBitDepth(bitDepth);
+
+      float Ixsq = Ix[4][idx] * Ix[4][idx];
+      float Iysq = Iy[idx] * Iy[idx];
+      float denom = Ixsq + Iysq;
+
+      for(uint32 c = 0; c < 4; c++) {
+        float I0 = ConvertChannelToFloat(current.Component(c), bitDepth[c]);
+        float It = Ixx[c][idx] + Iyy[c][idx];
+        if(fabs(denom) > 1e-6) {
+          It -= (Ixsq*Ixx[c][idx] + 2*Ix[4][idx]*Iy[idx]*Ixy[c][idx] + Iysq*Iyy[c][idx]);
+        }
+        float pxScale = static_cast<float>((1 << bitDepth[c]) - 1);
+        result.Component(c) = static_cast<uint8>(((I0 + 0.25*It) + 0.5) * pxScale);
+      }
+
+      downscaledPixels[j * newHeight + i] = result;
+    }
+  }
+
+  delete m_Pixels;
+  m_Pixels = downscaledPixels;
+  m_Width = newWidth;
+  m_Height = newHeight;
+
+  delete [] imgData;
+}
+
 void Image::ChangeBitDepth(const uint8 (&depths)[4]) {
   for(uint32 j = 0; j < GetHeight(); j++) {
     for(uint32 i = 0; i < GetWidth(); i++) {
diff --git a/PVRTCEncoder/src/Image.h b/PVRTCEncoder/src/Image.h
index e7b35e6..a87f95f 100644
--- a/PVRTCEncoder/src/Image.h
+++ b/PVRTCEncoder/src/Image.h
@@ -70,6 +70,16 @@ class Image {
 
   void BilinearUpscale(uint32 xtimes, uint32 ytimes,
                        EWrapMode wrapMode = eWrapMode_Wrap);
+
+  // Downscales the image by taking an anisotropic diffusion approach
+  // with respect to the gradient of the intensity. In this way, we can
+  // preserve the most important image structures by not blurring across
+  // edge boundaries, which when upscaled will retain the structural
+  // image quality...
+  void ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
+                             EWrapMode wrapMode = eWrapMode_Wrap,
+                             bool bOffsetNewPixels = false);
+
   void ChangeBitDepth(const uint8 (&depths)[4]);
   void ExpandTo8888();
 

From 1d58ea238597c81d7305f49546f5b875915021e0 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Thu, 19 Sep 2013 15:11:27 -0400
Subject: [PATCH 08/32] Add RGBA pixel constructor

---
 PVRTCEncoder/src/Pixel.h        | 11 ++++++++---
 PVRTCEncoder/test/PixelTest.cpp |  6 ++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/PVRTCEncoder/src/Pixel.h b/PVRTCEncoder/src/Pixel.h
index 1953e09..f87d79d 100644
--- a/PVRTCEncoder/src/Pixel.h
+++ b/PVRTCEncoder/src/Pixel.h
@@ -63,9 +63,14 @@ class Pixel {
     for(int i = 0; i < 4; i++) m_BitDepth[i] = 8;
   }
 
-  explicit Pixel(const uint8 *bits,
-                 const uint8 channelDepth[4] = static_cast<uint8 *>(0),
-                 uint8 bitOffset = 0) {
+  explicit Pixel(uint32 rgba) {
+    for(int i = 0; i < 4; i++) m_BitDepth[i] = 8;
+    UnpackRGBA(rgba);
+  }
+
+  Pixel(const uint8 *bits,
+        const uint8 channelDepth[4] = static_cast<uint8 *>(0),
+        uint8 bitOffset = 0) {
     FromBits(bits, channelDepth, bitOffset);
   }
 
diff --git a/PVRTCEncoder/test/PixelTest.cpp b/PVRTCEncoder/test/PixelTest.cpp
index b8b4d2a..5a2cc6e 100644
--- a/PVRTCEncoder/test/PixelTest.cpp
+++ b/PVRTCEncoder/test/PixelTest.cpp
@@ -206,6 +206,12 @@ TEST(Pixel, UnpackRGBA) {
   EXPECT_EQ(p.G(), 0xB3);
   EXPECT_EQ(p.R(), 0xFE);
 
+  p = PVRTCC::Pixel(rgba);
+  EXPECT_EQ(p.A(), 0x46);
+  EXPECT_EQ(p.B(), 0x19);
+  EXPECT_EQ(p.G(), 0xB3);
+  EXPECT_EQ(p.R(), 0xFE);
+
   uint8 newBitDepth[4] = { 3, 5, 2, 1 };  // A R G B
   p.ChangeBitDepth(newBitDepth);
 

From 995c237e5ec0e322f99054766dd9a50e0f9c0bfa Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Thu, 19 Sep 2013 17:47:36 -0400
Subject: [PATCH 09/32] Add downscale test and fix a few bugs and formatting.

---
 PVRTCEncoder/src/Image.cpp      | 53 +++++++++++++--------------------
 PVRTCEncoder/test/ImageTest.cpp | 35 +++++++++++++++++++---
 2 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index fdb982b..9ea301c 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -282,45 +282,32 @@ void Image::ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
   // Use central differences to calculate Ix, Iy, Ixx, Iyy...
   for(uint32 j = 0; j < h; j++) {
     for(uint32 i = 0; i < w; i++) {
-      uint32 hm2xidx = GetPixelIndex(i-2, j);
-      uint32 hm1xidx = GetPixelIndex(i-1, j);
-      uint32 hp1xidx = GetPixelIndex(i+1, j);
-      uint32 hp2xidx = GetPixelIndex(i+2, j);
+      uint32 xmhidx = GetPixelIndex(i-1, j);
+      uint32 xphidx = GetPixelIndex(i+1, j);
 
-      uint32 hm2yidx = GetPixelIndex(i, j-2);
-      uint32 hm1yidx = GetPixelIndex(i, j-1);
-      uint32 hp1yidx = GetPixelIndex(i, j+1);
-      uint32 hp2yidx = GetPixelIndex(i, j+2);
+      uint32 ymhidx = GetPixelIndex(i, j-1);
+      uint32 yphidx = GetPixelIndex(i, j+1);
 
       uint32 idx = GetPixelIndex(i, j);
-      Ix[4][idx] = (I[hm2xidx] - 8*I[hm1xidx] + 8*I[hp1xidx] - I[hp2xidx]) / 12.0f;
-      Iy[idx] = (I[hm2yidx] - 8*I[hm1yidx] + 8*I[hp1yidx] - I[hp2yidx]) / 12.0f;
+
+      uint32 upidx = GetPixelIndex(i + 1, j + 1);
+      uint32 downidx = GetPixelIndex(i - 1, j - 1);
+
+      Ix[4][idx] = (I[xphidx] - I[xmhidx]) / 2.0f;
+      Iy[idx] = (I[yphidx] - I[ymhidx]) / 2.0f;
 
       for(uint32 c = 0; c <= 3; c++) {
         #define CPNT(dx) ConvertChannelToFloat(m_Pixels[dx].Component(c), bitDepth[c])
-        Ix[c][idx] = (CPNT(hm2xidx) - 8*CPNT(hm1xidx) + 8*CPNT(hp1xidx) - CPNT(hp2xidx)) / 12.0f;
-        Ixx[c][idx] = (-CPNT(hm2xidx) + 16*CPNT(hm1xidx) - 30*CPNT(idx) + 16*CPNT(hp1xidx) - CPNT(hp2xidx)) / 12.0f;
-        Iyy[c][idx] = (-CPNT(hm2yidx) + 16*CPNT(hm1yidx) - 30*CPNT(idx) + 16*CPNT(hp1yidx) - CPNT(hp2yidx)) / 12.0f;
+        Ix[c][idx] = (CPNT(xphidx) - CPNT(xmhidx)) / 2.0f;
+        Ixx[c][idx] = (CPNT(xphidx) - 2.0f*CPNT(idx) + CPNT(xmhidx)) / 2.0f;
+        Iyy[c][idx] = (CPNT(yphidx) - 2.0f*CPNT(idx) + CPNT(ymhidx)) / 2.0f;
+        Ixy[c][idx] = (CPNT(upidx) - CPNT(xphidx) - CPNT(yphidx) + 2.0f*CPNT(idx) -
+                       CPNT(xmhidx) - CPNT(ymhidx) + CPNT(downidx)) / 2.0f;
         #undef CPNT
       }
     }
   }
 
-  // Finally, compute Ixy
-  for(uint32 j = 0; j < h; j++) {
-    for(uint32 i = 0; i < w; i++) {
-      uint32 hm2y = GetPixelIndex(i, j-2);
-      uint32 hm1y = GetPixelIndex(i, j-1);
-      uint32 hp1y = GetPixelIndex(i, j+1);
-      uint32 hp2y = GetPixelIndex(i, j+2);
-
-      uint32 idx = GetPixelIndex(i, j);
-      for(uint32 c = 0; c <= 3; c++) {
-        Ixy[c][idx] = (Ix[c][hm2y] - 8*Ix[c][hm1y] + 8*Ix[c][hp1y] - Ix[c][hp2y]) / 12.0f;
-      }
-    }
-  }
-
   // Now, for each pixel that we take into consideration, use
   // a smoothing step that is taken from the anisotropic diffusion
   // equation:
@@ -354,10 +341,12 @@ void Image::ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
         float I0 = ConvertChannelToFloat(current.Component(c), bitDepth[c]);
         float It = Ixx[c][idx] + Iyy[c][idx];
         if(fabs(denom) > 1e-6) {
-          It -= (Ixsq*Ixx[c][idx] + 2*Ix[4][idx]*Iy[idx]*Ixy[c][idx] + Iysq*Iyy[c][idx]);
+          It -= (Ixsq * Ixx[c][idx] +
+                 2 * Ix[4][idx] * Iy[idx] * Ixy[c][idx] +
+                 Iysq * Iyy[c][idx]) / denom;
         }
-        float pxScale = static_cast<float>((1 << bitDepth[c]) - 1);
-        result.Component(c) = static_cast<uint8>(((I0 + 0.25*It) + 0.5) * pxScale);
+        float scale = static_cast<float>((1 << bitDepth[c]) - 1);
+        result.Component(c) = static_cast<uint8>((I0 + 0.25*It) * scale + 0.5);
       }
 
       downscaledPixels[j * newHeight + i] = result;
@@ -448,7 +437,7 @@ const uint32 Image::GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode) const {
     }
   }
 
-  int32 idx = j * GetWidth() + i;
+  uint32 idx = j * GetWidth() + i;
   assert(idx >= 0);
   assert(idx < GetWidth() * GetHeight());
   return idx;
diff --git a/PVRTCEncoder/test/ImageTest.cpp b/PVRTCEncoder/test/ImageTest.cpp
index e16d622..d6f8d0e 100644
--- a/PVRTCEncoder/test/ImageTest.cpp
+++ b/PVRTCEncoder/test/ImageTest.cpp
@@ -161,8 +161,8 @@ TEST(Image, BilinearUpscaleMaintainsPixels) {
   const uint32 h = 4;
 
   PVRTCC::Pixel pxs[16];
-  for(int i = 0; i < w; i++) {
-    for(int j = 0; j < h; j++) {
+  for(uint32 i = 0; i < w; i++) {
+    for(uint32 j = 0; j < h; j++) {
       pxs[j*w + i].R() = rand() % 256;
       pxs[j*w + i].G() = rand() % 256;
       pxs[j*w + i].B() = rand() % 256;
@@ -191,8 +191,8 @@ TEST(Image, NonuniformBilinearUpscale) {
   const uint32 kHeight = 8;
 
   PVRTCC::Pixel pxs[kWidth * kHeight];
-  for(int i = 0; i < kWidth; i++) {
-    for(int j = 0; j < kHeight; j++) {
+  for(uint32 i = 0; i < kWidth; i++) {
+    for(uint32 j = 0; j < kHeight; j++) {
       pxs[j*kWidth + i].R() = i*4;
       pxs[j*kWidth + i].G() = j*2;
     }
@@ -279,6 +279,33 @@ TEST(Image, BilinearUpscaleWrapped) {
   }
 }
 
+TEST(Image, ContentAwareDownscale) {
+  PVRTCC::Image img(8, 8);
+  for(uint32 j = 0; j < img.GetHeight(); j++) {
+    for(uint32 i = 0; i < img.GetWidth(); i++) {
+      if(j < 4) {
+        img(i, j) = PVRTCC::Pixel( 0xFF000000 );
+      } else {
+        img(i, j) = PVRTCC::Pixel( 0xFF0000FF );
+      }
+    }
+  }
+
+  img.ContentAwareDownscale(1, 1);
+  EXPECT_EQ(img.GetWidth(), static_cast<uint32>(4));
+  EXPECT_EQ(img.GetHeight(), static_cast<uint32>(4));
+
+  for(uint32 j = 0; j < img.GetHeight(); j++) {
+    for(uint32 i = 0; i < img.GetWidth(); i++) {
+      if(j < 2) {
+        EXPECT_EQ(img(i, j).R(), 0);
+      } else {
+        EXPECT_EQ(img(i, j).R(), 255);
+      }
+    }
+  }
+}
+
 TEST(Image, ChangeBitDepth) {
   PVRTCC::Image img(4, 4);
 

From 1093447055d6f05fa4fb60db0c5dcb2042d155f6 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Fri, 20 Sep 2013 12:43:59 -0400
Subject: [PATCH 10/32] Plug in the infrastructure to start passing images to
 be compressed.

---
 CLTool/src/clunix.cpp                  | 22 ++++++++-
 Core/CMakeLists.txt                    |  4 ++
 Core/include/CompressedImage.h         | 31 +++++++-----
 Core/include/TexComp.h                 |  3 +-
 Core/src/CompressedImage.cpp           | 51 +++++++++++++-------
 Core/src/Image.cpp                     | 10 +---
 Core/src/TexComp.cpp                   | 66 ++++++++++++++++++--------
 PVRTCEncoder/include/PVRTCCompressor.h |  2 +-
 PVRTCEncoder/src/Compressor.cpp        | 44 ++++++++++++++---
 9 files changed, 166 insertions(+), 67 deletions(-)

diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp
index aa39d52..a064c1f 100644
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@@ -53,6 +53,7 @@
 void PrintUsage() {
   fprintf(stderr, "Usage: tc [OPTIONS] imagefile\n");
   fprintf(stderr, "\n");
+  fprintf(stderr, "\t-f\t\tFormat to use. Either \"BPTC\" or \"PVRTC\". Default: BPTC\n");
   fprintf(stderr, "\t-l\t\tSave an output log.\n");
   fprintf(stderr, "\t-q <quality>\tSet compression quality level. Default: 50\n");
   fprintf(stderr, "\t-n <num>\tCompress the image num times and give the average time and PSNR. Default: 1\n");
@@ -92,7 +93,8 @@ int main(int argc, char **argv) {
   bool bUseSIMD = false;
   bool bSaveLog = false;
   bool bUseAtomics = false;
-  
+  ECompressionFormat format = eCompressionFormat_BPTC;
+
   bool knowArg = false;
   do {
     knowArg = false;
@@ -110,6 +112,23 @@ int main(int argc, char **argv) {
       continue;
     }
 
+    if(strcmp(argv[fileArg], "-f") == 0) {
+      fileArg++;
+
+      if(fileArg == argc) {
+        PrintUsage();
+        exit(1);
+      } else {
+        if(!strcmp(argv[fileArg], "PVRTC")) {
+          format = eCompressionFormat_PVRTC;
+        }
+      }
+
+      fileArg++;
+      knowArg = true;
+      continue;
+    }
+
     if(strcmp(argv[fileArg], "-l") == 0) {
       fileArg++;
       bSaveLog = true;
@@ -195,6 +214,7 @@ int main(int argc, char **argv) {
   }
   
   SCompressionSettings settings;
+  settings.format = format;
   settings.bUseSIMD = bUseSIMD;
   settings.bUseAtomics = bUseAtomics;
   settings.iNumThreads = numThreads;
diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index 0e33516..21d1f2e 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -71,6 +71,9 @@ ELSE()
 	SET( LINK_FLAGS -lrt ${LINK_FLAGS} )
 ENDIF()
 
+INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR} )
+INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR}/PVRTCEncoder/include )
+
 INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR}/BPTCEncoder/include )
 INCLUDE_DIRECTORIES( ${FasTC_BINARY_DIR}/BPTCEncoder/include )
 
@@ -149,6 +152,7 @@ ADD_LIBRARY( FasTCCore
 
 TARGET_LINK_LIBRARIES( FasTCCore FasTCIO )
 TARGET_LINK_LIBRARIES( FasTCCore BPTCEncoder )
+TARGET_LINK_LIBRARIES( FasTCCore PVRTCEncoder )
 
 IF( THREAD_API MATCHES "Boost" )
 	TARGET_LINK_LIBRARIES( FasTCCore ${Boost_LIBRARIES} )
diff --git a/Core/include/CompressedImage.h b/Core/include/CompressedImage.h
index 831a50c..8799d11 100644
--- a/Core/include/CompressedImage.h
+++ b/Core/include/CompressedImage.h
@@ -44,10 +44,13 @@
 #ifndef _COMPRESSED_IMAGE_H_
 #define _COMPRESSED_IMAGE_H_
 
+#include "TexCompTypes.h"
+
 enum ECompressionFormat {
   eCompressionFormat_DXT1,
   eCompressionFormat_DXT5,
   eCompressionFormat_BPTC,
+  eCompressionFormat_PVRTC,
 
   kNumCompressionFormats
 };
@@ -55,15 +58,15 @@ enum ECompressionFormat {
 class CompressedImage {
 
  private:
-  unsigned int m_Width;
-  unsigned int m_Height;
+  uint32 m_Width;
+  uint32 m_Height;
 
   ECompressionFormat m_Format;
 
-  unsigned char *m_Data;
-  unsigned int m_DataSz;
+  uint8 *m_Data;
+  uint32 m_DataSz;
 
-  void InitData(const unsigned char *withData);
+  void InitData(const uint8 *withData);
  public:
   CompressedImage();
 
@@ -71,23 +74,29 @@ class CompressedImage {
   // the passed format. The size of the data is expected to conform
   // to the width, height, and format specified.
   CompressedImage(
-    const unsigned int width, 
-    const unsigned int height, 
+    const uint32 width, 
+    const uint32 height, 
     const ECompressionFormat format, 
-    const unsigned char *data
+    const uint8 *data
   );
 
-  unsigned int GetHeight() const { return m_Height; }
-  unsigned int GetWidth() const { return m_Width; }
+  uint32 GetHeight() const { return m_Height; }
+  uint32 GetWidth() const { return m_Width; }
 
   CompressedImage( const CompressedImage &other );
   ~CompressedImage();
 
+  static uint32 GetCompressedSize(uint32 uncompressedSize, ECompressionFormat format);
+  static uint32 GetUncompressedSize(uint32 compressedSize, ECompressionFormat format) {
+    uint32 cmp = GetCompressedSize(compressedSize, format);
+    return compressedSize * (compressedSize / cmp);
+  }
+
   // Decompress the compressed image data into outBuf. outBufSz is expected
   // to be the proper size determined by the width, height, and format.
   // !FIXME! We should have a function to explicitly return the in/out buf
   // size for a given compressed image.
-  bool DecompressImage(unsigned char *outBuf, unsigned int outBufSz) const;
+  bool DecompressImage(uint8 *outBuf, uint32 outBufSz) const;
 };
 
 #endif // _COMPRESSED_IMAGE_H_
diff --git a/Core/include/TexComp.h b/Core/include/TexComp.h
index 1ad8c84..873a228 100644
--- a/Core/include/TexComp.h
+++ b/Core/include/TexComp.h
@@ -93,7 +93,8 @@ struct SCompressionSettings {
 
 extern bool CompressImageData(
   const unsigned char *data,
-  const unsigned int dataSz,
+  const unsigned int width,
+  const unsigned int height,
   unsigned char *cmpData,
   const unsigned int cmpDataSz,
   const SCompressionSettings &settings
diff --git a/Core/src/CompressedImage.cpp b/Core/src/CompressedImage.cpp
index 0f4f393..a30d9ba 100644
--- a/Core/src/CompressedImage.cpp
+++ b/Core/src/CompressedImage.cpp
@@ -50,6 +50,7 @@
 
 #include "TexCompTypes.h"
 #include "BC7Compressor.h"
+#include "PVRTCCompressor.h"
 
 CompressedImage::CompressedImage()
   : m_Width(0)
@@ -85,16 +86,7 @@ CompressedImage::CompressedImage(
 }
 
 void CompressedImage::InitData(const unsigned char *withData) {
-  m_DataSz = 0;
-  int uncompDataSz = m_Width * m_Height * 4;
-
-  switch(m_Format) {
-    default: assert(!"Not implemented!"); // Fall through V
-    case eCompressionFormat_DXT1: m_DataSz = uncompDataSz / 8; break;
-    case eCompressionFormat_DXT5: m_DataSz = uncompDataSz / 4; break;
-    case eCompressionFormat_BPTC: m_DataSz = uncompDataSz / 4; break;
-  }
-
+  m_DataSz = GetCompressedSize(m_Width * m_Height * 4, m_Format);
   if(m_DataSz > 0) {
     m_Data = new unsigned char[m_DataSz];
     memcpy(m_Data, withData, m_DataSz);
@@ -111,14 +103,7 @@ CompressedImage::~CompressedImage() {
 bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBufSz) const {
 
   // First make sure that we have enough data
-  uint32 dataSz = 0;
-  switch(m_Format) {
-    default: assert(!"Not implemented!"); // Fall through V
-    case eCompressionFormat_DXT1: dataSz = m_DataSz * 8; break;
-    case eCompressionFormat_DXT5: dataSz = m_DataSz * 4; break;
-    case eCompressionFormat_BPTC: dataSz = m_DataSz * 4; break;
-  }
-
+  uint32 dataSz = GetUncompressedSize(m_DataSz, m_Format);
   if(dataSz > outBufSz) {
     fprintf(stderr, "Not enough space to store entire decompressed image! "
                     "Got %d bytes, but need %d!\n", outBufSz, dataSz);
@@ -126,6 +111,13 @@ bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBuf
   }
 
   switch(m_Format) {
+  case eCompressionFormat_PVRTC:
+    {
+      DecompressionJob dj (m_Data, outBuf, m_Width, m_Height);
+      PVRTCC::Decompress(dj);
+    }
+    break;
+
   case eCompressionFormat_BPTC: 
     { 
       DecompressionJob dj (m_Data, outBuf, m_Width, m_Height);
@@ -142,3 +134,26 @@ bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBuf
 
   return true;
 }
+
+uint32 CompressedImage::GetCompressedSize(uint32 uncompressedSize, ECompressionFormat format) {
+  assert(uncompressedSize % 8 == 0);
+
+  uint32 cmpDataSzNeeded = 0;
+  switch(format) {
+  default:
+    assert(!"Not implemented!");
+    // Fall through V
+  case eCompressionFormat_DXT1:
+  case eCompressionFormat_PVRTC:
+    cmpDataSzNeeded = uncompressedSize / 8;
+    break;
+
+  case eCompressionFormat_DXT5:
+  case eCompressionFormat_BPTC:
+    cmpDataSzNeeded = uncompressedSize / 4;
+    break;
+  }
+
+  return cmpDataSzNeeded;
+}
+
diff --git a/Core/src/Image.cpp b/Core/src/Image.cpp
index 663469b..a918706 100644
--- a/Core/src/Image.cpp
+++ b/Core/src/Image.cpp
@@ -149,16 +149,10 @@ CompressedImage *Image::Compress(const SCompressionSettings &settings) const {
   assert(dataSz > 0);
 
   // Allocate data based on the compression method
-  int cmpDataSz = 0;
-  switch(settings.format) {
-    default: assert(!"Not implemented!"); // Fall Through V
-    case eCompressionFormat_DXT1: cmpDataSz = dataSz / 8; break;
-    case eCompressionFormat_DXT5: cmpDataSz = dataSz / 4; break;
-    case eCompressionFormat_BPTC: cmpDataSz = dataSz / 4; break;
-  }
+  int cmpDataSz = CompressedImage::GetCompressedSize(dataSz, settings.format);
 
   unsigned char *cmpData = new unsigned char[cmpDataSz];
-  CompressImageData(m_PixelData, dataSz, cmpData, cmpDataSz, settings);
+  CompressImageData(m_PixelData, GetWidth(), GetHeight(), cmpData, cmpDataSz, settings);
 
   outImg = new CompressedImage(GetWidth(), GetHeight(), settings.format, cmpData);
   
diff --git a/Core/src/TexComp.cpp b/Core/src/TexComp.cpp
index 3cf0bf5..3cc25ac 100644
--- a/Core/src/TexComp.cpp
+++ b/Core/src/TexComp.cpp
@@ -49,6 +49,7 @@
 #include <cstdio>
 #include <cassert>
 
+#include "PVRTCCompressor.h"
 #include "BC7Compressor.h"
 #include "Thread.h"
 #include "WorkerQueue.h"
@@ -68,6 +69,10 @@ static inline T sad(const T &a, const T &b) {
   return (a > b)? a - b : b - a;
 }
 
+static void CompressPVRTC(const CompressionJob &cj) {
+  PVRTCC::Compress(cj);
+}
+
 SCompressionSettings:: SCompressionSettings()
   : format(eCompressionFormat_BPTC)
   , bUseSIMD(false)
@@ -86,6 +91,12 @@ static  CompressionFuncWithStats ChooseFuncFromSettingsWithStats(const SCompress
     }
     break;
 
+    case eCompressionFormat_PVRTC:
+    {
+      // !FIXME! actually implement one of these methods...
+      return NULL;
+    }
+
     default:
     {
       assert(!"Not implemented!");
@@ -109,6 +120,11 @@ static CompressionFunc ChooseFuncFromSettings(const SCompressionSettings &s) {
     }
     break;
 
+    case eCompressionFormat_PVRTC:
+    {
+      return CompressPVRTC;
+    }
+
     default:
     {
       assert(!"Not implemented!");
@@ -123,8 +139,9 @@ static void ReportError(const char *msg) {
 }
 
 static double CompressImageInSerial(
-  const unsigned char *imgData,
-  const unsigned int imgDataSz,
+  const uint8 *imgData,
+  const uint32 imgWidth,
+  const uint32 imgHeight,
   const SCompressionSettings &settings,
   unsigned char *outBuf
 ) {
@@ -133,14 +150,14 @@ static double CompressImageInSerial(
 
   double cmpTimeTotal = 0.0;
 
+  StopWatch stopWatch = StopWatch();
   for(int i = 0; i < settings.iNumCompressions; i++) {
 
-    StopWatch stopWatch = StopWatch();
     stopWatch.Reset();
     stopWatch.Start();
 
     // !FIXME! We're assuming that we have 4x4 blocks here...
-    CompressionJob cj (imgData, outBuf, imgDataSz / 16, 4);
+    CompressionJob cj (imgData, outBuf, imgWidth, imgHeight);
     if(fStats && settings.pStatManager) {
       (*fStats)(cj, *(settings.pStatManager));
     }
@@ -332,12 +349,15 @@ static double CompressImageWithWorkerQueue(
 
 bool CompressImageData(
   const unsigned char *data, 
-  const unsigned int dataSz,
+  const unsigned int width,
+  const unsigned int height,
   unsigned char *cmpData,
   const unsigned int cmpDataSz,
   const SCompressionSettings &settings
 ) { 
 
+  uint32 dataSz = width * height * 4;
+
   // Make sure that platform supports SSE if they chose this
   // option...
   #ifndef HAS_SSE_41
@@ -352,15 +372,23 @@ bool CompressImageData(
     return false;
   }
 
-  // Allocate data based on the compression method
-  uint32 cmpDataSzNeeded = 0;
-  switch(settings.format) {
-    default: assert(!"Not implemented!"); // Fall through V
-    case eCompressionFormat_DXT1: cmpDataSzNeeded = dataSz / 8; break;
-    case eCompressionFormat_DXT5: cmpDataSzNeeded = dataSz / 4; break;
-    case eCompressionFormat_BPTC: cmpDataSzNeeded = dataSz / 4; break;
+  uint32 numThreads = settings.iNumThreads;
+  if(settings.format == eCompressionFormat_PVRTC &&
+     (settings.iNumThreads > 1 || settings.pStatManager)) {
+    if(settings.iNumThreads > 1) {
+      ReportError("WARNING - PVRTC compressor does not support multithreading.");
+      numThreads = 1;
+    }
+
+    if(settings.pStatManager) {
+      ReportError("WARNING - PVRTC compressor does not support stat collection.");
+    }
   }
 
+  // Allocate data based on the compression method
+  uint32 cmpDataSzNeeded =
+    CompressedImage::GetCompressedSize(dataSz, settings.format);
+
   if(cmpDataSzNeeded == 0) {
     ReportError("Unknown compression format");
     return false;
@@ -375,21 +403,17 @@ bool CompressImageData(
 
     double cmpMSTime = 0.0;
 
-    if(settings.iNumThreads > 1) {
+    if(numThreads > 1) {
       if(settings.bUseAtomics) {
-        //!KLUDGE!
-        unsigned int height = 4;
-        unsigned int width = dataSz / 16;
-
         cmpMSTime = CompressImageWithAtomics(data, width, height, settings, cmpData);
-      }
-      else if(settings.iJobSize > 0)
+      } else if(settings.iJobSize > 0) {
         cmpMSTime = CompressImageWithWorkerQueue(data, dataSz, settings, cmpData);
-      else
+      } else {
         cmpMSTime = CompressImageWithThreads(data, dataSz, settings, cmpData);
+      }
     }
     else {
-      cmpMSTime = CompressImageInSerial(data, dataSz, settings, cmpData);
+      cmpMSTime = CompressImageInSerial(data, width, height, settings, cmpData);
     }
 
     // Report compression time
diff --git a/PVRTCEncoder/include/PVRTCCompressor.h b/PVRTCEncoder/include/PVRTCCompressor.h
index c9001cd..657c71a 100644
--- a/PVRTCEncoder/include/PVRTCCompressor.h
+++ b/PVRTCEncoder/include/PVRTCCompressor.h
@@ -76,7 +76,7 @@ namespace PVRTCC {
   // Takes a stream of uncompressed RGBA8 data and compresses it into PVRTC
   // version one. The width and height must be specified in order to properly
   // decompress the data.
-  void Compress(const DecompressionJob &,
+  void Compress(const CompressionJob &,
                 bool bTwoBitMode = false,
                 const EWrapMode wrapMode = eWrapMode_Wrap);
 
diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index b17484f..b26c51e 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -57,20 +57,52 @@
 
 namespace PVRTCC {
 
-  void Compress(const DecompressionJob &dcj,
+  void Compress(const CompressionJob &dcj,
                 bool bTwoBitMode,
                 const EWrapMode wrapMode) {
     Image img(dcj.height, dcj.width);
-    for(uint32 j = 0; j < dcj.height; j++) {
-      for(uint32 i = 0; i < dcj.width; i++) {
-        const uint32 *pixels = reinterpret_cast<const uint32 *>(dcj.inBuf);
-        img(i, j).UnpackRGBA(pixels[j * dcj.width + i]);
-      }
+    uint32 nPixels = dcj.height * dcj.width;
+    for(uint32 i = 0; i < nPixels; i++) {
+      // Assume block stream order (whyyyy)
+      uint32 blockIdx = i / 16;
+      uint32 blockWidth = dcj.width / 4;
+      uint32 blockX = blockIdx % blockWidth;
+      uint32 blockY = blockIdx / blockWidth;
+
+      uint32 x = blockX * 4 + (i % 4);
+      uint32 y = blockY * 4 + (i % 16) / 4;
+
+      const uint32 *pixels = reinterpret_cast<const uint32 *>(dcj.inBuf);
+      img(x, y).UnpackRGBA(pixels[i]);      
     }
 
+    Image original = img;
+    img.DebugOutput("Original");
+
     // Downscale it using anisotropic diffusion based scheme in order to preserve
     // image features, then reupscale and compute deltas. Use deltas to generate
     // initial A & B images followed by modulation data.
+    img.ContentAwareDownscale(1, 1, eWrapMode_Wrap, true);
+    img.DebugOutput("DownscaledOnce");
+    img.ContentAwareDownscale(1, 1, eWrapMode_Wrap, false);
+    img.DebugOutput("DownscaledTwice");
+
+    Image downscaled = img;
+
+    // Upscale it again
+    img.BilinearUpscale(2, 2, eWrapMode_Wrap);
+
+    img.DebugOutput("Reconstruction");
+
+    // Compute difference...
+    Image difference = img;
+    for(uint32 j = 0; j < dcj.height; j++) {
+      for(uint32 i = 0; i < dcj.width; i++) {
+        for(uint32 c = 0; c < 4; c++) {
+          difference(i, j).Component(c) -= img(i, j).Component(c);
+        }
+      }
+    }
   }
 
 }  // namespace PVRTCC

From e9a9988a56762f320c712e7c13ca71efed770596 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Fri, 20 Sep 2013 12:48:56 -0400
Subject: [PATCH 11/32] Fix small bug in downsampling that causes artifacts

---
 PVRTCEncoder/src/Image.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index 9ea301c..ced773e 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -52,6 +52,7 @@
 
 #include "Image.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstring>
 #include <cstdio>
@@ -62,6 +63,11 @@
 #include "Core/include/Image.h"
 #include "IO/include/ImageFile.h"
 
+template <typename T>
+inline T Clamp(const T &v, const T &a, const T &b) {
+  return ::std::min(::std::max(a, v), b);
+}
+
 static float ConvertChannelToFloat(uint8 channel, uint8 bitDepth) {
   float denominator = static_cast<float>((1 << bitDepth) - 1);
   return static_cast<float>(channel) / denominator;
@@ -346,7 +352,7 @@ void Image::ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
                  Iysq * Iyy[c][idx]) / denom;
         }
         float scale = static_cast<float>((1 << bitDepth[c]) - 1);
-        result.Component(c) = static_cast<uint8>((I0 + 0.25*It) * scale + 0.5);
+        result.Component(c) = static_cast<uint8>(Clamp(I0 + 0.25f*It, 0.0f, 1.0f) * scale + 0.5);
       }
 
       downscaledPixels[j * newHeight + i] = result;

From ea68f3a138a1400ebe9a7becb2147e0938e008c6 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 14:00:14 -0400
Subject: [PATCH 12/32] PVRTC image quality of life improvements.

---
 PVRTCEncoder/src/Image.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index ced773e..7a7581a 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -352,7 +352,7 @@ void Image::ContentAwareDownscale(uint32 xtimes, uint32 ytimes,
                  Iysq * Iyy[c][idx]) / denom;
         }
         float scale = static_cast<float>((1 << bitDepth[c]) - 1);
-        result.Component(c) = static_cast<uint8>(Clamp(I0 + 0.25f*It, 0.0f, 1.0f) * scale + 0.5);
+        result.Component(c) = static_cast<uint8>(Clamp(I0 + 0.25f*It, 0.0f, 1.0f) * scale + 0.5f);
       }
 
       downscaledPixels[j * newHeight + i] = result;
@@ -469,6 +469,8 @@ void Image::DebugOutput(const char *filename) const {
       uint32 idx = j * GetWidth() + i;
       Pixel p = m_Pixels[idx];
       p.ChangeBitDepth(fullDepth);
+      p.A() = 255;
+
       outPixels[idx] = p.PackRGBA();
     }
   }

From 0cfca89da2e984530e293a9632c069aaba4ef96a Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 17:44:11 -0400
Subject: [PATCH 13/32] Add default for PVRTexLib on linux

---
 CMakeModules/FindPVRTexLib.cmake | 41 ++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/CMakeModules/FindPVRTexLib.cmake b/CMakeModules/FindPVRTexLib.cmake
index b837860..f2da742 100644
--- a/CMakeModules/FindPVRTexLib.cmake
+++ b/CMakeModules/FindPVRTexLib.cmake
@@ -56,30 +56,51 @@
 #  PVRTEXLIB_LIBRARIES - The libraries needed to use PVRTexLib
 
 IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+  SET( PVRTEXLIB_ROOT "/Applications/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library" )
   find_path(
     PVRTEXLIB_INCLUDE_DIR PVRTexture.h
-    PATHS "/Applications/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Include"
+    PATHS ${PVRTEXLIB_ROOT}/Include
   )
 
   find_library(PVRTEXLIB_LIB PVRTexLib
-    PATHS "/Applications/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/OSX_x86/Static"
-          "/Applications/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/OSX_x86/Dynamic"
+    PATHS ${PVRTEXLIB_ROOT}/OSX_x86/Static
+          ${PVRTEXLIB_ROOT}/OSX_x86/Dynamic
   )
-ELSEIF(MSVC)
+ELSEIF (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+  SET( PVRTEXLIB_ROOT "/opt/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library" )
   find_path(
     PVRTEXLIB_INCLUDE_DIR PVRTexture.h
-    PATHS "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Include"
+    PATHS ${PVRTEXLIB_ROOT}/Include
+  )
+
+  IF(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+    find_library(PVRTEXLIB_LIB PVRTexLib
+      PATHS ${PVRTEXLIB_ROOT}/Linux_x86_64/Static
+            ${PVRTEXLIB_ROOT}/Linux_x86_64/Dynamic
+    )
+  ELSE()
+    find_library(PVRTEXLIB_LIB PVRTexLib
+      PATHS ${PVRTEXLIB_ROOT}/Linux_x86_32/Static
+            ${PVRTEXLIB_ROOT}/Linux_x86_32/Dynamic
+    )
+  ENDIF()
+
+ELSEIF(MSVC)
+  SET( PVRTEXLIB_ROOT "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library" )
+  find_path(
+    PVRTEXLIB_INCLUDE_DIR PVRTexture.h
+    PATHS ${PVRTEXLIB_ROOT}/Include
   )
 
   IF(${CMAKE_GENERATOR} MATCHES Win64)
     find_library(PVRTEXLIB_LIB PVRTexLib
-      PATHS "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Windows_x86_64/Static"
-            "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Windows_x86_64/Dynamic"
+      PATHS ${PVRTEXLIB_ROOT}/Windows_x86_64/Static
+            ${PVRTEXLIB_ROOT}/Windows_x86_64/Dynamic
     )
   ELSE()
     find_library(PVRTEXLIB_LIB PVRTexLib
-      PATHS "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Windows_x86_32/Static"
-            "C:/Imagination/PowerVR/GraphicsSDK/PVRTexTool/Library/Windows_x86_32/Dynamic"
+      PATHS ${PVRTEXLIB_ROOT}/Windows_x86_32/Static
+            ${PVRTEXLIB_ROOT}/Windows_x86_32/Dynamic
     )
   ENDIF()
 ENDIF()
@@ -93,4 +114,4 @@ include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(PVRTexLib  DEFAULT_MSG
                                   PVRTEXLIB_LIB PVRTEXLIB_INCLUDE_DIR)
 
-mark_as_advanced(PVRTEXLIB_INCLUDE_DIR PVRTEXLIB_LIB )
+mark_as_advanced( PVRTEXLIB_ROOT PVRTEXLIB_INCLUDE_DIR PVRTEXLIB_LIB )

From 163623271774cee920bcf5c8ec3b1a8a707cab5d Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 17:45:19 -0400
Subject: [PATCH 14/32] Make sure that when changing from higher to lower bit
 depth (i.e. perform a division) that we round to the nearest value

---
 PVRTCEncoder/src/Pixel.cpp      | 5 ++++-
 PVRTCEncoder/test/PixelTest.cpp | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/src/Pixel.cpp b/PVRTCEncoder/src/Pixel.cpp
index aa3d3a1..9c85c32 100644
--- a/PVRTCEncoder/src/Pixel.cpp
+++ b/PVRTCEncoder/src/Pixel.cpp
@@ -134,7 +134,10 @@ namespace PVRTCC {
         return 0xFF;
       } else {
         uint8 bitsWasted = oldDepth - newDepth;
-        return val >> bitsWasted;
+        uint16 v = static_cast<uint16>(val);
+        v = (v + (1 << (bitsWasted - 1))) >> bitsWasted;
+        v = ::std::min<uint16>(::std::max<uint16>(0, v), (1 << newDepth) - 1);
+        return v;
       }
     }
 
diff --git a/PVRTCEncoder/test/PixelTest.cpp b/PVRTCEncoder/test/PixelTest.cpp
index 5a2cc6e..99a9c67 100644
--- a/PVRTCEncoder/test/PixelTest.cpp
+++ b/PVRTCEncoder/test/PixelTest.cpp
@@ -139,7 +139,7 @@ TEST(Pixel, ChangeChannelBitDepth) {
 
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 8), 0x87);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 7), 0x43);
-  EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 6), 0x21);
+  EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 6), 0x22);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 2), 0x2);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 0), 0xFF);
 
@@ -149,7 +149,7 @@ TEST(Pixel, ChangeChannelBitDepth) {
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 8), 0x6D);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 6), 0x1B);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 3), 0x03);
-  EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 2), 0x01);
+  EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 2), 0x02);
   EXPECT_EQ(PVRTCC::Pixel::ChangeBitDepth(val, depth, 0), 0xFF);
 }
 
@@ -212,11 +212,13 @@ TEST(Pixel, UnpackRGBA) {
   EXPECT_EQ(p.G(), 0xB3);
   EXPECT_EQ(p.R(), 0xFE);
 
+  p = PVRTCC::Pixel();
   uint8 newBitDepth[4] = { 3, 5, 2, 1 };  // A R G B
   p.ChangeBitDepth(newBitDepth);
+  p.UnpackRGBA(rgba);
 
   EXPECT_EQ(p.A(), 0x2);
   EXPECT_EQ(p.B(), 0x0);
-  EXPECT_EQ(p.G(), 0x2);
+  EXPECT_EQ(p.G(), 0x3);
   EXPECT_EQ(p.R(), 0x1f);
 }

From 75e570ed160f5c85852e9f2ea20ba14c822abe91 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 17:46:09 -0400
Subject: [PATCH 15/32] Fix some compiler snafus

---
 PVRTCEncoder/src/Decompressor.cpp   | 4 +---
 PVRTCEncoder/test/DecompTestPVR.cpp | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/src/Decompressor.cpp b/PVRTCEncoder/src/Decompressor.cpp
index 18f7c61..3e906ec 100644
--- a/PVRTCEncoder/src/Decompressor.cpp
+++ b/PVRTCEncoder/src/Decompressor.cpp
@@ -275,9 +275,7 @@ namespace PVRTCC {
 
     if(bDebugImages) {
       Image dbgMod(h, w);
-      uint8 modDepth[4] = { 8, 4, 4, 4 };
-
-      for(int i = 0; i < h*w; i++) {
+      for(uint32 i = 0; i < h*w; i++) {
         float fb = static_cast<float>(modValues[i]);
         uint8 val = static_cast<uint8>((fb / 8.0f) * 15.0f);
 
diff --git a/PVRTCEncoder/test/DecompTestPVR.cpp b/PVRTCEncoder/test/DecompTestPVR.cpp
index e5a64f2..592afb5 100644
--- a/PVRTCEncoder/test/DecompTestPVR.cpp
+++ b/PVRTCEncoder/test/DecompTestPVR.cpp
@@ -73,7 +73,7 @@ class ImageTester {
     pvrtexture::CPVRTexture pvrTex(filename);
 
     const uint8 *data = static_cast<const uint8 *>(pvrTex.getDataPtr());
-    ASSERT_TRUE(data);
+    assert(data);
 
     const pvrtexture::CPVRTextureHeader &hdr = pvrTex.getHeader();
     const uint32 w = hdr.getWidth();

From 340f4f314118e79112416a4d1b47c28650b58d20 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 19:29:03 -0400
Subject: [PATCH 16/32] Add pixel packing routines

---
 PVRTCEncoder/src/Pixel.cpp      | 38 +++++++++++++++++++++++++++++++++
 PVRTCEncoder/src/Pixel.h        |  7 ++++++
 PVRTCEncoder/test/PixelTest.cpp | 26 ++++++++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/PVRTCEncoder/src/Pixel.cpp b/PVRTCEncoder/src/Pixel.cpp
index 9c85c32..c4bfe86 100644
--- a/PVRTCEncoder/src/Pixel.cpp
+++ b/PVRTCEncoder/src/Pixel.cpp
@@ -108,6 +108,44 @@ namespace PVRTCC {
     }
   }
 
+  void Pixel::ToBits(uint8 *bits, uint32 numBytes, uint32 bitOffset) const {
+#ifndef NDEBUG
+    uint32 bitDepthSum = bitOffset;
+    for(int i = 0; i < 4; i++) {
+      bitDepthSum += m_BitDepth[i];
+    }
+    assert((bitDepthSum / 8) < numBytes);
+#endif
+
+    uint8 byteIdx = 0;
+    while(bitOffset > 8) {
+      byteIdx++;
+      bitOffset -= 8;
+    }
+
+    uint8 bitIdx = bitOffset;
+    for(int i = 3; i >= 0; i--) {
+      uint8 val = Component(i);
+      uint8 depth = m_BitDepth[i];
+
+      if(depth + bitIdx > 8) {
+        uint8 nextBitIdx = depth - (8 - bitIdx);
+        uint16 v = static_cast<uint16>(val);
+        bits[byteIdx++] |= (v << bitIdx) & 0xFF;
+        bitIdx = nextBitIdx;
+        bits[byteIdx] = (v >> (depth - bitIdx)) & 0xFF;
+      } else {
+        bits[byteIdx] |= (val << bitIdx) & 0xFF;
+        bitIdx += depth;
+      }
+
+      if(bitIdx == 8) {
+        bitIdx = 0;
+        byteIdx++;
+      }
+    }
+  }
+
   uint8 Pixel::ChangeBitDepth(uint8 val, uint8 oldDepth, uint8 newDepth) {
     assert(newDepth <= 8);
     assert(oldDepth <= 8);
diff --git a/PVRTCEncoder/src/Pixel.h b/PVRTCEncoder/src/Pixel.h
index f87d79d..3fe7e2b 100644
--- a/PVRTCEncoder/src/Pixel.h
+++ b/PVRTCEncoder/src/Pixel.h
@@ -82,6 +82,13 @@ class Pixel {
                 const uint8 channelDepth[4] = static_cast<uint8 *>(0),
                 uint8 bitOffset = 0);
 
+  // This function is the converse of FromBits. It will pack a pixel
+  // into a specified buffer based on the bit depth of the pixel. The
+  // bitOffset determines at which bit to start from. The bits are written
+  // starting from the LSB of bits[0]. numBytes is a sanity check and isn't
+  // used in release mode.
+  void ToBits(uint8 *bits, uint32 numBytes, uint32 bitOffset = 0) const;
+
   // Changes the depth of each pixel. This scales the values to
   // the appropriate bit depth by either truncating the least
   // significant bits when going from larger to smaller bit depth
diff --git a/PVRTCEncoder/test/PixelTest.cpp b/PVRTCEncoder/test/PixelTest.cpp
index 99a9c67..decab5b 100644
--- a/PVRTCEncoder/test/PixelTest.cpp
+++ b/PVRTCEncoder/test/PixelTest.cpp
@@ -133,6 +133,32 @@ TEST(Pixel, FromBitsAndAssociatedConstructor) {
   }
 }
 
+TEST(Pixel, ToBits) {
+  PVRTCC::Pixel p;
+
+  uint8 bitDepth[4] = { 2, 8, 1, 7 };
+  p.ChangeBitDepth(bitDepth);
+
+  p.A() = 0x2;
+  p.R() = 0x56;
+  p.G() = 0;
+  p.B() = 0x4F;
+
+  uint8 bits[3];
+  memset(bits, 0, sizeof(bits));
+  p.ToBits(bits, sizeof(bits));
+
+  EXPECT_EQ(bits[0], 0x4F);
+  EXPECT_EQ(bits[1], 0x56);
+  EXPECT_EQ(bits[2], 0x2);
+
+  memset(bits, 0, sizeof(bits));
+  p.ToBits(bits, 3, 2);
+  EXPECT_EQ(bits[0], 0x3C);
+  EXPECT_EQ(bits[1], 0x59);
+  EXPECT_EQ(bits[2], 0x09);
+}
+
 TEST(Pixel, ChangeChannelBitDepth) {
   uint8 val = 0x43;
   uint8 depth = 7;

From 3de9abc908685219efac262488dfdbb4d155b6e2 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 19:30:31 -0400
Subject: [PATCH 17/32] Make PVRTC block size a constant accessible from the
 global header

---
 PVRTCEncoder/include/PVRTCCompressor.h | 2 ++
 PVRTCEncoder/src/Decompressor.cpp      | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/PVRTCEncoder/include/PVRTCCompressor.h b/PVRTCEncoder/include/PVRTCCompressor.h
index 657c71a..c0d9255 100644
--- a/PVRTCEncoder/include/PVRTCCompressor.h
+++ b/PVRTCEncoder/include/PVRTCCompressor.h
@@ -80,6 +80,8 @@ namespace PVRTCC {
                 bool bTwoBitMode = false,
                 const EWrapMode wrapMode = eWrapMode_Wrap);
 
+  static const uint32 kBlockSize = sizeof(uint64);
+
 }  // namespace PVRTCC
 
 #endif  // PVRTCENCODER_INCLUDE_PVRTCCOMPRESSOR_H_
diff --git a/PVRTCEncoder/src/Decompressor.cpp b/PVRTCEncoder/src/Decompressor.cpp
index 3e906ec..48d2ecb 100644
--- a/PVRTCEncoder/src/Decompressor.cpp
+++ b/PVRTCEncoder/src/Decompressor.cpp
@@ -308,7 +308,6 @@ namespace PVRTCC {
 
     const uint32 blocksW = bTwoBitMode? (w / 8) : (w / 4);
     const uint32 blocksH = h / 4;
-    const uint32 blockSz = 8;
 
     for(uint32 j = 0; j < blocksH; j++) {
       for(uint32 i = 0; i < blocksW; i++) {
@@ -317,7 +316,7 @@ namespace PVRTCC {
         // linearize them...
         uint32 idx = Interleave(j, i);
 
-        uint32 offset = idx * blockSz;
+        uint32 offset = idx * kBlockSize;
         blocks.push_back( Block(dcj.inBuf + offset) );
       }
     }

From 8f4dcca4d794d44aa536c442ad32ba0c621dd1b3 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 20:33:48 -0400
Subject: [PATCH 18/32] Add some utility functions for manipulating the block
 data, including packing colors back into the 64-bit word.

---
 PVRTCEncoder/src/Block.cpp      | 108 ++++++++++++++++++++++++++++++++
 PVRTCEncoder/src/Block.h        |  30 +++++++++
 PVRTCEncoder/test/BlockTest.cpp |  91 +++++++++++++++++++++++++++
 3 files changed, 229 insertions(+)

diff --git a/PVRTCEncoder/src/Block.cpp b/PVRTCEncoder/src/Block.cpp
index 8197953..602aab5 100644
--- a/PVRTCEncoder/src/Block.cpp
+++ b/PVRTCEncoder/src/Block.cpp
@@ -83,6 +83,42 @@ namespace PVRTCC {
     return m_ColorA;
   }
 
+  Pixel Block::SetColor(const Pixel &c, bool transparent,
+                        const uint8 (&tbd)[4], const uint8 (&obd)[4]) {
+    uint8 cDepth[4];
+    c.GetBitDepth(cDepth);
+
+    Pixel final = c;
+    if(transparent) {
+      final.ChangeBitDepth(tbd);
+
+      // If we went effectively transparent, then just switch over to opaque...
+      if(final.A() == 0x7) {
+        return SetColor(c, false, tbd, obd);
+      }
+
+    } else {
+      final.A() = 255;
+      final.ChangeBitDepth(obd);
+    }
+
+    return final;
+  }
+
+  void Block::SetColorA(const Pixel &c, bool transparent) {
+    const uint8 transparentBitDepth[4] = { 3, 4, 4, 4 };
+    const uint8 opaqueBitDepth[4] = { 0, 5, 5, 5 };
+    m_ColorA = SetColor(c, transparent, transparentBitDepth, opaqueBitDepth);
+    m_ColorACached = true;
+  }
+
+  void Block::SetColorB(const Pixel &c, bool transparent) {
+    const uint8 transparentBitDepth[4] = { 3, 4, 4, 3 };
+    const uint8 opaqueBitDepth[4] = { 0, 5, 5, 4 };
+    m_ColorB = SetColor(c, transparent, transparentBitDepth, opaqueBitDepth);
+    m_ColorBCached = true;
+  }
+
   Pixel Block::GetColorB() {
     if(m_ColorBCached) {
       return m_ColorB;
@@ -109,6 +145,17 @@ namespace PVRTCC {
     return (m_LongData >> (texelIdx * 2)) & 0x3;
   }
 
+  void Block::SetLerpValue(uint32 texelIdx, uint8 lerpVal) {
+    assert(texelIdx >= 0);
+    assert(texelIdx <= 15);
+
+    assert(lerpVal >= 0);
+    assert(lerpVal < 4);
+
+    m_LongData &= ~(static_cast<uint64>(0x3) << (texelIdx * 2));
+    m_LongData |= static_cast<uint64>(lerpVal & 0x3) << (texelIdx * 2);
+  }
+
   Block::E2BPPSubMode Block::Get2BPPSubMode() const {
     uint8 first = GetLerpValue(0);
     if(!(first & 0x1)) {
@@ -145,4 +192,65 @@ namespace PVRTCC {
 
     return ret;
   }
+
+  uint64 Block::Pack() {
+    assert(m_ColorACached);
+    assert(m_ColorBCached);
+
+#ifndef NDEBUG
+    uint8 bitDepthA[4];
+    m_ColorA.GetBitDepth(bitDepthA);
+
+    uint32 sumA = 0;
+    for(int i = 0; i < 4; i++) {
+      sumA += bitDepthA[i];
+    }
+    assert(sumA == 15);
+#endif
+
+#ifndef NDEBUG
+    uint8 bitDepthB[4];
+    m_ColorB.GetBitDepth(bitDepthB);
+
+    uint32 sumB = 0;
+    for(int i = 0; i < 4; i++) {
+      sumB += bitDepthB[i];
+    }
+    assert(sumB == 14);
+#endif
+
+    uint8 aBits[2], bBits[2];
+    memset(aBits, 0, sizeof(aBits));
+    memset(bBits, 0, sizeof(bBits));
+
+    m_ColorA.ToBits(aBits, 2);
+    m_ColorB.ToBits(bBits, 2, 1);
+
+    if(m_ColorA.A() == 0xFF) {
+      m_ByteData[7] |= 0x80;
+    } else {
+      m_ByteData[7] &= 0x7f;
+    }
+    m_ByteData[7] = aBits[1];
+    m_ByteData[6] = aBits[0];
+
+    bool modeBit = GetModeBit();
+    m_ByteData[5] = bBits[1];
+    m_ByteData[4] = bBits[0];
+    if(m_ColorB.A() == 0xFF) {
+      m_ByteData[5] |= 0x80;
+    } else {
+      m_ByteData[5] &= 0x7f;
+    }
+
+    if(modeBit) {
+      m_ByteData[4] |= 0x1;
+    } else {
+      m_ByteData[4] &= 0xFE;
+    }
+
+    // Modulation data should have already been set...
+    return m_LongData;
+  }
+
 }  // namespace PVRTCC
diff --git a/PVRTCEncoder/src/Block.h b/PVRTCEncoder/src/Block.h
index a948d8a..0173f09 100644
--- a/PVRTCEncoder/src/Block.h
+++ b/PVRTCEncoder/src/Block.h
@@ -61,15 +61,29 @@ namespace PVRTCC {
 
 class Block {
  public:
+  Block(): m_LongData(0) { }
   explicit Block(const uint8 *data);
 
+  // Accessors for the A and B colors of the block.
   Pixel GetColorA();
+  void SetColorA(const Pixel &, bool transparent=false);
+
   Pixel GetColorB();
+  void SetColorB(const Pixel &, bool transparent=false);
 
   bool GetModeBit() const {
     return static_cast<bool>((m_LongData >> 32) & 0x1);
   }
 
+  void SetModeBit(bool flag) {
+    const uint64 bit = 0x100000000L;
+    if(flag) {
+      m_LongData |= bit;
+    } else {
+      m_LongData &= ~bit;
+    }
+  }
+
   // For 2BPP PVRTC, if the mode bit is set, then we use the modulation data
   // as 2 bits for every other texel in the 8x4 block in a checkerboard pattern.
   // The interleaved texel data is decided by averaging nearby texel modulation
@@ -101,6 +115,11 @@ class Block {
   // 12 13 14 15
   uint8 GetLerpValue(uint32 texelIdx) const;
 
+  // Sets the values in the data for this block according to the texel and
+  // modulation value passed. This happens immediately (i.e. a call to Pack()
+  // will reflect these changes).
+  void SetLerpValue(uint32 texelIdx, uint8 lerpVal);
+
   // This returns the modulation value for the texel in the block interpreted as
   // 2BPP. If the modulation bit is not set, then it expects a number from 0-31
   // and does the same operation as GetLerpValue. If the modulation bit is set,
@@ -110,6 +129,12 @@ class Block {
   // global information.
   uint8 Get2BPPLerpValue(uint32 texelIdx) const;
 
+  // Returns the 64-bit word that represents this block. This function packs the
+  // A and B colors based on their bit depths and preserves the corresponding mode
+  // bits. The color modes are determined by whether or not the alpha channel of
+  // each block is fully opaque or not.
+  uint64 Pack();
+
  private:
   union {
     uint8 m_ByteData[8];
@@ -121,6 +146,11 @@ class Block {
 
   bool m_ColorBCached;
   Pixel m_ColorB;
+
+  // tbd -- transparent bit depth
+  // obd -- opaque bit depth
+  static Pixel SetColor(const Pixel &c, bool transparent,
+                        const uint8 (&tbd)[4], const uint8 (&obd)[4]);
 };
 
 }  // namespace PVRTCC
diff --git a/PVRTCEncoder/test/BlockTest.cpp b/PVRTCEncoder/test/BlockTest.cpp
index db04a18..5e47315 100644
--- a/PVRTCEncoder/test/BlockTest.cpp
+++ b/PVRTCEncoder/test/BlockTest.cpp
@@ -236,3 +236,94 @@ TEST(Block, Get2BPPSubMode) {
   b = PVRTCC::Block(data);
   EXPECT_EQ(b.Get2BPPSubMode(), PVRTCC::Block::e2BPPSubMode_Vertical);
 }
+
+TEST(Block, SetColorAandB) {
+  PVRTCC::Block b;
+  PVRTCC::Pixel color;
+  color.A() = 212;
+  color.R() = 200;
+  color.G() = 100;
+  color.B() = -120;
+  b.SetColorA(color);
+  PVRTCC::Pixel cA = b.GetColorA();
+
+  uint8 bitDepth[4] = { 0, 5, 5, 5 };
+  color.ChangeBitDepth(bitDepth);
+
+  EXPECT_FALSE(memcmp(&color, &cA, sizeof(color)));
+  
+  memset(bitDepth, 8, sizeof(bitDepth));
+  color.ChangeBitDepth(bitDepth);
+
+  color.A() = 212;
+  color.R() = 200;
+  color.G() = 100;
+  color.B() = -120;
+  b.SetColorB(color, true);
+  PVRTCC::Pixel cB = b.GetColorB();
+  
+  uint8 tBitDepth[4] = { 0, 5, 5, 4 };
+  color.ChangeBitDepth(tBitDepth);
+
+  EXPECT_FALSE(memcmp(&color, &cB, sizeof(color)));
+
+  memset(bitDepth, 8, sizeof(bitDepth));
+  color.ChangeBitDepth(bitDepth);
+
+  color.A() = 100;
+  color.R() = 200;
+  color.G() = 100;
+  color.B() = -120;
+  b.SetColorB(color, true);
+  PVRTCC::Pixel cC = b.GetColorB();
+  
+  uint8 uBitDepth[4] = { 3, 4, 4, 3 };
+  color.ChangeBitDepth(uBitDepth);
+
+  EXPECT_FALSE(memcmp(&color, &cC, sizeof(color)));
+}
+
+TEST(Block, SetLerpValue) {
+  PVRTCC::Block b;
+
+  for(int i = 0; i < 16; i++) {
+    b.SetLerpValue(i, i%4);
+  }
+
+  for(int i = 0; i < 16; i++) {
+    EXPECT_EQ(b.GetLerpValue(i), i % 4);
+  }
+}
+
+TEST(Block, PackBlock) {
+  PVRTCC::Block b;
+
+  PVRTCC::Pixel cA, cB;
+
+  cA.A() = 0xFF;
+  cA.R() = 0xFF;
+  cA.G() = 0x80;
+  cA.B() = 0x00;
+
+  cB.A() = 0x80;
+  cB.R() = 0x7F;
+  cB.G() = 0x00;
+  cB.B() = 0xFF;
+
+  b.SetColorA(cA);
+  b.SetColorB(cB, true);
+
+  for(int i = 0; i < 16; i++) {
+    b.SetLerpValue(i, i%4);
+  }
+
+  b.SetModeBit(false);
+  EXPECT_EQ(b.Pack(), 0xFE00480EE4E4E4E4UL);
+
+  b.SetModeBit(true);
+  EXPECT_EQ(b.Pack(), 0xFE00480FE4E4E4E4UL);
+
+  b.SetColorB(cB);
+  b.SetModeBit(false);
+  EXPECT_EQ(b.Pack(), 0xFE00C01EE4E4E4E4UL);
+}

From c6d7bdc670b752fe3914a05714598ed82f381b1c Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Tue, 24 Sep 2013 20:35:36 -0400
Subject: [PATCH 19/32] Very preliminary compressor

---
 PVRTCEncoder/src/Compressor.cpp | 221 +++++++++++++++++++++++++++++++-
 1 file changed, 217 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index b26c51e..c985f61 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -52,11 +52,85 @@
 
 #include "PVRTCCompressor.h"
 
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <vector>
+
 #include "Pixel.h"
 #include "Image.h"
+#include "Block.h"
 
 namespace PVRTCC {
 
+  static uint32 Interleave(uint16 inx, uint16 iny) {
+    // Taken from:
+    // http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
+
+    static const uint32 B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF};
+    static const uint32 S[] = {1, 2, 4, 8};
+
+    uint32 x = static_cast<uint32>(inx);
+    uint32 y = static_cast<uint32>(iny);
+
+    x = (x | (x << S[3])) & B[3];
+    x = (x | (x << S[2])) & B[2];
+    x = (x | (x << S[1])) & B[1];
+    x = (x | (x << S[0])) & B[0];
+
+    y = (y | (y << S[3])) & B[3];
+    y = (y | (y << S[2])) & B[2];
+    y = (y | (y << S[1])) & B[1];
+    y = (y | (y << S[0])) & B[0];
+
+    return x | (y << 1);
+  }
+
+  template <typename T>
+  static T Clamp(const T &v, const T &low, const T &high) {
+    return ::std::min(::std::max(low, v), high);
+  }
+
+  template <typename T>
+  static T Lookup(const T *vals,
+                  uint32 x, uint32 y,
+                  uint32 width, uint32 height,
+                  const EWrapMode wrapMode) {
+    while(x >= width) {
+      if(wrapMode == eWrapMode_Wrap) {
+        x -= width;
+      } else {
+        x = width - 1;
+      }
+    }
+
+    while(x < 0) {
+      if(wrapMode == eWrapMode_Wrap) {
+        x += width;
+      } else {
+        x = 0;
+      }
+    }
+
+    while(y >= height) {
+      if(wrapMode == eWrapMode_Wrap) {
+        y -= height;
+      } else {
+        y = height - 1;
+      }
+    }
+
+    while(y < 0) {
+      if(wrapMode == eWrapMode_Wrap) {
+        y += height;
+      } else {
+        y = 0;
+      }
+    }
+
+    return vals[y * width + x];
+  }
+
   void Compress(const CompressionJob &dcj,
                 bool bTwoBitMode,
                 const EWrapMode wrapMode) {
@@ -83,9 +157,7 @@ namespace PVRTCC {
     // image features, then reupscale and compute deltas. Use deltas to generate
     // initial A & B images followed by modulation data.
     img.ContentAwareDownscale(1, 1, eWrapMode_Wrap, true);
-    img.DebugOutput("DownscaledOnce");
     img.ContentAwareDownscale(1, 1, eWrapMode_Wrap, false);
-    img.DebugOutput("DownscaledTwice");
 
     Image downscaled = img;
 
@@ -95,14 +167,155 @@ namespace PVRTCC {
     img.DebugOutput("Reconstruction");
 
     // Compute difference...
-    Image difference = img;
+    int16 difference[dcj.height * dcj.width * 4];
     for(uint32 j = 0; j < dcj.height; j++) {
       for(uint32 i = 0; i < dcj.width; i++) {
         for(uint32 c = 0; c < 4; c++) {
-          difference(i, j).Component(c) -= img(i, j).Component(c);
+          int16 o = original(i, j).Component(c);
+          int16 n = img(i, j).Component(c);
+          difference[j*dcj.width*4 + i*4 + c] = o - n;
         }
       }
     }
+
+    // Go over the 7x7 texel blocks and extract bounding box diagonals for each
+    // block. We should be able to choose which diagonal we want...
+    const uint32 kKernelSz = 7;
+    int16 maxDiff[dcj.height * dcj.width / 4];
+    int16 minDiff[dcj.height * dcj.width / 4];
+    for(uint32 j = 2; j < dcj.height; j += 4) {
+      for(uint32 i = 2; i < dcj.width; i += 4) {
+        const uint32 startX = i - (kKernelSz / 2);
+        const uint32 startY = j - (kKernelSz / 2);
+        for(uint32 c = 0; c < 4; c++) {
+          int32 pos = 0;
+          int32 neg = 0;
+          for(uint32 y = startY; y < startY + kKernelSz; y++) {
+            for(uint32 x = startX; x < startX + kKernelSz; x++) {
+              int16 val = Lookup(difference, x*4 + c, y, dcj.width*4, dcj.height, wrapMode);
+              if(val > 0) {
+                pos += val;
+              } else {
+                neg += val;
+              }
+            }
+          }
+
+          uint32 blockIdx = ((j-2)/4) * dcj.width + (i-2) + c;
+          assert(blockIdx < (dcj.width * dcj.height) / 4);
+          if(pos > -neg) {
+            maxDiff[blockIdx] = pos;
+            minDiff[blockIdx] = 0;
+          } else {
+            maxDiff[blockIdx] = 0;
+            minDiff[blockIdx] = neg;       
+          }
+        }
+      }
+    }
+
+    // Add maxDiff to image to get high signal, and lowdiff to image to
+    // get low signal...
+    Image imgA = downscaled;
+    Image imgB = downscaled;
+
+    for(uint32 j = 0; j < dcj.height / 4; j++) {
+      for(uint32 i = 0; i < dcj.width / 4; i++) {
+        for(uint32 c = 0; c < 4; c++) {
+          uint8 &a = imgA(i, j).Component(c);
+          a = Clamp<int16>(a + maxDiff[j*dcj.width/4 + i*4 + c], 0, 255);
+
+          uint8 &b = imgB(i, j).Component(c);
+          b = Clamp<int16>(b + minDiff[j*dcj.width/4 + i*4 + c], 0, 255);
+        }
+      }
+    }
+
+    imgA.DebugOutput("ImageA");
+    imgB.DebugOutput("ImageB");
+
+    // Determine modulation values...
+    Image upA = imgA;
+    Image upB = imgB;
+
+    upA.BilinearUpscale(2, 2, wrapMode);
+    upB.BilinearUpscale(2, 2, wrapMode);
+
+    assert(upA.GetHeight() == dcj.height && upA.GetWidth() == dcj.width);
+    assert(upB.GetHeight() == dcj.height && upB.GetWidth() == dcj.width);
+
+    upA.DebugOutput("UpscaledA");
+    upB.DebugOutput("UpscaledB");
+
+    // Choose the most appropriate modulation values for the two images...
+    std::vector<uint8> modValues;
+    modValues.reserve(dcj.width * dcj.height);
+    for(uint32 j = 0; j < dcj.height; j++) {
+      for(uint32 i = 0; i < dcj.width; i++) {
+        uint8 &mv = modValues[j * dcj.width + i];
+
+        const Pixel pa = upA(i, j);
+        const Pixel pb = upB(i, j);
+        const Pixel po = original(i, j);
+        
+        // !FIXME! there are two modulation modes... we're only using one.
+        uint8 modSteps[4] = { 0, 3, 5, 8 };
+        uint8 bestMod = 0;
+        uint32 bestError = 0xFFFFFFFF;
+        for(uint32 s = 0; s < 4; s++) {
+          uint32 error = 0;
+          for(uint32 c = 0; c < 4; c++) {
+            uint16 va = static_cast<uint16>(pa.Component(c));
+            uint16 vb = static_cast<uint16>(pb.Component(c));
+            uint16 vo = static_cast<uint16>(po.Component(c));
+
+            uint16 lerpVal = modSteps[s];
+            uint16 res = (va * (8 - lerpVal) + vb * lerpVal) / 8;
+            uint16 e = (res > vo)? res - vo : vo - res;
+            error += e * e;
+          }
+
+          if(error < bestError) {
+            bestError = error;
+            bestMod = modSteps[s];
+          }
+        }
+
+        mv = bestMod;
+      }
+    }
+
+    // Pack everything into a PVRTC blocks.
+    const uint32 blocksW = dcj.width / 4;
+    const uint32 blocksH = dcj.height / 4;
+    std::vector<uint64> blocks;
+    for(uint32 j = 0; j < blocksH; j++) {
+      for(uint32 i = 0; i < blocksW; i++) {
+        Block b;
+        b.SetColorA(imgA(i, j));
+        b.SetColorB(imgB(i, j));
+        for(uint32 t = 0; t < 16; t++) {
+          uint32 x = i + (t%4);
+          uint32 y = j + (t/4);
+          b.SetLerpValue(t, modValues[y*dcj.width + x]);
+        }
+        blocks.push_back(b.Pack());
+      }
+    }
+
+    // Spit out the blocks...
+    for(uint32 j = 0; j < blocksH; j++) {
+      for(uint32 i = 0; i < blocksW; i++) {
+
+        // The blocks are initially arranged in morton order. Let's
+        // linearize them...
+        uint32 idx = Interleave(j, i);
+
+        uint32 offset = idx * PVRTCC::kBlockSize;
+        uint64 *outPtr = reinterpret_cast<uint64 *>(dcj.outBuf + offset);
+        *outPtr = blocks[j * blocksW + i];
+      }
+    }
   }
 
 }  // namespace PVRTCC

From 79d2ad79ac0b41368d7f8c4404d86b757307b58a Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 14:30:48 -0400
Subject: [PATCH 20/32] Fix some snafus on windows

---
 PVRTCEncoder/src/Image.cpp          | 9 +++++++--
 PVRTCEncoder/test/DecompTestPVR.cpp | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/src/Image.cpp b/PVRTCEncoder/src/Image.cpp
index c90a57c..f928bf3 100644
--- a/PVRTCEncoder/src/Image.cpp
+++ b/PVRTCEncoder/src/Image.cpp
@@ -56,6 +56,11 @@
 #include <cstring>
 #include <cstdio>
 
+#if _MSC_VER
+#  define _CRT_SECURE_NO_WARNINGS
+#  define snprintf _snprintf
+#endif
+
 #include "Pixel.h"
 
 #include "Core/include/Image.h"
@@ -303,8 +308,8 @@ const Pixel & Image::operator()(uint32 i, uint32 j) const {
 void Image::DebugOutput(const char *filename) const {
   uint32 *outPixels = new uint32[m_Width * m_Height];
   const uint8 fullDepth[4] = { 8, 8, 8, 8 };
-  for(int j = 0; j < m_Height; j++) {
-    for(int i = 0; i < m_Width; i++) {
+  for(uint32 j = 0; j < m_Height; j++) {
+    for(uint32 i = 0; i < m_Width; i++) {
       uint32 idx = j * m_Width + i;
       Pixel p = m_Pixels[idx];
       p.ChangeBitDepth(fullDepth);
diff --git a/PVRTCEncoder/test/DecompTestPVR.cpp b/PVRTCEncoder/test/DecompTestPVR.cpp
index e5a64f2..2d29aa1 100644
--- a/PVRTCEncoder/test/DecompTestPVR.cpp
+++ b/PVRTCEncoder/test/DecompTestPVR.cpp
@@ -73,7 +73,7 @@ class ImageTester {
     pvrtexture::CPVRTexture pvrTex(filename);
 
     const uint8 *data = static_cast<const uint8 *>(pvrTex.getDataPtr());
-    ASSERT_TRUE(data);
+    assert(data);
 
     const pvrtexture::CPVRTextureHeader &hdr = pvrTex.getHeader();
     const uint32 w = hdr.getWidth();
@@ -96,7 +96,7 @@ class ImageTester {
 
     uint32 *libPixels = static_cast<uint32 *>(pvrTex.getDataPtr());
 
-    for(int i = 0; i < w*h; i++) {
+    for(uint32 i = 0; i < w*h; i++) {
       EXPECT_EQ(PixelPrinter(libPixels[i]), PixelPrinter(outPixels[i]));
     }
 

From f19b324d28d5310506cab29d8525f303e2ba0037 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 14:42:35 -0400
Subject: [PATCH 21/32] Make sure to use the proper directory when running
 RUN_TESTS from MSVC. Note, this doesn't work in Release.

---
 PVRTCEncoder/test/CMakeLists.txt | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/PVRTCEncoder/test/CMakeLists.txt b/PVRTCEncoder/test/CMakeLists.txt
index 5dd45b4..af7cf7c 100644
--- a/PVRTCEncoder/test/CMakeLists.txt
+++ b/PVRTCEncoder/test/CMakeLists.txt
@@ -101,8 +101,16 @@ IF(PVRTEXLIB_FOUND)
   TARGET_LINK_LIBRARIES(${TEST_NAME} FasTCIO)
   TARGET_LINK_LIBRARIES(${TEST_NAME} FasTCCore)
 
-  ADD_TEST(${TEST_NAME}
-    ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}
-  )
+  IF(MSVC)
+    ADD_TEST(${TEST_NAME}
+      ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}
+      ${CMAKE_CURRENT_BINARY_DIR}/Debug/${TEST_NAME}
+    )
+  ELSE()
+    ADD_TEST(${TEST_NAME}
+      ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}
+      ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}
+    )
+  ENDIF()
+
 ENDIF(PVRTEXLIB_FOUND)

From a58789904855f40af5a38a0185a12747ff98b82d Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 15:06:45 -0400
Subject: [PATCH 22/32] Fix some compiler errors with MSVC (and honestly, other
 compilers should catch them, too.)

---
 PVRTCEncoder/src/Compressor.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index c985f61..937c239 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -92,7 +92,7 @@ namespace PVRTCC {
   }
 
   template <typename T>
-  static T Lookup(const T *vals,
+  static T Lookup(const ::std::vector<T> &vals,
                   uint32 x, uint32 y,
                   uint32 width, uint32 height,
                   const EWrapMode wrapMode) {
@@ -167,7 +167,8 @@ namespace PVRTCC {
     img.DebugOutput("Reconstruction");
 
     // Compute difference...
-    int16 difference[dcj.height * dcj.width * 4];
+    ::std::vector<int16> difference;
+    difference.reserve(dcj.height * dcj.width * 4);
     for(uint32 j = 0; j < dcj.height; j++) {
       for(uint32 i = 0; i < dcj.width; i++) {
         for(uint32 c = 0; c < 4; c++) {
@@ -181,8 +182,13 @@ namespace PVRTCC {
     // Go over the 7x7 texel blocks and extract bounding box diagonals for each
     // block. We should be able to choose which diagonal we want...
     const uint32 kKernelSz = 7;
-    int16 maxDiff[dcj.height * dcj.width / 4];
-    int16 minDiff[dcj.height * dcj.width / 4];
+    ::std::vector<int16> maxDiff;
+    ::std::vector<int16> minDiff;
+
+    const uint32 kNumBlockChannels = dcj.height * dcj.width / 4;
+    maxDiff.reserve(kNumBlockChannels);
+    minDiff.reserve(kNumBlockChannels);
+
     for(uint32 j = 2; j < dcj.height; j += 4) {
       for(uint32 i = 2; i < dcj.width; i += 4) {
         const uint32 startX = i - (kKernelSz / 2);
@@ -192,7 +198,8 @@ namespace PVRTCC {
           int32 neg = 0;
           for(uint32 y = startY; y < startY + kKernelSz; y++) {
             for(uint32 x = startX; x < startX + kKernelSz; x++) {
-              int16 val = Lookup(difference, x*4 + c, y, dcj.width*4, dcj.height, wrapMode);
+              int16 val = Lookup(difference, x*4 + c, y,
+                                 dcj.width*4, dcj.height, wrapMode);
               if(val > 0) {
                 pos += val;
               } else {
@@ -202,7 +209,7 @@ namespace PVRTCC {
           }
 
           uint32 blockIdx = ((j-2)/4) * dcj.width + (i-2) + c;
-          assert(blockIdx < (dcj.width * dcj.height) / 4);
+          assert(blockIdx < kNumBlockChannels);
           if(pos > -neg) {
             maxDiff[blockIdx] = pos;
             minDiff[blockIdx] = 0;
@@ -222,11 +229,12 @@ namespace PVRTCC {
     for(uint32 j = 0; j < dcj.height / 4; j++) {
       for(uint32 i = 0; i < dcj.width / 4; i++) {
         for(uint32 c = 0; c < 4; c++) {
+          const uint32 cIdx = j*dcj.width/4 + i*4 + c;
           uint8 &a = imgA(i, j).Component(c);
-          a = Clamp<int16>(a + maxDiff[j*dcj.width/4 + i*4 + c], 0, 255);
+          a = static_cast<uint8>(Clamp<int16>(a + maxDiff[cIdx], 0, 255));
 
           uint8 &b = imgB(i, j).Component(c);
-          b = Clamp<int16>(b + minDiff[j*dcj.width/4 + i*4 + c], 0, 255);
+          b = static_cast<uint8>(Clamp<int16>(b + minDiff[cIdx], 0, 255));
         }
       }
     }

From 986616daf695a399e56145ee7016d051428a0fbd Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 16:36:34 -0400
Subject: [PATCH 23/32] Touch up windows command line tool to support PVRTC
 compression

---
 CLTool/src/clunix.cpp  |  8 +++++++-
 CLTool/src/clwin32.cpp | 30 ++++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp
index a064c1f..849a3de 100644
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@@ -244,7 +244,13 @@ int main(int argc, char **argv) {
   }
 
   Image cImg (*ci);
-  ImageFile cImgFile (strcat(basename, "-bc7.png"), eFileFormat_PNG, cImg);
+  if(format == eCompressionFormat_BPTC) {
+    strcat(basename, "-bc7.png");
+  } else if(format == eCompressionFormat_PVRTC) {
+    strcat(basename, "-pvrtc.png");
+  }
+
+  ImageFile cImgFile (basename, eFileFormat_PNG, cImg);
   cImgFile.Write();
 
   // Cleanup 
diff --git a/CLTool/src/clwin32.cpp b/CLTool/src/clwin32.cpp
index d746a22..20dfc29 100644
--- a/CLTool/src/clwin32.cpp
+++ b/CLTool/src/clwin32.cpp
@@ -54,6 +54,7 @@
 void PrintUsage() {
   fprintf(stderr, "Usage: tc [OPTIONS] imagefile\n");
   fprintf(stderr, "\n");
+  fprintf(stderr, "\t-f\t\tFormat to use. Either \"BPTC\" or \"PVRTC\". Default: BPTC\n");
   fprintf(stderr, "\t-l\t\tSave an output log.\n");
   fprintf(stderr, "\t-q <quality>\tSet compression quality level. Default: 50\n");
   fprintf(stderr, "\t-n <num>\tCompress the image num times and give the average time and PSNR. Default: 1\n");
@@ -93,11 +94,29 @@ int _tmain(int argc, _TCHAR* argv[])
   bool bUseSIMD = false;
   bool bSaveLog = false;
   bool bUseAtomics = false;
+  ECompressionFormat format = eCompressionFormat_BPTC;
   
   bool knowArg = false;
   do {
     knowArg = false;
 
+    if(strcmp(argv[fileArg], "-f") == 0) {
+      fileArg++;
+
+      if(fileArg == argc) {
+        PrintUsage();
+        exit(1);
+      } else {
+        if(!strcmp(argv[fileArg], "PVRTC")) {
+          format = eCompressionFormat_PVRTC;
+        }
+      }
+
+      fileArg++;
+      knowArg = true;
+      continue;
+    }
+
     if(strcmp(argv[fileArg], "-n") == 0) {
       fileArg++;
 
@@ -201,6 +220,7 @@ int _tmain(int argc, _TCHAR* argv[])
   }
   
   SCompressionSettings settings;
+  settings.format = format;
   settings.bUseSIMD = bUseSIMD;
   settings.bUseAtomics = bUseAtomics;
   settings.iNumThreads = numThreads;
@@ -226,10 +246,16 @@ int _tmain(int argc, _TCHAR* argv[])
   if(bSaveLog) {
     strcat_s(basename, ".log");
     statManager->ToFile(basename);
-  basename[strlen(basename) - 4] = '\0';
+    basename[strlen(basename) - 4] = '\0';
   }
-  strcat_s(basename, "-bc7.png");
+
   Image cImg (*ci);
+  if(format == eCompressionFormat_BPTC) {
+    strcat_s(basename, "-bc7.png");
+  } else if(format == eCompressionFormat_PVRTC) {
+    strcat_s(basename, "-pvrtc.png");
+  }
+
   ImageFile cImgFile (basename, eFileFormat_PNG, cImg);
   cImgFile.Write();
 

From 088481afe434b3b682bd8f376a489765758997af Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 16:37:17 -0400
Subject: [PATCH 24/32] Meant resize instead of reserve.

---
 PVRTCEncoder/src/Compressor.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index 937c239..fd7298e 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -168,7 +168,7 @@ namespace PVRTCC {
 
     // Compute difference...
     ::std::vector<int16> difference;
-    difference.reserve(dcj.height * dcj.width * 4);
+    difference.resize(dcj.height * dcj.width * 4);
     for(uint32 j = 0; j < dcj.height; j++) {
       for(uint32 i = 0; i < dcj.width; i++) {
         for(uint32 c = 0; c < 4; c++) {
@@ -186,8 +186,8 @@ namespace PVRTCC {
     ::std::vector<int16> minDiff;
 
     const uint32 kNumBlockChannels = dcj.height * dcj.width / 4;
-    maxDiff.reserve(kNumBlockChannels);
-    minDiff.reserve(kNumBlockChannels);
+    maxDiff.resize(kNumBlockChannels);
+    minDiff.resize(kNumBlockChannels);
 
     for(uint32 j = 2; j < dcj.height; j += 4) {
       for(uint32 i = 2; i < dcj.width; i += 4) {
@@ -256,8 +256,8 @@ namespace PVRTCC {
     upB.DebugOutput("UpscaledB");
 
     // Choose the most appropriate modulation values for the two images...
-    std::vector<uint8> modValues;
-    modValues.reserve(dcj.width * dcj.height);
+    ::std::vector<uint8> modValues;
+    modValues.resize(dcj.width * dcj.height);
     for(uint32 j = 0; j < dcj.height; j++) {
       for(uint32 i = 0; i < dcj.width; i++) {
         uint8 &mv = modValues[j * dcj.width + i];

From 33a1dab0a9310a1e51abb1c49c2075c43300b13e Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 16:37:34 -0400
Subject: [PATCH 25/32] The modulation value is the index.

---
 PVRTCEncoder/src/Compressor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index fd7298e..850f5d0 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -285,7 +285,7 @@ namespace PVRTCC {
 
           if(error < bestError) {
             bestError = error;
-            bestMod = modSteps[s];
+            bestMod = s;
           }
         }
 

From 2c8254d6c3c2f5a601e880ce436ae76df5d153ec Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 16:38:10 -0400
Subject: [PATCH 26/32] Small optimization: reserve vector data before we
 populate blocks vector.

---
 PVRTCEncoder/src/Compressor.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index 850f5d0..2b167a6 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -296,7 +296,9 @@ namespace PVRTCC {
     // Pack everything into a PVRTC blocks.
     const uint32 blocksW = dcj.width / 4;
     const uint32 blocksH = dcj.height / 4;
+
     std::vector<uint64> blocks;
+    blocks.reserve(blocksW * blocksH);
     for(uint32 j = 0; j < blocksH; j++) {
       for(uint32 i = 0; i < blocksW; i++) {
         Block b;

From ba36ca34fdf5d8d6c1ee7e04b92dcdf4b72869ce Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 20:07:10 -0400
Subject: [PATCH 27/32] Make sure to set the block stream order flag for sane
 debugging. This bug really needs to get fixed.

---
 CLTool/src/clunix.cpp  | 1 +
 CLTool/src/clwin32.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp
index 849a3de..cccc264 100644
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@@ -247,6 +247,7 @@ int main(int argc, char **argv) {
   if(format == eCompressionFormat_BPTC) {
     strcat(basename, "-bc7.png");
   } else if(format == eCompressionFormat_PVRTC) {
+    cImg.SetBlockStreamOrder(false);
     strcat(basename, "-pvrtc.png");
   }
 
diff --git a/CLTool/src/clwin32.cpp b/CLTool/src/clwin32.cpp
index 20dfc29..d05e12a 100644
--- a/CLTool/src/clwin32.cpp
+++ b/CLTool/src/clwin32.cpp
@@ -253,6 +253,7 @@ int _tmain(int argc, _TCHAR* argv[])
   if(format == eCompressionFormat_BPTC) {
     strcat_s(basename, "-bc7.png");
   } else if(format == eCompressionFormat_PVRTC) {
+    cImg.SetBlockStreamOrder(false);
     strcat_s(basename, "-pvrtc.png");
   }
 

From a57c40005ca3b5dfcf17ec87c573955abf488745 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 20:07:44 -0400
Subject: [PATCH 28/32] Small bug: don't allocate more memory than we have to.

---
 PVRTCEncoder/src/Decompressor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PVRTCEncoder/src/Decompressor.cpp b/PVRTCEncoder/src/Decompressor.cpp
index 48d2ecb..a367ada 100644
--- a/PVRTCEncoder/src/Decompressor.cpp
+++ b/PVRTCEncoder/src/Decompressor.cpp
@@ -304,10 +304,10 @@ namespace PVRTCC {
 
     // First, extract all of the block information...
     std::vector<Block> blocks;
-    blocks.reserve(w * h);
 
     const uint32 blocksW = bTwoBitMode? (w / 8) : (w / 4);
     const uint32 blocksH = h / 4;
+    blocks.reserve(blocksW * blocksH);
 
     for(uint32 j = 0; j < blocksH; j++) {
       for(uint32 i = 0; i < blocksW; i++) {

From 4de5f90edfaeb97a0912d936045bdd3250cd2e35 Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Wed, 25 Sep 2013 20:10:18 -0400
Subject: [PATCH 29/32] Some small bug fixes to our still awful compressor.

---
 PVRTCEncoder/src/Compressor.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index 2b167a6..19e1e0f 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -267,7 +267,7 @@ namespace PVRTCC {
         const Pixel po = original(i, j);
         
         // !FIXME! there are two modulation modes... we're only using one.
-        uint8 modSteps[4] = { 0, 3, 5, 8 };
+        uint8 modSteps[4] = { 8, 5, 3, 0 };
         uint8 bestMod = 0;
         uint32 bestError = 0xFFFFFFFF;
         for(uint32 s = 0; s < 4; s++) {
@@ -297,16 +297,19 @@ namespace PVRTCC {
     const uint32 blocksW = dcj.width / 4;
     const uint32 blocksH = dcj.height / 4;
 
+    assert(imgA.GetHeight() == blocksH);
+    assert(imgA.GetWidth() == blocksW);
+
     std::vector<uint64> blocks;
     blocks.reserve(blocksW * blocksH);
     for(uint32 j = 0; j < blocksH; j++) {
       for(uint32 i = 0; i < blocksW; i++) {
         Block b;
-        b.SetColorA(imgA(i, j));
-        b.SetColorB(imgB(i, j));
+        b.SetColorA(imgA(i, j), true);
+        b.SetColorB(imgB(i, j), true);
         for(uint32 t = 0; t < 16; t++) {
-          uint32 x = i + (t%4);
-          uint32 y = j + (t/4);
+          uint32 x = i*4 + (t%4);
+          uint32 y = j*4 + (t/4);
           b.SetLerpValue(t, modValues[y*dcj.width + x]);
         }
         blocks.push_back(b.Pack());
@@ -321,9 +324,8 @@ namespace PVRTCC {
         // linearize them...
         uint32 idx = Interleave(j, i);
 
-        uint32 offset = idx * PVRTCC::kBlockSize;
-        uint64 *outPtr = reinterpret_cast<uint64 *>(dcj.outBuf + offset);
-        *outPtr = blocks[j * blocksW + i];
+        uint64 *outPtr = reinterpret_cast<uint64 *>(dcj.outBuf);
+        outPtr[idx] = blocks[j*blocksW + i];
       }
     }
   }

From 264e447e8090e96c705d3d6e89a1497ab5ecb95b Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Thu, 26 Sep 2013 20:17:07 -0400
Subject: [PATCH 30/32] Deal with this bug once and for all. If we have an
 image in block stream order, then explicitly reorder it before doing any
 work. Then keep it that way. I probably could have fixed this in the amount
 of time I've wasted on it. -____-

---
 CLTool/src/clunix.cpp           |  4 ++-
 CLTool/src/clwin32.cpp          |  4 ++-
 Core/include/CompressedImage.h  |  2 ++
 Core/include/Image.h            | 11 ++++++-
 Core/src/Image.cpp              | 57 +++++++++++++++++++++++++++++++++
 PVRTCEncoder/src/Compressor.cpp | 10 ++----
 6 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp
index cccc264..2dd713d 100644
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@@ -206,6 +206,9 @@ int main(int argc, char **argv) {
   }
 
   const Image *img = file.GetImage();
+  if(format == eCompressionFormat_PVRTC) {
+    const_cast<Image *>(img)->SetBlockStreamOrder(false);
+  }
 
   int numBlocks = (img->GetWidth() * img->GetHeight())/16;
   BlockStatManager *statManager = NULL;
@@ -247,7 +250,6 @@ int main(int argc, char **argv) {
   if(format == eCompressionFormat_BPTC) {
     strcat(basename, "-bc7.png");
   } else if(format == eCompressionFormat_PVRTC) {
-    cImg.SetBlockStreamOrder(false);
     strcat(basename, "-pvrtc.png");
   }
 
diff --git a/CLTool/src/clwin32.cpp b/CLTool/src/clwin32.cpp
index d05e12a..53057a6 100644
--- a/CLTool/src/clwin32.cpp
+++ b/CLTool/src/clwin32.cpp
@@ -212,6 +212,9 @@ int _tmain(int argc, _TCHAR* argv[])
   }
 
   const Image *img = file.GetImage();
+  if(format == eCompressionFormat_PVRTC) {
+    const_cast<Image *>(img)->SetBlockStreamOrder(false);
+  }
 
   int numBlocks = (img->GetWidth() * img->GetHeight())/16;
   BlockStatManager *statManager = NULL;
@@ -253,7 +256,6 @@ int _tmain(int argc, _TCHAR* argv[])
   if(format == eCompressionFormat_BPTC) {
     strcat_s(basename, "-bc7.png");
   } else if(format == eCompressionFormat_PVRTC) {
-    cImg.SetBlockStreamOrder(false);
     strcat_s(basename, "-pvrtc.png");
   }
 
diff --git a/Core/include/CompressedImage.h b/Core/include/CompressedImage.h
index 8799d11..86021a7 100644
--- a/Core/include/CompressedImage.h
+++ b/Core/include/CompressedImage.h
@@ -97,6 +97,8 @@ class CompressedImage {
   // !FIXME! We should have a function to explicitly return the in/out buf
   // size for a given compressed image.
   bool DecompressImage(uint8 *outBuf, uint32 outBufSz) const;
+
+  ECompressionFormat GetFormat() const { return m_Format; }
 };
 
 #endif // _COMPRESSED_IMAGE_H_
diff --git a/Core/include/Image.h b/Core/include/Image.h
index 7c2a35c..2ac3162 100644
--- a/Core/include/Image.h
+++ b/Core/include/Image.h
@@ -70,7 +70,13 @@ class Image {
   uint32 GetWidth() const { return m_Width; }
   uint32 GetHeight() const { return m_Height; }
 
-  void SetBlockStreamOrder(bool flag) { m_bBlockStreamOrder = flag; }
+  void SetBlockStreamOrder(bool flag) {
+    if(flag) {
+      ConvertToBlockStreamOrder();
+    } else {
+      ConvertFromBlockStreamOrder();
+    }
+  }
   bool GetBlockStreamOrder() const { return m_bBlockStreamOrder; }
 
  private:
@@ -80,6 +86,9 @@ class Image {
   bool m_bBlockStreamOrder;
 
   uint8 *m_PixelData;
+
+  void ConvertToBlockStreamOrder();
+  void ConvertFromBlockStreamOrder();
 };
 
 #endif // __TEXCOMP_IMAGE_H__
diff --git a/Core/src/Image.cpp b/Core/src/Image.cpp
index a918706..ac57763 100644
--- a/Core/src/Image.cpp
+++ b/Core/src/Image.cpp
@@ -117,6 +117,11 @@ Image::Image(const CompressedImage &ci)
     fprintf(stderr, "Error decompressing image!\n");
     return;
   }
+
+  // !HACK!
+  if(ci.GetFormat() == eCompressionFormat_PVRTC) {
+    m_bBlockStreamOrder = false;
+  }
 }
 
 Image::Image(const ImageLoader &loader) 
@@ -207,3 +212,55 @@ double Image::ComputePSNR(const CompressedImage &ci) const {
   delete unCompData;
   return PSNR;
 }
+
+void Image::ConvertToBlockStreamOrder() {
+  if(m_bBlockStreamOrder || !m_PixelData)
+    return;
+
+  uint32 *newPixelData = new uint32[GetWidth() * GetHeight() * 4];
+  for(uint32 j = 0; j < GetHeight(); j+=4) {
+    for(uint32 i = 0; i < GetWidth(); i+=4) {
+      uint32 blockX = i / 4;
+      uint32 blockY = j / 4;
+      uint32 blockIdx = blockY * (GetWidth() / 4) + blockX;
+
+      uint32 offset = blockIdx * 4 * 4;
+      for(uint32 t = 0; t < 16; t++) {
+        uint32 x = i + t % 4;
+        uint32 y = j + t / 4;
+        newPixelData[offset + t] =
+          reinterpret_cast<uint32 *>(m_PixelData)[y*GetWidth() + x];
+      }
+    }
+  }
+
+  delete m_PixelData;
+  m_PixelData = reinterpret_cast<uint8 *>(newPixelData);
+  m_bBlockStreamOrder = true;
+}
+
+void Image::ConvertFromBlockStreamOrder() {
+  if(!m_bBlockStreamOrder || !m_PixelData)
+    return;
+
+  uint32 *newPixelData = new uint32[GetWidth() * GetHeight() * 4];
+  for(uint32 j = 0; j < GetHeight(); j+=4) {
+    for(uint32 i = 0; i < GetWidth(); i+=4) {
+      uint32 blockX = i / 4;
+      uint32 blockY = j / 4;
+      uint32 blockIdx = blockY * (GetWidth() / 4) + blockX;
+
+      uint32 offset = blockIdx * 4 * 4;
+      for(uint32 t = 0; t < 16; t++) {
+        uint32 x = i + t % 4;
+        uint32 y = j + t / 4;
+        newPixelData[y*GetWidth() + x] =
+          reinterpret_cast<uint32 *>(m_PixelData)[offset + t];
+      }
+    }
+  }
+
+  delete m_PixelData;
+  m_PixelData = reinterpret_cast<uint8 *>(newPixelData);
+  m_bBlockStreamOrder = false;
+}
diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index 19e1e0f..66768da 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -137,14 +137,8 @@ namespace PVRTCC {
     Image img(dcj.height, dcj.width);
     uint32 nPixels = dcj.height * dcj.width;
     for(uint32 i = 0; i < nPixels; i++) {
-      // Assume block stream order (whyyyy)
-      uint32 blockIdx = i / 16;
-      uint32 blockWidth = dcj.width / 4;
-      uint32 blockX = blockIdx % blockWidth;
-      uint32 blockY = blockIdx / blockWidth;
-
-      uint32 x = blockX * 4 + (i % 4);
-      uint32 y = blockY * 4 + (i % 16) / 4;
+      uint32 x = i % dcj.width;
+      uint32 y = i / dcj.width;
 
       const uint32 *pixels = reinterpret_cast<const uint32 *>(dcj.inBuf);
       img(x, y).UnpackRGBA(pixels[i]);      

From 36ce4788212f489b5a29c92a3ca226e7ebf1b33f Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Thu, 26 Sep 2013 20:18:26 -0400
Subject: [PATCH 31/32] Refactor PSNR calculations.

---
 Core/src/Image.cpp | 48 ++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/Core/src/Image.cpp b/Core/src/Image.cpp
index ac57763..b755128 100644
--- a/Core/src/Image.cpp
+++ b/Core/src/Image.cpp
@@ -174,43 +174,41 @@ double Image::ComputePSNR(const CompressedImage &ci) const {
     return -1.0f;
   }
 
-  const double wr = 1.0;
-  const double wg = 1.0;
-  const double wb = 1.0;
+  //  const double w[3] = { 0.2126, 0.7152, 0.0722 };
+  const double w[3] = { 1.0, 1.0, 1.0 };
     
-  double MSE = 0.0;
+  double mse = 0.0;
   for(uint32 i = 0; i < imageSz; i+=4) {
 
     const unsigned char *pixelDataRaw = m_PixelData + i;
     const unsigned char *pixelDataUncomp = unCompData + i;
 
-    double rawAlphaScale = double(pixelDataRaw[3]) / 255.0;
-    double uncompAlphaScale = double(pixelDataUncomp[3]) / 255.0;
-    double dr = double(sad(rawAlphaScale * pixelDataRaw[0], uncompAlphaScale * pixelDataUncomp[0])) * wr;
-    double dg = double(sad(rawAlphaScale * pixelDataRaw[1], uncompAlphaScale * pixelDataUncomp[1])) * wg;
-    double db = double(sad(rawAlphaScale * pixelDataRaw[2], uncompAlphaScale * pixelDataUncomp[2])) * wb;
-    
-    const double pixelMSE = 
-      (double(dr) * double(dr)) + 
-      (double(dg) * double(dg)) + 
-      (double(db) * double(db));
-    
-    //fprintf(stderr, "Pixel MSE: %f\n", pixelMSE);
-    MSE += pixelMSE;
+    float r[4], u[4];
+    for(uint32 c = 0; c < 4; c++) {
+      if(c == 3) {
+        r[c] = pixelDataRaw[c] / 255.0;
+        u[c] = pixelDataUncomp[c] / 255.0;
+      } else {
+        r[c] = static_cast<double>(pixelDataRaw[c]) * w[c];
+        u[c] = static_cast<double>(pixelDataUncomp[c]) * w[c];
+      }
+    }
+
+    for(uint32 c = 0; c < 3; c++) {
+      double diff = (r[3] * r[c] - u[3] * u[c]);
+      mse += diff * diff;
+    }
   }
 
-  MSE /= (double(GetWidth()) * double(GetHeight()));
+  mse /= GetWidth() * GetHeight();
 
-  double MAXI = 
-    (255.0 * wr) * (255.0 * wr) + 
-    (255.0 * wg) * (255.0 * wg) + 
-    (255.0 * wb) * (255.0 * wb);
-
-  double PSNR = 10 * log10(MAXI/MSE);
+  const double C = 255.0 * 255.0;
+  double maxi = (w[0] + w[1] + w[2]) * C;
+  double psnr = 10 * log10(maxi/mse);
 
   // Cleanup
   delete unCompData;
-  return PSNR;
+  return psnr;
 }
 
 void Image::ConvertToBlockStreamOrder() {

From 8ea39890aca95dd7ad798104f4c16523f45cf5ac Mon Sep 17 00:00:00 2001
From: Pavel Krajcevski <pavel@cs.unc.edu>
Date: Thu, 26 Sep 2013 20:19:19 -0400
Subject: [PATCH 32/32] Update compressor to do a simple bounding box
 algorithm... results are still bad but better than what we've been getting.

---
 PVRTCEncoder/src/Compressor.cpp | 81 +++++++++------------------------
 1 file changed, 22 insertions(+), 59 deletions(-)

diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp
index 66768da..11b8340 100644
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@@ -91,11 +91,10 @@ namespace PVRTCC {
     return ::std::min(::std::max(low, v), high);
   }
 
-  template <typename T>
-  static T Lookup(const ::std::vector<T> &vals,
-                  uint32 x, uint32 y,
-                  uint32 width, uint32 height,
-                  const EWrapMode wrapMode) {
+  static const Pixel &Lookup(const Image &img,
+                             int32 x, int32 y,
+                             uint32 width, uint32 height,
+                             const EWrapMode wrapMode) {
     while(x >= width) {
       if(wrapMode == eWrapMode_Wrap) {
         x -= width;
@@ -128,7 +127,7 @@ namespace PVRTCC {
       }
     }
 
-    return vals[y * width + x];
+    return img(x, y);
   }
 
   void Compress(const CompressionJob &dcj,
@@ -173,62 +172,29 @@ namespace PVRTCC {
       }
     }
 
+    const uint32 blocksW = dcj.width / 4;
+    const uint32 blocksH = dcj.height / 4;
+
     // Go over the 7x7 texel blocks and extract bounding box diagonals for each
     // block. We should be able to choose which diagonal we want...
     const uint32 kKernelSz = 7;
-    ::std::vector<int16> maxDiff;
-    ::std::vector<int16> minDiff;
 
-    const uint32 kNumBlockChannels = dcj.height * dcj.width / 4;
-    maxDiff.resize(kNumBlockChannels);
-    minDiff.resize(kNumBlockChannels);
-
-    for(uint32 j = 2; j < dcj.height; j += 4) {
-      for(uint32 i = 2; i < dcj.width; i += 4) {
-        const uint32 startX = i - (kKernelSz / 2);
-        const uint32 startY = j - (kKernelSz / 2);
-        for(uint32 c = 0; c < 4; c++) {
-          int32 pos = 0;
-          int32 neg = 0;
-          for(uint32 y = startY; y < startY + kKernelSz; y++) {
-            for(uint32 x = startX; x < startX + kKernelSz; x++) {
-              int16 val = Lookup(difference, x*4 + c, y,
-                                 dcj.width*4, dcj.height, wrapMode);
-              if(val > 0) {
-                pos += val;
-              } else {
-                neg += val;
-              }
-            }
-          }
-
-          uint32 blockIdx = ((j-2)/4) * dcj.width + (i-2) + c;
-          assert(blockIdx < kNumBlockChannels);
-          if(pos > -neg) {
-            maxDiff[blockIdx] = pos;
-            minDiff[blockIdx] = 0;
-          } else {
-            maxDiff[blockIdx] = 0;
-            minDiff[blockIdx] = neg;       
-          }
-        }
-      }
-    }
-
-    // Add maxDiff to image to get high signal, and lowdiff to image to
-    // get low signal...
     Image imgA = downscaled;
     Image imgB = downscaled;
-
-    for(uint32 j = 0; j < dcj.height / 4; j++) {
-      for(uint32 i = 0; i < dcj.width / 4; i++) {
-        for(uint32 c = 0; c < 4; c++) {
-          const uint32 cIdx = j*dcj.width/4 + i*4 + c;
-          uint8 &a = imgA(i, j).Component(c);
-          a = static_cast<uint8>(Clamp<int16>(a + maxDiff[cIdx], 0, 255));
-
-          uint8 &b = imgB(i, j).Component(c);
-          b = static_cast<uint8>(Clamp<int16>(b + minDiff[cIdx], 0, 255));
+    for(uint32 j = 0; j < blocksH; j++) {
+      for(uint32 i = 0; i < blocksW; i++) {
+        int32 startX = i*4 + 2 - (kKernelSz / 2);
+        int32 startY = j*4 + 2 - (kKernelSz / 2);
+        for(int32 y = startY; y < startY + kKernelSz; y++) {
+          for(int32 x = startX; x < startX + kKernelSz; x++) {
+            const Pixel &po = Lookup(original, x, y, dcj.width, dcj.height, wrapMode);
+            Pixel &pa = imgA(i, j);
+            Pixel &pb = imgB(i, j);
+            for(uint32 c = 0; c < 4; c++) {
+              pa.Component(c) = ::std::max(po.Component(c), pa.Component(c));
+              pb.Component(c) = ::std::min(po.Component(c), pb.Component(c));
+            }
+          }
         }
       }
     }
@@ -288,9 +254,6 @@ namespace PVRTCC {
     }
 
     // Pack everything into a PVRTC blocks.
-    const uint32 blocksW = dcj.width / 4;
-    const uint32 blocksH = dcj.height / 4;
-
     assert(imgA.GetHeight() == blocksH);
     assert(imgA.GetWidth() == blocksW);