Refactor CompressionJob struct.

In order to better facilitate the change from block stream order to non-block stream order, a lot of changes were introduced to the way that we feed texture data to the compressors. This data is embodied in the CompressionJob struct. We have made it so that the compression job points to both the in and out pointers for our compressed and uncompressed data. Furthermore, we have made sure that the struct also contains the format that its compressing for, so that if any threading programs would like to chop up a compression job into smaller chunks based on the format, it doesn't need to know the format explicitly, it just needs to know certain properties about the format. Moreover, the user can now define the start and end pixels from which we would like to compress to. We can compress subsets of data by changing the in and out pointers and the width and height values. The compressors will read data linearly until they reach the out pixels based on the width of the given pixel.
2025-08-03 22:31:09 +00:00 · 2013-11-08 16:21:01 -05:00 · 2013-11-08 16:21:01 -05:00 · a80944901e
parent f70b26a47f
commit a80944901e
27 changed files with 567 additions and 438 deletions
--- a/BPTCEncoder/include/BC7Compressor.h
+++ b/BPTCEncoder/include/BC7Compressor.h
@ -114,13 +114,13 @@ namespace BC7C {

  // Compress the image given as RGBA data to BC7 format. Width and Height are
  // the dimensions of the image in pixels.
-  void Compress(const CompressionJob &);
+  void Compress(const FasTC::CompressionJob &);

  // Perform a compression while recording all of the choices the compressor
  // made into a list of statistics. We can use this to see whether or not
  // certain heuristics are working, such as whether or not certain modes are
  // being chosen more often than others, etc.
-  void CompressWithStats(const CompressionJob &, std::ostream *logStream);
+  void CompressWithStats(const FasTC::CompressionJob &, std::ostream *logStream);

 #ifdef HAS_SSE_41
  // Compress the image given as RGBA data to BC7 format using an algorithm
@ -135,12 +135,12 @@ namespace BC7C {
  // to compress a list of textures. If this function is called with the same
  // argument from multiple threads, they will work together to compress all of
  // the images in the list.
-  void CompressAtomic(CompressionJobList &);
+  void CompressAtomic(FasTC::CompressionJobList &);
 #endif

  // Decompress the image given as BC7 data to R8G8B8A8 format. Width and Height
  // are the dimensions of the image in pixels.
-  void Decompress(const DecompressionJob &);
+  void Decompress(const FasTC::DecompressionJob &);
 }  // namespace BC7C

 #endif  // BPTCENCODER_INCLUDE_BC7COMPRESSOR_H_
--- a/BPTCEncoder/src/BC7Compressor.cpp
+++ b/BPTCEncoder/src/BC7Compressor.cpp
@ -1622,23 +1622,32 @@ namespace BC7C {

  static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]);

+  void GetBlock(const uint32 x, const uint32 y, const uint32 pixelsWide,
+                const uint32 *inPixels, uint32 block[16]) {
+    memcpy(block, inPixels + y*pixelsWide + x, 4 * sizeof(uint32));
+    memcpy(block + 4, inPixels + (y+1)*pixelsWide + x, 4 * sizeof(uint32));
+    memcpy(block + 8, inPixels + (y+2)*pixelsWide + x, 4 * sizeof(uint32));
+    memcpy(block + 12, inPixels + (y+3)*pixelsWide + x, 4 * sizeof(uint32));
+  }
+
  // Compress an image using BC7 compression. Use the inBuf parameter to point
  // to an image in 4-byte RGBA format. The width and height parameters specify
  // the size of the image in pixels. The buffer pointed to by outBuf should be
  // large enough to store the compressed image. This implementation has an 4:1
  // compression ratio.
-  void Compress(const CompressionJob &cj) {
+  void Compress(const FasTC::CompressionJob &cj) {
    const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj.InBuf());
-    unsigned char *outBuf = cj.OutBuf();
-    for(uint32 j = 0; j < cj.Height(); j += 4) {
-      for(uint32 i = 0; i < cj.Width(); i += 4) {
+    const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_BPTC);
+    uint8 *outBuf = cj.OutBuf() + cj.CoordsToBlockIdx(cj.XStart(), cj.YStart()) * kBlockSz;
+
+    uint32 startX = cj.XStart();
+    bool done = false;
+
+    for(uint32 j = cj.YStart(); !done; j += 4) {
+      for(uint32 i = startX; !done && i < cj.Width(); i += 4) {

        uint32 block[16];
-        memcpy(block, inPixels + j*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 4, inPixels + (j+1)*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 8, inPixels + (j+2)*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 12, inPixels + (j+3)*cj.RowBytes() + i, 4 * sizeof(uint32));
-
+        GetBlock(i, j, cj.Width(), inPixels, block);
        CompressBC7Block(block, outBuf);

 #ifndef NDEBUG
@ -1666,8 +1675,10 @@ namespace BC7C {
        }
 #endif

-        outBuf += 16;
+        outBuf += kBlockSz;
+        done = i+4 >= cj.XEnd() && j+(i+4 == cj.Width()? 4 : 0) >= cj.YEnd();
      }
+      startX = 0;
    }
  }

@ -1691,24 +1702,28 @@ namespace BC7C {
 #endif

  // Variables used for synchronization in threadsafe implementation.
-  void CompressAtomic(CompressionJobList &cjl) {
+  void CompressAtomic(FasTC::CompressionJobList &cjl) {
    uint32 jobIdx;
    while((jobIdx = cjl.m_CurrentJobIndex) < cjl.GetNumJobs()) {
      // !HACK! ... Microsoft has this defined
      #undef GetJob

-      const CompressionJob *cj = cjl.GetJob(jobIdx);
-      const uint32 nBlocks = (cj->height * cj->width) / 16;
+      const FasTC::CompressionJob *cj = cjl.GetJob(jobIdx);
+      const uint32 nBlocks = (cj->Height() * cj->Width()) / 16;

      // Help finish whatever texture we're compressing before we start again on
      // my work...
      uint32 blockIdx;
      while((blockIdx = FetchAndAdd(&(cjl.m_CurrentBlockIndex))) < nBlocks &&
            *(cjl.GetFinishedFlag(jobIdx)) == 0) {
-        unsigned char *out = cj->outBuf + (16 * blockIdx);
-        const unsigned char *in = cj->inBuf + (64 * blockIdx);
+        unsigned char *out = cj->OutBuf() + (16 * blockIdx);

-        CompressBC7Block((const uint32 *)in, out);
+        uint32 block[16];
+        uint32 x = cj->XStart() + 4 * (blockIdx % (cj->Width() / 4));
+        uint32 y = cj->YStart() + 4 * (blockIdx / (cj->Width() / 4));
+        const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj->InBuf());
+        GetBlock(x, y, cj->Width(), inPixels, block);
+        CompressBC7Block(block, out);
      }

      if(TestAndSet(cjl.GetFinishedFlag(jobIdx)) == 0) {
@ -1722,21 +1737,21 @@ namespace BC7C {
  }
 #endif  // HAS_ATOMICS

-  void CompressWithStats(const CompressionJob &cj, std::ostream *logStream) {
+  void CompressWithStats(const FasTC::CompressionJob &cj, std::ostream *logStream) {
    const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj.InBuf());
-    unsigned char *outBuf = cj.OutBuf();
+    const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_BPTC);
+    uint8 *outBuf = cj.OutBuf() + cj.CoordsToBlockIdx(cj.XStart(), cj.YStart()) * kBlockSz;

-    for(uint32 j = 0; j < cj.Height(); j += 4) {
-      for(uint32 i = 0; i < cj.Width(); i += 4) {
+    uint32 startX = cj.XStart();
+    bool done = false;
+    for(uint32 j = cj.YStart(); !done; j += 4) {
+      for(uint32 i = startX; !done && i < cj.Width(); i += 4) {

        uint32 block[16];
-        memcpy(block, inPixels + j*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 4, inPixels + (j+1)*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 8, inPixels + (j+2)*cj.RowBytes() + i, 4 * sizeof(uint32));
-        memcpy(block + 12, inPixels + (j+3)*cj.RowBytes() + i, 4 * sizeof(uint32));
+        GetBlock(i, j, cj.Width(), inPixels, block);

        if(logStream) {
-          uint64 blockIdx = reinterpret_cast<uint64>(inPixels + j*cj.Width() + i);
+          uint64 blockIdx = cj.CoordsToBlockIdx(i, j);
          CompressBC7Block(block, outBuf, BlockLogger(blockIdx, *logStream));
        } else {
          CompressBC7Block(block, outBuf);
@ -1749,9 +1764,9 @@ namespace BC7C {
        DecompressBC7Block(cmpData, unComp);
        const uint8* unCompData = reinterpret_cast<uint8 *>(unComp);

-        int diffSum = 0;
-        for(int i = 0; i < 64; i++) {
-          diffSum += sad(unCompData[i], inBlock[i]);
+        uint32 diffSum = 0;
+        for(uint32 k = 0; k < 64; k++) {
+          diffSum += sad(unCompData[k], inBlock[k]);
        }
        double blockError = static_cast<double>(diffSum) / 64.0;
        if(blockError > 50.0) {
@ -1761,7 +1776,10 @@ namespace BC7C {
 #endif

        outBuf += 16;
+        done = i+4 >= cj.XEnd() && j+(i+4 == cj.Width()? 4 : 0) >= cj.YEnd();
      }
+
+      startX = 0;
    }
  }

@ -2755,7 +2773,7 @@ namespace BC7C {
  }

  // Convert the image from a BC7 buffer to a RGBA8 buffer
-  void Decompress(const DecompressionJob &dj) {
+  void Decompress(const FasTC::DecompressionJob &dj) {

    const uint8 *inBuf = dj.InBuf();
    uint32 *outBuf = reinterpret_cast<uint32 *>(dj.OutBuf());
--- a/Base/include/CompressionFormat.h
+++ b/Base/include/CompressionFormat.h
@ -0,0 +1,103 @@
+/* FasTC
+ * Copyright (c) 2013 University of North Carolina at Chapel Hill.
+ * All rights reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for educational, research, and non-profit purposes, without
+ * fee, and without a written agreement is hereby granted, provided that the
+ * above copyright notice, this paragraph, and the following four paragraphs
+ * appear in all copies.
+ *
+ * Permission to incorporate this software into commercial products may be
+ * obtained by contacting the authors or the Office of Technology Development
+ * at the University of North Carolina at Chapel Hill <otd@unc.edu>.
+ *
+ * This software program and documentation are copyrighted by the University of
+ * North Carolina at Chapel Hill. The software program and documentation are
+ * supplied "as is," without any accompanying services from the University of
+ * North Carolina at Chapel Hill or the authors. The University of North
+ * Carolina at Chapel Hill and the authors do not warrant that the operation of
+ * the program will be uninterrupted or error-free. The end-user understands
+ * that the program was developed for research purposes and is advised not to
+ * rely exclusively on the program for any reason.
+ *
+ * IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE
+ * AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF
+ * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA
+ * AT CHAPEL HILL OR THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS SPECIFICALLY
+ * DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY 
+ * STATUTORY WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON
+ * AN "AS IS" BASIS, AND THE UNIVERSITY  OF NORTH CAROLINA AT CHAPEL HILL AND
+ * THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, 
+ * ENHANCEMENTS, OR MODIFICATIONS.
+ *
+ * Please send all BUG REPORTS to <pavel@cs.unc.edu>.
+ *
+ * The authors may be contacted via:
+ *
+ * Pavel Krajcevski
+ * Dept of Computer Science
+ * 201 S Columbia St
+ * Frederick P. Brooks, Jr. Computer Science Bldg
+ * Chapel Hill, NC 27599-3175
+ * USA
+ * 
+ * <http://gamma.cs.unc.edu/FasTC/>
+ */
+
+#ifndef _BASE_INCLUDE_COMPRESSIONFORMAT_H_
+#define _BASE_INCLUDE_COMPRESSIONFORMAT_H_
+
+#include "TexCompTypes.h"
+
+namespace FasTC {
+
+  // The different supported compression formats
+  enum ECompressionFormat {
+    eCompressionFormat_DXT1,
+    eCompressionFormat_DXT5,
+    eCompressionFormat_ETC1,
+    eCompressionFormat_BPTC,
+    eCompressionFormat_PVRTC,
+
+    kNumCompressionFormats
+  };
+
+  // Returns the dimensions of the blocks for the given format.
+  inline static void GetBlockDimensions(ECompressionFormat fmt, uint32 (&outSz)[2]) {
+    switch(fmt) {
+      default:
+      case eCompressionFormat_DXT1:
+      case eCompressionFormat_DXT5:
+      case eCompressionFormat_BPTC:
+      case eCompressionFormat_PVRTC:
+      case eCompressionFormat_ETC1:
+        outSz[0] = 4;
+        outSz[1] = 4;
+        break;
+    }
+  }
+
+  // Returns the size of the compressed block in bytes for the given format.
+  inline static uint32 GetBlockSize(ECompressionFormat fmt) {
+    switch(fmt) {
+      default:
+      case eCompressionFormat_DXT1:
+      case eCompressionFormat_PVRTC:
+      case eCompressionFormat_ETC1:
+        return 8;
+        break;
+
+      case eCompressionFormat_DXT5:
+      case eCompressionFormat_BPTC:
+        return 16;
+    }
+  }
+}  // namespace FasTC
+
+#endif // _BASE_INCLUDE_COMPRESSIONFORMAT_H_
--- a/Base/include/CompressionJob.h
+++ b/Base/include/CompressionJob.h
@ -45,6 +45,7 @@
 #define __COMPRESSION_JOBS_H__

 #include "TexCompTypes.h"
+#include "CompressionFormat.h"

 #ifdef _MSC_VER
 #   define ALIGN(x) __declspec( align(x) )
@ -53,84 +54,149 @@
 #endif
 #define ALIGN_SSE ALIGN(16)

-// This structure defines a compression job. Here, width and height are the dimensions
-// of the image in pixels. inBuf contains the R8G8B8A8 data that is to be compressed, and
-// outBuf will contain the compressed BC7 data.
-//
-// Implicit sizes:
-//    inBuf - (width * height * 4) bytes
-//    outBuf - (width * height) bytes
-struct CompressionJob {
- private:
-  const uint8 *m_InBuf;
-  uint8 *m_OutBuf;
-  const uint32 m_Width;
-  const uint32 m_Height;
-  const uint32 m_RowBytes;
+namespace FasTC {

- public:
-  const uint8 *InBuf() const { return m_InBuf; }
-  uint8 *OutBuf() const { return m_OutBuf; }
-  uint32 Width() const { return m_Width; }
-  uint32 Height() const { return m_Height; }
-  uint32 RowBytes() const { return m_RowBytes; }
+  // This structure defines a compression job. Here, width and height are the dimensions
+  // of the image in pixels. inBuf contains the R8G8B8A8 data that is to be compressed, and
+  // outBuf will contain the compressed BC7 data.
+  //
+  // Implicit sizes:
+  //    inBuf - (width * height * 4) bytes
+  //    outBuf - (width * height) bytes
+  class CompressionJob {
+   private:
+    ECompressionFormat m_Format;
+    const uint8 *m_InBuf;
+    uint8 *m_OutBuf;
+    uint32 m_Width;
+    uint32 m_Height;
+    uint32 m_XStart, m_XEnd;
+    uint32 m_YStart, m_YEnd;

-  CompressionJob(
-    const uint8 *_inBuf,
-    unsigned char *_outBuf,
-    const uint32 _width,
-    const uint32 _height)
-  : m_InBuf(_inBuf)
-  , m_OutBuf(_outBuf)
-  , m_Width(_width)
-  , m_Height(_height)
-  , m_RowBytes(_width)
-  { }
+   public:
+    ECompressionFormat Format() const { return m_Format; }
+    const uint8 *InBuf() const { return m_InBuf; }
+    uint8 *OutBuf() const { return m_OutBuf; }
+    uint32 Width() const { return m_Width; }
+    uint32 Height() const { return m_Height; }
+    uint32 XStart() const { return m_XStart; }
+    uint32 XEnd() const { return m_XEnd; }
+    uint32 YStart() const { return m_YStart; }
+    uint32 YEnd() const { return m_YEnd; }

-  CompressionJob(
-    const uint8 *_inBuf,
-    unsigned char *_outBuf,
-    const uint32 _width,
-    const uint32 _height,
-    const uint32 _rowbytes)
-  : m_InBuf(_inBuf)
-  , m_OutBuf(_outBuf)
-  , m_Width(_width)
-  , m_Height(_height)
-  , m_RowBytes(_rowbytes)
-  { }
-};
+    CompressionJob(
+      ECompressionFormat _fmt,
+      const uint8 *_inBuf,
+      unsigned char *_outBuf,
+      const uint32 _width,
+      const uint32 _height)
+      : m_Format(_fmt)
+      , m_InBuf(_inBuf)
+      , m_OutBuf(_outBuf)
+      , m_Width(_width)
+      , m_Height(_height)
+      , m_XStart(0), m_XEnd(_width)
+      , m_YStart(0), m_YEnd(_height)
+    { }
+
+    CompressionJob(
+      ECompressionFormat _fmt,
+      const uint8 *_inBuf,
+      unsigned char *_outBuf,
+      const uint32 _width,
+      const uint32 _height,
+      const uint32 _xOffset,
+      const uint32 _yOffset)
+      : m_Format(_fmt)
+      , m_InBuf(_inBuf)
+      , m_OutBuf(_outBuf)
+      , m_Width(_width)
+      , m_Height(_height)
+      , m_XStart(_xOffset), m_XEnd(_width)
+      , m_YStart(_yOffset), m_YEnd(_height)
+    { }
+
+    CompressionJob(
+      ECompressionFormat _fmt,
+      const uint8 *_inBuf,
+      unsigned char *_outBuf,
+      const uint32 _width,
+      const uint32 _height,
+      const uint32 _xOffset,
+      const uint32 _yOffset,
+      const uint32 _xEndpoint,
+      const uint32 _yEndpoint)
+      : m_Format(_fmt)
+      , m_InBuf(_inBuf)
+      , m_OutBuf(_outBuf)
+      , m_Width(_width)
+      , m_Height(_height)
+      , m_XStart(_xOffset), m_XEnd(_xEndpoint)
+      , m_YStart(_yOffset), m_YEnd(_yEndpoint)
+    { }
+
+    // Returns the x and y coordinates of the pixels that corresponds to the block
+    // index for the given format.
+    void BlockIdxToCoords(uint32 blockIdx, uint32 (&out)[2]) const {
+      uint32 blockDim[2];
+      GetBlockDimensions(Format(), blockDim);
+
+      const uint32 kNumBlocksX = Width() / blockDim[0];
+
+      const uint32 blockX = blockIdx % kNumBlocksX;
+      const uint32 blockY = blockIdx / kNumBlocksX;
+
+      out[0] = blockX * blockDim[0];
+      out[1] = blockY * blockDim[1];
+    }
+
+    // Returns the x and y coordinates of the pixels that corresponds to the block
+    // index for the given format.
+    uint32 CoordsToBlockIdx(uint32 x, uint32 y) const {
+      uint32 blockDim[2];
+      GetBlockDimensions(Format(), blockDim);
+
+      const uint32 kNumBlocksX = Width() / blockDim[0];
+
+      const uint32 blockX = x / blockDim[0];
+      const uint32 blockY = y / blockDim[1];
+
+      return blockY * kNumBlocksX + blockX;
+    }
+  };
  
-// This struct mirrors that for a compression job, but is used to decompress a BC7 stream. Here, inBuf
-// is a buffer of BC7 data, and outBuf is the destination where we will copy the decompressed R8G8B8A8 data
-struct DecompressionJob {
- private:
-  const uint8 *m_InBuf;
-  uint8 *m_OutBuf;
-  const uint32 m_Width;
-  const uint32 m_Height;
+  // This struct mirrors that for a compression job, but is used to decompress a BC7 stream. Here, inBuf
+  // is a buffer of BC7 data, and outBuf is the destination where we will copy the decompressed R8G8B8A8 data
+  class DecompressionJob {
+   private:
+    const ECompressionFormat m_Format;
+    const uint8 *m_InBuf;
+    uint8 *m_OutBuf;
+    const uint32 m_Width;
+    const uint32 m_Height;

- public:
-  const uint8 *InBuf() const { return m_InBuf; }
-  uint8 *OutBuf() const { return m_OutBuf; }
-  uint32 Width() const { return m_Width; }
-  uint32 Height() const { return m_Height; }
+   public:
+    const uint8 *InBuf() const { return m_InBuf; }
+    uint8 *OutBuf() const { return m_OutBuf; }
+    uint32 Width() const { return m_Width; }
+    uint32 Height() const { return m_Height; }
+    uint32 Format() const { return m_Format; }

-  DecompressionJob(
-    const uint8 *_inBuf,
-    unsigned char *_outBuf,
-    const uint32 _width,
-    const uint32 _height)
-  : m_InBuf(_inBuf)
-  , m_OutBuf(_outBuf)
-  , m_Width(_width)
-  , m_Height(_height)
-  { }
-};
+    DecompressionJob(
+      ECompressionFormat _fmt,
+      const uint8 *_inBuf, uint8 *_outBuf,
+      uint32 _width, uint32 _height)
+      : m_Format(_fmt)
+      , m_InBuf(_inBuf)
+      , m_OutBuf(_outBuf)
+      , m_Width(_width)
+      , m_Height(_height)
+      { }
+  };

-// A structure for maintaining a list of textures to compress.
-struct CompressionJobList {
-  public:
+  // A structure for maintaining a list of textures to compress.
+  class CompressionJobList {
+   public:

    // Initialize the list by specifying the total number of jobs that it will contain.
    // This constructor allocates the necessary memory to hold the array.
@ -154,7 +220,7 @@ struct CompressionJobList {
    const CompressionJob *GetJob(uint32 idx) const;
    uint32 *GetFinishedFlag(uint32 idx) const;
    
-  private:
+   private:
    CompressionJob *m_Jobs;
    uint32 m_NumJobs;
    const uint32 m_TotalNumJobs;
@ -163,9 +229,10 @@ struct CompressionJobList {
      ALIGN(32) uint32 m_flag;
    } *m_FinishedFlags;

-  public:
+   public:
    ALIGN(32) uint32 m_CurrentJobIndex;
    ALIGN(32) uint32 m_CurrentBlockIndex;
-};
+  };

+}  // namespace FasTC
 #endif // __COMPRESSION_JOBS_H__
--- a/Base/src/CompressionJob.cpp
+++ b/Base/src/CompressionJob.cpp
@ -48,6 +48,8 @@
 #include <cstring>
 #include <cassert>

+namespace FasTC {
+
 // Initialize the list by specifying the total number of jobs that it will contain.
 // This constructor allocates the necessary memory to hold the array.
 CompressionJobList::CompressionJobList(const uint32 nJobs) 
@ -128,3 +130,5 @@ uint32 *CompressionJobList::GetFinishedFlag(uint32 idx) const {

  return &(m_FinishedFlags[idx].m_flag);
 }
+
+}  // namespace FasTC
--- a/CLTool/src/clunix.cpp
+++ b/CLTool/src/clunix.cpp
@ -56,7 +56,7 @@
 void PrintUsage() {
  fprintf(stderr, "Usage: tc [OPTIONS] imagefile\n");
  fprintf(stderr, "\n");
-  fprintf(stderr, "\t-v\t\tVerbose mode: prints out Entropy, Mean Local Entropy, and MSSIM");
+  fprintf(stderr, "\t-v\t\tVerbose mode: prints out Entropy, Mean Local Entropy, and MSSIM\n");
  fprintf(stderr, "\t-f\t\tFormat to use. Either \"BPTC\", \"ETC1\", \"DXT1\", \"DXT5\", or \"PVRTC\". Default: BPTC\n");
  fprintf(stderr, "\t-l\t\tSave an output log.\n");
  fprintf(stderr, "\t-q <quality>\tSet compression quality level. Default: 50\n");
@ -104,7 +104,7 @@ int main(int argc, char **argv) {
  bool bUseAtomics = false;
  bool bUsePVRTexLib = false;
  bool bVerbose = false;
-  ECompressionFormat format = eCompressionFormat_BPTC;
+  FasTC::ECompressionFormat format = FasTC::eCompressionFormat_BPTC;

  bool knowArg = false;
  do {
@ -131,16 +131,16 @@ int main(int argc, char **argv) {
        exit(1);
      } else {
        if(!strcmp(argv[fileArg], "PVRTC")) {
-          format = eCompressionFormat_PVRTC;
+          format = FasTC::eCompressionFormat_PVRTC;
        } else if(!strcmp(argv[fileArg], "PVRTCLib")) {
-          format = eCompressionFormat_PVRTC;
+          format = FasTC::eCompressionFormat_PVRTC;
          bUsePVRTexLib = true;
        } else if(!strcmp(argv[fileArg], "ETC1")) {
-          format = eCompressionFormat_ETC1;
+          format = FasTC::eCompressionFormat_ETC1;
        } else if(!strcmp(argv[fileArg], "DXT1")) {
-          format = eCompressionFormat_DXT1;
+          format = FasTC::eCompressionFormat_DXT1;
        } else if(!strcmp(argv[fileArg], "DXT5")) {
-          format = eCompressionFormat_DXT5;
+          format = FasTC::eCompressionFormat_DXT5;
        }
      }

@ -286,13 +286,13 @@ int main(int argc, char **argv) {
    }
  }

-  if(format == eCompressionFormat_BPTC) {
+  if(format == FasTC::eCompressionFormat_BPTC) {
    strcat(basename, "-bc7.png");
-  } else if(format == eCompressionFormat_PVRTC) {
+  } else if(format == FasTC::eCompressionFormat_PVRTC) {
    strcat(basename, "-pvrtc.png");
-  } else if(format == eCompressionFormat_DXT1) {
+  } else if(format == FasTC::eCompressionFormat_DXT1) {
    strcat(basename, "-dxt1.png");
-  } else if(format == eCompressionFormat_ETC1) {
+  } else if(format == FasTC::eCompressionFormat_ETC1) {
    strcat(basename, "-etc1.png");
  }

--- a/Core/include/CompressedImage.h
+++ b/Core/include/CompressedImage.h
@ -45,22 +45,12 @@
 #define _COMPRESSED_IMAGE_H_

 #include "TexCompTypes.h"
-
-enum ECompressionFormat {
-  eCompressionFormat_DXT1,
-  eCompressionFormat_DXT5,
-  eCompressionFormat_ETC1,
-  eCompressionFormat_BPTC,
-  eCompressionFormat_PVRTC,
-
-  kNumCompressionFormats
-};
-
+#include "CompressionFormat.h"
 #include "Image.h"

 class CompressedImage : public FasTC::Image<FasTC::Pixel> {
 private:
-  ECompressionFormat m_Format;
+  FasTC::ECompressionFormat m_Format;
  uint8 *m_CompressedData;

 public:
@ -73,7 +63,7 @@ class CompressedImage : public FasTC::Image<FasTC::Pixel> {
  CompressedImage(
    const uint32 width,
    const uint32 height,
-    const ECompressionFormat format,
+    const FasTC::ECompressionFormat format,
    const uint8 *data
  );

@ -85,8 +75,8 @@ class CompressedImage : public FasTC::Image<FasTC::Pixel> {

  virtual void ComputePixels();

-  static uint32 GetCompressedSize(uint32 uncompressedSize, ECompressionFormat format);
-  static uint32 GetUncompressedSize(uint32 compressedSize, ECompressionFormat format) {
+  static uint32 GetCompressedSize(uint32 uncompressedSize, FasTC::ECompressionFormat format);
+  static uint32 GetUncompressedSize(uint32 compressedSize, FasTC::ECompressionFormat format) {
    uint32 cmp = GetCompressedSize(compressedSize, format);
    return compressedSize * (compressedSize / cmp);
  }
@ -104,7 +94,7 @@ class CompressedImage : public FasTC::Image<FasTC::Pixel> {
  // size for a given compressed image.
  bool DecompressImage(uint8 *outBuf, uint32 outBufSz) const;

-  ECompressionFormat GetFormat() const { return m_Format; }
+  FasTC::ECompressionFormat GetFormat() const { return m_Format; }
 };

 #endif // _COMPRESSED_IMAGE_H_
--- a/Core/include/TexComp.h
+++ b/Core/include/TexComp.h
@ -57,7 +57,7 @@ struct SCompressionSettings {
  SCompressionSettings(); // defaults

  // The compression format for the image.
-  ECompressionFormat format; 
+  FasTC::ECompressionFormat format; 

  // The flag that requests us to use SIMD, if it is available
  bool bUseSIMD;
--- a/Core/src/CompressedImage.cpp
+++ b/Core/src/CompressedImage.cpp
@ -56,6 +56,10 @@
 #include "DXTCompressor.h"
 #include "ETCCompressor.h"

+using FasTC::CompressionJob;
+using FasTC::DecompressionJob;
+using FasTC::ECompressionFormat;
+
 CompressedImage::CompressedImage( const CompressedImage &other )
  : Image(other)
  , m_Format(other.m_Format)
@ -109,17 +113,17 @@ bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBuf
  assert(outBufSz == GetUncompressedSize());

  uint8 *byteData = reinterpret_cast<uint8 *>(m_CompressedData);
-  DecompressionJob dj (byteData, outBuf, GetWidth(), GetHeight());
+  DecompressionJob dj (m_Format, byteData, outBuf, GetWidth(), GetHeight());
  switch(m_Format) {
-    case eCompressionFormat_DXT1:
+    case FasTC::eCompressionFormat_DXT1:
      DXTC::DecompressDXT1(dj);
      break;

-    case eCompressionFormat_ETC1:
+    case FasTC::eCompressionFormat_ETC1:
      ETCC::Decompress(dj);
      break;

-    case eCompressionFormat_PVRTC:
+    case FasTC::eCompressionFormat_PVRTC:
    {
 #ifndef NDEBUG
      PVRTCC::Decompress(dj, false, PVRTCC::eWrapMode_Wrap, true);
@ -129,7 +133,7 @@ bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBuf
    }
    break;

-    case eCompressionFormat_BPTC: 
+    case FasTC::eCompressionFormat_BPTC: 
    { 
      BC7C::Decompress(dj);
    }
@ -164,24 +168,15 @@ void CompressedImage::ComputePixels() {
 }

 uint32 CompressedImage::GetCompressedSize(uint32 uncompressedSize, ECompressionFormat format) {
-  assert(uncompressedSize % 8 == 0);
+  uint32 blockDim[2];
+  GetBlockDimensions(format, blockDim);

-  uint32 cmpDataSzNeeded = 0;
-  switch(format) {
-  default:
-    assert(!"Not implemented!");
-    // Fall through V
-  case eCompressionFormat_ETC1:
-  case eCompressionFormat_DXT1:
-  case eCompressionFormat_PVRTC:
-    cmpDataSzNeeded = uncompressedSize / 8;
-    break;
+  const uint32 uncompBlockSz = blockDim[0] * blockDim[1] * sizeof(uint32);
+  const uint32 blockSz = GetBlockSize(format);

-  case eCompressionFormat_DXT5:
-  case eCompressionFormat_BPTC:
-    cmpDataSzNeeded = uncompressedSize / 4;
-    break;
-  }
+  assert(uncompBlockSz % blockSz == 0);
+  const uint32 scale = uncompBlockSz / blockSz;

-  return cmpDataSzNeeded;
+  assert(uncompressedSize % blockSz == 0);
+  return uncompressedSize / scale;
 }
--- a/Core/src/CompressionFuncs.h
+++ b/Core/src/CompressionFuncs.h
@ -60,12 +60,12 @@
 // returns the compressed image data into outData. It is assumed that there is
 // enough space allocated for outData to store the compressed data. Allocation
 // is dependent on the compression format.
-typedef void (* CompressionFunc)(const CompressionJob &);
+typedef void (* CompressionFunc)(const FasTC::CompressionJob &);

 // A compression function format. It takes the raw data and image dimensions and 
 // returns the compressed image data into outData. It is assumed that there is
 // enough space allocated for outData to store the compressed data. Allocation
 // is dependent on the compression format.
-typedef void (* CompressionFuncWithStats)(const CompressionJob &, std::ostream *logStream);
+typedef void (* CompressionFuncWithStats)(const FasTC::CompressionJob &, std::ostream *logStream);

 #endif  // CORE_SRC_COMPRESSIONFUNCS_H_
--- a/Core/src/TexComp.cpp
+++ b/Core/src/TexComp.cpp
@ -62,6 +62,10 @@
 #include "ThreadGroup.h"
 #include "WorkerQueue.h"

+using FasTC::CompressionJob;
+using FasTC::CompressionJobList;
+using FasTC::ECompressionFormat;
+
 template <typename T>
 static void clamp(T &x, const T &minX, const T &maxX) {
  x = std::max(std::min(maxX, x), minX);
@ -86,7 +90,7 @@ static void CompressPVRTCLib(const CompressionJob &cj) {
 }

 SCompressionSettings:: SCompressionSettings()
-  : format(eCompressionFormat_BPTC)
+  : format(FasTC::eCompressionFormat_BPTC)
  , bUseSIMD(false)
  , iNumThreads(1)
  , iQuality(50)
@ -98,16 +102,16 @@ SCompressionSettings:: SCompressionSettings()
 static  CompressionFuncWithStats ChooseFuncFromSettingsWithStats(const SCompressionSettings &s) {
  switch(s.format) {

-    case eCompressionFormat_BPTC:
+    case FasTC::eCompressionFormat_BPTC:
    {
       return BC7C::CompressWithStats;
    }
    break;
    
-    case eCompressionFormat_ETC1:
-    case eCompressionFormat_DXT1:
-    case eCompressionFormat_DXT5:
-    case eCompressionFormat_PVRTC:
+    case FasTC::eCompressionFormat_ETC1:
+    case FasTC::eCompressionFormat_DXT1:
+    case FasTC::eCompressionFormat_DXT5:
+    case FasTC::eCompressionFormat_PVRTC:
    {
      // !FIXME! actually implement one of these methods...
      return NULL;
@ -124,7 +128,7 @@ static  CompressionFuncWithStats ChooseFuncFromSettingsWithStats(const SCompress

 static CompressionFunc ChooseFuncFromSettings(const SCompressionSettings &s) {
  switch(s.format) {
-    case eCompressionFormat_BPTC:
+    case FasTC::eCompressionFormat_BPTC:
    {
      BC7C::SetQualityLevel(s.iQuality);
 #ifdef HAS_SSE_41
@ -136,13 +140,13 @@ static CompressionFunc ChooseFuncFromSettings(const SCompressionSettings &s) {
    }
    break;

-    case eCompressionFormat_DXT1:
+    case FasTC::eCompressionFormat_DXT1:
      return DXTC::CompressImageDXT1;

-    case eCompressionFormat_DXT5:
+    case FasTC::eCompressionFormat_DXT5:
      return DXTC::CompressImageDXT5;

-    case eCompressionFormat_PVRTC:
+    case FasTC::eCompressionFormat_PVRTC:
    {
      if(s.bUsePVRTexLib) {
        return CompressPVRTCLib;
@ -151,7 +155,7 @@ static CompressionFunc ChooseFuncFromSettings(const SCompressionSettings &s) {
      }
    }

-    case eCompressionFormat_ETC1:
+    case FasTC::eCompressionFormat_ETC1:
      return ETCC::Compress_RG;

    default:
@ -168,11 +172,8 @@ static void ReportError(const char *msg) {
 }

 static double CompressImageInSerial(
-  const uint8 *imgData,
-  const uint32 imgWidth,
-  const uint32 imgHeight,
-  const SCompressionSettings &settings,
-  unsigned char *outBuf
+  const CompressionJob &job,
+  const SCompressionSettings &settings
 ) {
  CompressionFunc f = ChooseFuncFromSettings(settings);
  CompressionFuncWithStats fStats = ChooseFuncFromSettingsWithStats(settings);
@ -185,11 +186,10 @@ static double CompressImageInSerial(
    stopWatch.Reset();
    stopWatch.Start();

-    CompressionJob cj (imgData, outBuf, imgWidth, imgHeight);
    if(fStats && settings.logStream) {
-      (*fStats)(cj, settings.logStream);
+      (*fStats)(job, settings.logStream);
    } else {
-      (*f)(cj);
+      (*f)(job);
    }

    stopWatch.Stop();
@ -231,10 +231,8 @@ class AtomicThreadUnit : public TCCallable {
 };

 static double CompressImageWithAtomics(
-  const unsigned char *imgData,
-  const unsigned int width, const unsigned int height,
-  const SCompressionSettings &settings,
-  unsigned char *outBuf
+  const CompressionJob &cj,
+  const SCompressionSettings &settings
 ) {
  CompressionFunc f = ChooseFuncFromSettings(settings);
  
@ -242,7 +240,7 @@ static double CompressImageWithAtomics(
  const int nTimes = settings.iNumCompressions;
  CompressionJobList cjl (nTimes);
  for(int i = 0; i < nTimes; i++) {
-    if(!cjl.AddJob(CompressionJob(imgData, outBuf, height, width))) {
+    if(!cjl.AddJob(cj)) {
      assert(!"Error adding compression job to job list!");
    }
  }
@ -287,10 +285,8 @@ static double CompressImageWithAtomics(
 }
 #else  // HAS_ATOMICS
 static double CompressImageWithAtomics(
-  const unsigned char *imgData,
-  const unsigned int width, const unsigned int height,
-  const SCompressionSettings &settings,
-  unsigned char *outBuf
+  const CompressionJob &cj,
+  const SCompressionSettings &settings
 ) {
  fprintf(stderr, "Compiler does not support atomic operations!");
 }
@ -314,15 +310,13 @@ static double CompressThreadGroup(ThreadGroup &tgrp, const SCompressionSettings
    cmpTimeTotal += tgrp.GetStopWatch().TimeInMilliseconds();
  }

-  tgrp.CleanUpThreads();  
+  tgrp.CleanUpThreads();
  return cmpTimeTotal;
 }

 static double CompressImageWithThreads(
-  const unsigned char *imgData,
-  const unsigned int imgDataSz,
-  const SCompressionSettings &settings,
-  unsigned char *outBuf
+  const CompressionJob &job,                                     
+  const SCompressionSettings &settings
 ) {

  CompressionFunc f = ChooseFuncFromSettings(settings);
@ -330,11 +324,11 @@ static double CompressImageWithThreads(

  double cmpTimeTotal = 0.0;
  if(fStats && settings.logStream) {
-    ThreadGroup tgrp (settings.iNumThreads, imgData, imgDataSz, fStats, settings.logStream, outBuf);
+    ThreadGroup tgrp (settings.iNumThreads, job, fStats, settings.logStream);
    cmpTimeTotal = CompressThreadGroup(tgrp, settings);
  }
  else {
-    ThreadGroup tgrp (settings.iNumThreads, imgData, imgDataSz, f, outBuf);
+    ThreadGroup tgrp (settings.iNumThreads, job, f);
    cmpTimeTotal = CompressThreadGroup(tgrp, settings);
  }

@ -342,11 +336,14 @@ static double CompressImageWithThreads(
  return cmpTime;
 }

+static double RunWorkerQueue(WorkerQueue &wq) {
+  wq.Run();
+  return wq.GetStopWatch().TimeInMilliseconds();
+}
+
 static double CompressImageWithWorkerQueue(
-  const unsigned char *imgData,
-  const unsigned int imgDataSz,
-  const SCompressionSettings &settings,
-  unsigned char *outBuf
+  const CompressionJob &job,
+  const SCompressionSettings &settings
 ) {
  CompressionFunc f = ChooseFuncFromSettings(settings);
  CompressionFuncWithStats fStats = ChooseFuncFromSettingsWithStats(settings);
@ -357,29 +354,21 @@ static double CompressImageWithWorkerQueue(
      settings.iNumCompressions,
      settings.iNumThreads,
      settings.iJobSize,
-      imgData,
-      imgDataSz,
+      job,
      fStats,
-      settings.logStream,
-      outBuf
+      settings.logStream
    );
-
-    wq.Run();
-    cmpTimeTotal = wq.GetStopWatch().TimeInMilliseconds();
+    cmpTimeTotal = RunWorkerQueue(wq);
  }
  else {
    WorkerQueue wq (
      settings.iNumCompressions,
      settings.iNumThreads,
      settings.iJobSize,
-      imgData,
-      imgDataSz,
-      f,
-      outBuf
+      job,
+      f
    );
-
-    wq.Run();
-    cmpTimeTotal = wq.GetStopWatch().TimeInMilliseconds();
+    cmpTimeTotal = RunWorkerQueue(wq);
  }

  return cmpTimeTotal / double(settings.iNumCompressions);
@ -458,7 +447,7 @@ bool CompressImageData(
  }

  uint32 numThreads = settings.iNumThreads;
-  if(settings.format == eCompressionFormat_PVRTC &&
+  if(settings.format == FasTC::eCompressionFormat_PVRTC &&
     (settings.iNumThreads > 1 || settings.logStream)) {
    if(settings.iNumThreads > 1) {
      ReportError("WARNING - PVRTC compressor does not support multithreading.");
@ -483,22 +472,22 @@ bool CompressImageData(
    return false;
  }

-  CompressionFunc f = ChooseFuncFromSettings(settings);
-  if(f) {
+  if(ChooseFuncFromSettings(settings)) {
+
+    CompressionJob cj(settings.format, data, compressedData, width, height);

    double cmpMSTime = 0.0;
-
    if(numThreads > 1) {
      if(settings.bUseAtomics) {
-        cmpMSTime = CompressImageWithAtomics(data, width, height, settings, compressedData);
+        cmpMSTime = CompressImageWithAtomics(cj, settings);
      } else if(settings.iJobSize > 0) {
-        cmpMSTime = CompressImageWithWorkerQueue(data, dataSz, settings, compressedData);
+        cmpMSTime = CompressImageWithWorkerQueue(cj, settings);
      } else {
-        cmpMSTime = CompressImageWithThreads(data, dataSz, settings, compressedData);
+        cmpMSTime = CompressImageWithThreads(cj, settings);
      }
    }
    else {
-      cmpMSTime = CompressImageInSerial(data, width, height, settings, compressedData);
+      cmpMSTime = CompressImageInSerial(cj, settings);
    }

    // Report compression time
--- a/Core/src/ThreadGroup.cpp
+++ b/Core/src/ThreadGroup.cpp
@ -49,23 +49,22 @@
 #include <cassert>
 #include <iostream>

+using FasTC::CompressionJob;
+
 CmpThread::CmpThread() 
-  : m_StartBarrier(NULL)
-  , m_ParentCounter(NULL)
+  : m_ParentCounter(NULL)
+  , m_StartBarrier(NULL)
  , m_ParentCounterLock(NULL)
  , m_FinishCV(NULL)
-  , m_Width(0)
-  , m_Height(0)
+  , m_ParentExitFlag(NULL)
+  , m_Job(CompressionJob(FasTC::kNumCompressionFormats, NULL, NULL, 0, 0))
  , m_CmpFunc(NULL)
  , m_CmpFuncWithStats(NULL)
  , m_LogStream(NULL)
-  , m_OutBuf(NULL)
-  , m_InBuf(NULL)
-  , m_ParentExitFlag(NULL)
 { }

 void CmpThread::operator()() {
-  if(!m_OutBuf || !m_InBuf 
+  if(!m_Job.OutBuf() || !m_Job.InBuf() 
     || !m_ParentCounter || !m_ParentCounterLock || !m_FinishCV
     || !m_StartBarrier
     || !m_ParentExitFlag
@ -87,11 +86,10 @@ void CmpThread::operator()() {
      return;
    }

-    CompressionJob cj (m_InBuf, m_OutBuf, m_Width, m_Height);
    if(m_CmpFunc)
-      (*m_CmpFunc)(cj);
+      (*m_CmpFunc)(m_Job);
    else
-      (*m_CmpFuncWithStats)(cj, m_LogStream);
+      (*m_CmpFuncWithStats)(m_Job, m_LogStream);

    {
      TCLock lock(*m_ParentCounterLock);
@ -102,39 +100,19 @@ void CmpThread::operator()() {
  }
 }

-ThreadGroup::ThreadGroup( int numThreads, const unsigned char *inBuf, unsigned int inBufSz, CompressionFunc func, unsigned char *outBuf )
+ThreadGroup::ThreadGroup(uint32 numThreads,
+                         const CompressionJob &job,
+                         CompressionFunc func)
  : m_StartBarrier(new TCBarrier(numThreads + 1))
  , m_FinishMutex(new TCMutex())
  , m_FinishCV(new TCConditionVariable())
  , m_NumThreads(numThreads)
  , m_ActiveThreads(0)
-  , m_ImageDataSz(inBufSz)
-  , m_ImageData(inBuf)
-  , m_OutBuf(outBuf)
+  , m_Job(job)
  , m_ThreadState(eThreadState_Done)
  , m_ExitFlag(false)
-  , m_CompressedBlockSize(
-       (func == BC7C::Compress 
-#ifdef HAS_SSE_41
-  || func == BC7C::CompressImageBC7SIMD
-#endif
-       )? 
-         16 
-       : 
-         0
-  )
-  , m_UncompressedBlockSize(
-       (func == BC7C::Compress 
-#ifdef HAS_SSE_41
-  || func == BC7C::CompressImageBC7SIMD
-#endif
-       )? 
-         64 
-       : 
-         0
-  )
 { 
-  for(int i = 0; i < kMaxNumThreads; i++) {
+  for(uint32 i = 0; i < kMaxNumThreads; i++) {
    // Thread synchronization primitives
    m_Threads[i].m_ParentCounterLock = m_FinishMutex;
    m_Threads[i].m_FinishCV = m_FinishCV;
@ -146,37 +124,21 @@ ThreadGroup::ThreadGroup( int numThreads, const unsigned char *inBuf, unsigned i
 }

 ThreadGroup::ThreadGroup( 
-  int numThreads, 
-  const unsigned char *inBuf, 
-  unsigned int inBufSz, 
+  uint32 numThreads, 
+  const CompressionJob &job,
  CompressionFuncWithStats func, 
-  std::ostream *logStream,
-  unsigned char *outBuf 
+  std::ostream *logStream
 )
  : m_StartBarrier(new TCBarrier(numThreads + 1))
  , m_FinishMutex(new TCMutex())
  , m_FinishCV(new TCConditionVariable())
  , m_NumThreads(numThreads)
  , m_ActiveThreads(0)
-  , m_ImageDataSz(inBufSz)
-  , m_ImageData(inBuf)
-  , m_OutBuf(outBuf)
+  , m_Job(job)
  , m_ThreadState(eThreadState_Done)
  , m_ExitFlag(false)
-  , m_CompressedBlockSize(
-       (func == BC7C::CompressWithStats)? 
-         16 
-       : 
-         0
-  )
-  , m_UncompressedBlockSize(
-       (func == BC7C::CompressWithStats)? 
-         64 
-       : 
-         0
-  )
 { 
-  for(int i = 0; i < kMaxNumThreads; i++) {
+  for(uint32 i = 0; i < kMaxNumThreads; i++) {
    // Thread synchronization primitives
    m_Threads[i].m_ParentCounterLock = m_FinishMutex;
    m_Threads[i].m_FinishCV = m_FinishCV;
@ -209,10 +171,11 @@ bool ThreadGroup::PrepareThreads() {

  // We can assume that the image data is in block stream order
  // so, the size of the data given to each thread will be (nb*4)x4
-  int numBlocks = m_ImageDataSz / 64;
-
-  int blocksProcessed = 0;
-  int blocksPerThread = (numBlocks/m_NumThreads) + ((numBlocks % m_NumThreads)? 1 : 0);
+  uint32 blockDim[2];
+  GetBlockDimensions(m_Job.Format(), blockDim);
+  uint32 numBlocks = (m_Job.Width() * m_Job.Height()) / (blockDim[0] * blockDim[1]);
+  uint32 blocksProcessed = 0;
+  uint32 blocksPerThread = (numBlocks/m_NumThreads) + ((numBlocks % m_NumThreads)? 1 : 0);

  // Currently no threads are finished...
  m_ThreadsFinished = 0;
@ -226,11 +189,22 @@ bool ThreadGroup::PrepareThreads() {
      numBlocksThisThread = numBlocks - blocksProcessed;
    }

+    uint32 start[2], end[2];
+    m_Job.BlockIdxToCoords(blocksProcessed, start);
+    m_Job.BlockIdxToCoords(blocksProcessed + numBlocksThisThread, end);
+
+    // !TODO! This should be moved to a unit test...
+    assert(m_Job.CoordsToBlockIdx(start[0], start[1]) == blocksProcessed);
+    assert(m_Job.CoordsToBlockIdx(end[0], end[1]) == blocksProcessed + numBlocksThisThread);
+
+    CompressionJob cj(m_Job.Format(),
+                      m_Job.InBuf(), m_Job.OutBuf(),
+                      m_Job.Width(), m_Job.Height(),
+                      start[0], start[1],
+                      end[0], end[1]);
+
    CmpThread &t = m_Threads[m_ActiveThreads];
-    t.m_Height = 4;
-    t.m_Width = numBlocksThisThread * 4;
-    t.m_OutBuf = m_OutBuf + (blocksProcessed * m_CompressedBlockSize);
-    t.m_InBuf = m_ImageData + (blocksProcessed * m_UncompressedBlockSize);
+    t.m_Job = cj;

    blocksProcessed += numBlocksThisThread;
    
@ -280,7 +254,7 @@ bool ThreadGroup::CleanUpThreads() {
  m_StartBarrier->Wait();

  // Clean up.
-  for(int i = 0; i < m_ActiveThreads; i++) {
+  for(uint32 i = 0; i < m_ActiveThreads; i++) {
    m_ThreadHandles[i]->Join();
    delete m_ThreadHandles[i];
  }
--- a/Core/src/ThreadGroup.h
+++ b/Core/src/ThreadGroup.h
@ -54,26 +54,19 @@ struct CmpThread : public TCCallable {
  friend class ThreadGroup;  

 private:
-  TCBarrier *m_StartBarrier;
-
-  int *m_ParentCounter;
+  uint32 *m_ParentCounter;
  
+  TCBarrier *m_StartBarrier;
  TCMutex *m_ParentCounterLock;
  TCConditionVariable *m_FinishCV;

-  int m_Width;
-  int m_Height;
+  bool *m_ParentExitFlag;

+  FasTC::CompressionJob m_Job;
  CompressionFunc m_CmpFunc;
-
  CompressionFuncWithStats m_CmpFuncWithStats;
  std::ostream *m_LogStream;

-  unsigned char *m_OutBuf;
-  const unsigned char *m_InBuf;
-
-  bool *m_ParentExitFlag;
-
  CmpThread();

 public:
@ -83,21 +76,17 @@ public:

 class ThreadGroup {
 public:
-  ThreadGroup( 
-    int numThreads, 
-    const unsigned char *inBuf, 
-    unsigned int inBufSz, 
-    CompressionFunc func, 
-    unsigned char *outBuf
+  ThreadGroup(
+    uint32 numThreads,
+    const FasTC::CompressionJob &cj,
+    CompressionFunc func
  );

-  ThreadGroup( 
-    int numThreads, 
-    const unsigned char *inBuf, 
-    unsigned int inBufSz, 
-    CompressionFuncWithStats func, 
-    std::ostream *logStream,
-    unsigned char *outBuf
+  ThreadGroup(
+    uint32 numThreads,
+    const FasTC::CompressionJob &cj,
+    CompressionFuncWithStats func,
+    std::ostream *logStream
  );

  ~ThreadGroup();
@ -121,19 +110,16 @@ class ThreadGroup {
  TCMutex *const m_FinishMutex;
  TCConditionVariable *const m_FinishCV;

-  static const int kMaxNumThreads = 256;
+  static const uint32 kMaxNumThreads = 256;
  const int m_NumThreads;

-  int m_ActiveThreads;
-  int m_ThreadsFinished;
+  uint32 m_ActiveThreads;
+  uint32 m_ThreadsFinished;

  CmpThread m_Threads[kMaxNumThreads];
  TCThread *m_ThreadHandles[kMaxNumThreads];

-  // State variables.
-  const unsigned int m_ImageDataSz;
-  const unsigned char *const m_ImageData;
-  unsigned char *m_OutBuf;
+  FasTC::CompressionJob m_Job;

  StopWatch m_StopWatch;

@ -141,9 +127,6 @@ class ThreadGroup {
  bool m_ExitFlag;

  std::ostream *m_LogStream;
-
-  const unsigned int m_CompressedBlockSize;
-  const unsigned int m_UncompressedBlockSize;
 };

 #endif // _THREAD_GROUP_H_
--- a/Core/src/WorkerQueue.cpp
+++ b/Core/src/WorkerQueue.cpp
@ -51,6 +51,8 @@

 #include "BC7Compressor.h"

+using FasTC::CompressionJob;
+
 template <typename T>
 static inline void clamp(T &x, const T &min, const T &max) {
  if(x < min) x = min;
@ -98,10 +100,19 @@ void WorkerThread::operator()() {

      case eAction_DoWork:
      {
-        const uint8 *src = m_Parent->GetSrcForThread(m_ThreadIdx);
-        uint8 *dst = m_Parent->GetDstForThread(m_ThreadIdx);
+        const CompressionJob &job = m_Parent->GetCompressionJob();

-        CompressionJob cj (src, dst, 4 * m_Parent->GetNumBlocksForThread(m_ThreadIdx), 4);
+        uint32 start[2];
+        m_Parent->GetStartForThread(m_ThreadIdx, start);
+
+        uint32 end[2];
+        m_Parent->GetEndForThread(m_ThreadIdx, end);
+
+        CompressionJob cj (job.Format(),
+                           job.InBuf(), job.OutBuf(),
+                           job.Width(), job.Height(),
+                           start[0], start[1],
+                           end[0], end[1]);
        if(f)
          (*f)(cj);
        else
@ -128,10 +139,8 @@ WorkerQueue::WorkerQueue(
  uint32 numCompressions,
  uint32 numThreads,
  uint32 jobSize,
-  const uint8 *inBuf,
-  uint32 inBufSz,
-  CompressionFunc func,
-  uint8 *outBuf
+  const CompressionJob &job,
+  CompressionFunc func
 )
  : m_NumCompressions(0)
  , m_TotalNumCompressions(std::max(uint32(1), numCompressions))
@ -139,32 +148,22 @@ WorkerQueue::WorkerQueue(
  , m_WaitingThreads(0)
  , m_ActiveThreads(0)
  , m_JobSize(std::max(uint32(1), jobSize))
-  , m_InBufSz(inBufSz)
-  , m_InBuf(inBuf)
-  , m_OutBuf(outBuf)
+  , m_Job(job)
  , m_NextBlock(0)
  , m_CompressionFunc(func)
  , m_CompressionFuncWithStats(NULL)
  , m_LogStream(NULL)
 {
  clamp(m_NumThreads, uint32(1), uint32(kMaxNumWorkerThreads));
-
-#ifndef NDEBUG
-  if(m_InBufSz % 64) {
-    fprintf(stderr, "WorkerQueue.cpp -- WARNING: InBufSz not a multiple of 64. Are you sure that your image dimensions are correct?\n");
-  }
-#endif
 }

 WorkerQueue::WorkerQueue(
  uint32 numCompressions,
  uint32 numThreads, 
  uint32 jobSize,
-  const uint8 *inBuf, 
-  uint32 inBufSz, 
+  const CompressionJob &job,
  CompressionFuncWithStats func, 
-  std::ostream *logStream,
-  uint8 *outBuf
+  std::ostream *logStream
 )
  : m_NumCompressions(0)
  , m_TotalNumCompressions(std::max(uint32(1), numCompressions))
@ -172,21 +171,13 @@ WorkerQueue::WorkerQueue(
  , m_WaitingThreads(0)
  , m_ActiveThreads(0)
  , m_JobSize(std::max(uint32(1), jobSize))
-  , m_InBufSz(inBufSz)
-  , m_InBuf(inBuf)
-  , m_OutBuf(outBuf)
+  , m_Job(job)
  , m_NextBlock(0)
  , m_CompressionFunc(NULL)
  , m_CompressionFuncWithStats(func)
  , m_LogStream(logStream)
 {
  clamp(m_NumThreads, uint32(1), uint32(kMaxNumWorkerThreads));
-
-#ifndef NDEBUG
-  if(m_InBufSz % 64) {
-    fprintf(stderr, "WorkerQueue.cpp -- WARNING: InBufSz not a multiple of 64. Are you sure that your image dimensions are correct?\n");
-  }
-#endif
 }

 void WorkerQueue::Run() {
@ -234,7 +225,9 @@ WorkerThread::EAction WorkerQueue::AcceptThreadData(uint32 threadIdx) {
  }

  // How many blocks total do we have?
-  const uint32 totalBlocks = m_InBufSz / 64;
+  uint32 blockDim[2];
+  GetBlockDimensions(m_Job.Format(), blockDim);
+  const uint32 totalBlocks = (m_Job.Width() * m_Job.Height()) / (blockDim[0] * blockDim[1]);
  
  // Make sure we have exclusive access...
  TCLock lock(m_Mutex);
@ -273,28 +266,21 @@ WorkerThread::EAction WorkerQueue::AcceptThreadData(uint32 threadIdx) {
  return WorkerThread::eAction_DoWork;
 }

-const uint8 *WorkerQueue::GetSrcForThread(const int threadIdx) const {
-  assert(m_Offsets[threadIdx] >= 0);
+void WorkerQueue::GetStartForThread(const uint32 threadIdx, uint32 (&start)[2]) {
  assert(threadIdx >= 0);
  assert(threadIdx < int(m_NumThreads));
+  assert(m_Offsets[threadIdx] >= 0);

-  const uint32 inBufBlockSz = 16 * 4;
-  return m_InBuf + m_Offsets[threadIdx] * inBufBlockSz;
+  const uint32 blockIdx = m_Offsets[threadIdx];
+  m_Job.BlockIdxToCoords(blockIdx, start);
 }

-uint8 *WorkerQueue::GetDstForThread(const int threadIdx) const {
-  assert(m_Offsets[threadIdx] >= 0);
+void WorkerQueue::GetEndForThread(const uint32 threadIdx, uint32 (&end)[2]) {
  assert(threadIdx >= 0);
  assert(threadIdx < int(m_NumThreads));
-
-  const uint32 outBufBlockSz = 16;
-  return m_OutBuf + m_Offsets[threadIdx] * outBufBlockSz;
-}
-
-uint32 WorkerQueue::GetNumBlocksForThread(const int threadIdx) const {
  assert(m_Offsets[threadIdx] >= 0);
-  assert(threadIdx >= 0);
-  assert(threadIdx < int(m_NumThreads));
+  assert(m_NumBlocks[threadIdx] >= 0);

-  return m_NumBlocks[threadIdx];
+  const uint32 blockIdx = m_Offsets[threadIdx] + m_NumBlocks[threadIdx];
+  m_Job.BlockIdxToCoords(blockIdx, end);
 }
--- a/Core/src/WorkerQueue.h
+++ b/Core/src/WorkerQueue.h
@ -81,23 +81,19 @@ class WorkerQueue {
 public:
  WorkerQueue(
    uint32 numCompressions,
-    uint32 numThreads, 
+    uint32 numThreads,
    uint32 jobSize,
-    const uint8 *inBuf, 
-    uint32 inBufSz, 
-    CompressionFunc func, 
-    uint8 *outBuf
+    const FasTC::CompressionJob &job,
+    CompressionFunc func
  );

  WorkerQueue(
    uint32 numCompressions,
    uint32 numThreads, 
    uint32 jobSize,
-    const uint8 *inBuf, 
-    uint32 inBufSz, 
+    const FasTC::CompressionJob &job,
    CompressionFuncWithStats func, 
-    std::ostream *logStream,
-    uint8 *outBuf
+    std::ostream *logStream
  );

  ~WorkerQueue() { }
@ -113,9 +109,7 @@ class WorkerQueue {
  uint32 m_WaitingThreads;
  uint32 m_ActiveThreads;
  uint32 m_JobSize;
-  uint32 m_InBufSz;
-  const uint8 *m_InBuf;
-  uint8 *m_OutBuf;
+  FasTC::CompressionJob m_Job;

  TCConditionVariable m_CV;
  TCMutex m_Mutex;
@ -129,9 +123,9 @@ class WorkerQueue {
  WorkerThread *m_Workers[kMaxNumWorkerThreads];
  TCThread *m_ThreadHandles[kMaxNumWorkerThreads];

-  const uint8 *GetSrcForThread(const int threadIdx) const;
-  uint8 *GetDstForThread(const int threadIdx) const;
-  uint32 GetNumBlocksForThread(const int threadIdx) const;
+  const FasTC::CompressionJob &GetCompressionJob() const { return m_Job; }
+  void GetStartForThread(const uint32 threadIdx, uint32 (&start)[2]);
+  void GetEndForThread(const uint32 threadIdx, uint32 (&start)[2]);
  
  const CompressionFunc m_CompressionFunc;
  CompressionFunc GetCompressionFunc() const { return m_CompressionFunc; }
--- a/DXTEncoder/include/DXTCompressor.h
+++ b/DXTEncoder/include/DXTCompressor.h
@ -16,10 +16,10 @@
 namespace DXTC
 {
  // DXT compressor (scalar version).
-  void CompressImageDXT1(const CompressionJob &);
-  void CompressImageDXT5(const CompressionJob &);
+  void CompressImageDXT1(const FasTC::CompressionJob &);
+  void CompressImageDXT5(const FasTC::CompressionJob &);

-  void DecompressDXT1(const DecompressionJob &);
+  void DecompressDXT1(const FasTC::DecompressionJob &);

  uint16 ColorTo565(const uint8* color);
  void EmitByte(uint8*& dest, uint8 b);
--- a/DXTEncoder/src/DXTCompressor.cpp
+++ b/DXTEncoder/src/DXTCompressor.cpp
@ -25,7 +25,7 @@
 namespace DXTC
 {
  // Function prototypes
-  void ExtractBlock(const uint8* inPtr, uint32 width, uint8* colorBlock);
+  void ExtractBlock(const uint32* inPtr, uint32 width, uint8* colorBlock);
  void GetMinMaxColors(const uint8* colorBlock, uint8* minColor, uint8* maxColor);
  void GetMinMaxColorsWithAlpha(const uint8* colorBlock, uint8* minColor, uint8* maxColor);
  void EmitColorIndices(const uint8* colorBlock, uint8*& outBuf, const uint8* minColor, const uint8* maxColor);
@ -35,23 +35,30 @@ namespace DXTC
  // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
  // The buffer pointed to by outBuf should be large enough to store the compressed image. This
  // implementation has an 8:1 compression ratio.
-  void CompressImageDXT1(const CompressionJob &cj) {
+  void CompressImageDXT1(const FasTC::CompressionJob &cj) {
    uint8 block[64];
    uint8 minColor[4];
    uint8 maxColor[4];

-    uint8 *outBuf = cj.OutBuf();
-    const uint8 *inBuf = cj.InBuf();
-    for(int j = 0; j < cj.Height(); j += 4, inBuf += cj.Width() * 4 * 4)
-    {
-      for(int i = 0; i < cj.Width(); i += 4)
-      {
-        ExtractBlock(inBuf + i * 4, cj.Width(), block);
+    const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_DXT1);
+    const uint32 startBlock = cj.CoordsToBlockIdx(cj.XStart(), cj.YStart());
+    uint8 *outBuf = cj.OutBuf() + startBlock * kBlockSz;
+
+    const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj.InBuf());
+    uint32 startX = cj.XStart();
+    bool done = false;
+    for(uint32 j = cj.YStart(); !done; j += 4) {
+      for(uint32 i = startX; !done && i < cj.Width(); i += 4) {
+
+        const uint32 kOffset = j*cj.Width() + i;
+        ExtractBlock(inPixels + kOffset, cj.Width(), block);
        GetMinMaxColors(block, minColor, maxColor);
        EmitWord(outBuf, ColorTo565(maxColor));
        EmitWord(outBuf, ColorTo565(minColor));
        EmitColorIndices(block, outBuf, minColor, maxColor);
+        done = i+4 >= cj.XEnd() && j+(i+4 == cj.Width()? 4 : 0) >= cj.YEnd();
      }
+      startX = 0;
    }
  }

@ -59,18 +66,23 @@ namespace DXTC
  // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels.
  // The buffer pointed to by outBuf should be large enough to store the compressed image. This
  // implementation has an 4:1 compression ratio.
-  void CompressImageDXT5(const CompressionJob &cj) {
+  void CompressImageDXT5(const FasTC::CompressionJob &cj) {
    uint8 block[64];
    uint8 minColor[4];
    uint8 maxColor[4];

-    uint8 *outBuf = cj.OutBuf();
-    const uint8 *inBuf = cj.InBuf();
-    for(int j = 0; j < cj.Height(); j += 4, inBuf += cj.Width() * 4 * 4)
-    {
-      for(int i = 0; i < cj.Width(); i += 4)
-      {
-        ExtractBlock(inBuf + i * 4, cj.Width(), block);
+    const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_DXT5);
+    const uint32 startBlock = cj.CoordsToBlockIdx(cj.XStart(), cj.YStart());
+    uint8 *outBuf = cj.OutBuf() + startBlock * kBlockSz;
+    
+    const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj.InBuf());
+    uint32 startX = cj.XStart();
+    bool done = false;
+    for(uint32 j = cj.YStart(); !done; j += 4) {
+      for(uint32 i = startX; !done && i < cj.Width(); i += 4) {
+
+        const uint32 kOffset = j*cj.Width() + i;
+        ExtractBlock(inPixels + kOffset, cj.Width(), block);
        GetMinMaxColorsWithAlpha(block, minColor, maxColor);
        EmitByte(outBuf, maxColor[3]);
        EmitByte(outBuf, minColor[3]);
@ -78,6 +90,7 @@ namespace DXTC
        EmitWord(outBuf, ColorTo565(maxColor));
        EmitWord(outBuf, ColorTo565(minColor));
        EmitColorIndices(block, outBuf, minColor, maxColor);
+        done = i+4 >= cj.XEnd() && j+(i+4 == cj.Width()? 4 : 0) >= cj.YEnd();
      }
    }
  }
@ -115,12 +128,12 @@ namespace DXTC

  // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter
  // specifies the size of the image in pixels.
-  void ExtractBlock(const uint8* inPtr, uint32 width, uint8* colorBlock)
+  void ExtractBlock(const uint32* inPtr, uint32 width, uint8* colorBlock)
  {
    for(int j = 0; j < 4; j++)
    {
      memcpy(&colorBlock[j * 4 * 4], inPtr, 4 * 4);
-      inPtr += width * 4;
+      inPtr += width;
    }
  }

@ -129,7 +142,7 @@ namespace DXTC
  // channel.
  void GetMinMaxColors(const uint8* colorBlock, uint8* minColor, uint8* maxColor)
  {
-    int32 i;
+    uint32 i;
    uint8 inset[3];

    minColor[0] = minColor[1] = minColor[2] = 255;
@ -177,7 +190,7 @@ namespace DXTC
  // the extents of the bounding box of the color space. This function includes the alpha channel.
  void GetMinMaxColorsWithAlpha(const uint8* colorBlock, uint8* minColor, uint8* maxColor)
  {
-    int32 i;
+    uint32 i;
    uint8 inset[4];

    minColor[0] = minColor[1] = minColor[2] = minColor[3] = 255;
@ -299,7 +312,7 @@ namespace DXTC

    colorBlock += 3;

-    for(int i = 0; i < 16; i++) {
+    for(uint32 i = 0; i < 16; i++) {
      uint8 a = colorBlock[i * 4];
      int32 b1 = (a <= ab1);
      int32 b2 = (a <= ab2);
--- a/DXTEncoder/src/DXTDecompressor.cpp
+++ b/DXTEncoder/src/DXTDecompressor.cpp
@ -90,7 +90,7 @@ namespace DXTC
    }
  }

-  void DecompressDXT1(const DecompressionJob &dcj)
+  void DecompressDXT1(const FasTC::DecompressionJob &dcj)
  {
    assert(!(dcj.Height() & 3));
    assert(!(dcj.Width() & 3));
@ -98,13 +98,13 @@ namespace DXTC
    uint32 blockW = dcj.Width() >> 2;
    uint32 blockH = dcj.Height() >> 2;

-    const uint32 blockSz = 8;
+    const uint32 blockSz = GetBlockSize(FasTC::eCompressionFormat_DXT1);

    uint32 *outPixels = reinterpret_cast<uint32 *>(dcj.OutBuf());

    uint32 outBlock[16];
-    for(int j = 0; j < blockH; j++) {
-      for(int i = 0; i < blockW; i++) {
+    for(uint32 j = 0; j < blockH; j++) {
+      for(uint32 i = 0; i < blockW; i++) {

        uint32 offset = (j * blockW + i) * blockSz;
        DecompressDXT1Block(dcj.InBuf() + offset, outBlock);
--- a/ETCEncoder/include/ETCCompressor.h
+++ b/ETCEncoder/include/ETCCompressor.h
@ -61,13 +61,13 @@ namespace ETCC {
  // Takes a stream of compressed ETC1 data and decompresses it into R8G8B8A8
  // format. The width and height must be specified in order to properly
  // decompress the data.
-  void Decompress(const DecompressionJob &);
+  void Decompress(const FasTC::DecompressionJob &);

  // Takes a stream of uncompressed RGBA8 data and compresses it into ETC1
  // version one. The width and height must be specified in order to properly
  // decompress the data. This uses the library created by Rich Geldreich found here:
  // https://code.google.com/p/rg-etc1
-  void Compress_RG(const CompressionJob &);
+  void Compress_RG(const FasTC::CompressionJob &);

 }  // namespace PVRTCC

--- a/ETCEncoder/src/Compressor.cpp
+++ b/ETCEncoder/src/Compressor.cpp
@ -52,32 +52,37 @@

 #include "rg_etc1.h"
 #include "ETCCompressor.h"
+#include <cstring>

 namespace ETCC {

-  void Compress_RG(const CompressionJob &cj) {
+  void Compress_RG(const FasTC::CompressionJob &cj) {

    rg_etc1::etc1_pack_params params;
    params.m_quality = rg_etc1::cLowQuality;
    rg_etc1::pack_etc1_block_init();

-    // Assume block-stream order
-    uint32 blockSizeX = cj.Width() / 4;
-    uint32 blockSizeY = cj.Height() / 4;
+    const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_ETC1);
+    const uint32 startBlock = cj.CoordsToBlockIdx(cj.XStart(), cj.YStart());
+    uint8 *outBuf = cj.OutBuf() + startBlock * kBlockSz;
+    uint32 startX = cj.XStart();
+    bool done = false;
+    for(uint32 j = cj.YStart(); !done; j += 4) {
+      for(uint32 i = startX; !done && i < cj.Width(); i += 4) {

-    for(uint32 j = 0; j < blockSizeY; j++)
-    for(uint32 i = 0; i < blockSizeX; i++) {
-      uint32 pixels[16];
-      uint32 blockIdx = j*blockSizeX + i;
+        uint32 pixels[16];
+        const uint32 *inPixels = reinterpret_cast<const uint32 *>(cj.InBuf());
+        memcpy(pixels, inPixels + j*cj.Width() + i, 4 * sizeof(uint32));
+        memcpy(pixels + 4, inPixels + (j+1)*cj.Width() + i, 4 * sizeof(uint32));
+        memcpy(pixels + 8, inPixels + (j+2)*cj.Width() + i, 4 * sizeof(uint32));
+        memcpy(pixels + 12, inPixels + (j+3)*cj.Width() + i, 4 * sizeof(uint32));

-      for(uint32 y = 0; y < 4; y++) {
-        for(uint32 x = 0; x < 4; x++) {
-          const uint32 *in = reinterpret_cast<const uint32 *>(cj.InBuf());
-          pixels[y*4 + x] = in[(j*4 + y)*cj.Width() + (i*4 + x)];
-        }
+        pack_etc1_block(outBuf, pixels, params);
+
+        outBuf += kBlockSz;
+        done = i+4 >= cj.XEnd() && j+(i+4 == cj.Width()? 4 : 0) >= cj.YEnd();
      }
-
-      pack_etc1_block(cj.OutBuf() + blockIdx * 8, pixels, params);
+      startX = 0;
    }
  }
 }  // namespace PVRTCC
--- a/ETCEncoder/src/Decompressor.cpp
+++ b/ETCEncoder/src/Decompressor.cpp
@ -55,7 +55,7 @@

 namespace ETCC {

-  void Decompress(const DecompressionJob &cj) {
+  void Decompress(const FasTC::DecompressionJob &cj) {

    uint32 blocksX = cj.Width() / 4;
    uint32 blocksY = cj.Height() / 4;
--- a/PVRTCEncoder/include/PVRTCCompressor.h
+++ b/PVRTCEncoder/include/PVRTCCompressor.h
@ -69,7 +69,7 @@ namespace PVRTCC {
  // Takes a stream of compressed PVRTC data and decompresses it into R8G8B8A8
  // format. The width and height must be specified in order to properly
  // decompress the data.
-  void Decompress(const DecompressionJob &,
+  void Decompress(const FasTC::DecompressionJob &,
                  bool bTwoBitMode = false,
                  const EWrapMode wrapMode = eWrapMode_Wrap,
                  bool bDebugImages = false);
@ -77,12 +77,12 @@ namespace PVRTCC {
  // Takes a stream of uncompressed RGBA8 data and compresses it into PVRTC
  // version one. The width and height must be specified in order to properly
  // decompress the data.
-  void Compress(const CompressionJob &,
+  void Compress(const FasTC::CompressionJob &,
                bool bTwoBitMode = false,
                const EWrapMode wrapMode = eWrapMode_Wrap);

 #ifdef PVRTEXLIB_FOUND
-  void CompressPVRLib(const CompressionJob &,
+  void CompressPVRLib(const FasTC::CompressionJob &,
                      bool bTwoBitMode = false,
                      const EWrapMode wrapMode = eWrapMode_Wrap);
 #endif
--- a/PVRTCEncoder/src/Compressor.cpp
+++ b/PVRTCEncoder/src/Compressor.cpp
@ -917,7 +917,7 @@ namespace PVRTCC {
  }
 #endif

-  void Compress(const CompressionJob &cj, bool bTwoBit, EWrapMode wrapMode) {
+  void Compress(const FasTC::CompressionJob &cj, bool bTwoBit, EWrapMode wrapMode) {
    const uint32 width = cj.Width();
    const uint32 height = cj.Height();

@ -925,6 +925,11 @@ namespace PVRTCC {
    assert((width & (width - 1)) == 0);
    assert((height & (height - 1)) == 0);

+    // Make sure that we aren't doing any shenanigans with threading or otherwise
+    // assuming that we're not ending at the end of the texture...
+    assert(cj.XStart() == 0 && cj.YStart() == 0);
+    assert(cj.XEnd() == cj.Width() && cj.YEnd() == cj.Width());
+
    CompressionLabel *labels =
      (CompressionLabel *)calloc(width * height, sizeof(CompressionLabel));

--- a/PVRTCEncoder/src/CompressorPVRLib.cpp
+++ b/PVRTCEncoder/src/CompressorPVRLib.cpp
@ -60,9 +60,9 @@

 namespace PVRTCC {

-  void CompressPVRLib(const CompressionJob &cj,
-                  bool bTwoBitMode,
-                  const EWrapMode) {
+  void CompressPVRLib(const FasTC::CompressionJob &cj,
+                      bool bTwoBitMode,
+                      const EWrapMode) {
    pvrtexture::CPVRTextureHeader pvrTexHdr;
    pvrTexHdr.setPixelFormat(pvrtexture::PVRStandard8PixelType);
    pvrTexHdr.setWidth(cj.Width());
--- a/PVRTCEncoder/src/Decompressor.cpp
+++ b/PVRTCEncoder/src/Decompressor.cpp
@ -273,7 +273,7 @@ namespace PVRTCC {
    }
  }

-  void Decompress(const DecompressionJob &dcj,
+  void Decompress(const FasTC::DecompressionJob &dcj,
                  const bool bTwoBitMode,
                  const EWrapMode wrapMode,
                  bool bDebugImages) {
--- a/PVRTCEncoder/test/DecompTestPVR.cpp
+++ b/PVRTCEncoder/test/DecompTestPVR.cpp
@ -81,7 +81,8 @@ class ImageTester {

    uint32 *outPixels = new uint32[w * h];

-    DecompressionJob dcj(data, reinterpret_cast<uint8 *>(outPixels), w, h);
+    FasTC::DecompressionJob dcj(FasTC::eCompressionFormat_PVRTC,
+                                data, reinterpret_cast<uint8 *>(outPixels), w, h);
 #ifdef OUTPUT_DEBUG_IMAGE
    PVRTCC::Decompress(dcj, twobpp, PVRTCC::eWrapMode_Wrap, true);
 #else
--- a/PVRTCEncoder/test/DecompressorTest.cpp
+++ b/PVRTCEncoder/test/DecompressorTest.cpp
@ -56,6 +56,8 @@

 #include "PVRTCCompressor.h"

+static const FasTC::ECompressionFormat kFmt = FasTC::eCompressionFormat_PVRTC;
+
 TEST(Decompressor, DecompressWhite) {
  const uint32 kWidth = 32;
  const uint32 kHeight = 32;
@ -69,7 +71,7 @@ TEST(Decompressor, DecompressWhite) {

  uint8 outData[4 * kWidth * kHeight];

-  DecompressionJob dcj (pvrData, outData, kWidth, kHeight);
+  FasTC::DecompressionJob dcj (kFmt, pvrData, outData, kWidth, kHeight);
  PVRTCC::Decompress(dcj);

  for(uint32 i = 0; i < kWidth; i++) {
@ -94,7 +96,7 @@ TEST(Decompressor, DecompressGray) {

  uint8 outData[4 * kWidth * kHeight];

-  DecompressionJob dcj (pvrData, outData, kWidth, kHeight);
+  FasTC::DecompressionJob dcj (kFmt, pvrData, outData, kWidth, kHeight);
  PVRTCC::Decompress(dcj);

  for(uint32 i = 0; i < kWidth; i++) {