/* FasTC * Copyright (c) 2012 University of North Carolina at Chapel Hill. * All rights reserved. * * Permission to use, copy, modify, and distribute this software and its * documentation for educational, research, and non-profit purposes, without * fee, and without a written agreement is hereby granted, provided that the * above copyright notice, this paragraph, and the following four paragraphs * appear in all copies. * * Permission to incorporate this software into commercial products may be * obtained by contacting the authors or the Office of Technology Development * at the University of North Carolina at Chapel Hill . * * This software program and documentation are copyrighted by the University of * North Carolina at Chapel Hill. The software program and documentation are * supplied "as is," without any accompanying services from the University of * North Carolina at Chapel Hill or the authors. The University of North * Carolina at Chapel Hill and the authors do not warrant that the operation of * the program will be uninterrupted or error-free. The end-user understands * that the program was developed for research purposes and is advised not to * rely exclusively on the program for any reason. * * IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE * AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA * AT CHAPEL HILL OR THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS SPECIFICALLY * DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY * STATUTORY WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON * AN "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND * THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, * ENHANCEMENTS, OR MODIFICATIONS. * * Please send all BUG REPORTS to . * * The authors may be contacted via: * * Pavel Krajcevski * Dept of Computer Science * 201 S Columbia St * Frederick P. Brooks, Jr. Computer Science Bldg * Chapel Hill, NC 27599-3175 * USA * * */ // The original lisence from the code available at the following location: // http://software.intel.com/en-us/vcsource/samples/fast-texture-compression // // This code has been modified significantly from the original. //------------------------------------------------------------------------------ // Copyright 2011 Intel Corporation // All Rights Reserved // // Permission is granted to use, copy, distribute and prepare derivative works // of this software for any purpose and without fee, provided, that the above // copyright notice and this statement appear in all copies. Intel makes no // representations about the suitability of this software for any purpose. THIS // SOFTWARE IS PROVIDED "AS IS." INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, // EXPRESS OR IMPLIED, AND ALL LIABILITY, INCLUDING CONSEQUENTIAL AND OTHER // INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE, INCLUDING LIABILITY FOR // INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Intel does not assume // any responsibility for any errors which may appear in this software nor any // responsibility to update it. // //------------------------------------------------------------------------------ #include "FasTC/BPTCCompressor.h" #include "FasTC/TexCompTypes.h" #include "FasTC/BitStream.h" using FasTC::BitStream; using FasTC::BitStreamReadOnly; #include "FasTC/Shapes.h" #include "CompressionMode.h" #include "BCLookupTables.h" #include "RGBAEndpoints.h" #ifdef HAS_MSVC_ATOMICS # include "Windows.h" #endif #ifdef _MSC_VER # undef min # undef max #endif // _MSC_VER #include #include #include #include #include #include #include #include #include #include #include enum EBlockStats { eBlockStat_Path, eBlockStat_Mode, eBlockStat_SingleShapeEstimate, eBlockStat_TwoShapeEstimate, eBlockStat_ThreeShapeEstimate, eBlockStat_ModeZeroEstimate, eBlockStat_ModeOneEstimate, eBlockStat_ModeTwoEstimate, eBlockStat_ModeThreeEstimate, eBlockStat_ModeFourEstimate, eBlockStat_ModeFiveEstimate, eBlockStat_ModeSixEstimate, eBlockStat_ModeSevenEstimate, eBlockStat_ModeZeroError, eBlockStat_ModeOneError, eBlockStat_ModeTwoError, eBlockStat_ModeThreeError, eBlockStat_ModeFourError, eBlockStat_ModeFiveError, eBlockStat_ModeSixError, eBlockStat_ModeSevenError, kNumBlockStats }; static const char *kBlockStatString[kNumBlockStats] = { "BlockStat_Path", "BlockStat_Mode", "BlockStat_SingleShapeEstimate", "BlockStat_TwoShapeEstimate", "BlockStat_ThreeShapeEstimate", "BlockStat_ModeZeroEstimate", "BlockStat_ModeOneEstimate", "BlockStat_ModeTwoEstimate", "BlockStat_ModeThreeEstimate", "BlockStat_ModeFourEstimate", "BlockStat_ModeFiveEstimate", "BlockStat_ModeSixEstimate", "BlockStat_ModeSevenEstimate", "BlockStat_ModeZeroError", "BlockStat_ModeOneError", "BlockStat_ModeTwoError", "BlockStat_ModeThreeError", "BlockStat_ModeFourError", "BlockStat_ModeFiveError", "BlockStat_ModeSixError", "BlockStat_ModeSevenError", }; namespace BPTCC { static const int kAnchorIdx2[kNumShapes2] = { 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 2, 8, 2, 2, 8, 8, 15, 2 , 8, 2, 2, 8, 8, 2, 2, 15, 15, 6, 8, 2, 8, 15, 15, 2 , 8, 2, 2, 2, 15, 15, 6, 6 , 2, 6, 8, 15, 15, 2, 2, 15, 15, 15, 15, 15, 2, 2, 15 }; static const uint32 kWMValues[] = { 0x32b92180, 0x32ba3080, 0x31103200, 0x28103c80, 0x32bb3080, 0x25903600, 0x3530b900, 0x3b32b180, 0x34b5b98 }; static const uint32 kNumWMVals = sizeof(kWMValues) / sizeof(kWMValues[0]); static uint32 gWMVal = -1; static const int kAnchorIdx3[2][kNumShapes3] = { {3, 3, 15, 15, 8, 3, 15, 15, 8 , 8, 6, 6, 6, 5, 3, 3, 3 , 3, 8, 15, 3, 3, 6, 10, 5 , 8, 8, 6, 8, 5, 15, 15, 8 , 15, 3, 5, 6, 10, 8, 15, 15, 3, 15, 5, 15, 15, 15, 15, 3 , 15, 5, 5, 5, 8, 5, 10, 5 , 10, 8, 13, 15, 12, 3, 3 }, {15, 8, 8, 3, 15, 15, 3, 8, 15 , 15, 15, 15, 15, 15, 15, 8, 15 , 8, 15, 3, 15, 8, 15, 8, 3 , 15, 6, 10, 15, 15, 10, 8, 15 , 3, 15, 10, 10, 8, 9, 10, 6 , 15, 8, 15, 3, 6, 6, 8, 15 , 3, 15, 15, 15, 15, 15, 15, 15 , 15, 15, 15, 3, 15, 15, 8 } }; template static inline T sad(const T &a, const T &b) { return (a > b)? a - b : b - a; } static uint32 GetAnchorIndexForSubset( int subset, const int shapeIdx, const int nSubsets ) { int anchorIdx = 0; switch(subset) { case 1: { if(nSubsets == 2) { anchorIdx = kAnchorIdx2[shapeIdx]; } else { anchorIdx = kAnchorIdx3[0][shapeIdx]; } } break; case 2: { assert(nSubsets == 3); anchorIdx = kAnchorIdx3[1][shapeIdx]; } break; default: break; } return anchorIdx; } template static void insert(T* buf, int bufSz, T newVal, int idx = 0) { int safeIdx = std::min(bufSz-1, std::max(idx, 0)); for(int i = bufSz - 1; i > safeIdx; i--) { buf[i] = buf[i-1]; } buf[safeIdx] = newVal; } template static inline void swap(T &a, T &b) { T t = a; a = b; b = t; } const uint32 kInterpolationValues[4][16][2] = { { {64, 0}, {33, 31}, {0, 64}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} }, { {64, 0}, {43, 21}, {21, 43}, {0, 64}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} }, { {64, 0}, {55, 9}, {46, 18}, {37, 27}, {27, 37}, {18, 46}, {9, 55}, {0, 64}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} }, { {64, 0}, {60, 4}, {55, 9}, {51, 13}, {47, 17}, {43, 21}, {38, 26}, {34, 30}, {30, 34}, {26, 38}, {21, 43}, {17, 47}, {13, 51}, {9, 55}, {4, 60}, {0, 64}} }; CompressionMode::Attributes CompressionMode::kModeAttributes[kNumModes] = { // Mode 0 { 0, 4, 3, 3, 0, 4, 0, false, false, CompressionMode::ePBitType_NotShared }, // Mode 1 { 1, 6, 2, 3, 0, 6, 0, false, false, CompressionMode::ePBitType_Shared }, // Mode 2 { 2, 6, 3, 2, 0, 5, 0, false, false, CompressionMode::ePBitType_None }, // Mode 3 { 3, 6, 2, 2, 0, 7, 0, false, false, CompressionMode::ePBitType_NotShared }, // Mode 4 { 4, 0, 1, 2, 3, 5, 6, true, true, CompressionMode::ePBitType_None }, // Mode 5 { 5, 0, 1, 2, 2, 7, 8, true, false, CompressionMode::ePBitType_None }, // Mode 6 { 6, 0, 1, 4, 0, 7, 7, false, false, CompressionMode::ePBitType_NotShared }, // Mode 7 { 7, 6, 2, 2, 0, 5, 5, false, false, CompressionMode::ePBitType_NotShared }, }; ALIGN_SSE const float kErrorMetrics[kNumErrorMetrics][kNumColorChannels] = { { 1.0f, 1.0f, 1.0f, 1.0f }, { sqrtf(0.3f), sqrtf(0.56f), sqrtf(0.11f), 1.0f } }; const float *GetErrorMetric(ErrorMetric e) { return kErrorMetrics[e]; } void CompressionMode::ClampEndpointsToGrid( RGBAVector &p1, RGBAVector &p2, uint8 &bestPBitCombo ) const { const int nPbitCombos = GetNumPbitCombos(); const bool hasPbits = nPbitCombos > 1; const uint32 qmask = GetQuantizationMask(); ClampEndpoints(p1, p2); // !SPEED! This can be faster. float minDist = FLT_MAX; RGBAVector bp1, bp2; for(int i = 0; i < nPbitCombos; i++) { uint32 qp1, qp2; if(hasPbits) { qp1 = p1.ToPixel(qmask, GetPBitCombo(i)[0]); qp2 = p2.ToPixel(qmask, GetPBitCombo(i)[1]); } else { qp1 = p1.ToPixel(qmask); qp2 = p2.ToPixel(qmask); } RGBAVector np1 = RGBAVector(0, qp1); RGBAVector np2 = RGBAVector(0, qp2); RGBAVector d1 = np1 - p1; RGBAVector d2 = np2 - p2; float dist = (d1 * d1) + (d2 * d2); if(dist < minDist) { minDist = dist; bp1 = np1; bp2 = np2; bestPBitCombo = i; } } p1 = bp1; p2 = bp2; } double CompressionMode::CompressSingleColor( const RGBAVector &p, RGBAVector &p1, RGBAVector &p2, uint8 &bestPbitCombo ) const { const uint32 pixel = p.ToPixel(); float bestError = FLT_MAX; for(int pbi = 0; pbi < GetNumPbitCombos(); pbi++) { const int *pbitCombo = GetPBitCombo(pbi); uint32 dist[4] = { 0x0, 0x0, 0x0, 0x0 }; uint32 bestValI[kNumColorChannels]; uint32 bestValJ[kNumColorChannels]; memset(bestValI, 0xFF, sizeof(bestValI)); memset(bestValJ, 0xFF, sizeof(bestValJ)); for(uint32 ci = 0; ci < kNumColorChannels; ci++) { const uint8 val = (pixel >> (ci * 8)) & 0xFF; int nBits = m_Attributes->colorChannelPrecision; if(ci == 3) { nBits = GetAlphaChannelPrecision(); } // If we don't handle this channel, then it must be the full value (alpha) if(nBits == 0) { bestValI[ci] = bestValJ[ci] = 0xFF; dist[ci] = std::max(dist[ci], static_cast(0xFF - val)); continue; } const int nPossVals = (1 << nBits); int possValsH[256]; int possValsL[256]; // Do we have a pbit? const bool havepbit = GetPBitType() != ePBitType_None; if(havepbit) nBits++; for(int i = 0; i < nPossVals; i++) { int vh = i, vl = i; if(havepbit) { vh <<= 1; vl <<= 1; vh |= pbitCombo[1]; vl |= pbitCombo[0]; } possValsH[i] = (vh << (8 - nBits)); possValsH[i] |= (possValsH[i] >> nBits); possValsL[i] = (vl << (8 - nBits)); possValsL[i] |= (possValsL[i] >> nBits); } const uint8 bpi = GetNumberOfBitsPerIndex() - 1; const uint32 interpVal0 = kInterpolationValues[bpi][1][0]; const uint32 interpVal1 = kInterpolationValues[bpi][1][1]; // Find the closest interpolated val that to the given val... uint32 bestChannelDist = 0xFF; for(int i = 0; bestChannelDist > 0 && i < nPossVals; i++) for(int j = 0; bestChannelDist > 0 && j < nPossVals; j++) { const uint32 v1 = possValsL[i]; const uint32 v2 = possValsH[j]; const uint32 combo = (interpVal0*v1 + (interpVal1 * v2) + 32) >> 6; const uint32 err = (combo > val)? combo - val : val - combo; if(err < bestChannelDist) { bestChannelDist = err; bestValI[ci] = v1; bestValJ[ci] = v2; } } dist[ci] = std::max(bestChannelDist, dist[ci]); } const float *errorWeights = kErrorMetrics[this->m_ErrorMetric]; float error = 0.0; for(uint32 i = 0; i < kNumColorChannels; i++) { float e = static_cast(dist[i]) * errorWeights[i]; error += e * e; } if(error < bestError) { bestError = error; bestPbitCombo = pbi; for(uint32 ci = 0; ci < kNumColorChannels; ci++) { p1[ci] = static_cast(bestValI[ci]); p2[ci] = static_cast(bestValJ[ci]); } } } return bestError; } // Fast random number generator. See more information at // http://software.intel.com/en-us/articles/fast-random-number- // generator-on-the-intel-pentiumr-4-processor/ static uint32 g_seed = static_cast(time(NULL)); static inline uint32 fastrand() { g_seed = (214013 * g_seed + 2531011); return (g_seed>>16) & RAND_MAX; } static void ChangePointForDirWithoutPbitChange( RGBAVector &v, uint32 dir, const float step[kNumColorChannels] ) { if(dir % 2) { v.X() -= step[0]; } else { v.X() += step[0]; } if(((dir / 2) % 2)) { v.Y() -= step[1]; } else { v.Y() += step[1]; } if(((dir / 4) % 2)) { v.Z() -= step[2]; } else { v.Z() += step[2]; } if(((dir / 8) % 2)) { v.W() -= step[3]; } else { v.W() += step[3]; } } static void ChangePointForDirWithPbitChange( RGBAVector &v, uint32 dir, uint32 oldPbit, const float step[kNumColorChannels] ) { if(dir % 2 && oldPbit == 0) { v.X() -= step[0]; } else if(!(dir % 2) && oldPbit == 1) { v.X() += step[0]; } if(((dir / 2) % 2) && oldPbit == 0) { v.Y() -= step[1]; } else if(!((dir / 2) % 2) && oldPbit == 1) { v.Y() += step[1]; } if(((dir / 4) % 2) && oldPbit == 0) { v.Z() -= step[2]; } else if(!((dir / 4) % 2) && oldPbit == 1) { v.Z() += step[2]; } if(((dir / 8) % 2) && oldPbit == 0) { v.W() -= step[3]; } else if(!((dir / 8) % 2) && oldPbit == 1) { v.W() += step[3]; } } struct VisitedState { RGBAVector p1; RGBAVector p2; int pBitCombo; }; void CompressionMode::PickBestNeighboringEndpoints( const RGBACluster &cluster, const RGBAVector &p1, const RGBAVector &p2, const int curPbitCombo, RGBAVector &np1, RGBAVector &np2, int &nPbitCombo, const VisitedState *visitedStates, int nVisited, float stepSz ) const { // !SPEED! There might be a way to make this faster since we're working // with floating point values that are powers of two. We should be able // to just set the proper bits in the exponent and leave the mantissa to 0. float step[kNumColorChannels] = { stepSz * static_cast(1 << (8 - m_Attributes->colorChannelPrecision)), stepSz * static_cast(1 << (8 - m_Attributes->colorChannelPrecision)), stepSz * static_cast(1 << (8 - m_Attributes->colorChannelPrecision)), stepSz * static_cast(1 << (8 - GetAlphaChannelPrecision())) }; if(m_IsOpaque) { step[(GetRotationMode() + 3) % kNumColorChannels] = 0.0f; } // First, let's figure out the new pbit combo... if there's no pbit then we // don't need to worry about it. const bool hasPbits = GetPBitType() != ePBitType_None; if(hasPbits) { // If there is a pbit, then we must change it, because those will provide // the closest values to the current point. if(GetPBitType() == ePBitType_Shared) { nPbitCombo = (curPbitCombo + 1) % 2; } else { // Not shared... p1 needs to change and p2 needs to change... which means // that combo 0 gets rotated to combo 3, combo 1 gets rotated to combo 2 // and vice versa... nPbitCombo = 3 - curPbitCombo; } assert(GetPBitCombo(curPbitCombo)[0] + GetPBitCombo(nPbitCombo)[0] == 1); assert(GetPBitCombo(curPbitCombo)[1] + GetPBitCombo(nPbitCombo)[1] == 1); } bool visited = true; int infLoopPrevent = -1; while(visited && ++infLoopPrevent < 16) { for(int pt = 0; pt < 2; pt++) { const RGBAVector &p = (pt)? p1 : p2; RGBAVector &np = (pt)? np1 : np2; np = p; if(hasPbits) { const uint32 rdir = fastrand() % 16; const uint32 pbit = GetPBitCombo(curPbitCombo)[pt]; ChangePointForDirWithPbitChange(np, rdir, pbit, step); } else { ChangePointForDirWithoutPbitChange(np, fastrand() % 16, step); } for(uint32 i = 0; i < kNumColorChannels; i++) { np[i] = std::min(std::max(np[i], 0.0f), 255.0f); } } visited = false; for(int i = 0; i < nVisited; i++) { visited = visited || ( visitedStates[i].p1 == np1 && visitedStates[i].p2 == np2 && visitedStates[i].pBitCombo == nPbitCombo ); } } } // Fast generation of floats between 0 and 1. It generates a float // whose exponent forces the value to be between 1 and 2, then it // populates the mantissa with a random assortment of bits, and returns // the bytes interpreted as a float. This prevents two things: 1, a // division, and 2, a cast from an integer to a float. static inline float frand() { // RAND_MAX is 0x7FFF, which offers 15 bits // of precision. Therefore, we move the bits // into the top of the 23 bit mantissa, and // repeat the most significant bits of r in // the least significant of the mantissa const uint16 r = fastrand(); const uint32 m = (r << 8) | (r >> 7); const union { uint32 fltAsInt; float flt; } fltUnion = { (127 << 23) | m }; return fltUnion.flt - 1.0f; } #define COMPILE_ASSERT(x) extern int __compile_assert_[static_cast(x)]; COMPILE_ASSERT(RAND_MAX == 0x7FFF) bool CompressionMode::AcceptNewEndpointError( double newError, double oldError, float temp ) const { // Always accept better endpoints. if(newError < oldError) { return true; } const double p = exp((0.1f * (oldError - newError)) / temp); const double r = frand(); return r < p; } double CompressionMode::OptimizeEndpointsForCluster( const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, uint8 *bestIndices, uint8 &bestPbitCombo ) const { const uint32 nBuckets = (1 << GetNumberOfBitsPerIndex()); const uint32 qmask = GetQuantizationMask(); // Here we use simulated annealing to traverse the space of clusters to find // the best possible endpoints. double curError = cluster.QuantizedError( p1, p2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(bestPbitCombo), bestIndices ); int curPbitCombo = bestPbitCombo; double bestError = curError; // Clamp endpoints to the grid... uint32 qp1, qp2; if(GetPBitType() != ePBitType_None) { qp1 = p1.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[0]); qp2 = p2.ToPixel(qmask, GetPBitCombo(bestPbitCombo)[1]); } else { qp1 = p1.ToPixel(qmask); qp2 = p2.ToPixel(qmask); } p1 = RGBAVector(0, qp1); p2 = RGBAVector(0, qp2); RGBAVector bp1 = p1, bp2 = p2; int lastVisitedState = 0; VisitedState visitedStates[kMaxAnnealingIterations]; visitedStates[lastVisitedState].p1 = p1; visitedStates[lastVisitedState].p2 = p2; visitedStates[lastVisitedState].pBitCombo = curPbitCombo; lastVisitedState++; const int maxEnergy = this->m_SASteps; for(int energy = 0; bestError > 0 && energy < maxEnergy; energy++) { float temp = static_cast(energy) / static_cast(maxEnergy-1); uint8 indices[kMaxNumDataPoints]; RGBAVector np1, np2; int nPbitCombo = 0; PickBestNeighboringEndpoints( cluster, p1, p2, curPbitCombo, np1, np2, nPbitCombo, visitedStates, lastVisitedState ); double error = cluster.QuantizedError( np1, np2, nBuckets, qmask, GetErrorMetric(), GetPBitCombo(nPbitCombo), indices ); if(AcceptNewEndpointError(error, curError, temp)) { curError = error; p1 = np1; p2 = np2; curPbitCombo = nPbitCombo; } if(error < bestError) { memcpy(bestIndices, indices, sizeof(indices)); bp1 = np1; bp2 = np2; bestPbitCombo = nPbitCombo; bestError = error; lastVisitedState = 0; visitedStates[lastVisitedState].p1 = np1; visitedStates[lastVisitedState].p2 = np2; visitedStates[lastVisitedState].pBitCombo = nPbitCombo; lastVisitedState++; // Restart... energy = 0; } } p1 = bp1; p2 = bp2; return bestError; } double CompressionMode::CompressCluster( const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, uint8 *bestIndices, uint8 *alphaIndices ) const { assert(GetModeNumber() == 4 || GetModeNumber() == 5); assert(GetNumberOfSubsets() == 1); assert(cluster.GetNumPoints() == kMaxNumDataPoints); assert(m_Attributes->alphaChannelPrecision > 0); // If all the points are the same in the cluster, then we need to figure out // what the best approximation to this point is.... if(cluster.AllSamePoint()) { assert(!"We should only be using this function in modes 4 & 5 that have a" "single subset, in which case single colors should have been" "detected much earlier."); const RGBAVector &p = cluster.GetPoint(0); uint8 dummyPbit = 0; double bestErr = CompressSingleColor(p, p1, p2, dummyPbit); // We're assuming all indices will be index 1... for(uint32 i = 0; i < cluster.GetNumPoints(); i++) { bestIndices[i] = 1; alphaIndices[i] = 1; } return cluster.GetNumPoints() * bestErr; } RGBACluster rgbCluster(cluster); float alphaVals[kMaxNumDataPoints] = {0}; float alphaMin = FLT_MAX, alphaMax = -FLT_MAX; for(uint32 i = 0; i < rgbCluster.GetNumPoints(); i++) { RGBAVector &v = rgbCluster.Point(i); switch(this->GetRotationMode()) { default: case 0: // Do nothing break; case 1: swap(v.R(), v.A()); break; case 2: swap(v.G(), v.A()); break; case 3: swap(v.B(), v.A()); break; } alphaVals[i] = v.A(); v.A() = 255.0f; alphaMin = std::min(alphaVals[i], alphaMin); alphaMax = std::max(alphaVals[i], alphaMax); } uint8 dummyPbit = 0; RGBAVector rgbp1, rgbp2; double rgbError = CompressCluster( rgbCluster, rgbp1, rgbp2, bestIndices, dummyPbit ); float a1 = alphaMin, a2 = alphaMax; double alphaError = DBL_MAX; typedef uint32 tInterpPair[2]; typedef tInterpPair tInterpLevel[16]; const tInterpLevel *interpVals = kInterpolationValues + (GetNumberOfBitsPerAlpha() - 1); const float weight = GetErrorMetric().A(); const uint32 nBuckets = (1 << GetNumberOfBitsPerAlpha()); // If they're the same, then we can get them exactly. if(a1 == a2) { const uint8 a1be = uint8(a1); const uint8 a2be = uint8(a2); // Mode 5 has 8 bits of precision for alpha. if(GetModeNumber() == 5) { for(uint32 i = 0; i < kMaxNumDataPoints; i++) alphaIndices[i] = 0; alphaError = 0.0; } else { assert(GetModeNumber() == 4); // Mode 4 can be treated like the 6 channel of DXT1 compression. if(Optimal6CompressDXT1[a1be][0][0]) { a1 = static_cast( (Optimal6CompressDXT1[a1be][1][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4)); a2 = static_cast( (Optimal6CompressDXT1[a2be][1][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4)); } else { a1 = static_cast( (Optimal6CompressDXT1[a1be][0][1] << 2) | (Optimal6CompressDXT1[a1be][0][1] >> 4)); a2 = static_cast( (Optimal6CompressDXT1[a2be][0][2] << 2) | (Optimal6CompressDXT1[a2be][0][1] >> 4)); } if(m_IndexMode == 1) { for(uint32 i = 0; i < kMaxNumDataPoints; i++) alphaIndices[i] = 1; } else { for(uint32 i = 0; i < kMaxNumDataPoints; i++) alphaIndices[i] = 2; } uint32 interp0 = (*interpVals)[alphaIndices[0] & 0xFF][0]; uint32 interp1 = (*interpVals)[alphaIndices[0] & 0xFF][1]; const uint32 a1i = static_cast(a1); const uint32 a2i = static_cast(a2); const uint8 ip = (((a1i * interp0) + (a2i * interp1) + 32) >> 6) & 0xFF; float pxError = weight * static_cast((a1be > ip)? a1be - ip : ip - a1be); pxError *= pxError; alphaError = 16 * pxError; } } else { // (a1 != a2) float vals[1<<3]; memset(vals, 0, sizeof(vals)); uint32 buckets[kMaxNumDataPoints]; // Figure out initial positioning. for(uint32 i = 0; i < nBuckets; i++) { const float fi = static_cast(i); const float fb = static_cast(nBuckets - 1); vals[i] = alphaMin + (fi/fb) * (alphaMax - alphaMin); } // Assign each value to a bucket for(uint32 i = 0; i < kMaxNumDataPoints; i++) { float minDist = 255.0f; for(uint32 j = 0; j < nBuckets; j++) { float dist = fabs(alphaVals[i] - vals[j]); if(dist < minDist) { minDist = dist; buckets[i] = j; } } } float npts[1 << 3]; // Do k-means bool fixed = false; while(!fixed) { memset(npts, 0, sizeof(npts)); float avg[1 << 3]; memset(avg, 0, sizeof(avg)); // Calculate average of each cluster for(uint32 i = 0; i < nBuckets; i++) { for(uint32 j = 0; j < kMaxNumDataPoints; j++) { if(buckets[j] == i) { avg[i] += alphaVals[j]; npts[i] += 1.0f; } } if(npts[i] > 0.0f) { avg[i] /= npts[i]; } } // Did we change anything? fixed = true; for(uint32 i = 0; i < nBuckets; i++) { fixed = fixed && (avg[i] == vals[i]); } // Reassign indices... memcpy(vals, avg, sizeof(vals)); // Reassign each value to a bucket for(uint32 i = 0; i < kMaxNumDataPoints; i++) { float minDist = 255.0f; for(uint32 j = 0; j < nBuckets; j++) { float dist = fabs(alphaVals[i] - vals[j]); if(dist < minDist) { minDist = dist; buckets[i] = j; } } } } // Do least squares fit of vals. float asq = 0.0, bsq = 0.0, ab = 0.0; float ax(0.0), bx(0.0); for(uint32 i = 0; i < nBuckets; i++) { const float fbi = static_cast(nBuckets - 1 - i); const float fb = static_cast(nBuckets - 1); const float fi = static_cast(i); float a = fbi / fb; float b = fi / fb; float n = npts[i]; float x = vals[i]; asq += n * a * a; bsq += n * b * b; ab += n * a * b; ax += x * a * n; bx += x * b * n; } float f = 1.0f / (asq * bsq - ab * ab); a1 = f * (ax * bsq - bx * ab); a2 = f * (bx * asq - ax * ab); // Clamp a1 = std::min(255.0f, std::max(0.0f, a1)); a2 = std::min(255.0f, std::max(0.0f, a2)); // Quantize const int8 maskSeed = -0x7F; const uint8 a1b = ::QuantizeChannel( uint8(a1), (maskSeed >> (GetAlphaChannelPrecision() - 1))); const uint8 a2b = ::QuantizeChannel( uint8(a2), (maskSeed >> (GetAlphaChannelPrecision() - 1))); // Compute error alphaError = 0.0; for(uint32 i = 0; i < kMaxNumDataPoints; i++) { uint8 val = uint8(alphaVals[i]); float minError = FLT_MAX; int bestBucket = -1; for(uint32 j = 0; j < nBuckets; j++) { uint32 interp0 = (*interpVals)[j][0]; uint32 interp1 = (*interpVals)[j][1]; uint32 a1i = static_cast(a1b); uint32 a2i = static_cast(a2b); const uint8 ip = (((a1i * interp0) + (a2i * interp1) + 32) >> 6) & 0xFF; float pxError = weight * static_cast((val > ip)? val - ip : ip - val); pxError *= pxError; if(pxError < minError) { minError = pxError; bestBucket = j; } } alphaError += minError; alphaIndices[i] = bestBucket; } } for(uint32 i = 0; i < kNumColorChannels; i++) { p1[i] = (i == (kNumColorChannels-1))? a1 : rgbp1[i]; p2[i] = (i == (kNumColorChannels-1))? a2 : rgbp2[i]; } return rgbError + alphaError; } double CompressionMode::CompressCluster( const RGBACluster &cluster, RGBAVector &p1, RGBAVector &p2, uint8 *bestIndices, uint8 &bestPbitCombo ) const { // If all the points are the same in the cluster, then we need to figure out // what the best approximation to this point is.... if(cluster.AllSamePoint()) { const RGBAVector &p = cluster.GetPoint(0); double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); // We're assuming all indices will be index 1... for(uint32 i = 0; i < cluster.GetNumPoints(); i++) { bestIndices[i] = 1; } return cluster.GetNumPoints() * bestErr; } const uint32 nBuckets = (1 << GetNumberOfBitsPerIndex()); #if 1 RGBADir axis; cluster.GetPrincipalAxis(axis, NULL, NULL); float mindp = FLT_MAX, maxdp = -FLT_MAX; for(uint32 i = 0 ; i < cluster.GetNumPoints(); i++) { float dp = (cluster.GetPoint(i) - cluster.GetAvg()) * axis; if(dp < mindp) mindp = dp; if(dp > maxdp) maxdp = dp; } p1 = cluster.GetAvg() + mindp * axis; p2 = cluster.GetAvg() + maxdp * axis; #else cluster.GetBoundingBox(p1, p2); #endif ClampEndpoints(p1, p2); RGBAVector pts[1 << 4]; // At most 4 bits per index. uint32 numPts[1<<4]; assert(nBuckets <= 1 << 4); for(uint32 i = 0; i < nBuckets; i++) { float s = (static_cast(i) / static_cast(nBuckets - 1)); pts[i] = (1.0f - s) * p1 + s * p2; } assert(pts[0] == p1); assert(pts[nBuckets - 1] == p2); // Do k-means clustering... uint32 bucketIdx[kMaxNumDataPoints] = {0}; bool fixed = false; while(!fixed) { RGBAVector newPts[1 << 4]; // Assign each of the existing points to one of the buckets... for(uint32 i = 0; i < cluster.GetNumPoints(); i++) { int minBucket = -1; float minDist = FLT_MAX; for(uint32 j = 0; j < nBuckets; j++) { RGBAVector v = cluster.GetPoint(i) - pts[j]; float distSq = v * v; if(distSq < minDist) { minDist = distSq; minBucket = j; } } assert(minBucket >= 0); bucketIdx[i] = minBucket; } // Calculate new buckets based on centroids of clusters... for(uint32 i = 0; i < nBuckets; i++) { numPts[i] = 0; newPts[i] = RGBAVector(0.0f); for(uint32 j = 0; j < cluster.GetNumPoints(); j++) { if(bucketIdx[j] == i) { numPts[i]++; newPts[i] += cluster.GetPoint(j); } } // If there are no points in this cluster, then it should // remain the same as last time and avoid a divide by zero. if(0 != numPts[i]) newPts[i] /= static_cast(numPts[i]); } // If we haven't changed, then we're done. fixed = true; for(uint32 i = 0; i < nBuckets; i++) { if(pts[i] != newPts[i]) fixed = false; } // Assign the new points to be the old points. for(uint32 i = 0; i < nBuckets; i++) { pts[i] = newPts[i]; } } // If there's only one bucket filled, then just compress for that single color int numBucketsFilled = 0, lastFilledBucket = -1; for(uint32 i = 0; i < nBuckets; i++) { if(numPts[i] > 0) { numBucketsFilled++; lastFilledBucket = i; } } assert(numBucketsFilled > 0); if(1 == numBucketsFilled) { const RGBAVector &p = pts[lastFilledBucket]; double bestErr = CompressSingleColor(p, p1, p2, bestPbitCombo); // We're assuming all indices will be index 1... for(uint32 i = 0; i < cluster.GetNumPoints(); i++) { bestIndices[i] = 1; } return cluster.GetNumPoints() * bestErr; } // Now that we know the index of each pixel, we can assign the endpoints based // on a least squares fit of the clusters. For more information, take a look // at this article by NVidia: http://developer.download.nvidia.com/compute/ // cuda/1.1-Beta/x86_website/projects/dxtc/doc/cuda_dxtc.pdf float asq = 0.0, bsq = 0.0, ab = 0.0; RGBAVector ax(0.0), bx(0.0); for(uint32 i = 0; i < nBuckets; i++) { const RGBAVector x = pts[i]; const int n = numPts[i]; const float fbi = static_cast(nBuckets - 1 - i); const float fb = static_cast(nBuckets - 1); const float fi = static_cast(i); const float fn = static_cast(n); const float a = fbi / fb; const float b = fi / fb; asq += fn * a * a; bsq += fn * b * b; ab += fn * a * b; ax += x * a * fn; bx += x * b * fn; } float f = 1.0f / (asq * bsq - ab * ab); p1 = f * (ax * bsq - bx * ab); p2 = f * (bx * asq - ax * ab); ClampEndpointsToGrid(p1, p2, bestPbitCombo); #ifdef _DEBUG uint8 pBitCombo = bestPbitCombo; RGBAVector tp1 = p1, tp2 = p2; ClampEndpointsToGrid(tp1, tp2, pBitCombo); assert(p1 == tp1); assert(p2 == tp2); assert(pBitCombo == bestPbitCombo); #endif assert(bestPbitCombo >= 0); return OptimizeEndpointsForCluster( cluster, p1, p2, bestIndices, bestPbitCombo ); } void CompressionMode::Pack(Params ¶ms, BitStream &stream) const { const int kModeNumber = GetModeNumber(); const int nPartitionBits = GetNumberOfPartitionBits(); const int nSubsets = GetNumberOfSubsets(); // Mode # stream.WriteBits(1 << kModeNumber, kModeNumber + 1); // Partition # assert(!nPartitionBits || (((1 << nPartitionBits) - 1) & params.m_ShapeIdx) == params.m_ShapeIdx); stream.WriteBits(params.m_ShapeIdx, nPartitionBits); stream.WriteBits(params.m_RotationMode, m_Attributes->hasRotation? 2 : 0); stream.WriteBits(params.m_IndexMode, m_Attributes->hasIdxMode? 1 : 0); #ifdef _DEBUG for(int i = 0; i < kMaxNumDataPoints; i++) { int nSet = 0; for(int j = 0; j < nSubsets; j++) { if(params.m_Indices[j][i] < 255) nSet++; } assert(nSet == 1); } #endif // Get the quantization mask const uint32 qmask = GetQuantizationMask(); // Quantize the points... uint32 pixel1[kMaxNumSubsets], pixel2[kMaxNumSubsets]; for(int i = 0; i < nSubsets; i++) { switch(GetPBitType()) { default: case ePBitType_None: pixel1[i] = params.m_P1[i].ToPixel(qmask); pixel2[i] = params.m_P2[i].ToPixel(qmask); break; case ePBitType_Shared: case ePBitType_NotShared: pixel1[i] = params.m_P1[i].ToPixel(qmask, GetPBitCombo(params.m_PbitCombo[i])[0]); pixel2[i] = params.m_P2[i].ToPixel(qmask, GetPBitCombo(params.m_PbitCombo[i])[1]); break; } } // If the anchor index does not have 0 in the leading bit, then // we need to swap EVERYTHING. for(int sidx = 0; sidx < nSubsets; sidx++) { int anchorIdx = GetAnchorIndexForSubset(sidx, params.m_ShapeIdx, nSubsets); assert(params.m_Indices[sidx][anchorIdx] != 255); const int nAlphaIndexBits = GetNumberOfBitsPerAlpha(params.m_IndexMode); const int nIndexBits = GetNumberOfBitsPerIndex(params.m_IndexMode); if(params.m_Indices[sidx][anchorIdx] >> (nIndexBits - 1)) { std::swap(pixel1[sidx], pixel2[sidx]); int nIndexVals = 1 << nIndexBits; for(int i = 0; i < 16; i++) { params.m_Indices[sidx][i] = (nIndexVals - 1) - params.m_Indices[sidx][i]; } int nAlphaIndexVals = 1 << nAlphaIndexBits; if(m_Attributes->hasRotation) { for(int i = 0; i < 16; i++) { params.m_AlphaIndices[i] = (nAlphaIndexVals - 1) - params.m_AlphaIndices[i]; } } } const bool rotated = (params.m_AlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1)) > 0; if(m_Attributes->hasRotation && rotated) { uint8 * bp1 = reinterpret_cast(&pixel1[sidx]); uint8 * bp2 = reinterpret_cast(&pixel2[sidx]); uint8 t = bp1[3]; bp1[3] = bp2[3]; bp2[3] = t; int nAlphaIndexVals = 1 << nAlphaIndexBits; for(int i = 0; i < 16; i++) { params.m_AlphaIndices[i] = (nAlphaIndexVals - 1) - params.m_AlphaIndices[i]; } } assert(!(params.m_Indices[sidx][anchorIdx] >> (nIndexBits - 1))); assert(!m_Attributes->hasRotation || !(params.m_AlphaIndices[anchorIdx] >> (nAlphaIndexBits - 1))); } // Get the quantized values... uint8 r1[kMaxNumSubsets], g1[kMaxNumSubsets], b1[kMaxNumSubsets], a1[kMaxNumSubsets]; uint8 r2[kMaxNumSubsets], g2[kMaxNumSubsets], b2[kMaxNumSubsets], a2[kMaxNumSubsets]; for(int i = 0; i < nSubsets; i++) { r1[i] = pixel1[i] & 0xFF; r2[i] = pixel2[i] & 0xFF; g1[i] = (pixel1[i] >> 8) & 0xFF; g2[i] = (pixel2[i] >> 8) & 0xFF; b1[i] = (pixel1[i] >> 16) & 0xFF; b2[i] = (pixel2[i] >> 16) & 0xFF; a1[i] = (pixel1[i] >> 24) & 0xFF; a2[i] = (pixel2[i] >> 24) & 0xFF; } // Write them out... const int nRedBits = m_Attributes->colorChannelPrecision; for(int i = 0; i < nSubsets; i++) { stream.WriteBits(r1[i] >> (8 - nRedBits), nRedBits); stream.WriteBits(r2[i] >> (8 - nRedBits), nRedBits); } const int nGreenBits = m_Attributes->colorChannelPrecision; for(int i = 0; i < nSubsets; i++) { stream.WriteBits(g1[i] >> (8 - nGreenBits), nGreenBits); stream.WriteBits(g2[i] >> (8 - nGreenBits), nGreenBits); } const int nBlueBits = m_Attributes->colorChannelPrecision; for(int i = 0; i < nSubsets; i++) { stream.WriteBits(b1[i] >> (8 - nBlueBits), nBlueBits); stream.WriteBits(b2[i] >> (8 - nBlueBits), nBlueBits); } const int nAlphaBits = m_Attributes->alphaChannelPrecision; for(int i = 0; i < nSubsets; i++) { stream.WriteBits(a1[i] >> (8 - nAlphaBits), nAlphaBits); stream.WriteBits(a2[i] >> (8 - nAlphaBits), nAlphaBits); } // Write out the best pbits.. if(GetPBitType() != ePBitType_None) { for(int s = 0; s < nSubsets; s++) { const int *pbits = GetPBitCombo(params.m_PbitCombo[s]); stream.WriteBits(pbits[0], 1); if(GetPBitType() != ePBitType_Shared) stream.WriteBits(pbits[1], 1); } } // If our index mode has changed, then we need to write the alpha indices // first. if(m_Attributes->hasIdxMode && params.m_IndexMode == 1) { assert(m_Attributes->hasRotation); for(int i = 0; i < 16; i++) { const int idx = params.m_AlphaIndices[i]; assert(GetAnchorIndexForSubset(0, params.m_ShapeIdx, nSubsets) == 0); assert(GetNumberOfBitsPerAlpha(params.m_IndexMode) == 2); assert(idx >= 0 && idx < (1 << 2)); assert(i != 0 || !(idx >> 1) || !"Leading bit of anchor index is not zero!"); stream.WriteBits(idx, (i == 0)? 1 : 2); } for(int i = 0; i < 16; i++) { const int idx = params.m_Indices[0][i]; assert(GetSubsetForIndex(i, params.m_ShapeIdx, nSubsets) == 0); assert(GetAnchorIndexForSubset(0, params.m_ShapeIdx, nSubsets) == 0); assert(GetNumberOfBitsPerIndex(params.m_IndexMode) == 3); assert(idx >= 0 && idx < (1 << 3)); assert(i != 0 || !(idx >> 2) || !"Leading bit of anchor index is not zero!"); stream.WriteBits(idx, (i == 0)? 2 : 3); } } else { for(int i = 0; i < 16; i++) { const int subs = GetSubsetForIndex(i, params.m_ShapeIdx, nSubsets); const int idx = params.m_Indices[subs][i]; const int anchorIdx = GetAnchorIndexForSubset(subs, params.m_ShapeIdx, nSubsets); const int nBitsForIdx = GetNumberOfBitsPerIndex(params.m_IndexMode); assert(idx >= 0 && idx < (1 << nBitsForIdx)); assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!"); stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx); } if(m_Attributes->hasRotation) { for(int i = 0; i < 16; i++) { const int idx = params.m_AlphaIndices[i]; const int anchorIdx = 0; const int nBitsForIdx = GetNumberOfBitsPerAlpha(params.m_IndexMode); assert(idx >= 0 && idx < (1 << nBitsForIdx)); assert(i != anchorIdx || !(idx >> (nBitsForIdx - 1)) || !"Leading bit of anchor index is not zero!"); stream.WriteBits(idx, (i == anchorIdx)? nBitsForIdx - 1 : nBitsForIdx); } } } assert(stream.GetBitsWritten() == 128); } double CompressionMode::Compress( Params ¶ms, const int shapeIdx, RGBACluster &cluster ) { const int kModeNumber = GetModeNumber(); const int nSubsets = GetNumberOfSubsets(); params = Params(shapeIdx); double totalErr = 0.0; for(int cidx = 0; cidx < nSubsets; cidx++) { uint8 indices[kMaxNumDataPoints] = {0}; cluster.SetPartition(cidx); if(m_Attributes->hasRotation) { assert(nSubsets == 1); uint8 alphaIndices[kMaxNumDataPoints]; double bestError = DBL_MAX; for(int rotMode = 0; rotMode < 4; rotMode++) { SetRotationMode(rotMode); const int nIdxModes = kModeNumber == 4? 2 : 1; for(int idxMode = 0; idxMode < nIdxModes; idxMode++) { SetIndexMode(idxMode); RGBAVector v1, v2; double error = CompressCluster( cluster, v1, v2, indices, alphaIndices ); if(error < bestError) { bestError = error; memcpy(params.m_Indices[cidx], indices, sizeof(indices)); memcpy(params.m_AlphaIndices, alphaIndices, sizeof(alphaIndices)); params.m_RotationMode = rotMode; params.m_IndexMode = idxMode; params.m_P1[cidx] = v1; params.m_P2[cidx] = v2; } } } totalErr += bestError; } else { // ! m_Attributes->hasRotation // Compress this cluster totalErr += CompressCluster( cluster, params.m_P1[cidx], params.m_P2[cidx], indices, params.m_PbitCombo[cidx] ); // Map the indices to their proper position. int idx = 0; for(int i = 0; i < 16; i++) { int subs = GetSubsetForIndex(i, shapeIdx, GetNumberOfSubsets()); if(subs == cidx) { params.m_Indices[cidx][i] = indices[idx++]; } } } } return totalErr; } class BlockLogger { public: BlockLogger(uint64 blockIdx, std::ostream &os) : m_BlockIdx(blockIdx), m_Stream(os) { } template friend std::ostream &operator<<(const BlockLogger &bl, const T &v); uint64 m_BlockIdx; std::ostream &m_Stream; }; template std::ostream &operator<<(const BlockLogger &bl, const T &v) { std::stringstream ss; ss << bl.m_BlockIdx << ": " << v; return bl.m_Stream << ss.str(); } // Function prototypes static void CompressBC7Block( const uint32 x, const uint32 y, const uint32 block[16], uint8 *outBuf, const CompressionSettings = CompressionSettings() ); static void CompressBC7Block( const uint32 x, const uint32 y, const uint32 block[16], uint8 *outBuf, const BlockLogger &logStream, const CompressionSettings = CompressionSettings() ); // Returns true if the entire block is a single color. static bool AllOneColor(const uint32 block[16]) { const uint32 pixel = block[0]; for(int i = 1; i < 16; i++) { if( block[i] != pixel ) return false; } return true; } // Write out a transparent block. static void WriteTransparentBlock(BitStream &stream) { // Use mode 6 stream.WriteBits(1 << 6, 7); stream.WriteBits(0, 128-7); assert(stream.GetBitsWritten() == 128); } // Compresses a single color optimally and outputs the result. static void CompressOptimalColorBC7(uint32 pixel, BitStream &stream) { stream.WriteBits(1 << 5, 6); // Mode 5 stream.WriteBits(0, 2); // No rotation bits. uint8 r = pixel & 0xFF; uint8 g = (pixel >> 8) & 0xFF; uint8 b = (pixel >> 16) & 0xFF; uint8 a = (pixel >> 24) & 0xFF; // Red endpoints stream.WriteBits(Optimal7CompressBC7Mode5[r][0], 7); stream.WriteBits(Optimal7CompressBC7Mode5[r][1], 7); // Green endpoints stream.WriteBits(Optimal7CompressBC7Mode5[g][0], 7); stream.WriteBits(Optimal7CompressBC7Mode5[g][1], 7); // Blue endpoints stream.WriteBits(Optimal7CompressBC7Mode5[b][0], 7); stream.WriteBits(Optimal7CompressBC7Mode5[b][1], 7); // Alpha endpoints... are just the same. stream.WriteBits(a, 8); stream.WriteBits(a, 8); // Color indices are 1 for each pixel... // Anchor index is 0, so 1 bit for the first pixel, then // 01 for each following pixel giving the sequence of 31 bits: // ...010101011 stream.WriteBits(0xaaaaaaab, 31); // Alpha indices... stream.WriteBits(kWMValues[gWMVal = (gWMVal+1) % kNumWMVals], 31); } static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]); void GetBlock(const uint32 x, const uint32 y, const uint32 pixelsWide, const uint32 *inPixels, uint32 block[16]) { memcpy(block, inPixels + y*pixelsWide + x, 4 * sizeof(uint32)); memcpy(block + 4, inPixels + (y+1)*pixelsWide + x, 4 * sizeof(uint32)); memcpy(block + 8, inPixels + (y+2)*pixelsWide + x, 4 * sizeof(uint32)); memcpy(block + 12, inPixels + (y+3)*pixelsWide + x, 4 * sizeof(uint32)); } // Compress an image using BC7 compression. Use the inBuf parameter to point // to an image in 4-byte RGBA format. The width and height parameters specify // the size of the image in pixels. The buffer pointed to by outBuf should be // large enough to store the compressed image. This implementation has an 4:1 // compression ratio. void Compress(const FasTC::CompressionJob &cj, CompressionSettings settings) { const uint32 *inPixels = reinterpret_cast(cj.InBuf()); const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_BPTC); uint8 *outBuf = cj.OutBuf() + cj.CoordsToBlockIdx(cj.XStart(), cj.YStart()) * kBlockSz; const uint32 endY = std::min(cj.YEnd(), cj.Height() - 4); uint32 startX = cj.XStart(); for(uint32 j = cj.YStart(); j <= endY; j += 4) { const uint32 endX = j == cj.YEnd()? cj.XEnd() : cj.Width(); for(uint32 i = startX; i < endX; i += 4) { uint32 block[16]; GetBlock(i, j, cj.Width(), inPixels, block); CompressBC7Block(i, j, block, outBuf, settings); #ifndef NDEBUG const uint8 *inBlock = reinterpret_cast(block); const uint8 *cmpblock = reinterpret_cast(outBuf); uint32 unComp[16]; DecompressBC7Block(cmpblock, unComp); const uint8* unCompData = reinterpret_cast(unComp); double diffSum = 0.0; for(int k = 0; k < 64; k+=4) { double rdiff = sad(unCompData[k], inBlock[k]); double gdiff = sad(unCompData[k+1], inBlock[k+1]); double bdiff = sad(unCompData[k+2], inBlock[k+2]); double adiff = sad(unCompData[k+3], inBlock[k+3]); const double asrc = static_cast(inBlock[k+3]); const double adst = static_cast(unCompData[k+3]); double avga = ((asrc + adst)*0.5)/255.0; diffSum += (rdiff + gdiff + bdiff + adiff) * avga; } double blockError = static_cast(diffSum) / 64.0; if(blockError > 5.0) { fprintf(stderr, "WARNING: Block error very high" " at <%d, %d>: (%.2f)\n", i, j, blockError); } #endif outBuf += kBlockSz; } startX = 0; } } #ifdef HAS_ATOMICS #ifdef HAS_MSVC_ATOMICS static uint32 TestAndSet(uint32 *x) { return InterlockedExchange(x, 1); } static uint32 FetchAndAdd(uint32 *x) { return InterlockedIncrement(x)-1; } #elif defined HAS_GCC_ATOMICS static uint32 TestAndSet(uint32 *x) { return __sync_lock_test_and_set(x, 1); } static uint32 FetchAndAdd(uint32 *x) { return __sync_fetch_and_add(x, 1); } #endif // Variables used for synchronization in threadsafe implementation. void CompressAtomic(FasTC::CompressionJobList &cjl) { uint32 jobIdx; while((jobIdx = cjl.m_CurrentJobIndex) < cjl.GetNumJobs()) { // !HACK! ... Microsoft has this defined #undef GetJob const FasTC::CompressionJob *cj = cjl.GetJob(jobIdx); const uint32 nBlocks = (cj->Height() * cj->Width()) / 16; // Help finish whatever texture we're compressing before we start again on // my work... uint32 blockIdx; while((blockIdx = FetchAndAdd(&(cjl.m_CurrentBlockIndex))) < nBlocks && *(cjl.GetFinishedFlag(jobIdx)) == 0) { unsigned char *out = cj->OutBuf() + (16 * blockIdx); uint32 block[16]; uint32 x = cj->XStart() + 4 * (blockIdx % (cj->Width() / 4)); uint32 y = cj->YStart() + 4 * (blockIdx / (cj->Width() / 4)); const uint32 *inPixels = reinterpret_cast(cj->InBuf()); GetBlock(x, y, cj->Width(), inPixels, block); CompressBC7Block(x, y, block, out); } if(TestAndSet(cjl.GetFinishedFlag(jobIdx)) == 0) { cjl.m_CurrentBlockIndex = 0; cjl.m_CurrentJobIndex++; } // Wait until this texture finishes. while(cjl.m_CurrentJobIndex == jobIdx) { } } } #endif // HAS_ATOMICS void CompressWithStats(const FasTC::CompressionJob &cj, std::ostream *logStream, CompressionSettings settings) { const uint32 *inPixels = reinterpret_cast(cj.InBuf()); const uint32 kBlockSz = GetBlockSize(FasTC::eCompressionFormat_BPTC); uint8 *outBuf = cj.OutBuf() + cj.CoordsToBlockIdx(cj.XStart(), cj.YStart()) * kBlockSz; uint32 startX = cj.XStart(); for(uint32 j = cj.YStart(); j <= cj.YEnd(); j += 4) { const uint32 endX = j == cj.YEnd()? cj.XEnd() : cj.Width(); for(uint32 i = startX; i < endX; i += 4) { uint32 block[16]; GetBlock(i, j, cj.Width(), inPixels, block); if(logStream) { uint64 blockIdx = cj.CoordsToBlockIdx(i, j); CompressBC7Block(i, j, block, outBuf, BlockLogger(blockIdx, *logStream), settings); } else { CompressBC7Block(i, j, block, outBuf, settings); } #ifndef NDEBUG const uint8 *inBlock = reinterpret_cast(block); const uint8 *cmpData = outBuf; uint32 unComp[16]; DecompressBC7Block(cmpData, unComp); const uint8* unCompData = reinterpret_cast(unComp); uint32 diffSum = 0; for(uint32 k = 0; k < 64; k++) { diffSum += sad(unCompData[k], inBlock[k]); } double blockError = static_cast(diffSum) / 64.0; if(blockError > 50.0) { fprintf(stderr, "WARNING: Block error very high" " (%.2f)\n", blockError); } #endif outBuf += 16; } startX = 0; } } static double EstimateTwoClusterError(ErrorMetric metric, RGBACluster &c) { RGBAVector Min, Max, v; c.GetBoundingBox(Min, Max); v = Max - Min; if(v * v == 0) { return 0.0; } const float *w = kErrorMetrics[metric]; double error = 0.0001; error += c.QuantizedError(Min, Max, 8, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3])); return error; } static double EstimateThreeClusterError(ErrorMetric metric, RGBACluster &c) { RGBAVector Min, Max, v; c.GetBoundingBox(Min, Max); v = Max - Min; if(v * v == 0) { return 0.0; } const float *w = kErrorMetrics[metric]; double error = 0.0001; error += c.QuantizedError(Min, Max, 4, 0xFFFFFFFF, RGBAVector(w[0], w[1], w[2], w[3])); return error; } static uint32 kTwoPartitionModes = static_cast(eBlockMode_One) | static_cast(eBlockMode_Three) | static_cast(eBlockMode_Seven); static uint32 kThreePartitionModes = static_cast(eBlockMode_Zero) | static_cast(eBlockMode_Two); static uint32 kAlphaModes = static_cast(eBlockMode_Four) | static_cast(eBlockMode_Five) | static_cast(eBlockMode_Six) | static_cast(eBlockMode_Seven); static ShapeSelection BoxSelection( uint32, uint32, const uint32 pixels[16], const void *userData ) { ErrorMetric metric = *(reinterpret_cast(userData)); ShapeSelection result; bool opaque = true; for(uint32 i = 0; i < 16; i++) { uint32 a = (pixels[i] >> 24) & 0xFF; opaque = opaque && (a >= 250); // For all intents and purposes... } // First we must figure out which shape to use. To do this, simply // see which shape has the smallest sum of minimum bounding spheres. double bestError[2] = { std::numeric_limits::max(), std::numeric_limits::max() }; RGBACluster cluster(pixels); result.m_NumIndices = 1; for(unsigned int i = 0; i < kNumShapes2; i++) { cluster.SetShapeIndex(i, 2); double err = 0.0; for(int ci = 0; ci < 2; ci++) { cluster.SetPartition(ci); err += EstimateTwoClusterError(metric, cluster); } if(err < bestError[0]) { bestError[0] = err; result.m_Shapes[0].m_Index = i; result.m_Shapes[0].m_NumPartitions = 2; } // If it's small, we'll take it! if(err < 1e-9) { result.m_SelectedModes = kTwoPartitionModes; return result; } } // There are not 3 subset blocks that support alpha, so only check these // if the entire block is opaque. if(!opaque) { result.m_SelectedModes &= kAlphaModes; return result; } // If it's opaque, we get more value out of mode 6 than modes // 4 and 5, so just ignore those. result.m_SelectedModes &= ~(static_cast(eBlockMode_Four) | static_cast(eBlockMode_Five)); result.m_NumIndices++; for(unsigned int i = 0; i < kNumShapes3; i++) { cluster.SetShapeIndex(i, 3); double err = 0.0; for(int ci = 0; ci < 3; ci++) { cluster.SetPartition(ci); err += EstimateThreeClusterError(metric, cluster); } if(err < bestError[1]) { bestError[1] = err; result.m_Shapes[1].m_Index = i; result.m_Shapes[1].m_NumPartitions = 3; } // If it's small, we'll take it! if(err < 1e-9) { result.m_SelectedModes = kThreePartitionModes; return result; } } return result; } static void CompressClusters(const ShapeSelection &selection, const uint32 pixels[16], const CompressionSettings &settings, uint8 *outBuf, double *errors, int *modeChosen) { RGBACluster cluster(pixels); double bestError = std::numeric_limits::max(); uint32 modes[8] = {0, 2, 1, 3, 7, 4, 5, 6}; uint32 bestMode = 8; CompressionMode::Params bestParams; uint32 selectedModes = selection.m_SelectedModes; uint32 numShapeIndices = std::min(5, selection.m_NumIndices); // If we don't have any indices, turn off two and three partition modes, // since the compressor will simply ignore the shapeIndex variable afterwards... if(numShapeIndices == 0) { numShapeIndices = 1; selectedModes &= ~(kTwoPartitionModes | kThreePartitionModes); } for(uint32 modeIdx = 0; modeIdx < 8; modeIdx++) { uint32 mode = modes[modeIdx]; if((selectedModes & (1 << mode)) == 0) { continue; } for(uint32 shapeIdx = 0; shapeIdx < numShapeIndices; shapeIdx++) { const Shape &shape = selection.m_Shapes[shapeIdx]; // If the shape doesn't support the number of subsets then skip it. uint32 nParts = CompressionMode::GetAttributesForMode(mode)->numSubsets; if(nParts != 1 && nParts != shape.m_NumPartitions) { continue; } // Block mode zero only has four bits for the partition index, // so if the chosen three-partition shape is not within this range, // then we shouldn't consider using this block mode... if(shape.m_Index >= 16 && mode == 0) { continue; } uint32 idx = shape.m_Index; cluster.SetShapeIndex(idx, nParts); CompressionMode::Params params; double error = CompressionMode(mode, settings).Compress(params, idx, cluster); if(errors) errors[mode] = std::min(error, errors[mode]); if(error < bestError) { bestError = error; bestMode = mode; bestParams = params; } } } assert(bestMode < 8); BitStream stream(outBuf, 128, 0); CompressionMode(bestMode, settings).Pack(bestParams, stream); if(modeChosen) *modeChosen = bestMode; } static void CompressBC7Block(const uint32 x, const uint32 y, const uint32 block[16], uint8 *outBuf, const CompressionSettings settings) { // All a single color? if(AllOneColor(block)) { BitStream bStrm(outBuf, 128, 0); CompressOptimalColorBC7(*block, bStrm); return; } RGBACluster blockCluster(block); bool transparent = true; for(uint32 i = 0; i < blockCluster.GetNumPoints(); i++) { const RGBAVector &p = blockCluster.GetPoint(i); if(p.A() > 0.0f) { transparent = false; break; } } // The whole block is transparent? if(transparent) { BitStream bStrm(outBuf, 128, 0); WriteTransparentBlock(bStrm); return; } ShapeSelectionFn selectionFn = BoxSelection; const void *userData = &settings.m_ErrorMetric; if(settings.m_ShapeSelectionFn != NULL) { selectionFn = settings.m_ShapeSelectionFn; userData = settings.m_ShapeSelectionUserData; } assert(selectionFn); ShapeSelection selection = selectionFn(x, y, block, userData); selection.m_SelectedModes &= settings.m_BlockModes; assert(selection.m_SelectedModes); CompressClusters(selection, block, settings, outBuf, NULL, NULL); } static double EstimateTwoClusterErrorStats( ErrorMetric metric, RGBACluster &c, double (&estimates)[2] ) { RGBAVector Min, Max, v; c.GetBoundingBox(Min, Max); v = Max - Min; if(v * v == 0) { estimates[0] = estimates[1] = 0.0; return 0.0; } const float *w = kErrorMetrics[metric]; const double err1 = c.QuantizedError( Min, Max, 8, 0xFFFCFCFC, RGBAVector(w[0], w[1], w[2], w[3]) ); if(err1 >= 0.0) { estimates[0] = err1; } else { estimates[0] = std::min(estimates[0], err1); } const double err3 = c.QuantizedError( Min, Max, 8, 0xFFFEFEFE, RGBAVector(w[0], w[1], w[2], w[3]) ); if(err3 >= 0.0) { estimates[1] = err3; } else { estimates[1] = std::min(estimates[1], err3); } double error = 0.0001; error += std::min(err1, err3); return error; } static double EstimateThreeClusterErrorStats( ErrorMetric metric, RGBACluster &c, double (&estimates)[2] ) { RGBAVector Min, Max, v; c.GetBoundingBox(Min, Max); v = Max - Min; if(v * v == 0) { estimates[0] = estimates[1] = 0.0; return 0.0; } const float *w = kErrorMetrics[metric]; const double err0 = 0.0001 + c.QuantizedError( Min, Max, 4, 0xFFF0F0F0, RGBAVector(w[0], w[1], w[2], w[3]) ); if(err0 >= 0.0) { estimates[0] = err0; } else { estimates[0] = std::min(estimates[0], err0); } const double err2 = 0.0001 + c.QuantizedError( Min, Max, 4, 0xFFF8F8F8, RGBAVector(w[0], w[1], w[2], w[3]) ); if(err2 >= 0.0) { estimates[1] = err2; } else { estimates[1] = std::min(estimates[1], err2); } double error = 0.0001; error += std::min(err0, err2); return error; } static void UpdateErrorEstimate(double *estimates, uint32 mode, double est) { assert(estimates); assert(mode >= 0); assert(mode < CompressionMode::kNumModes); if(estimates[mode] == -1.0 || est < estimates[mode]) { estimates[mode] = est; } } template static void PrintStat(const BlockLogger &lgr, const char *stat, const T &v) { std::stringstream ss; ss << stat << " -- " << v << std::endl; lgr << ss.str(); } // Compress a single block but collect statistics as well... static void CompressBC7Block( const uint32 x, const uint32 y, const uint32 block[16], uint8 *outBuf, const BlockLogger &logStream, const CompressionSettings settings ) { class RAIIStatSaver { private: const BlockLogger &m_Logger; int *m_ModePtr; double *m_Estimates; double *m_Errors; public: RAIIStatSaver(const BlockLogger &logger) : m_Logger(logger) , m_ModePtr(NULL), m_Estimates(NULL), m_Errors(NULL) { } void SetMode(int *modePtr) { m_ModePtr = modePtr; } void SetEstimates(double *estimates) { m_Estimates = estimates; } void SetErrors(double *errors) { m_Errors = errors; } ~RAIIStatSaver() { assert(m_ModePtr); assert(m_Estimates); assert(m_Errors); PrintStat(m_Logger, kBlockStatString[eBlockStat_Mode], *m_ModePtr); for(uint32 i = 0; i < CompressionMode::kNumModes; i++) { PrintStat(m_Logger, kBlockStatString[eBlockStat_ModeZeroEstimate + i], m_Estimates[i]); PrintStat(m_Logger, kBlockStatString[eBlockStat_ModeZeroError + i], m_Errors[i]); } } }; int bestMode = 0; double modeEstimate[CompressionMode::kNumModes]; double modeError[CompressionMode::kNumModes]; // reset global variables... bestMode = 0; for(uint32 i = 0; i < CompressionMode::kNumModes; i++) { modeError[i] = modeEstimate[i] = -1.0; } RAIIStatSaver __statsaver__(logStream); __statsaver__.SetMode(&bestMode); __statsaver__.SetEstimates(modeEstimate); __statsaver__.SetErrors(modeError); // All a single color? if(AllOneColor(block)) { BitStream bStrm(outBuf, 128, 0); CompressOptimalColorBC7(*block, bStrm); bestMode = 5; PrintStat(logStream, kBlockStatString[eBlockStat_Path], 0); return; } RGBACluster blockCluster(block); bool opaque = true; bool transparent = true; for(uint32 i = 0; i < blockCluster.GetNumPoints(); i++) { const RGBAVector &p = blockCluster.GetPoint(i); if(fabs(p.A() - 255.0f) > 1e-10) { opaque = false; } if(p.A() > 0.0f) { transparent = false; } } // The whole block is transparent? if(transparent) { BitStream bStrm(outBuf, 128, 0); WriteTransparentBlock(bStrm); bestMode = 6; PrintStat(logStream, kBlockStatString[eBlockStat_Path], 1); return; } // First, estimate the error it would take to compress a single line with // mode 6... { RGBAVector Min, Max, v; blockCluster.GetBoundingBox(Min, Max); v = Max - Min; if(v * v == 0) { modeEstimate[6] = 0.0; } else { const float *w = kErrorMetrics[settings.m_ErrorMetric]; const double err = 0.0001 + blockCluster.QuantizedError( Min, Max, 4, 0xFEFEFEFE, RGBAVector(w[0], w[1], w[2], w[3]) ); UpdateErrorEstimate(modeEstimate, 6, err); } } // First we must figure out which shape to use. To do this, simply // see which shape has the smallest sum of minimum bounding spheres. double bestError[2] = { DBL_MAX, DBL_MAX }; ShapeSelection selection; uint32 path = 0; selection.m_NumIndices = 1; for(unsigned int i = 0; i < kNumShapes2; i++) { blockCluster.SetShapeIndex(i, 2); double err = 0.0; double errEstimate[2] = { -1.0, -1.0 }; for(int ci = 0; ci < 2; ci++) { blockCluster.SetPartition(ci); double shapeEstimate[2] = { -1.0, -1.0 }; err += EstimateTwoClusterErrorStats(settings.m_ErrorMetric, blockCluster, shapeEstimate); for(int ei = 0; ei < 2; ei++) { if(shapeEstimate[ei] >= 0.0) { if(errEstimate[ei] == -1.0) { errEstimate[ei] = shapeEstimate[ei]; } else { errEstimate[ei] += shapeEstimate[ei]; } } } } if(errEstimate[0] != -1.0) { UpdateErrorEstimate(modeEstimate, 1, errEstimate[0]); } if(errEstimate[1] != -1.0) { UpdateErrorEstimate(modeEstimate, 3, errEstimate[1]); } if(err < bestError[0]) { PrintStat(logStream, kBlockStatString[eBlockStat_TwoShapeEstimate], err ); } if(err < bestError[0]) { bestError[0] = err; selection.m_Shapes[0].m_Index = i; selection.m_Shapes[0].m_NumPartitions = 2; } // If it's small, we'll take it! if(err < 1e-9) { path = 2; selection.m_SelectedModes = kTwoPartitionModes; break; } } // There are not 3 subset blocks that support alpha, so only check these // if the entire block is opaque. if(opaque) { selection.m_NumIndices++; for(unsigned int i = 0; i < kNumShapes3; i++) { blockCluster.SetShapeIndex(i, 3); double err = 0.0; double errEstimate[2] = { -1.0, -1.0 }; for(int ci = 0; ci < 3; ci++) { blockCluster.SetPartition(ci); double shapeEstimate[2] = { -1.0, -1.0 }; err += EstimateThreeClusterErrorStats(settings.m_ErrorMetric, blockCluster, shapeEstimate); for(int ei = 0; ei < 2; ei++) { if(shapeEstimate[ei] >= 0.0) { if(errEstimate[ei] == -1.0) { errEstimate[ei] = shapeEstimate[ei]; } else { errEstimate[ei] += shapeEstimate[ei]; } } } } if(errEstimate[0] != -1.0) { UpdateErrorEstimate(modeEstimate, 0, errEstimate[0]); } if(errEstimate[1] != -1.0) { UpdateErrorEstimate(modeEstimate, 2, errEstimate[1]); } if(err < bestError[1]) { PrintStat(logStream, kBlockStatString[eBlockStat_ThreeShapeEstimate], err ); } if(err < bestError[1]) { bestError[1] = err; selection.m_Shapes[1].m_Index = i; selection.m_Shapes[1].m_NumPartitions = 3; } // If it's small, we'll take it! if(err < 1e-9) { path = 2; selection.m_SelectedModes = kThreePartitionModes; break; } } } if(path == 0) path = 3; selection.m_SelectedModes &= settings.m_BlockModes; assert(selection.m_SelectedModes); CompressClusters(selection, block, settings, outBuf, modeError, &bestMode); PrintStat(logStream, kBlockStatString[eBlockStat_Path], path); } static void DecompressBC7Block(const uint8 block[16], uint32 outBuf[16]) { BitStreamReadOnly strm(block); uint32 mode = 0; while(!strm.ReadBit()) { mode++; } const CompressionMode::Attributes *attrs = CompressionMode::GetAttributesForMode(mode); const uint32 nSubsets = attrs->numSubsets; uint32 idxMode = 0; uint32 rotMode = 0; uint32 shapeIdx = 0; if ( nSubsets > 1 ) { shapeIdx = strm.ReadBits(mode == 0? 4 : 6); } else if( attrs->hasRotation ) { rotMode = strm.ReadBits(2); if( attrs->hasIdxMode ) { idxMode = strm.ReadBit(); } } assert(idxMode < 2); assert(rotMode < 4); assert(shapeIdx < ((mode == 0)? 16U : 64U)); uint32 cp = attrs->colorChannelPrecision; const uint32 shift = 8 - cp; uint8 eps[3][2][4]; for(uint32 ch = 0; ch < 3; ch++) for(uint32 i = 0; i < nSubsets; i++) for(uint32 ep = 0; ep < 2; ep++) eps[i][ep][ch] = strm.ReadBits(cp) << shift; uint32 ap = attrs->alphaChannelPrecision; const uint32 ash = 8 - ap; if(ap == 0) { for(uint32 i = 0; i < nSubsets; i++) for(uint32 ep = 0; ep < 2; ep++) eps[i][ep][3] = 0xFF; } else { for(uint32 i = 0; i < nSubsets; i++) for(uint32 ep = 0; ep < 2; ep++) eps[i][ep][3] = strm.ReadBits(ap) << ash; } // Handle pbits switch(attrs->pbitType) { case CompressionMode::ePBitType_None: // Do nothing. break; case CompressionMode::ePBitType_Shared: cp += 1; ap += 1; for(uint32 i = 0; i < nSubsets; i++) { uint32 pbit = strm.ReadBit(); for(uint32 j = 0; j < 2; j++) for(uint32 ch = 0; ch < kNumColorChannels; ch++) { const uint32 prec = ch == 3? ap : cp; eps[i][j][ch] |= pbit << (8-prec); } } break; case CompressionMode::ePBitType_NotShared: cp += 1; ap += 1; for(uint32 i = 0; i < nSubsets; i++) for(uint32 j = 0; j < 2; j++) { uint32 pbit = strm.ReadBit(); for(uint32 ch = 0; ch < kNumColorChannels; ch++) { const uint32 prec = ch == 3? ap : cp; eps[i][j][ch] |= pbit << (8-prec); } } break; } // Quantize endpoints... for(uint32 i = 0; i < nSubsets; i++) for(uint32 j = 0; j < 2; j++) for(uint32 ch = 0; ch < kNumColorChannels; ch++) { const uint32 prec = ch == 3? ap : cp; eps[i][j][ch] |= eps[i][j][ch] >> prec; } // Figure out indices... uint32 alphaIndices[kMaxNumDataPoints]; uint32 colorIndices[kMaxNumDataPoints]; int nBitsPerAlpha = attrs->numBitsPerAlpha; int nBitsPerColor = attrs->numBitsPerIndex; uint32 idxPrec = attrs->numBitsPerIndex; for(uint32 i = 0; i < kMaxNumDataPoints; i++) { uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); int idx = 0; if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) { idx = strm.ReadBits(idxPrec - 1); } else { idx = strm.ReadBits(idxPrec); } colorIndices[i] = idx; } idxPrec = attrs->numBitsPerAlpha; if(idxPrec == 0) { memcpy(alphaIndices, colorIndices, sizeof(alphaIndices)); } else { for(uint32 i = 0; i < kMaxNumDataPoints; i++) { uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); int idx = 0; if(GetAnchorIndexForSubset(subset, shapeIdx, nSubsets) == i) { idx = strm.ReadBits(idxPrec - 1); } else { idx = strm.ReadBits(idxPrec); } alphaIndices[i] = idx; } if(idxMode) { for(uint32 i = 0; i < kMaxNumDataPoints; i++) { swap(alphaIndices[i], colorIndices[i]); } swap(nBitsPerAlpha, nBitsPerColor); } } assert(strm.GetBitsRead() == 128); // Get final colors by interpolating... for(uint32 i = 0; i < kMaxNumDataPoints; i++) { const uint32 subset = GetSubsetForIndex(i, shapeIdx, nSubsets); uint32 &pixel = outBuf[i]; pixel = 0; for(int ch = 0; ch < 4; ch++) { if(ch == 3 && nBitsPerAlpha > 0) { uint32 i0 = kInterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][0]; uint32 i1 = kInterpolationValues[nBitsPerAlpha - 1][alphaIndices[i]][1]; const uint32 ep1 = static_cast(eps[subset][0][3]); const uint32 ep2 = static_cast(eps[subset][1][3]); const uint8 ip = (((ep1 * i0 + ep2 * i1) + 32) >> 6) & 0xFF; pixel |= ip << 24; } else { uint32 i0 = kInterpolationValues[nBitsPerColor - 1][colorIndices[i]][0]; uint32 i1 = kInterpolationValues[nBitsPerColor - 1][colorIndices[i]][1]; const uint32 ep1 = static_cast(eps[subset][0][ch]); const uint32 ep2 = static_cast(eps[subset][1][ch]); const uint8 ip = (((ep1 * i0 + ep2 * i1) + 32) >> 6) & 0xFF; pixel |= ip << (8*ch); } } // Swap colors if necessary... uint8 *pb = reinterpret_cast(&pixel); switch(rotMode) { default: case 0: // Do nothing break; case 1: swap(pb[0], pb[3]); break; case 2: swap(pb[1], pb[3]); break; case 3: swap(pb[2], pb[3]); break; } } } // Convert the image from a BC7 buffer to a RGBA8 buffer void Decompress(const FasTC::DecompressionJob &dj) { const uint8 *inBuf = dj.InBuf(); uint32 *outBuf = reinterpret_cast(dj.OutBuf()); for(unsigned int j = 0; j < dj.Height(); j += 4) { for(unsigned int i = 0; i < dj.Width(); i += 4) { uint32 pixels[16]; DecompressBC7Block(inBuf, pixels); memcpy(outBuf + j*dj.Width() + i, pixels, 4 * sizeof(pixels[0])); memcpy(outBuf + (j+1)*dj.Width() + i, pixels+4, 4 * sizeof(pixels[0])); memcpy(outBuf + (j+2)*dj.Width() + i, pixels+8, 4 * sizeof(pixels[0])); memcpy(outBuf + (j+3)*dj.Width() + i, pixels+12, 4 * sizeof(pixels[0])); inBuf += 16; } } } } // namespace BPTCC