From bcf7c5c389ae62748fcbb35d7ceca2a5df661e5e Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Tue, 15 Oct 2013 10:32:38 -0400 Subject: [PATCH 1/8] Some more compiler error and warning fixes. --- Base/include/CompressionJob.h | 4 ++-- Base/include/VectorBase.h | 4 ++-- PVRTCEncoder/src/PVRTCImage.cpp | 2 +- PVRTCEncoder/src/PVRTCImage.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Base/include/CompressionJob.h b/Base/include/CompressionJob.h index 1e3b25c..2543ada 100755 --- a/Base/include/CompressionJob.h +++ b/Base/include/CompressionJob.h @@ -115,10 +115,10 @@ struct CompressionJobList { bool AddJob(const CompressionJob &); // Get the maximum number of jobs that this list can hold. - const uint32 GetTotalNumJobs() const { return m_TotalNumJobs; } + uint32 GetTotalNumJobs() const { return m_TotalNumJobs; } // Get the current number of jobs in the list. - const uint32 GetNumJobs() const { return m_NumJobs; } + uint32 GetNumJobs() const { return m_NumJobs; } // Returns the specified job. const CompressionJob *GetJob(uint32 idx) const; diff --git a/Base/include/VectorBase.h b/Base/include/VectorBase.h index 45b0fba..dc78d01 100644 --- a/Base/include/VectorBase.h +++ b/Base/include/VectorBase.h @@ -195,7 +195,7 @@ namespace FasTC { static inline VectorType ScalarMultiply(const VectorType &v, const ScalarType &s) { VectorType a; for(int i = 0; i < VectorType::Size; i++) - a(i) = static_cast(v(i) * s); + a(i) = static_cast(v(i) * s); return a; } @@ -212,7 +212,7 @@ namespace FasTC { static inline VectorType ScalarDivide(const VectorType &v, const ScalarType &s) { VectorType a; for(int i = 0; i < VectorType::Size; i++) - a(i) = static_cast(v(i) / s); + a(i) = static_cast(v(i) / s); return a; } diff --git a/PVRTCEncoder/src/PVRTCImage.cpp b/PVRTCEncoder/src/PVRTCImage.cpp index a992df3..c06fbf4 100644 --- a/PVRTCEncoder/src/PVRTCImage.cpp +++ b/PVRTCEncoder/src/PVRTCImage.cpp @@ -502,7 +502,7 @@ const FasTC::Pixel &Image::GetPixel(int32 i, int32 j, EWrapMode wrapMode) const return GetPixels()[GetPixelIndex(i, j, wrapMode)]; } -const uint32 Image::GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode) const { +uint32 Image::GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode) const { while(i < 0) { if(wrapMode == eWrapMode_Clamp) { i = 0; diff --git a/PVRTCEncoder/src/PVRTCImage.h b/PVRTCEncoder/src/PVRTCImage.h index cc5b96c..44058b2 100644 --- a/PVRTCEncoder/src/PVRTCImage.h +++ b/PVRTCEncoder/src/PVRTCImage.h @@ -100,7 +100,7 @@ class Image : public FasTC::Image { private: FasTC::Pixel *m_FractionalPixels; - const uint32 GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp) const; + uint32 GetPixelIndex(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp) const; const FasTC::Pixel &GetPixel(int32 i, int32 j, EWrapMode wrapMode = eWrapMode_Clamp) const; }; From 838d1f7b6ebc964f4382c21088907936be95faed Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Tue, 15 Oct 2013 13:56:40 -0400 Subject: [PATCH 2/8] Some small refactoring. --- PVRTCEncoder/src/Compressor.cpp | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp index 37277d5..80b0301 100644 --- a/PVRTCEncoder/src/Compressor.cpp +++ b/PVRTCEncoder/src/Compressor.cpp @@ -487,11 +487,8 @@ namespace PVRTCC { } static FasTC::Pixel BilerpPixels(uint32 x, uint32 y, - const FasTC::Pixel &p, FasTC::Pixel &fp, - const FasTC::Pixel &topLeft, - const FasTC::Pixel &topRight, - const FasTC::Pixel &bottomLeft, - const FasTC::Pixel &bottomRight) { + const FasTC::Pixel &topLeft, const FasTC::Pixel &topRight, + const FasTC::Pixel &bottomLeft, const FasTC::Pixel &bottomRight) { const uint32 highXWeight = x; const uint32 lowXWeight = 4 - x; @@ -510,13 +507,12 @@ namespace PVRTCC { const FasTC::Pixel br = bottomRight * bottomRightWeight; const FasTC::Pixel sum = tl + tr + bl + br; + FasTC::Pixel fp; for(uint32 c = 0; c < 4; c++) { fp.Component(c) = sum.Component(c) & 15; } - FasTC::Pixel tmp(p); - tmp = sum / (16); - + FasTC::Pixel tmp(sum / 16); tmp.A() = (tmp.A() << 4) | tmp.A(); tmp.G() = (tmp.G() << 3) | (tmp.G() >> 2); tmp.B() = (tmp.B() << 3) | (tmp.B() >> 2); @@ -553,16 +549,10 @@ namespace PVRTCC { const uint32 *pixels = reinterpret_cast(inBuf); - // Make sure the bit depth matches the original... - FasTC::Pixel p; - uint8 bitDepth[4] = { 4, 5, 5, 5 }; - p.ChangeBitDepth(bitDepth); - - // Save fractional bits - FasTC::Pixel fp; - uint8 fpDepths[4] = { 4, 4, 4, 4 }; - fp.ChangeBitDepth(fpDepths); - + // !SPEED! When we're iterating over the blocks here, we don't need to load from outBlocks + // every iteration of the loop. Once we finish with a block, topLeft becomes topRight and + // bottomLeft becomes bottomRight. Also, when we go to the next row, bottomRight becomes + // topLeft. for(uint32 j = 0; j < blocksH; j++) { for(uint32 i = 0; i < blocksW; i++) { @@ -609,8 +599,8 @@ namespace PVRTCC { for(uint32 x = 0; x < 4; x++) { uint32 pixelX = (i*4 + 2 + x) & (w - 1); uint32 pixelY = (j*4 + 2 + y) & (h - 1); - FasTC::Pixel colorA = BilerpPixels(x, y, p, fp, topLeftA, topRightA, bottomLeftA, bottomRightA); - FasTC::Pixel colorB = BilerpPixels(x, y, p, fp, topLeftB, topRightB, bottomLeftB, bottomRightB); + FasTC::Pixel colorA = BilerpPixels(x, y, topLeftA, topRightA, bottomLeftA, bottomRightA); + FasTC::Pixel colorB = BilerpPixels(x, y, topLeftB, topRightB, bottomLeftB, bottomRightB); FasTC::Pixel original(pixels[pixelY * w + pixelX]); // !FIXME! there are two modulation modes... we're only using one. From 4f2db726b7c21f468ace8f27359560775e518421 Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Wed, 16 Oct 2013 10:40:56 -0400 Subject: [PATCH 3/8] Get rid of unnecessary include --- PVRTCEncoder/src/PVRTCImage.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/PVRTCEncoder/src/PVRTCImage.cpp b/PVRTCEncoder/src/PVRTCImage.cpp index c06fbf4..2300176 100644 --- a/PVRTCEncoder/src/PVRTCImage.cpp +++ b/PVRTCEncoder/src/PVRTCImage.cpp @@ -66,7 +66,6 @@ #include "Pixel.h" using FasTC::Pixel; -#include "../../Base/include/Image.h" #include "../../IO/include/ImageFile.h" template From 674c18b9d9cd92fcc6b8ebe99ce3f33e723495f5 Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Wed, 16 Oct 2013 16:38:53 -0400 Subject: [PATCH 4/8] Add DXT encoder from J.M.P. Van Waveren --- CLTool/src/clunix.cpp | 10 +- CMakeLists.txt | 2 +- Core/CMakeLists.txt | 3 + Core/src/CompressedImage.cpp | 7 +- Core/src/TexComp.cpp | 10 + DXTEncoder/CMakeLists.txt | 62 +++ DXTEncoder/include/DXTCompressor.h | 34 ++ DXTEncoder/src/DXTCompressor.cpp | 322 ++++++++++++++ DXTEncoder/src/DXTCompressorSSE2DLL.cpp | 552 ++++++++++++++++++++++++ DXTEncoder/src/DXTDecompressor.cpp | 120 ++++++ 10 files changed, 1118 insertions(+), 4 deletions(-) create mode 100755 DXTEncoder/CMakeLists.txt create mode 100755 DXTEncoder/include/DXTCompressor.h create mode 100755 DXTEncoder/src/DXTCompressor.cpp create mode 100755 DXTEncoder/src/DXTCompressorSSE2DLL.cpp create mode 100644 DXTEncoder/src/DXTDecompressor.cpp diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp index a53f0f1..5df3ac6 100644 --- a/CLTool/src/clunix.cpp +++ b/CLTool/src/clunix.cpp @@ -56,7 +56,7 @@ void PrintUsage() { fprintf(stderr, "Usage: tc [OPTIONS] imagefile\n"); fprintf(stderr, "\n"); - fprintf(stderr, "\t-f\t\tFormat to use. Either \"BPTC\" or \"PVRTC\". Default: BPTC\n"); + fprintf(stderr, "\t-f\t\tFormat to use. Either \"BPTC\", \"DXT1\", \"DXT5\", or \"PVRTC\". Default: BPTC\n"); fprintf(stderr, "\t-l\t\tSave an output log.\n"); fprintf(stderr, "\t-q \tSet compression quality level. Default: 50\n"); fprintf(stderr, "\t-n \tCompress the image num times and give the average time and PSNR. Default: 1\n"); @@ -133,6 +133,10 @@ int main(int argc, char **argv) { } else if(!strcmp(argv[fileArg], "PVRTCLib")) { format = eCompressionFormat_PVRTC; bUsePVRTexLib = true; + } else if(!strcmp(argv[fileArg], "DXT1")) { + format = eCompressionFormat_DXT1; + } else if(!strcmp(argv[fileArg], "DXT5")) { + format = eCompressionFormat_DXT5; } } @@ -218,7 +222,7 @@ int main(int argc, char **argv) { } FasTC::Image<> img(*file.GetImage()); - if(format == eCompressionFormat_PVRTC) { + if(format != eCompressionFormat_BPTC) { img.SetBlockStreamOrder(false); } @@ -271,6 +275,8 @@ int main(int argc, char **argv) { strcat(basename, "-bc7.png"); } else if(format == eCompressionFormat_PVRTC) { strcat(basename, "-pvrtc.png"); + } else if(format == eCompressionFormat_DXT1) { + strcat(basename, "-dxt1.png"); } ImageFile cImgFile (basename, eFileFormat_PNG, *ci); diff --git a/CMakeLists.txt b/CMakeLists.txt index 32ab81a..836c08a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,7 +93,7 @@ SET(CMAKE_MODULE_PATH "${FasTC_SOURCE_DIR}/CMakeModules" ${CMAKE_MODULE_PATH}) FIND_PACKAGE(PVRTexLib) SET(FASTC_DIRECTORIES - Base Core IO BPTCEncoder PVRTCEncoder + Base Core IO BPTCEncoder PVRTCEncoder DXTEncoder ) FOREACH(DIR ${FASTC_DIRECTORIES}) diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt index 571cb94..89420fc 100644 --- a/Core/CMakeLists.txt +++ b/Core/CMakeLists.txt @@ -77,6 +77,8 @@ ENDIF() INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR}/Base/include ) INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR} ) +INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR}/DXTEncoder/include ) + INCLUDE_DIRECTORIES( ${FasTC_SOURCE_DIR}/PVRTCEncoder/include ) INCLUDE_DIRECTORIES( ${FasTC_BINARY_DIR}/PVRTCEncoder/include) @@ -160,6 +162,7 @@ ADD_LIBRARY( FasTCCore TARGET_LINK_LIBRARIES( FasTCCore FasTCBase ) TARGET_LINK_LIBRARIES( FasTCCore FasTCIO ) +TARGET_LINK_LIBRARIES( FasTCCore DXTEncoder ) TARGET_LINK_LIBRARIES( FasTCCore BPTCEncoder ) TARGET_LINK_LIBRARIES( FasTCCore PVRTCEncoder ) diff --git a/Core/src/CompressedImage.cpp b/Core/src/CompressedImage.cpp index 66b64e7..f1c9303 100644 --- a/Core/src/CompressedImage.cpp +++ b/Core/src/CompressedImage.cpp @@ -53,6 +53,7 @@ #include "TexCompTypes.h" #include "BC7Compressor.h" #include "PVRTCCompressor.h" +#include "DXTCompressor.h" CompressedImage::CompressedImage( const CompressedImage &other ) : Image(other) @@ -74,7 +75,7 @@ CompressedImage::CompressedImage( ) : FasTC::Image<>(width, height, reinterpret_cast(NULL), - format != eCompressionFormat_PVRTC) + format == eCompressionFormat_BPTC) , m_Format(format) , m_CompressedData(0) { @@ -111,6 +112,10 @@ bool CompressedImage::DecompressImage(unsigned char *outBuf, unsigned int outBuf uint8 *byteData = reinterpret_cast(m_CompressedData); DecompressionJob dj (byteData, outBuf, GetWidth(), GetHeight()); switch(m_Format) { + case eCompressionFormat_DXT1: + DXTC::DecompressDXT1(dj); + break; + case eCompressionFormat_PVRTC: { #ifndef NDEBUG diff --git a/Core/src/TexComp.cpp b/Core/src/TexComp.cpp index 5b36d50..c052d45 100644 --- a/Core/src/TexComp.cpp +++ b/Core/src/TexComp.cpp @@ -50,6 +50,7 @@ #include #include +#include "DXTCompressor.h" #include "BC7Compressor.h" #include "CompressionFuncs.h" #include "Image.h" @@ -95,12 +96,15 @@ SCompressionSettings:: SCompressionSettings() static CompressionFuncWithStats ChooseFuncFromSettingsWithStats(const SCompressionSettings &s) { switch(s.format) { + case eCompressionFormat_BPTC: { return BC7C::CompressWithStats; } break; + case eCompressionFormat_DXT1: + case eCompressionFormat_DXT5: case eCompressionFormat_PVRTC: { // !FIXME! actually implement one of these methods... @@ -130,6 +134,12 @@ static CompressionFunc ChooseFuncFromSettings(const SCompressionSettings &s) { } break; + case eCompressionFormat_DXT1: + return DXTC::CompressImageDXT1; + + case eCompressionFormat_DXT5: + return DXTC::CompressImageDXT5; + case eCompressionFormat_PVRTC: { if(s.bUsePVRTexLib) { diff --git a/DXTEncoder/CMakeLists.txt b/DXTEncoder/CMakeLists.txt new file mode 100755 index 0000000..9774136 --- /dev/null +++ b/DXTEncoder/CMakeLists.txt @@ -0,0 +1,62 @@ +# FasTC +# Copyright (c) 2012 University of North Carolina at Chapel Hill. All rights reserved. +# +# Permission to use, copy, modify, and distribute this software and its documentation for educational, +# research, and non-profit purposes, without fee, and without a written agreement is hereby granted, +# provided that the above copyright notice, this paragraph, and the following four paragraphs appear +# in all copies. +# +# Permission to incorporate this software into commercial products may be obtained by contacting the +# authors or the Office of Technology Development at the University of North Carolina at Chapel Hill . +# +# This software program and documentation are copyrighted by the University of North Carolina at Chapel Hill. +# The software program and documentation are supplied "as is," without any accompanying services from the +# University of North Carolina at Chapel Hill or the authors. The University of North Carolina at Chapel Hill +# and the authors do not warrant that the operation of the program will be uninterrupted or error-free. The +# end-user understands that the program was developed for research purposes and is advised not to rely +# exclusively on the program for any reason. +# +# IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE AUTHORS BE LIABLE TO ANY PARTY FOR +# DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE +# USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE +# AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS SPECIFICALLY DISCLAIM ANY WARRANTIES, INCLUDING, +# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY +# STATUTORY WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY +# OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, +# ENHANCEMENTS, OR MODIFICATIONS. +# +# Please send all BUG REPORTS to . +# +# The authors may be contacted via: +# +# Pavel Krajcevski +# Dept of Computer Science +# 201 S Columbia St +# Frederick P. Brooks, Jr. Computer Science Bldg +# Chapel Hill, NC 27599-3175 +# USA +# +# + +INCLUDE_DIRECTORIES(${FasTC_SOURCE_DIR}/Base/include) + +INCLUDE_DIRECTORIES(${FasTC_SOURCE_DIR}/DXTEncoder/include) +INCLUDE_DIRECTORIES(${FasTC_BINARY_DIR}/DXTEncoder/include) + +SET( HEADERS + include/DXTCompressor.h +) + +SET( SOURCES + src/DXTCompressor.cpp + src/DXTDecompressor.cpp +) + +ADD_LIBRARY( DXTEncoder + ${HEADERS} + ${SOURCES} +) + +TARGET_LINK_LIBRARIES( DXTEncoder FasTCBase ) diff --git a/DXTEncoder/include/DXTCompressor.h b/DXTEncoder/include/DXTCompressor.h new file mode 100755 index 0000000..c26f60b --- /dev/null +++ b/DXTEncoder/include/DXTCompressor.h @@ -0,0 +1,34 @@ +/* + This code is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This code is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. +*/ + +#include "TexCompTypes.h" +#include "CompressionJob.h" + +namespace DXTC +{ + // DXT compressor (scalar version). + void CompressImageDXT1(const CompressionJob &); + void CompressImageDXT5(const CompressionJob &); + + void DecompressDXT1(const DecompressionJob &); + + uint16 ColorTo565(const uint8* color); + void EmitByte(uint8*& dest, uint8 b); + void EmitWord(uint8*& dest, uint16 s); + void EmitDoubleWord(uint8*& dest, uint32 i); + +#if 0 + // DXT compressor (SSE2 version). + void CompressImageDXT1SSE2(const uint8* inBuf, uint8* outBuf, uint32 width, uint32 height); + void CompressImageDXT5SSE2(const uint8* inBuf, uint8* outBuf, uint32 width, uint32 height); +#endif +} diff --git a/DXTEncoder/src/DXTCompressor.cpp b/DXTEncoder/src/DXTCompressor.cpp new file mode 100755 index 0000000..e556354 --- /dev/null +++ b/DXTEncoder/src/DXTCompressor.cpp @@ -0,0 +1,322 @@ +/* + This code is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This code is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. +*/ + +// Refer to "Real-Time DXT Compression" by J.M.P. van Waveren for a more thorough discussion of the +// algorithms used in this code. + +#include "DXTCompressor.h" +#include +#include +#include + +#define INSET_SHIFT 4 // Inset the bounding box with (range >> shift). +#define C565_5_MASK 0xF8 // 0xFF minus last three bits +#define C565_6_MASK 0xFC // 0xFF minus last two bits + +namespace DXTC +{ + // Function prototypes + void ExtractBlock(const uint8* inPtr, uint32 width, uint8* colorBlock); + void GetMinMaxColors(const uint8* colorBlock, uint8* minColor, uint8* maxColor); + void GetMinMaxColorsWithAlpha(const uint8* colorBlock, uint8* minColor, uint8* maxColor); + void EmitColorIndices(const uint8* colorBlock, uint8*& outBuf, const uint8* minColor, const uint8* maxColor); + void EmitAlphaIndices(const uint8* colorBlock, uint8*& outBuf, const uint8 minAlpha, const uint8 maxAlpha); + + // Compress an image using DXT1 compression. Use the inBuf parameter to point to an image in + // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels. + // The buffer pointed to by outBuf should be large enough to store the compressed image. This + // implementation has an 8:1 compression ratio. + void CompressImageDXT1(const CompressionJob &cj) { + uint8 block[64]; + uint8 minColor[4]; + uint8 maxColor[4]; + + uint8 *outBuf = cj.outBuf; + const uint8 *inBuf = cj.inBuf; + for(int j = 0; j < cj.height; j += 4, inBuf += cj.width * 4 * 4) + { + for(int i = 0; i < cj.width; i += 4) + { + ExtractBlock(inBuf + i * 4, cj.width, block); + GetMinMaxColors(block, minColor, maxColor); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + } + + // Compress an image using DXT5 compression. Use the inBuf parameter to point to an image in + // 4-byte RGBA format. The width and height parameters specify the size of the image in pixels. + // The buffer pointed to by outBuf should be large enough to store the compressed image. This + // implementation has an 4:1 compression ratio. + void CompressImageDXT5(const CompressionJob &cj) { + uint8 block[64]; + uint8 minColor[4]; + uint8 maxColor[4]; + + uint8 *outBuf = cj.outBuf; + const uint8 *inBuf = cj.inBuf; + for(int j = 0; j < cj.height; j += 4, inBuf += cj.width * 4 * 4) + { + for(int i = 0; i < cj.width; i += 4) + { + ExtractBlock(inBuf + i * 4, cj.width, block); + GetMinMaxColorsWithAlpha(block, minColor, maxColor); + EmitByte(outBuf, maxColor[3]); + EmitByte(outBuf, minColor[3]); + EmitAlphaIndices(block, outBuf, minColor[3], maxColor[3]); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + } + + // Convert a color in 24-bit RGB888 format to 16-bit RGB565 format. + uint16 ColorTo565(const uint8* color) + { + return ((color[0] >> 3) << 11) | ((color[1] >> 2) << 5) | (color[2] >> 3); + } + + // Write a single byte to dest. + void EmitByte(uint8*& dest, uint8 b) + { + dest[0] = b; + dest += 1; + } + + // Write a word to dest. + void EmitWord(uint8*& dest, uint16 s) + { + dest[0] = (s >> 0) & 255; + dest[1] = (s >> 8) & 255; + dest += 2; + } + + // Write a double word to dest. + void EmitDoubleWord(uint8*& dest, uint32 i) + { + dest[0] = (i >> 0) & 255; + dest[1] = (i >> 8) & 255; + dest[2] = (i >> 16) & 255; + dest[3] = (i >> 24) & 255; + dest += 4; + } + + // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter + // specifies the size of the image in pixels. + void ExtractBlock(const uint8* inPtr, uint32 width, uint8* colorBlock) + { + for(int j = 0; j < 4; j++) + { + memcpy(&colorBlock[j * 4 * 4], inPtr, 4 * 4); + inPtr += width * 4; + } + } + + // Find a line of best fit through the color space of colorBlock. The line is approximated using + // the extents of the bounding box of the color space. This function does not include the alpha + // channel. + void GetMinMaxColors(const uint8* colorBlock, uint8* minColor, uint8* maxColor) + { + int32 i; + uint8 inset[3]; + + minColor[0] = minColor[1] = minColor[2] = 255; + maxColor[0] = maxColor[1] = maxColor[2] = 0; + + // Find the bounding box (defined by minimum and maximum color). + for(i = 0; i < 16; i++) { + if(colorBlock[i * 4 + 0] < minColor[0]) { + minColor[0] = colorBlock[i * 4 + 0]; + } + if(colorBlock[i * 4 + 1] < minColor[1]) { + minColor[1] = colorBlock[i * 4 + 1]; + } + if(colorBlock[i * 4 + 2] < minColor[2]) { + minColor[2] = colorBlock[i * 4 + 2]; + } + if(colorBlock[i * 4 + 0] > maxColor[0]) { + maxColor[0] = colorBlock[i * 4 + 0]; + } + if(colorBlock[i * 4 + 1] > maxColor[1]) { + maxColor[1] = colorBlock[i * 4 + 1]; + } + if(colorBlock[i * 4 + 2] > maxColor[2]) { + maxColor[2] = colorBlock[i * 4 + 2]; + } + } + + // Inset the bounding box by 1/16 of it's size. (i.e. shift right by 4). + inset[0] = (maxColor[0] - minColor[0]) >> INSET_SHIFT; + inset[1] = (maxColor[1] - minColor[1]) >> INSET_SHIFT; + inset[2] = (maxColor[2] - minColor[2]) >> INSET_SHIFT; + + // Clamp the inset bounding box to 255. + minColor[0] = (minColor[0] + inset[0] <= 255) ? minColor[0] + inset[0] : 255; + minColor[1] = (minColor[1] + inset[1] <= 255) ? minColor[1] + inset[1] : 255; + minColor[2] = (minColor[2] + inset[2] <= 255) ? minColor[2] + inset[2] : 255; + + // Clamp the inset bounding box to 0. + maxColor[0] = (maxColor[0] >= inset[0]) ? maxColor[0] - inset[0] : 0; + maxColor[1] = (maxColor[1] >= inset[1]) ? maxColor[1] - inset[1] : 0; + maxColor[2] = (maxColor[2] >= inset[2]) ? maxColor[2] - inset[2] : 0; + } + + // Find a line of best fit through the color space of colorBlock. The line is approximated using + // the extents of the bounding box of the color space. This function includes the alpha channel. + void GetMinMaxColorsWithAlpha(const uint8* colorBlock, uint8* minColor, uint8* maxColor) + { + int32 i; + uint8 inset[4]; + + minColor[0] = minColor[1] = minColor[2] = minColor[3] = 255; + maxColor[0] = maxColor[1] = maxColor[2] = maxColor[3] = 0; + + // Find the bounding box (defined by minimum and maximum color). + for(i = 0; i < 16; i++) { + if(colorBlock[i * 4 + 0] < minColor[0]) { + minColor[0] = colorBlock[i * 4 + 0]; + } + if(colorBlock[i * 4 + 1] < minColor[1]) { + minColor[1] = colorBlock[i * 4 + 1]; + } + if(colorBlock[i * 4 + 2] < minColor[2]) { + minColor[2] = colorBlock[i * 4 + 2]; + } + if(colorBlock[i * 4 + 3] < minColor[3]) { + minColor[3] = colorBlock[i * 4 + 3]; + } + if(colorBlock[i * 4 + 0] > maxColor[0]) { + maxColor[0] = colorBlock[i * 4 + 0]; + } + if(colorBlock[i * 4 + 1] > maxColor[1]) { + maxColor[1] = colorBlock[i * 4 + 1]; + } + if(colorBlock[i * 4 + 2] > maxColor[2]) { + maxColor[2] = colorBlock[i * 4 + 2]; + } + if(colorBlock[i * 4 + 3] > maxColor[3]) { + maxColor[3] = colorBlock[i * 4 + 3]; + } + } + + // Inset the bounding box by 1/16 of it's size. (i.e. shift right by 4). + inset[0] = (maxColor[0] - minColor[0]) >> INSET_SHIFT; + inset[1] = (maxColor[1] - minColor[1]) >> INSET_SHIFT; + inset[2] = (maxColor[2] - minColor[2]) >> INSET_SHIFT; + inset[3] = (maxColor[3] - minColor[3]) >> INSET_SHIFT; + + // Clamp the inset bounding box to 255. + minColor[0] = (minColor[0] + inset[0] <= 255) ? minColor[0] + inset[0] : 255; + minColor[1] = (minColor[1] + inset[1] <= 255) ? minColor[1] + inset[1] : 255; + minColor[2] = (minColor[2] + inset[2] <= 255) ? minColor[2] + inset[2] : 255; + minColor[3] = (minColor[3] + inset[3] <= 255) ? minColor[3] + inset[3] : 255; + + // Clamp the inset bounding box to 0. + maxColor[0] = (maxColor[0] >= inset[0]) ? maxColor[0] - inset[0] : 0; + maxColor[1] = (maxColor[1] >= inset[1]) ? maxColor[1] - inset[1] : 0; + maxColor[2] = (maxColor[2] >= inset[2]) ? maxColor[2] - inset[2] : 0; + maxColor[3] = (maxColor[3] >= inset[3]) ? maxColor[3] - inset[3] : 0; + } + + // Quantize the pixels of the colorBlock to 4 colors that lie on the line through the color space + // of colorBlock. The paramaters minColor and maxColor approximate the line through the color + // space. 32 bits (2 bits per pixel) are written to outBuf, which represent the indices of the 4 + // colors. This function does not include the alpha channel. + void EmitColorIndices(const uint8* colorBlock, uint8*& outBuf, const uint8* minColor, const uint8* maxColor) + { + uint16 colors[4][4]; + uint32 result = 0; + + colors[0][0] = (maxColor[0] & C565_5_MASK) | (maxColor[0] >> 5); + colors[0][1] = (maxColor[1] & C565_6_MASK) | (maxColor[1] >> 6); + colors[0][2] = (maxColor[2] & C565_5_MASK) | (maxColor[2] >> 5); + colors[1][0] = (minColor[0] & C565_5_MASK) | (minColor[0] >> 5); + colors[1][1] = (minColor[1] & C565_6_MASK) | (minColor[1] >> 6); + colors[1][2] = (minColor[2] & C565_5_MASK) | (minColor[2] >> 5); + colors[2][0] = (2 * colors[0][0] + 1 * colors[1][0]) / 3; + colors[2][1] = (2 * colors[0][1] + 1 * colors[1][1]) / 3; + colors[2][2] = (2 * colors[0][2] + 1 * colors[1][2]) / 3; + colors[3][0] = (1 * colors[0][0] + 2 * colors[1][0]) / 3; + colors[3][1] = (1 * colors[0][1] + 2 * colors[1][1]) / 3; + colors[3][2] = (1 * colors[0][2] + 2 * colors[1][2]) / 3; + + for(int i = 15; i >= 0; i--) { + int32 c0 = colorBlock[i * 4 + 0]; + int32 c1 = colorBlock[i * 4 + 1]; + int32 c2 = colorBlock[i * 4 + 2]; + + int32 d0 = abs(colors[0][0] - c0) + abs(colors[0][1] - c1) + abs(colors[0][2] - c2); + int32 d1 = abs(colors[1][0] - c0) + abs(colors[1][1] - c1) + abs(colors[1][2] - c2); + int32 d2 = abs(colors[2][0] - c0) + abs(colors[2][1] - c1) + abs(colors[2][2] - c2); + int32 d3 = abs(colors[3][0] - c0) + abs(colors[3][1] - c1) + abs(colors[3][2] - c2); + + int32 b0 = d0 > d3; + int32 b1 = d1 > d2; + int32 b2 = d0 > d2; + int32 b3 = d1 > d3; + int32 b4 = d2 > d3; + + int32 x0 = b1 & b2; + int32 x1 = b0 & b3; + int32 x2 = b0 & b4; + + result |= (x2 | ((x0 | x1) << 1)) << (i << 1); + } + + EmitDoubleWord(outBuf, result); + } + + // Quantize the alpha channel of the pixels in colorBlock to 8 alpha values that are equally + // spaced along the interval defined by minAlpha and maxAlpha. 48 bits (3 bits per alpha) are + // written to outBuf, which represent the indices of the 8 alpha values. + void EmitAlphaIndices(const uint8* colorBlock, uint8*& outBuf, const uint8 minAlpha, const uint8 maxAlpha) + { + assert(maxAlpha >= minAlpha); + + uint8 indices[16]; + + uint8 mid = (maxAlpha - minAlpha) / (2 * 7); + + uint8 ab1 = minAlpha + mid; + uint8 ab2 = (6 * maxAlpha + 1 * minAlpha) / 7 + mid; + uint8 ab3 = (5 * maxAlpha + 2 * minAlpha) / 7 + mid; + uint8 ab4 = (4 * maxAlpha + 3 * minAlpha) / 7 + mid; + uint8 ab5 = (3 * maxAlpha + 4 * minAlpha) / 7 + mid; + uint8 ab6 = (2 * maxAlpha + 5 * minAlpha) / 7 + mid; + uint8 ab7 = (1 * maxAlpha + 6 * minAlpha) / 7 + mid; + + colorBlock += 3; + + for(int i = 0; i < 16; i++) { + uint8 a = colorBlock[i * 4]; + int32 b1 = (a <= ab1); + int32 b2 = (a <= ab2); + int32 b3 = (a <= ab3); + int32 b4 = (a <= ab4); + int32 b5 = (a <= ab5); + int32 b6 = (a <= ab6); + int32 b7 = (a <= ab7); + int32 index = (b1 + b2 + b3 + b4 + b5 + b6 + b7 + 1) & 7; + indices[i] = index ^ (2 > index); + } + + EmitByte(outBuf, (indices[0] >> 0) | (indices[1] << 3) | (indices[2] << 6)); + EmitByte(outBuf, (indices[2] >> 2) | (indices[3] << 1) | (indices[4] << 4) | (indices[ 5] << 7)); + EmitByte(outBuf, (indices[5] >> 1) | (indices[6] << 2) | (indices[7] << 5)); + EmitByte(outBuf, (indices[8] >> 0) | (indices[9] << 3) | (indices[10] << 6)); + EmitByte(outBuf, (indices[10] >> 2) | (indices[11] << 1) | (indices[12] << 4) | (indices[13] << 7)); + EmitByte(outBuf, (indices[13] >> 1) | (indices[14] << 2) | (indices[15] << 5)); + } +} diff --git a/DXTEncoder/src/DXTCompressorSSE2DLL.cpp b/DXTEncoder/src/DXTCompressorSSE2DLL.cpp new file mode 100755 index 0000000..28b75dd --- /dev/null +++ b/DXTEncoder/src/DXTCompressorSSE2DLL.cpp @@ -0,0 +1,552 @@ +/* + This code is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This code is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. +*/ + +// Refer to "Real-Time DXT Compression" by J.M.P. van Waveren for a more thorough discussion of the +// algorithms used in this code. + +#include "DXTCompressorDLL.h" +#include +#include +#include +#include + +#define ALIGN16(x) __declspec(align(16)) x +#define INSET_SHIFT 4 // Inset the bounding box with (range >> shift). +#define C565_5_MASK 0xF8 // 0xFF minus last three bits +#define C565_6_MASK 0xFC // 0xFF minus last two bits +#define R_SHUFFLE_D( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) + +namespace DXTC +{ + // SSE2 Constants + ALIGN16(static const BYTE SIMD_byte_0[16]) = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + ALIGN16(static const BYTE SIMD_byte_1[16]) = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }; + ALIGN16(static const BYTE SIMD_byte_2[16]) = { 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 }; + ALIGN16(static const BYTE SIMD_byte_7[16]) = { 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 }; + + ALIGN16(static const BYTE SIMD_byte_colorMask[16]) = { C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00, C565_5_MASK, C565_6_MASK, C565_5_MASK, 0x00, 0x00, 0x00, 0x00, 0x00 }; + ALIGN16(static const WORD SIMD_word_0[8]) = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + + ALIGN16(static const WORD SIMD_word_1[8]) = { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }; + ALIGN16(static const WORD SIMD_word_2[8]) = { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }; + ALIGN16(static const WORD SIMD_word_div_by_3[8]) = { (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1, (1 << 16) / 3 + 1 }; + + ALIGN16(static const WORD SIMD_word_div_by_7[8]) = { (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1, (1 << 16) / 7 + 1 }; + ALIGN16(static const WORD SIMD_word_div_by_14[8]) = { (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1, (1 << 16) / 14 + 1 }; + + ALIGN16(static const WORD SIMD_word_scale66554400[8]) = { 6, 6, 5, 5, 4, 4, 0, 0 }; + ALIGN16(static const WORD SIMD_word_scale11223300[8]) = { 1, 1, 2, 2, 3, 3, 0, 0 }; + + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask0[4]) = { 7 << 0, 0, 7 << 0, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask1[4]) = { 7 << 3, 0, 7 << 3, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask2[4]) = { 7 << 6, 0, 7 << 6, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask3[4]) = { 7 << 9, 0, 7 << 9, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask4[4]) = { 7 << 12, 0, 7 << 12, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask5[4]) = { 7 << 15, 0, 7 << 15, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask6[4]) = { 7 << 18, 0, 7 << 18, 0 }; + ALIGN16(static const DWORD SIMD_dword_alpha_bit_mask7[4]) = { 7 << 21, 0, 7 << 21, 0 }; + + static void ExtractBlock(const BYTE* inPtr, int width, BYTE* colorBlock); + static void GetMinMaxColors(const BYTE* colorBlock, BYTE* minColor, BYTE* maxColor); + static void EmitColorIndices(const BYTE* colorBlock, BYTE*& outBuf, const BYTE* minColor, const BYTE* maxColor); + static void EmitAlphaIndices(const BYTE* colorBlock, BYTE*& outBuf, const BYTE minAlpha, const BYTE maxAlpha); + + // Compress an image using SSE2-optimized DXT1 compression. Use the inBuf parameter to point to an + // image in 4-byte RGBA format. The address pointed to by inBuf must be 16-byte aligned. The width + // and height parameters specify the size of the image in pixels. The buffer pointed to by outBuf + // must be 16-byte aligned and should be large enough to store the compressed image. This + // implementation has an 8:1 compression ratio. + void CompressImageDXT1SSE2(const BYTE* inBuf, BYTE* outBuf, int width, int height) + { + ALIGN16(BYTE block[64]); + ALIGN16(BYTE minColor[4]); + ALIGN16(BYTE maxColor[4]); + + for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4) + { + for(int i = 0; i < width; i += 4) + { + ExtractBlock(inBuf + i * 4, width, block); + GetMinMaxColors(block, minColor, maxColor); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + } + + // Compress an image using SSE2-optimized DXT5 compression. Use the inBuf parameter to point to an + // image in 4-byte RGBA format. The address pointed to by inBuf must be 16-byte aligned. The width + // and height parameters specify the size of the image in pixels. The buffer pointed to by outBuf + // must be 16-byte aligned and should be large enough to store the compressed image. This + // implementation has an 4:1 compression ratio. + void CompressImageDXT5SSE2(const BYTE* inBuf, BYTE* outBuf, int width, int height) + { + ALIGN16(BYTE block[64]); + ALIGN16(BYTE minColor[4]); + ALIGN16(BYTE maxColor[4]); + + for(int j = 0; j < height; j += 4, inBuf += width * 4 * 4) + { + for(int i = 0; i < width; i += 4) + { + ExtractBlock(inBuf + i * 4, width, block); + GetMinMaxColors(block, minColor, maxColor); + EmitByte(outBuf, maxColor[3]); + EmitByte(outBuf, minColor[3]); + EmitAlphaIndices(block, outBuf, minColor[3], maxColor[3]); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + } + + // Compress the blocks assigned to this task using SSE2-optimized DXT1 compression. + VOID CompressImageDXT1SSE2Task(VOID* taskData, INT taskContext, UINT taskId, UINT taskCount) + { + const DXTTaskData* data = (const DXTTaskData*)taskData; + + // Compress the block. + ALIGN16(BYTE block[64]); + ALIGN16(BYTE minColor[4]); + ALIGN16(BYTE maxColor[4]); + + // Interate over the block set. + for (int blockOffset = 0; blockOffset < data->kBlocksPerTask; ++blockOffset) + { + // Check for out of bounds. + const INT blockIndex = (INT)taskId * data->kBlocksPerTask + blockOffset; + if(blockIndex >= data->numBlocks) + { + break; + } + + // Compute the offsets into the input and output buffers. + const INT blockWidth = data->width / 4; + const INT blockRow = blockIndex / blockWidth; + const INT blockCol = blockIndex % blockWidth; + const INT inOffset = blockRow * blockWidth * 4 * 4 * 4 + blockCol * 4 * 4; + const INT outOffset = blockIndex * 8; + const BYTE* inBuf = data->inBuf + inOffset; + BYTE* outBuf = data->outBuf + outOffset; + + ExtractBlock(inBuf, data->width, block); + GetMinMaxColors(block, minColor, maxColor); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + + // Compress the blocks assigned to this task using SSE2-optimized DXT5 compression. + VOID CompressImageDXT5SSE2Task(VOID* taskData, INT taskContext, UINT taskId, UINT taskCount) + { + const DXTTaskData* data = (const DXTTaskData*)taskData; + + // Compress the block. + ALIGN16(BYTE block[64]); + ALIGN16(BYTE minColor[4]); + ALIGN16(BYTE maxColor[4]); + + // Interate over the block set. + for (int blockOffset = 0; blockOffset < data->kBlocksPerTask; ++blockOffset) + { + // Check for out of bounds. + const INT blockIndex = (INT)taskId * data->kBlocksPerTask + blockOffset; + if(blockIndex >= data->numBlocks) + { + break; + } + + // Compute the offsets into the input and output buffers. + const INT blockWidth = data->width / 4; + const INT blockRow = blockIndex / blockWidth; + const INT blockCol = blockIndex % blockWidth; + const INT inOffset = blockRow * blockWidth * 4 * 4 * 4 + blockCol * 4 * 4; + const INT outOffset = blockIndex * 16; + const BYTE* inBuf = data->inBuf + inOffset; + BYTE* outBuf = data->outBuf + outOffset; + + ExtractBlock(inBuf, data->width, block); + GetMinMaxColors(block, minColor, maxColor); + EmitByte(outBuf, maxColor[3]); + EmitByte(outBuf, minColor[3]); + EmitAlphaIndices(block, outBuf, minColor[3], maxColor[3]); + EmitWord(outBuf, ColorTo565(maxColor)); + EmitWord(outBuf, ColorTo565(minColor)); + EmitColorIndices(block, outBuf, minColor, maxColor); + } + } + + // Extract a 4 by 4 block of pixels from inPtr and store it in colorBlock. The width parameter + // specifies the size of the image in pixels. + void ExtractBlock(const BYTE* inPtr, int width, BYTE* colorBlock) + { + // Compute the stride. + const int stride = width * 4; + + // Copy the first row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)colorBlock, _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the second row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 16), _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the third row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 32), _mm_load_si128((__m128i*)inPtr)); + inPtr += stride; + + // Copy the forth row of pixels from inPtr into colorBlock. + _mm_store_si128((__m128i*)(colorBlock + 48), _mm_load_si128((__m128i*)inPtr)); + } + + // Find a line of best fit through the color space of colorBlock. The line is approximated using + // the extents of the bounding box of the color space. This function does not include the alpha + // channel. + void GetMinMaxColors(const BYTE* colorBlock, BYTE* minColor, BYTE* maxColor) + { + // Compute the min/max of each column of pixels. + __m128i min = _mm_load_si128((__m128i*)colorBlock); + __m128i max = _mm_load_si128((__m128i*)colorBlock); + min = _mm_min_epu8(min, *((__m128i*)(colorBlock + 16))); + max = _mm_max_epu8(max, *((__m128i*)(colorBlock + 16))); + min = _mm_min_epu8(min, *((__m128i*)(colorBlock + 32))); + max = _mm_max_epu8(max, *((__m128i*)(colorBlock + 32))); + min = _mm_min_epu8(min, *((__m128i*)(colorBlock + 48))); + max = _mm_max_epu8(max, *((__m128i*)(colorBlock + 48))); + + // Compute the min/max of the 1st and 3rd DWORD and the 2nd and 4th DWORD. + __m128i minShuf = _mm_shuffle_epi32(min, R_SHUFFLE_D(2, 3, 2, 3)); + __m128i maxShuf = _mm_shuffle_epi32(max, R_SHUFFLE_D(2, 3, 2, 3)); + min = _mm_min_epu8(min, minShuf); + max = _mm_max_epu8(max, maxShuf); + + // Compute the min/max of the 1st and 2nd DWORD. + minShuf = _mm_shufflelo_epi16(min, R_SHUFFLE_D(2, 3, 2, 3)); + maxShuf = _mm_shufflelo_epi16(max, R_SHUFFLE_D(2, 3, 2, 3)); + min = _mm_min_epu8(min, minShuf); + max = _mm_max_epu8(max, maxShuf); + + // Compute the inset value. + const __m128i zero = _mm_setzero_si128(); + min = _mm_unpacklo_epi8(min, zero); + max = _mm_unpacklo_epi8(max, zero); + __m128i inset = _mm_sub_epi16(max, min); + inset = _mm_srli_epi16(inset, INSET_SHIFT); + + // Inset the bounding box. + min = _mm_add_epi16(min, inset); + max = _mm_sub_epi16(max, inset); + + // Store the bounding box. + min = _mm_packus_epi16(min, min); + max = _mm_packus_epi16(max, max); + *((int*)minColor) = _mm_cvtsi128_si32(min); + *((int*)maxColor) = _mm_cvtsi128_si32(max); + } + + // Quantize the pixels of the colorBlock to 4 colors that lie on the line through the color space + // of colorBlock. The paramaters minColor and maxColor approximate the line through the color + // space. 32 bits (2 bits per pixel) are written to outBuf, which represent the indices of the 4 + // colors. This function does not include the alpha channel. + void EmitColorIndices(const BYTE* colorBlock, BYTE*& outBuf, const BYTE* minColor, const BYTE* maxColor) + { + const __m128i RGB565Mask = _mm_load_si128((__m128i*)SIMD_byte_colorMask); + const __m128i zero = _mm_setzero_si128(); + + // Find 4 colors on the line through maxColor and minColor. + // Compute color0 (maxColor). + __m128i color0 = _mm_cvtsi32_si128(*((int*)maxColor)); + color0 = _mm_and_si128(color0, RGB565Mask); + color0 = _mm_unpacklo_epi8(color0, zero); + __m128i redBlue = _mm_shufflelo_epi16(color0, R_SHUFFLE_D(0, 3, 2, 3)); + __m128i green = _mm_shufflelo_epi16(color0, R_SHUFFLE_D(3, 1, 3, 3)); + redBlue = _mm_srli_epi16(redBlue, 5); + green = _mm_srli_epi16(green, 6); + color0 = _mm_or_si128(color0, redBlue); + color0 = _mm_or_si128(color0, green); + + // Compute color1 (minColor). + __m128i color1 = _mm_cvtsi32_si128(*((int*)minColor)); + color1 = _mm_and_si128(color1, RGB565Mask); + color1 = _mm_unpacklo_epi8(color1, zero); + redBlue = _mm_shufflelo_epi16(color1, R_SHUFFLE_D(0, 3, 2, 3)); + green = _mm_shufflelo_epi16(color1, R_SHUFFLE_D(3, 1, 3, 3)); + redBlue = _mm_srli_epi16(redBlue, 5); + green = _mm_srli_epi16(green, 6); + color1 = _mm_or_si128(color1, redBlue); + color1 = _mm_or_si128(color1, green); + + // Compute and pack color3. + __m128i color3 = _mm_add_epi16(color1, color1); + color3 = _mm_add_epi16(color0, color3); + color3 = _mm_mulhi_epi16(color3, *((__m128i*)SIMD_word_div_by_3)); + color3 = _mm_packus_epi16(color3, zero); + color3 = _mm_shuffle_epi32(color3, R_SHUFFLE_D(0, 1, 0, 1)); + + // Compute and pack color2. + __m128i color2 = _mm_add_epi16(color0, color0); + color2 = _mm_add_epi16(color2, color1); + color2 = _mm_mulhi_epi16(color2, *((__m128i*)SIMD_word_div_by_3)); + color2 = _mm_packus_epi16(color2, zero); + color2 = _mm_shuffle_epi32(color2, R_SHUFFLE_D(0, 1, 0, 1)); + + // Pack color1. + color1 = _mm_packus_epi16(color1, zero); + color1 = _mm_shuffle_epi32(color1, R_SHUFFLE_D(0, 1, 0, 1)); + + // Pack color0. + color0 = _mm_packus_epi16(color0, zero); + color0 = _mm_shuffle_epi32(color0, R_SHUFFLE_D(0, 1, 0, 1)); + + // Assign a color index for each of the 16 colors in the colorblock. + // This loop iterates twice (computes 8 indexes per iteration). + __m128i result = zero; + for(int i = 32; i >= 0; i -= 32) + { + // Load 4 colors. + __m128i colorHi = _mm_loadl_epi64((__m128i*)(colorBlock + i)); + colorHi = _mm_shuffle_epi32(colorHi, R_SHUFFLE_D(0, 2, 1, 3)); + __m128i colorLo = _mm_loadl_epi64((__m128i*)(colorBlock + i + 8)); + colorLo = _mm_shuffle_epi32(colorLo, R_SHUFFLE_D(0, 2, 1, 3)); + + // Compute the sum of absolute differences for each color. + __m128i dHi = _mm_sad_epu8(colorHi, color0); + __m128i dLo = _mm_sad_epu8(colorLo, color0); + __m128i d0 = _mm_packs_epi32(dHi, dLo); + dHi = _mm_sad_epu8(colorHi, color1); + dLo = _mm_sad_epu8(colorLo, color1); + __m128i d1 = _mm_packs_epi32(dHi, dLo); + dHi = _mm_sad_epu8(colorHi, color2); + dLo = _mm_sad_epu8(colorLo, color2); + __m128i d2 = _mm_packs_epi32(dHi, dLo); + dHi = _mm_sad_epu8(colorHi, color3); + dLo = _mm_sad_epu8(colorLo, color3); + __m128i d3 = _mm_packs_epi32(dHi, dLo); + + // Load 4 more colors. + colorHi = _mm_loadl_epi64((__m128i*)(colorBlock + i + 16)); + colorHi = _mm_shuffle_epi32(colorHi, R_SHUFFLE_D(0, 2, 1, 3)); + colorLo = _mm_loadl_epi64((__m128i*)(colorBlock + i + 24)); + colorLo = _mm_shuffle_epi32(colorLo, R_SHUFFLE_D(0, 2, 1, 3)); + + // Compute the sum of absolute differences for each color. Pack result into previous 4 results. + dHi = _mm_sad_epu8(colorHi, color0); + dLo = _mm_sad_epu8(colorLo, color0); + dLo = _mm_packs_epi32(dHi, dLo); + d0 = _mm_packs_epi32(d0, dLo); + dHi = _mm_sad_epu8(colorHi, color1); + dLo = _mm_sad_epu8(colorLo, color1); + dLo = _mm_packs_epi32(dHi, dLo); + d1 = _mm_packs_epi32(d1, dLo); + dHi = _mm_sad_epu8(colorHi, color2); + dLo = _mm_sad_epu8(colorLo, color2); + dLo = _mm_packs_epi32(dHi, dLo); + d2 = _mm_packs_epi32(d2, dLo); + dHi = _mm_sad_epu8(colorHi, color3); + dLo = _mm_sad_epu8(colorLo, color3); + dLo = _mm_packs_epi32(dHi, dLo); + d3 = _mm_packs_epi32(d3, dLo); + + // Compare the distances. + __m128i b0 = _mm_cmpgt_epi16(d0, d3); + __m128i b1 = _mm_cmpgt_epi16(d1, d2); + __m128i b2 = _mm_cmpgt_epi16(d0, d2); + __m128i b3 = _mm_cmpgt_epi16(d1, d3); + __m128i b4 = _mm_cmpgt_epi16(d2, d3); + + // Compute the color index. + __m128i x0 = _mm_and_si128(b2, b1); + __m128i x1 = _mm_and_si128(b3, b0); + __m128i x2 = _mm_and_si128(b4, b0); + __m128i indexBit0 = _mm_or_si128(x0, x1); + indexBit0 = _mm_and_si128(indexBit0, *((__m128i*)SIMD_word_2)); + __m128i indexBit1 = _mm_and_si128(x2, *((__m128i*)SIMD_word_1)); + __m128i index = _mm_or_si128(indexBit1, indexBit0); + + // Pack the index into the result. + __m128i indexHi = _mm_shuffle_epi32(index, R_SHUFFLE_D(2, 3, 0, 1)); + indexHi = _mm_unpacklo_epi16(indexHi, *((__m128i*)SIMD_word_0)); + indexHi = _mm_slli_epi32(indexHi, 8); + __m128i indexLo = _mm_unpacklo_epi16(index, *((__m128i*)SIMD_word_0)); + result = _mm_slli_epi32(result, 16); + result = _mm_or_si128(result, indexHi); + result = _mm_or_si128(result, indexLo); + } + + // Pack the 16 2-bit color indices into a single 32-bit value. + __m128i result1 = _mm_shuffle_epi32(result, R_SHUFFLE_D(1, 2, 3, 0)); + __m128i result2 = _mm_shuffle_epi32(result, R_SHUFFLE_D(2, 3, 0, 1)); + __m128i result3 = _mm_shuffle_epi32(result, R_SHUFFLE_D(3, 0, 1, 2)); + result1 = _mm_slli_epi32(result1, 2); + result2 = _mm_slli_epi32(result2, 4); + result3 = _mm_slli_epi32(result3, 6); + result = _mm_or_si128(result, result1); + result = _mm_or_si128(result, result2); + result = _mm_or_si128(result, result3); + + // Store the result. + *((int*)outBuf) = _mm_cvtsi128_si32(result); + + outBuf += 4; + } + + // Quantize the alpha channel of the pixels in colorBlock to 8 alpha values that are equally + // spaced along the interval defined by minAlpha and maxAlpha. 48 bits (3 bits per alpha) are + // written to outBuf, which represent the indices of the 8 alpha values. + void EmitAlphaIndices(const BYTE* colorBlock, BYTE*& outBuf, const BYTE minAlpha, const BYTE maxAlpha) + { + // Pack the alpha values of the first two rows of colorBlock. + __m128i alpha1 = _mm_load_si128((__m128i*)colorBlock); + alpha1 = _mm_srli_epi32(alpha1, 24); + __m128i alpha2 = _mm_load_si128((__m128i*)(colorBlock + 16)); + alpha2 = _mm_srli_epi32(alpha2, 24); + alpha1 = _mm_packus_epi16(alpha1, alpha2); + + // Pack the alpha values of the last two rows of colorBlock. + __m128i alpha3 = _mm_load_si128((__m128i*)(colorBlock + 32)); + alpha3 = _mm_srli_epi32(alpha3, 24); + __m128i alpha4 = _mm_load_si128((__m128i*)(colorBlock + 48)); + alpha4 = _mm_srli_epi32(alpha4, 24); + alpha3 = _mm_packus_epi16(alpha3, alpha4); + + // Pack all 16 alpha values together. + __m128i alpha = _mm_packus_epi16(alpha1, alpha3); + + // Unpack the maximum alpha value. + __m128i max = _mm_cvtsi32_si128(maxAlpha); + max = _mm_shufflelo_epi16(max, R_SHUFFLE_D(0, 0, 0, 0)); + max = _mm_shuffle_epi32(max, R_SHUFFLE_D(0, 0, 0, 0)); + + // Unpack the minimum alpha value. + __m128i min = _mm_cvtsi32_si128(minAlpha); + min = _mm_shufflelo_epi16(min, R_SHUFFLE_D(0, 0, 0, 0)); + min = _mm_shuffle_epi32(min, R_SHUFFLE_D(0, 0, 0, 0)); + + // Compute the midpoint offset between any two interpolated alpha values. + __m128i mid = _mm_sub_epi16(max, min); + mid = _mm_mulhi_epi16(mid, *((__m128i*)SIMD_word_div_by_14)); + + // Compute the first midpoint. + __m128i ab1 = min; + ab1 = _mm_add_epi16(ab1, mid); + ab1 = _mm_packus_epi16(ab1, ab1); + + // Compute the next three midpoints. + __m128i max456 = _mm_mullo_epi16(max, *((__m128i*)SIMD_word_scale66554400)); + __m128i min123 = _mm_mullo_epi16(min, *((__m128i*)SIMD_word_scale11223300)); + __m128i ab234 = _mm_add_epi16(max456, min123); + ab234 = _mm_mulhi_epi16(ab234, *((__m128i*)SIMD_word_div_by_7)); + ab234 = _mm_add_epi16(ab234, mid); + __m128i ab2 = _mm_shuffle_epi32(ab234, R_SHUFFLE_D(0, 0, 0, 0)); + ab2 = _mm_packus_epi16(ab2, ab2); + __m128i ab3 = _mm_shuffle_epi32(ab234, R_SHUFFLE_D(1, 1, 1, 1)); + ab3 = _mm_packus_epi16(ab3, ab3); + __m128i ab4 = _mm_shuffle_epi32(ab234, R_SHUFFLE_D(2, 2, 2, 2)); + ab4 = _mm_packus_epi16(ab4, ab4); + + // Compute the last three midpoints. + __m128i max123 = _mm_mullo_epi16(max, *((__m128i*)SIMD_word_scale11223300)); + __m128i min456 = _mm_mullo_epi16(min, *((__m128i*)SIMD_word_scale66554400)); + __m128i ab567 = _mm_add_epi16(max123, min456); + ab567 = _mm_mulhi_epi16(ab567, *((__m128i*)SIMD_word_div_by_7)); + ab567 = _mm_add_epi16(ab567, mid); + __m128i ab5 = _mm_shuffle_epi32(ab567, R_SHUFFLE_D(2, 2, 2, 2)); + ab5 = _mm_packus_epi16(ab5, ab5); + __m128i ab6 = _mm_shuffle_epi32(ab567, R_SHUFFLE_D(1, 1, 1, 1)); + ab6 = _mm_packus_epi16(ab6, ab6); + __m128i ab7 = _mm_shuffle_epi32(ab567, R_SHUFFLE_D(0, 0, 0, 0)); + ab7 = _mm_packus_epi16(ab7, ab7); + + // Compare the alpha values to the midpoints. + __m128i b1 = _mm_min_epu8(ab1, alpha); + b1 = _mm_cmpeq_epi8(b1, alpha); + b1 = _mm_and_si128(b1, *((__m128i*)SIMD_byte_1)); + __m128i b2 = _mm_min_epu8(ab2, alpha); + b2 = _mm_cmpeq_epi8(b2, alpha); + b2 = _mm_and_si128(b2, *((__m128i*)SIMD_byte_1)); + __m128i b3 = _mm_min_epu8(ab3, alpha); + b3 = _mm_cmpeq_epi8(b3, alpha); + b3 = _mm_and_si128(b3, *((__m128i*)SIMD_byte_1)); + __m128i b4 = _mm_min_epu8(ab4, alpha); + b4 = _mm_cmpeq_epi8(b4, alpha); + b4 = _mm_and_si128(b4, *((__m128i*)SIMD_byte_1)); + __m128i b5 = _mm_min_epu8(ab5, alpha); + b5 = _mm_cmpeq_epi8(b5, alpha); + b5 = _mm_and_si128(b5, *((__m128i*)SIMD_byte_1)); + __m128i b6 = _mm_min_epu8(ab6, alpha); + b6 = _mm_cmpeq_epi8(b6, alpha); + b6 = _mm_and_si128(b6, *((__m128i*)SIMD_byte_1)); + __m128i b7 = _mm_min_epu8(ab7, alpha); + b7 = _mm_cmpeq_epi8(b7, alpha); + b7 = _mm_and_si128(b7, *((__m128i*)SIMD_byte_1)); + + // Compute the alpha indexes. + __m128i index = _mm_adds_epu8(b1, b2); + index = _mm_adds_epu8(index, b3); + index = _mm_adds_epu8(index, b4); + index = _mm_adds_epu8(index, b5); + index = _mm_adds_epu8(index, b6); + index = _mm_adds_epu8(index, b7); + + // Convert natural index ordering to DXT index ordering. + __m128i byte1 = _mm_load_si128((__m128i*)SIMD_byte_1); + index = _mm_adds_epu8(index, byte1); + __m128i byte7 = _mm_load_si128((__m128i*)SIMD_byte_7); + index = _mm_and_si128(index, byte7); + __m128i byte2 = _mm_load_si128((__m128i*)SIMD_byte_2); + __m128i swapMinMax = _mm_cmpgt_epi8(byte2, index); + swapMinMax = _mm_and_si128(swapMinMax, byte1); + index = _mm_xor_si128(index, swapMinMax); + + // Pack the 16 3-bit indices into 6 bytes. + __m128i alphaBitMask0 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask0); + __m128i index0 = _mm_and_si128(index, alphaBitMask0); + __m128i index1 = _mm_srli_epi64(index, 8 - 3); + __m128i alphaBitMask1 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask1); + index1 = _mm_and_si128(index1, alphaBitMask1); + __m128i index2 = _mm_srli_epi64(index, 16 - 6); + __m128i alphaBitMask2 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask2); + index2 = _mm_and_si128(index2, alphaBitMask2); + __m128i index3 = _mm_srli_epi64(index, 24 - 9); + __m128i alphaBitMask3 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask3); + index3 = _mm_and_si128(index3, alphaBitMask3); + __m128i index4 = _mm_srli_epi64(index, 32 - 12); + __m128i alphaBitMask4 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask4); + index4 = _mm_and_si128(index4, alphaBitMask4); + __m128i index5 = _mm_srli_epi64(index, 40 - 15); + __m128i alphaBitMask5 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask5); + index5 = _mm_and_si128(index5, alphaBitMask5); + __m128i index6 = _mm_srli_epi64(index, 48 - 18); + __m128i alphaBitMask6 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask6); + index6 = _mm_and_si128(index6, alphaBitMask6); + __m128i index7 = _mm_srli_epi64(index, 56 - 21); + __m128i alphaBitMask7 = _mm_load_si128((__m128i*)SIMD_dword_alpha_bit_mask7); + index7 = _mm_and_si128(index7, alphaBitMask7); + index = _mm_or_si128(index0, index1); + index = _mm_or_si128(index, index2); + index = _mm_or_si128(index, index3); + index = _mm_or_si128(index, index4); + index = _mm_or_si128(index, index5); + index = _mm_or_si128(index, index6); + index = _mm_or_si128(index, index7); + + // Store the indexes. + *((int*)outBuf) = _mm_cvtsi128_si32(index); + index = _mm_shuffle_epi32(index, R_SHUFFLE_D(2, 3, 0, 1)); + *((int*)(outBuf + 3)) = _mm_cvtsi128_si32(index); + + outBuf += 6; + } +} \ No newline at end of file diff --git a/DXTEncoder/src/DXTDecompressor.cpp b/DXTEncoder/src/DXTDecompressor.cpp new file mode 100644 index 0000000..0493c22 --- /dev/null +++ b/DXTEncoder/src/DXTDecompressor.cpp @@ -0,0 +1,120 @@ +/* FasTC + * Copyright (c) 2013 University of North Carolina at Chapel Hill. + * All rights reserved. + * + * Permission to use, copy, modify, and distribute this software and its + * documentation for educational, research, and non-profit purposes, without + * fee, and without a written agreement is hereby granted, provided that the + * above copyright notice, this paragraph, and the following four paragraphs + * appear in all copies. + * + * Permission to incorporate this software into commercial products may be + * obtained by contacting the authors or the Office of Technology Development + * at the University of North Carolina at Chapel Hill . + * + * This software program and documentation are copyrighted by the University of + * North Carolina at Chapel Hill. The software program and documentation are + * supplied "as is," without any accompanying services from the University of + * North Carolina at Chapel Hill or the authors. The University of North + * Carolina at Chapel Hill and the authors do not warrant that the operation of + * the program will be uninterrupted or error-free. The end-user understands + * that the program was developed for research purposes and is advised not to + * rely exclusively on the program for any reason. + * + * IN NO EVENT SHALL THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL OR THE + * AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF + * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF NORTH CAROLINA + * AT CHAPEL HILL OR THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND THE AUTHORS SPECIFICALLY + * DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE AND ANY + * STATUTORY WARRANTY OF NON-INFRINGEMENT. THE SOFTWARE PROVIDED HEREUNDER IS ON + * AN "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA AT CHAPEL HILL AND + * THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, + * ENHANCEMENTS, OR MODIFICATIONS. + * + * Please send all BUG REPORTS to . + * + * The authors may be contacted via: + * + * Pavel Krajcevski + * Dept of Computer Science + * 201 S Columbia St + * Frederick P. Brooks, Jr. Computer Science Bldg + * Chapel Hill, NC 27599-3175 + * USA + * + * + */ + +#include "DXTCompressor.h" +#include +#include +#include + +#include "Pixel.h" + +namespace DXTC +{ + void DecompressDXT1Block(const uint8 *block, uint32 *outBuf) { + uint16 colorA = block[0]; + colorA <<= 8; + colorA |= block[1]; + + uint16 colorB = block[2]; + colorB <<= 8; + colorB |= block[3]; + + uint32 mod = reinterpret_cast(block + 4)[0]; + + uint8 kFiveSixFive[4] = { 0, 5, 6, 5 }; + FasTC::Pixel a, b, c, d; + a.FromBits(reinterpret_cast(&colorA), kFiveSixFive); + b.FromBits(reinterpret_cast(&colorB), kFiveSixFive); + + uint8 kFullDepth[4] = {8, 8, 8, 8}; + a.ChangeBitDepth(kFullDepth); + b.ChangeBitDepth(kFullDepth); + + d = (a + b*2) / 3; + c = (a*2 + b) / 3; + + FasTC::Pixel *colors[4] = { &a, &b, &c, &d }; + + uint32 *outPixels = reinterpret_cast(outBuf); + for(uint32 i = 0; i < 16; i++) { + outPixels[i] = colors[(mod >> (i*2)) & 3]->Pack(); + } + } + + void DecompressDXT1(const DecompressionJob &dcj) + { + assert(!(dcj.height & 3)); + assert(!(dcj.width & 3)); + + uint32 blockW = dcj.width >> 2; + uint32 blockH = dcj.height >> 2; + + const uint32 blockSz = 8; + + uint32 *outPixels = reinterpret_cast(dcj.outBuf); + + uint32 outBlock[16]; + for(int j = 0; j < blockH; j++) { + for(int i = 0; i < blockW; i++) { + + uint32 offset = (j * blockW + i) * blockSz; + DecompressDXT1Block(dcj.inBuf + offset, outBlock); + + for(uint32 y = 0; y < 4; y++) + for(uint32 x = 0; x < 4; x++) { + offset = (j*4 + y)*dcj.width + ((i*4)+x); + outPixels[offset] = outBlock[y*4 + x]; + } + } + } + } +} From 54f5030aeb6f12d04b06af8d5eecdfdda3ee65e6 Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Wed, 16 Oct 2013 19:35:34 -0400 Subject: [PATCH 5/8] Add some minor code changes. --- PVRTCEncoder/src/Compressor.cpp | 103 +++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 16 deletions(-) diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp index 80b0301..d7179d9 100644 --- a/PVRTCEncoder/src/Compressor.cpp +++ b/PVRTCEncoder/src/Compressor.cpp @@ -347,6 +347,14 @@ namespace PVRTCC { } } l.distance = newDist; +#if 0 + // We've already visited this label, but we should have dilated from here, + // so try and dilate now... + if(l.distance < 4 && nbs[4]->distance == 0) { + nbs[4]->distance = l.distance + 1; + nbs[4]->Combine(l); + } +#endif } static void LabelImageBackward(CompressionLabel *labels, @@ -382,6 +390,58 @@ namespace PVRTCC { } } +#if 0 + static void DilateImage(CompressionLabel *labels, uint32 w, uint32 h) { + for(uint32 j = 0; j < h; j++) + for(uint32 i = 0; i < w; i++) { + uint32 idx = j*w + i; + + uint32 minLowDist = labels[idx].lowLabel.distance == 0? 5 : labels[idx].lowLabel.distance - 1; + uint32 minHighDist = labels[idx].highLabel.distance == 0? 5 : labels[idx].highLabel.distance - 1; + + for(int32 y = 0; y < 3; y++) + for(int32 x = 0; x < 3; x++) { + uint32 cidx = ((j + y + h-1) & (h-1))*w + ((i+x+w-1) & (w-1)); + + if(labels[cidx].lowLabel.distance > 0) + minLowDist = ::std::min(minLowDist, labels[cidx].lowLabel.distance); + + if(labels[cidx].highLabel.distance > 0) + minHighDist = ::std::min(minHighDist, labels[cidx].highLabel.distance); + } + + if(minLowDist != labels[idx].lowLabel.distance - 1) { + labels[idx].lowLabel.nLabels = 0; + } + + if(minHighDist != labels[idx].highLabel.distance - 1) { + labels[idx].highLabel.nLabels = 0; + } + + for(int32 y = 0; y < 3; y++) + for(int32 x = 0; x < 3; x++) { + uint32 cidx = ((j + y + h-1) & (h-1))*w + ((i+x+w-1) & (w-1)); + + if(minLowDist > 0 && labels[cidx].lowLabel.distance == minLowDist) { + labels[idx].lowLabel.Combine(labels[cidx].lowLabel); + } + + if(minHighDist > 0 && labels[cidx].highLabel.distance == minHighDist) { + labels[idx].highLabel.Combine(labels[cidx].highLabel); + } + } + + if(minLowDist > 0 && minLowDist < 5) { + labels[idx].lowLabel.distance = minLowDist + 1; + } + + if(minHighDist > 0 && minHighDist < 5) { + labels[idx].highLabel.distance = minHighDist + 1; + } + } + } +#endif + static FasTC::Color CollectLabel(const uint32 *pixels, const Label &label) { FasTC::Color ret; uint32 nPs = 0; @@ -648,21 +708,9 @@ namespace PVRTCC { } } - void Compress(const CompressionJob &cj, bool bTwoBit, EWrapMode wrapMode) { - const uint32 width = cj.width; - const uint32 height = cj.height; - - // Make sure that width and height are a power of two. - assert((width & (width - 1)) == 0); - assert((height & (height - 1)) == 0); - - CompressionLabel *labels = - (CompressionLabel *)calloc(width * height, sizeof(CompressionLabel)); - - // First traverse forward... - LabelImageForward(labels, cj.inBuf, width, height); - #ifndef NDEBUG + void DebugOutputLabels(const char *outputPrefix, const CompressionLabel *labels, + uint32 width, uint32 height) { Image highForwardLabels(width, height); Image lowForwardLabels(width, height); @@ -689,8 +737,29 @@ namespace PVRTCC { } } - highForwardLabels.DebugOutput("HighForwardLabels"); - lowForwardLabels.DebugOutput("LowForwardLabels"); + ::std::string prefix(outputPrefix); + + highForwardLabels.DebugOutput((prefix + ::std::string("HighLabels")).c_str()); + lowForwardLabels.DebugOutput((prefix + ::std::string("LowLabels")).c_str()); + } +#endif + + void Compress(const CompressionJob &cj, bool bTwoBit, EWrapMode wrapMode) { + const uint32 width = cj.width; + const uint32 height = cj.height; + + // Make sure that width and height are a power of two. + assert((width & (width - 1)) == 0); + assert((height & (height - 1)) == 0); + + CompressionLabel *labels = + (CompressionLabel *)calloc(width * height, sizeof(CompressionLabel)); + + // First traverse forward... + LabelImageForward(labels, cj.inBuf, width, height); + +#ifndef NDEBUG + DebugOutputLabels("Forward-", labels, width, height); Image highForwardImg(width, height); Image lowForwardImg(width, height); @@ -737,6 +806,8 @@ namespace PVRTCC { LabelImageBackward(labels, width, height); #ifndef NDEBUG + DebugOutputLabels("Backward-", labels, width, height); + Image highImg(width, height); Image lowImg(width, height); for(uint32 j = 0; j < height; j++) { From f597ec2f77894f6ba841a9ffeb8ae5b181366b63 Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Fri, 18 Oct 2013 04:12:32 -0400 Subject: [PATCH 6/8] Add entropy calculation to images. --- Base/include/Image.h | 3 +++ Base/src/Image.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/Base/include/Image.h b/Base/include/Image.h index 0b56aed..ab9c974 100644 --- a/Base/include/Image.h +++ b/Base/include/Image.h @@ -107,6 +107,9 @@ namespace FasTC { double ComputePSNR(Image *other); double ComputeSSIM(Image *other); + + double ComputeEntropy(); + double ComputeMeanLocalEntropy(); // Function to allow derived classes to populate the pixel array. // This may involve decompressing a compressed image or otherwise diff --git a/Base/src/Image.cpp b/Base/src/Image.cpp index 2f913a2..bc44449 100644 --- a/Base/src/Image.cpp +++ b/Base/src/Image.cpp @@ -406,6 +406,62 @@ double Image::ComputeSSIM(Image *other) { return mssim / static_cast(w * h); } +template +double Image::ComputeMeanLocalEntropy() { + const uint32 kKernelSz = 15; + const uint32 kHalfKernelSz = kKernelSz / 2; + Image entropyIdx(GetWidth() - kKernelSz + 1, GetHeight() - kKernelSz + 1); + for(uint32 j = kHalfKernelSz; j < GetHeight() - kHalfKernelSz; j++) { + for(uint32 i = kHalfKernelSz; i < GetWidth() - kHalfKernelSz; i++) { + + Image subImg(kKernelSz, kKernelSz); + for(uint32 y = 0; y < kKernelSz; y++) + for(uint32 x = 0; x < kKernelSz; x++) { + subImg(x, y) = (*this)(i - kHalfKernelSz + x, j - kHalfKernelSz + y); + } + entropyIdx(i-kHalfKernelSz, j-kHalfKernelSz) = + static_cast(subImg.ComputeEntropy()); + } + } + + double sum = 0; + for(uint32 j = 0; j < entropyIdx.GetHeight(); j++) + for(uint32 i = 0; i < entropyIdx.GetWidth(); i++) { + sum += static_cast(entropyIdx(i, j)); + } + return sum / (entropyIdx.GetHeight() * entropyIdx.GetWidth()); +} + +template +double Image::ComputeEntropy() { + uint32 hist[256]; + memset(hist, 0, sizeof(hist)); + + ComputePixels(); + + Image intensity(GetWidth(), GetHeight()); + ConvertTo(intensity); + + for(uint32 j = 0; j < GetHeight(); j++) { + for(uint32 i = 0; i < GetWidth(); i++) { + float iflt = static_cast(intensity(i, j)); + uint32 iv = static_cast(iflt * 255.0f + 0.5f); + assert(iv < 256); + + hist[iv]++; + } + } + + double ret = 0; + for(uint32 i = 0; i < 256; i++) { + if(hist[i] > 0) { + float p = static_cast(hist[i]) / static_cast(GetHeight() * GetWidth()); + ret += p * log2(p); + } + } + return -ret; +} + // !FIXME! These won't work for non-RGBA8 data. template void Image::ConvertToBlockStreamOrder() { From 22246810d67bc31f8f742d9cabb9ce582a4a0a31 Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Fri, 18 Oct 2013 04:12:49 -0400 Subject: [PATCH 7/8] Report entropy when compressing a texture. --- CLTool/src/clunix.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CLTool/src/clunix.cpp b/CLTool/src/clunix.cpp index 5df3ac6..7f5dd68 100644 --- a/CLTool/src/clunix.cpp +++ b/CLTool/src/clunix.cpp @@ -226,6 +226,9 @@ int main(int argc, char **argv) { img.SetBlockStreamOrder(false); } + fprintf(stdout, "Entropy: %.5f\n", img.ComputeEntropy()); + fprintf(stdout, "Mean Local Entropy: %.5f\n", img.ComputeMeanLocalEntropy()); + std::ofstream logFile; ThreadSafeStreambuf streamBuf(logFile); std::ostream logStream(&streamBuf); From 3bb68cd8ad2003f8260fc6b77258a1731ff078aa Mon Sep 17 00:00:00 2001 From: Pavel Krajcevski Date: Fri, 18 Oct 2013 04:13:12 -0400 Subject: [PATCH 8/8] Debug refactoring. --- PVRTCEncoder/src/Compressor.cpp | 218 +++++++++++++++++--------------- 1 file changed, 115 insertions(+), 103 deletions(-) diff --git a/PVRTCEncoder/src/Compressor.cpp b/PVRTCEncoder/src/Compressor.cpp index d7179d9..921e141 100644 --- a/PVRTCEncoder/src/Compressor.cpp +++ b/PVRTCEncoder/src/Compressor.cpp @@ -188,7 +188,7 @@ namespace PVRTCC { uint32 idx = static_cast(xx) + width * static_cast(yy); uint8 ix = static_cast(255.0f * LookupIntensity(labels, pixels, idx) + 0.5f); - if(ix > i0) { + if(ix >= i0) { ng++; } @@ -709,38 +709,107 @@ namespace PVRTCC { } #ifndef NDEBUG + typedef FasTC::Pixel (*LabelFunc)(const CompressionLabel &); + + static const uint32 *gDbgPixels = NULL; + void DebugOutputImage(const char *imageName, const CompressionLabel *labels, + uint32 width, uint32 height, LabelFunc func) { + Image output(width, height); + for(uint32 j = 0; j < height; j++) + for(uint32 i = 0; i < width; i++) { + output(i, j) = func(labels[j*width + i]); + } + + output.DebugOutput(imageName); + } + + static const FasTC::Color kLabelPalette[4] = { + FasTC::Color(0.0, 0.0, 1.0, 1.0), + FasTC::Color(1.0, 0.0, 1.0, 1.0), + FasTC::Color(1.0, 0.0, 0.0, 1.0), + FasTC::Color(1.0, 1.0, 0.0, 1.0) + }; + + static FasTC::Pixel HighLabelDistance(const CompressionLabel &l) { + FasTC::Pixel ret; + const Label &hl = l.highLabel; + if(hl.distance > 0) { + ret.Unpack(kLabelPalette[hl.distance-1].Pack()); + } + return ret; + } + + static FasTC::Pixel HighPixel(const CompressionLabel &l) { + assert(gDbgPixels); + FasTC::Pixel ret; + const Label &hl = l.highLabel; + if(hl.distance > 0) { + FasTC::Color c; + uint32 nPs = 0; + for(uint32 p = 0; p < hl.nLabels; p++) { + FasTC::Color pc; pc.Unpack(gDbgPixels[hl.idxs[p]]); + c += pc * static_cast(hl.times[p]); + nPs += hl.times[p]; + } + c /= nPs; + ret.Unpack(c.Pack()); + } + return ret; + } + + static FasTC::Pixel LowPixel(const CompressionLabel &l) { + assert(gDbgPixels); + FasTC::Pixel ret; + const Label &ll = l.lowLabel; + if(ll.distance > 0) { + FasTC::Color c; + uint32 nPs = 0; + for(uint32 p = 0; p < ll.nLabels; p++) { + FasTC::Color pc; pc.Unpack(gDbgPixels[ll.idxs[p]]); + c += pc * static_cast(ll.times[p]); + nPs += ll.times[p]; + } + c /= nPs; + ret.Unpack(c.Pack()); + } + return ret; + } + + static FasTC::Pixel LowLabelDistance(const CompressionLabel &l) { + FasTC::Pixel ret; + const Label &ll = l.lowLabel; + if(ll.distance > 0) { + ret.Unpack(kLabelPalette[ll.distance-1].Pack()); + } + return ret; + } + + static FasTC::Pixel LabelIntensity(const CompressionLabel &l) { + assert(l.intensity <= 1.0f && l.intensity >= 0.0f); + uint32 iv = static_cast(l.intensity * 255.0f + 0.5); + assert(iv < 256); + return FasTC::Pixel(static_cast(0xFF000000 | (iv) | (iv << 8) | (iv << 16))); + } + + static FasTC::Pixel ExtremaLabels(const CompressionLabel &l) { + assert(!(l.highLabel.distance == 1 && l.lowLabel.distance == 1)); + + if(l.highLabel.distance == 1) { + return FasTC::Pixel(0xFF00FF00U); + } + + if(l.lowLabel.distance == 1) { + return FasTC::Pixel(0xFFFF0000U); + } + + return LabelIntensity(l); + } + void DebugOutputLabels(const char *outputPrefix, const CompressionLabel *labels, uint32 width, uint32 height) { - Image highForwardLabels(width, height); - Image lowForwardLabels(width, height); - - const FasTC::Color kLabelPalette[4] = { - FasTC::Color(0.0, 0.0, 1.0, 1.0), - FasTC::Color(1.0, 0.0, 1.0, 1.0), - FasTC::Color(1.0, 0.0, 0.0, 1.0), - FasTC::Color(1.0, 1.0, 0.0, 1.0) - }; - - for(uint32 j = 0; j < height; j++) { - for(uint32 i = 0; i < width; i++) { - const CompressionLabel &l = labels[j*width + i]; - - const Label &hl = l.highLabel; - if(hl.distance > 0) { - highForwardLabels(i, j).Unpack(kLabelPalette[hl.distance-1].Pack()); - } - - const Label &ll = l.lowLabel; - if(ll.distance > 0) { - lowForwardLabels(i, j).Unpack(kLabelPalette[ll.distance-1].Pack()); - } - } - } - ::std::string prefix(outputPrefix); - - highForwardLabels.DebugOutput((prefix + ::std::string("HighLabels")).c_str()); - lowForwardLabels.DebugOutput((prefix + ::std::string("LowLabels")).c_str()); + DebugOutputImage((prefix + ::std::string("HighLabels")).c_str(), labels, width, height, HighLabelDistance); + DebugOutputImage((prefix + ::std::string("LowLabels")).c_str(), labels, width, height, LowLabelDistance); } #endif @@ -759,47 +828,22 @@ namespace PVRTCC { LabelImageForward(labels, cj.inBuf, width, height); #ifndef NDEBUG + gDbgPixels = reinterpret_cast(cj.inBuf); + + Image original(width, height); + for(uint32 j = 0; j < height; j++) + for(uint32 i = 0; i < width; i++) { + original(i, j).Unpack(gDbgPixels[j*width + i]); + } + original.DebugOutput("Original"); + + DebugOutputImage("Intensity", labels, width, height, LabelIntensity); + DebugOutputImage("Labels", labels, width, height, ExtremaLabels); + DebugOutputLabels("Forward-", labels, width, height); - Image highForwardImg(width, height); - Image lowForwardImg(width, height); - const uint32 *pixels = reinterpret_cast(cj.inBuf); - for(uint32 j = 0; j < height; j++) { - for(uint32 i = 0; i < width; i++) { - const CompressionLabel &l = labels[j*width + i]; - - const Label &hl = l.highLabel; - if(hl.distance > 0) { - FasTC::Color c; - uint32 nPs = 0; - for(uint32 p = 0; p < hl.nLabels; p++) { - FasTC::Color pc; pc.Unpack(pixels[hl.idxs[p]]); - c += pc * static_cast(hl.times[p]); - nPs += hl.times[p]; - } - c /= nPs; - highForwardImg(i, j).Unpack(c.Pack()); - } - - const Label &ll = l.lowLabel; - if(ll.distance > 0) { - FasTC::Color c; - uint32 nPs = 0; - for(uint32 p = 0; p < ll.nLabels; p++) { - FasTC::Color pc; pc.Unpack(pixels[ll.idxs[p]]); - c += pc * static_cast(ll.times[p]); - nPs += ll.times[p]; - } - c /= nPs; - lowForwardImg(i, j).Unpack(c.Pack()); - } - } - } - - highForwardImg.DebugOutput("HighForwardImg"); - lowForwardImg.DebugOutput("LowForwardImg"); - - std::cout << "Output Forward images." << std::endl; + DebugOutputImage("HighForwardImg", labels, width, height, HighPixel); + DebugOutputImage("LowForwardImg", labels, width, height, LowPixel); #endif // Then traverse backward... @@ -808,40 +852,8 @@ namespace PVRTCC { #ifndef NDEBUG DebugOutputLabels("Backward-", labels, width, height); - Image highImg(width, height); - Image lowImg(width, height); - for(uint32 j = 0; j < height; j++) { - for(uint32 i = 0; i < width; i++) { - const CompressionLabel &l = labels[j*width + i]; - - const Label &hl = l.highLabel; - if(hl.distance > 0) { - FasTC::Color c; - for(uint32 p = 0; p < hl.nLabels; p++) { - FasTC::Color pc; pc.Unpack(pixels[hl.idxs[p]]); - c += pc; - } - c /= hl.nLabels; - highImg(i, j).Unpack(c.Pack()); - } - - const Label &ll = l.lowLabel; - if(ll.distance > 0) { - FasTC::Color c; - for(uint32 p = 0; p < ll.nLabels; p++) { - FasTC::Color pc; pc.Unpack(pixels[ll.idxs[p]]); - c += pc; - } - c /= ll.nLabels; - lowImg(i, j).Unpack(c.Pack()); - } - } - } - - highImg.DebugOutput("HighImg"); - lowImg.DebugOutput("LowImg"); - - std::cout << "Output images." << std::endl; + DebugOutputImage("HighImg", labels, width, height, HighPixel); + DebugOutputImage("LowImg", labels, width, height, LowPixel); #endif // Then combine everything...