Fixed alpha blending for the MMX blit functions

I see the Remarks of function SDL_BlitSurface shows that "when SDL_BLENDMODE_BLEND, we have dstA = srcA + (dstA * (1-srcA))". however, I tested some pictures but the result implies "dstA=arcA" actually. I stepped into the source code, and found after I set SDL_BLENDMODE_BLEND for the source surface, the final blit function is BlitRGBtoRGBPixelAlphaMMX when I use SDL_BlitSurface on my computer. And I found these codes:

    else if (alpha == amask) {
    /* opaque alpha -- copy RGB, keep dst alpha */
    *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);

The same code is used in BlitRGBtoRGBPixelAlphaMMX3DNOW and BlitRGBtoRGBPixelAlpha. So I think they still keep dst alpha.

Best regards,
Jianyu Guan
This commit is contained in:
Sam Lantinga 2013-08-16 06:59:19 -07:00
parent 65728477af
commit 89bc80f1ae

View file

@ -337,15 +337,14 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
Uint32 amask = sf->Amask; Uint32 amask = sf->Amask;
Uint32 ashift = sf->Ashift; Uint32 ashift = sf->Ashift;
Uint64 multmask; Uint64 multmask, multmask2;
__m64 src1, dst1, mm_alpha, mm_zero, dmask; __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
multmask = 0xFFFF; multmask = 0x00FF;
multmask <<= (ashift * 2); multmask <<= (ashift * 2);
multmask = ~multmask; multmask2 = 0x00FF00FF00FF00FF;
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
while (height--) { while (height--) {
/* *INDENT-OFF* */ /* *INDENT-OFF* */
@ -353,9 +352,8 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
Uint32 alpha = *srcp & amask; Uint32 alpha = *srcp & amask;
if (alpha == 0) { if (alpha == 0) {
/* do nothing */ /* do nothing */
} else if (alpha == amask) { } else if (alpha == amask || (*dstp & amask) == 0) {
/* opaque alpha -- copy RGB, keep dst alpha */ *dstp = *srcp;
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
} else { } else {
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@ -366,15 +364,17 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
/* blend */ /* blend */
src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ src1 = _mm_mullo_pi16(src1, mm_alpha);
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ src1 = _mm_srli_pi16(src1, 8);
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ dst1 = _mm_srli_pi16(dst1, 8);
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ dst1 = _mm_add_pi16(src1, dst1);
dst1 = _mm_packs_pu16(dst1, mm_zero);
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
} }
@ -481,23 +481,24 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
compositioning used (>>8 instead of /255) doesn't handle compositioning used (>>8 instead of /255) doesn't handle
it correctly. Also special-case alpha=0 for speed? it correctly. Also special-case alpha=0 for speed?
Benchmark this! */ Benchmark this! */
if(alpha) { if (alpha) {
if(alpha == SDL_ALPHA_OPAQUE) { if (alpha == SDL_ALPHA_OPAQUE) {
*dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); *dstp = *srcp;
} else { } else {
/* /*
* take out the middle component (green), and process * take out the middle component (green), and process
* the other two in parallel. One multiply less. * the other two in parallel. One multiply less.
*/ */
d = *dstp; d = *dstp;
dalpha = d & 0xff000000; dalpha = d >> 24;
s1 = s & 0xff00ff; s1 = s & 0xff00ff;
d1 = d & 0xff00ff; d1 = d & 0xff00ff;
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
s &= 0xff00; s &= 0xff00;
d &= 0xff00; d &= 0xff00;
d = (d + ((s - d) * alpha >> 8)) & 0xff00; d = (d + ((s - d) * alpha >> 8)) & 0xff00;
*dstp = d1 | d | dalpha; dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
*dstp = d1 | d | (dalpha << 24);
} }
} }
++srcp; ++srcp;
@ -524,15 +525,14 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
Uint32 amask = sf->Amask; Uint32 amask = sf->Amask;
Uint32 ashift = sf->Ashift; Uint32 ashift = sf->Ashift;
Uint64 multmask; Uint64 multmask, multmask2;
__m64 src1, dst1, mm_alpha, mm_zero, dmask; __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
multmask = 0xFFFF; multmask = 0x00FF;
multmask <<= (ashift * 2); multmask <<= (ashift * 2);
multmask = ~multmask; multmask2 = 0x00FF00FF00FF00FF;
dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
while (height--) { while (height--) {
/* *INDENT-OFF* */ /* *INDENT-OFF* */
@ -545,9 +545,8 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
alpha = *srcp & amask; alpha = *srcp & amask;
if (alpha == 0) { if (alpha == 0) {
/* do nothing */ /* do nothing */
} else if (alpha == amask) { } else if (alpha == amask || (*dstp & amask) == 0) {
/* copy RGB, keep dst alpha */ *dstp = *srcp;
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
} else { } else {
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@ -558,15 +557,18 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
/* blend */ /* blend */
src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ src1 = _mm_mullo_pi16(src1, mm_alpha);
src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ src1 = _mm_srli_pi16(src1, 8);
src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ dst1 = _mm_srli_pi16(dst1, 8);
dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ dst1 = _mm_add_pi16(src1, dst1);
dst1 = _mm_packs_pu16(dst1, mm_zero);
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
} }