Fixed alpha blending for the MMX blit functions I see the Remarks of function SDL_BlitSurface shows that "when SDL_BLENDMODE_BLEND, we have dstA = srcA + (dstA * (1-srcA))". however, I tested some pictures but the result implies "dstA=arcA" actually. I stepped into the source code, and found after I set SDL_BLENDMODE_BLEND for the source surface, the final blit function is BlitRGBtoRGBPixelAlphaMMX when I use SDL_BlitSurface on my computer. And I found these codes: else if (alpha == amask) { /* opaque alpha -- copy RGB, keep dst alpha */ *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); The same code is used in BlitRGBtoRGBPixelAlphaMMX3DNOW and BlitRGBtoRGBPixelAlpha. So I think they still keep dst alpha. Best regards, Jianyu Guan
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 3fb89c0..e53f99a 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -337,15 +337,14 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
Uint32 amask = sf->Amask;
Uint32 ashift = sf->Ashift;
- Uint64 multmask;
+ Uint64 multmask, multmask2;
- __m64 src1, dst1, mm_alpha, mm_zero, dmask;
+ __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
- multmask = 0xFFFF;
- multmask <<= (ashift * 2);
- multmask = ~multmask;
- dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
+ multmask = 0x00FF;
+ multmask <<= (ashift * 2);
+ multmask2 = 0x00FF00FF00FF00FF;
while (height--) {
/* *INDENT-OFF* */
@@ -353,9 +352,8 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
Uint32 alpha = *srcp & amask;
if (alpha == 0) {
/* do nothing */
- } else if (alpha == amask) {
- /* opaque alpha -- copy RGB, keep dst alpha */
- *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
+ } else if (alpha == amask || (*dstp & amask) == 0) {
+ *dstp = *srcp;
} else {
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@@ -366,15 +364,17 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
- mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
- mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
+ mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
+ mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
+ mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
/* blend */
- src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
- src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
- src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
- dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
- dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
+ src1 = _mm_mullo_pi16(src1, mm_alpha);
+ src1 = _mm_srli_pi16(src1, 8);
+ dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
+ dst1 = _mm_srli_pi16(dst1, 8);
+ dst1 = _mm_add_pi16(src1, dst1);
+ dst1 = _mm_packs_pu16(dst1, mm_zero);
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
}
@@ -481,23 +481,24 @@ BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
compositioning used (>>8 instead of /255) doesn't handle
it correctly. Also special-case alpha=0 for speed?
Benchmark this! */
- if(alpha) {
- if(alpha == SDL_ALPHA_OPAQUE) {
- *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
+ if (alpha) {
+ if (alpha == SDL_ALPHA_OPAQUE) {
+ *dstp = *srcp;
} else {
/*
* take out the middle component (green), and process
* the other two in parallel. One multiply less.
*/
d = *dstp;
- dalpha = d & 0xff000000;
+ dalpha = d >> 24;
s1 = s & 0xff00ff;
d1 = d & 0xff00ff;
d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
s &= 0xff00;
d &= 0xff00;
d = (d + ((s - d) * alpha >> 8)) & 0xff00;
- *dstp = d1 | d | dalpha;
+ dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
+ *dstp = d1 | d | (dalpha << 24);
}
}
++srcp;
@@ -524,15 +525,14 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
Uint32 amask = sf->Amask;
Uint32 ashift = sf->Ashift;
- Uint64 multmask;
+ Uint64 multmask, multmask2;
- __m64 src1, dst1, mm_alpha, mm_zero, dmask;
+ __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
- multmask = 0xFFFF;
+ multmask = 0x00FF;
multmask <<= (ashift * 2);
- multmask = ~multmask;
- dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
+ multmask2 = 0x00FF00FF00FF00FF;
while (height--) {
/* *INDENT-OFF* */
@@ -545,9 +545,8 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
alpha = *srcp & amask;
if (alpha == 0) {
/* do nothing */
- } else if (alpha == amask) {
- /* copy RGB, keep dst alpha */
- *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
+ } else if (alpha == amask || (*dstp & amask) == 0) {
+ *dstp = *srcp;
} else {
src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
@@ -558,15 +557,18 @@ BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
- mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
- mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
+ mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
+ mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
+ mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
+
/* blend */
- src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
- src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
- src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
- dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
- dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
+ src1 = _mm_mullo_pi16(src1, mm_alpha);
+ src1 = _mm_srli_pi16(src1, 8);
+ dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
+ dst1 = _mm_srli_pi16(dst1, 8);
+ dst1 = _mm_add_pi16(src1, dst1);
+ dst1 = _mm_packs_pu16(dst1, mm_zero);
*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
}