ARM: SIMD assembly optimization for function BlitARGBto565PixelAlpha
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 07dd980..a1d087d 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -390,6 +390,21 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
#endif /* __MMX__ */
#if SDL_ARM_SIMD_BLITTERS
+void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
+{
+ int32_t width = info->dst_w;
+ int32_t height = info->dst_h;
+ uint16_t *dstp = (uint16_t *)info->dst;
+ int32_t dststride = width + (info->dst_skip >> 1);
+ uint32_t *srcp = (uint32_t *)info->src;
+ int32_t srcstride = width + (info->src_skip >> 2);
+
+ BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
+}
+
void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
static void
@@ -1301,6 +1316,15 @@ SDL_CalculateBlitA(SDL_Surface * surface)
}
case 2:
+#if SDL_ARM_SIMD_BLITTERS
+ if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
+ && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
+ && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
+ || (sf->Bmask == 0xff && df->Bmask == 0x1f))
+ && SDL_HasARMSIMD())
+ return BlitARGBto565PixelAlphaARMSIMD;
+ else
+#endif
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 2d65887..6dcbbe5 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -166,3 +166,200 @@ generate_composite_function \
RGBtoRGBPixelAlpha_process_tail
/******************************************************************************/
+
+.macro ARGBto565PixelAlpha_init
+ line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+ mov MASK, #0x001f
+ mov STRIDE_M, #0x0010
+ orr MASK, MASK, MASK, lsl #16
+ orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
+.endm
+
+.macro ARGBto565PixelAlpha_newline
+ mov STRIDE_S, #0x0200
+.endm
+
+/* On entry:
+ * s1 holds 1 32bpp source pixel
+ * d holds 1 16bpp destination pixel
+ * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ */
+
+.macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
+ mov alpha, s, lsr #27
+ and misc, s, #0xfc00
+ and g, d, #0x07e0
+ pkhbt rb, d, d, lsl #5
+ rsb misc, g, misc, lsr #5
+ and s, rbmask, s, lsr #3
+ and rb, rbmask, rb
+ sub s, s, rb
+ smlabb misc, misc, alpha, ghalf
+ mla s, s, alpha, rbhalf
+ add misc, misc, misc, lsl #5
+ add g, g, misc, asr #10
+ add s, s, s, lsl #5
+ and g, g, #0x07e0
+ add rb, rb, s, asr #10
+ and rb, rb, rbmask
+ pkhbt rb, rb, rb, lsl #11
+ orr d, rb, g
+ orr d, d, rb, lsr #16
+.endm
+
+/* On entry:
+ * s1 holds 1 32bpp source pixel
+ * d holds 1 16bpp destination pixel
+ * rbmask holds 0x001f001f
+ * On exit:
+ * Constant registers preserved
+ */
+
+.macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask
+ and d, rbmask, s, lsr #3
+ and s, s, #0xfc00
+ orr d, d, d, lsr #5
+ orr d, d, s, lsr #5
+.endm
+
+/* On entry:
+ * s1, s2 hold 2 32bpp source pixels
+ * d holds 2 16bpp destination pixels
+ * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ * Blended results have been written through destination pointer
+ */
+
+.macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
+ mov alpha, s1, lsr #27
+ and misc, s1, #0xfc00
+ and g, d, #0x07e0
+ pkhbt rb, d, d, lsl #5
+ rsb misc, g, misc, lsr #5
+ and s1, rbmask, s1, lsr #3
+ and rb, rbmask, rb
+ sub s1, s1, rb
+ smlabb misc, misc, alpha, ghalf
+ mla s1, s1, alpha, rbhalf
+ uxth d, d, ror #16
+ add misc, misc, misc, lsl #5
+ mov alpha, s2, lsr #27
+ add g, g, misc, asr #10
+ add s1, s1, s1, lsl #5
+ and g, g, #0x07e0
+ add rb, rb, s1, asr #10
+ and rb, rb, rbmask
+ and misc, s2, #0xfc00
+ pkhbt rb, rb, rb, lsl #11
+ and s1, d, #0x07e0
+ pkhbt d, d, d, lsl #5
+ rsb misc, s1, misc, lsr #5
+ and s2, rbmask, s2, lsr #3
+ and d, rbmask, d
+ sub s2, s2, d
+ smlabb misc, misc, alpha, ghalf
+ mla s2, s2, alpha, rbhalf
+ orr alpha, rb, g
+ add misc, misc, misc, lsl #5
+ orr alpha, alpha, rb, lsr #16
+ add s1, s1, misc, asr #10
+ add s2, s2, s2, lsl #5
+ and s1, s1, #0x07e0
+ add d, d, s2, asr #10
+ and d, d, rbmask
+ strh alpha, [DST, #-4]
+ pkhbt d, d, d, lsl #11
+ orr alpha, d, s1
+ orr alpha, alpha, d, lsr #16
+ strh alpha, [DST, #-2]
+.endm
+
+/* On entry:
+ * s1, s2 hold 2 32bpp source pixels
+ * rbmask holds 0x001f001f
+ * other registers are temporaries
+ * On exit:
+ * Constant registers preserved
+ * Blended results have been written through destination pointer
+ */
+
+.macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g
+ and g, s1, #0xfc00
+ and d, rbmask, s1, lsr #3
+ and s1, rbmask, s2, lsr #3
+ orr d, d, d, lsr #5
+ orr d, d, g, lsr #5
+ and g, s2, #0xfc00
+ strh d, [DST, #-4]
+ orr s1, s1, s1, lsr #5
+ orr s1, s1, g, lsr #5
+ strh s1, [DST, #-2]
+.endm
+
+.macro ARGBto565PixelAlpha_2pixels_head
+ ldrd WK0, WK1, [SRC], #8
+ ldr WK2, [DST], #4
+ orr SCRATCH, WK0, WK1
+ and ORIG_W, WK0, WK1
+ tst SCRATCH, #0xff000000
+.endm
+
+.macro ARGBto565PixelAlpha_2pixels_tail
+ beq 20f @ all transparent
+ cmp ORIG_W, #0xff000000
+ bhs 10f @ all opaque
+ ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
+ b 20f
+10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH
+20:
+.endm
+
+.macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+ ARGBto565PixelAlpha_2pixels_head
+ ARGBto565PixelAlpha_2pixels_tail
+ ARGBto565PixelAlpha_2pixels_head
+ ARGBto565PixelAlpha_2pixels_tail
+ .endif
+ .if numbytes >= 8
+ ARGBto565PixelAlpha_2pixels_head
+ ARGBto565PixelAlpha_2pixels_tail
+ .endif
+ .if numbytes >= 4
+ ARGBto565PixelAlpha_2pixels_head
+ .else // numbytes == 2
+ ldr WK0, [SRC], #4
+ ldrh WK2, [DST], #2
+ tst WK0, #0xff000000
+ .endif
+.endm
+
+.macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg
+ .if numbytes >= 4
+ ARGBto565PixelAlpha_2pixels_tail
+ .else // numbytes == 2
+ beq 20f @ all transparent
+ cmp WK0, #0xff000000
+ bhs 10f @ opaque
+ ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
+ b 19f
+10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
+19: strh WK2, [DST, #-2]
+20:
+ .endif
+.endm
+
+generate_composite_function \
+ BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
+ 2, /* prefetch distance */ \
+ ARGBto565PixelAlpha_init, \
+ ARGBto565PixelAlpha_newline, \
+ nop_macro, /* cleanup */ \
+ ARGBto565PixelAlpha_process_head, \
+ ARGBto565PixelAlpha_process_tail