Commit 7ac733f0254997b4bb9b578f8ea2ff4817125627

Ben Avison 2019-10-24T21:15:21

ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits

diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index c3de8ea..6ad79c8 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -41,7 +41,8 @@
 enum blit_features {
 	BLIT_FEATURE_HAS_MMX = 1,
 	BLIT_FEATURE_HAS_ALTIVEC = 2,
-	BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4
+	BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4,
+	BLIT_FEATURE_HAS_ARM_SIMD = 8
 };
 
 #if SDL_ALTIVEC_BLITTERS
@@ -931,7 +932,24 @@ GetBlitFeatures(void)
 #endif
 #else
 /* Feature 1 is has-MMX */
-#define GetBlitFeatures() (SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0)
+#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) | (SDL_HasARMSIMD() ? BLIT_FEATURE_HAS_ARM_SIMD : 0))
+#endif
+
+#if SDL_ARM_SIMD_BLITTERS
+void Blit_BGR888_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo * info)
+{
+	int32_t width = info->dst_w;
+	int32_t height = info->dst_h;
+	uint32_t *dstp = (uint32_t *)info->dst;
+	int32_t dststride = width + (info->dst_skip >> 2);
+	uint32_t *srcp = (uint32_t *)info->src;
+	int32_t srcstride = width + (info->src_skip >> 2);
+
+	Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
+}
 #endif
 
 /* This is now endian dependent */
@@ -3270,6 +3288,10 @@ static const struct blit_table normal_blit_4[] = {
     {0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0, 0x0000001F,
      BLIT_FEATURE_HAS_ALTIVEC, Blit_RGB888_RGB565Altivec, NO_ALPHA},
 #endif
+#if SDL_ARM_SIMD_BLITTERS
+    {0x000000FF, 0x0000FF00, 0x00FF0000, 4, 0x00FF0000, 0x0000FF00, 0x000000FF,
+     BLIT_FEATURE_HAS_ARM_SIMD, Blit_BGR888_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA },
+#endif
     /* 4->3 with same rgb triplet */
     {0x000000FF, 0x0000FF00, 0x00FF0000, 3, 0x000000FF, 0x0000FF00, 0x00FF0000,
      0, Blit_3or4_to_3or4__same_rgb, NO_ALPHA | SET_ALPHA},
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 6dcbbe5..81e38c4 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -363,3 +363,45 @@ generate_composite_function \
     nop_macro, /* cleanup */ \
     ARGBto565PixelAlpha_process_head, \
     ARGBto565PixelAlpha_process_tail
+
+ /******************************************************************************/
+
+.macro BGR888toRGB888_1pixel cond, reg, tmp
+        uxtb16&cond  tmp, WK&reg, ror #8
+        uxtb16&cond  WK&reg, WK&reg, ror #16
+        orr&cond     WK&reg, WK&reg, tmp, lsl #8
+.endm
+
+.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
+        uxtb16&cond  tmp1, WK&reg1, ror #8
+        uxtb16&cond  WK&reg1, WK&reg1, ror #16
+        uxtb16&cond  tmp2, WK&reg2, ror #8
+        uxtb16&cond  WK&reg2, WK&reg2, ror #16
+        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
+        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
+.endm
+
+.macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
+ .if numbytes >= 8
+        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
+  .if numbytes == 16
+        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
+  .endif
+ .else @ numbytes == 4
+        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
+ .endif
+.endm
+
+generate_composite_function \
+    Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    BGR888toRGB888_process_head, \
+    BGR888toRGB888_process_tail