ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c
index c3de8ea..6ad79c8 100644
--- a/src/video/SDL_blit_N.c
+++ b/src/video/SDL_blit_N.c
@@ -41,7 +41,8 @@
enum blit_features {
BLIT_FEATURE_HAS_MMX = 1,
BLIT_FEATURE_HAS_ALTIVEC = 2,
- BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4
+ BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4,
+ BLIT_FEATURE_HAS_ARM_SIMD = 8
};
#if SDL_ALTIVEC_BLITTERS
@@ -931,7 +932,24 @@ GetBlitFeatures(void)
#endif
#else
/* Feature 1 is has-MMX */
-#define GetBlitFeatures() (SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0)
+#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) | (SDL_HasARMSIMD() ? BLIT_FEATURE_HAS_ARM_SIMD : 0))
+#endif
+
+#if SDL_ARM_SIMD_BLITTERS
+void Blit_BGR888_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo * info)
+{
+ int32_t width = info->dst_w;
+ int32_t height = info->dst_h;
+ uint32_t *dstp = (uint32_t *)info->dst;
+ int32_t dststride = width + (info->dst_skip >> 2);
+ uint32_t *srcp = (uint32_t *)info->src;
+ int32_t srcstride = width + (info->src_skip >> 2);
+
+ Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
+}
#endif
/* This is now endian dependent */
@@ -3270,6 +3288,10 @@ static const struct blit_table normal_blit_4[] = {
{0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0, 0x0000001F,
BLIT_FEATURE_HAS_ALTIVEC, Blit_RGB888_RGB565Altivec, NO_ALPHA},
#endif
+#if SDL_ARM_SIMD_BLITTERS
+ {0x000000FF, 0x0000FF00, 0x00FF0000, 4, 0x00FF0000, 0x0000FF00, 0x000000FF,
+ BLIT_FEATURE_HAS_ARM_SIMD, Blit_BGR888_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA },
+#endif
/* 4->3 with same rgb triplet */
{0x000000FF, 0x0000FF00, 0x00FF0000, 3, 0x000000FF, 0x0000FF00, 0x00FF0000,
0, Blit_3or4_to_3or4__same_rgb, NO_ALPHA | SET_ALPHA},
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 6dcbbe5..81e38c4 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -363,3 +363,45 @@ generate_composite_function \
nop_macro, /* cleanup */ \
ARGBto565PixelAlpha_process_head, \
ARGBto565PixelAlpha_process_tail
+
+ /******************************************************************************/
+
+.macro BGR888toRGB888_1pixel cond, reg, tmp
+ uxtb16&cond tmp, WK®, ror #8
+ uxtb16&cond WK®, WK®, ror #16
+ orr&cond WK®, WK®, tmp, lsl #8
+.endm
+
+.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
+ uxtb16&cond tmp1, WK®1, ror #8
+ uxtb16&cond WK®1, WK®1, ror #16
+ uxtb16&cond tmp2, WK®2, ror #8
+ uxtb16&cond WK®2, WK®2, ror #16
+ orr&cond WK®1, WK®1, tmp1, lsl #8
+ orr&cond WK®2, WK®2, tmp2, lsl #8
+.endm
+
+.macro BGR888toRGB888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro BGR888toRGB888_process_tail cond, numbytes, firstreg
+ .if numbytes >= 8
+ BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
+ .if numbytes == 16
+ BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
+ .endif
+ .else @ numbytes == 4
+ BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
+ .endif
+.endm
+
+generate_composite_function \
+ Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+ 2, /* prefetch distance */ \
+ nop_macro, /* init */ \
+ nop_macro, /* newline */ \
+ nop_macro, /* cleanup */ \
+ BGR888toRGB888_process_head, \
+ BGR888toRGB888_process_tail