Commit becc649ae2cf681bbbb28e977e64782b1f002eba

Ben Avison 2019-10-24T21:15:35

ARM: assembly optimization for SDL_FillRect

diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 899a1a5..4d9d3f0 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -22,6 +22,7 @@
 
 #include "SDL_video.h"
 #include "SDL_blit.h"
+#include "SDL_cpuinfo.h"
 
 
 #ifdef __SSE__
@@ -280,6 +281,28 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
         return SDL_SetError("SDL_FillRects() passed NULL rects");
     }
 
+#if SDL_ARM_SIMD_BLITTERS
+    if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
+        void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+        void FillRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
+        void FillRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
+        switch (dst->format->BytesPerPixel) {
+        case 1:
+            FillRect8ARMSIMDAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
+            break;
+        case 2:
+            FillRect16ARMSIMDAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
+            break;
+        case 4:
+            FillRect32ARMSIMDAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
+            break;
+        }
+
+        SDL_UnlockSurface(dst);
+        return(0);
+    }
+#endif
+
     switch (dst->format->BytesPerPixel) {
     case 1:
         {
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 81e38c4..769c213 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -47,6 +47,74 @@
 
 /******************************************************************************/
 
+.macro FillRect32_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro FillRect16_init
+        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro FillRect8_init
+        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #8
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro FillRect_process_tail  cond, numbytes, firstreg
+    WK4     .req    SRC
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+        pixst   cond, numbytes, 4, DST
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    FillRect32ARMSIMDAsm, 0, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    FillRect32_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    FillRect_process_tail
+
+generate_composite_function \
+    FillRect16ARMSIMDAsm, 0, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    FillRect16_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    FillRect_process_tail
+
+generate_composite_function \
+    FillRect8ARMSIMDAsm, 0, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    FillRect8_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    FillRect_process_tail
+
+/******************************************************************************/
+
 /* This differs from the over_8888_8888 routine in Pixman in that the destination
  * alpha component is always left unchanged, and RGB components are not
  * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that