Commit 72f8044a42aed3f1c9fbcb58c47fa3d4417a9ba7

Ben Avison 2019-10-24T21:17:52

ARM: NEON assembly optimization for SDL_FillRect

diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 4d9d3f0..5bd43e4 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
         return SDL_SetError("SDL_FillRects() passed NULL rects");
     }
 
+#if SDL_ARM_NEON_BLITTERS
+    if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
+        void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+        void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
+        void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
+        switch (dst->format->BytesPerPixel) {
+        case 1:
+            FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
+            break;
+        case 2:
+            FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
+            break;
+        case 4:
+            FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
+            break;
+        }
+
+        SDL_UnlockSurface(dst);
+        return(0);
+    }
+#endif
 #if SDL_ARM_SIMD_BLITTERS
     if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
         void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S
index 1fcf3c1..ab9bcce 100644
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -95,6 +95,134 @@
 
 /******************************************************************************/
 
+/* We can actually do significantly better than the Pixman macros, at least for
+ * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
+ * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
+ */
+
+.macro generate_fillrect_function name, bpp, log2Bpp
+/*
+ * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+ * On entry:
+ * a1 = width, pixels
+ * a2 = height, rows
+ * a3 = pointer to top-left destination pixel
+ * a4 = stride, pixels
+ * [sp] = pixel value to fill with
+ * Within the function:
+ * v1 = width remaining
+ * v2 = vst offset
+ * v3 = alternate pointer
+ * ip = data ARM register
+ */
+pixman_asm_function name
+    vld1.\bpp   {d0[],d1[]}, [sp]
+    sub         a4, a1
+    vld1.\bpp   {d2[],d3[]}, [sp]
+    cmp         a1, #(15+64) >> \log2Bpp
+    push        {v1-v3,lr}
+    vmov        ip, s0
+    blo         51f
+
+    /* Long-row case */
+    mov         v2, #64
+1:  mov         v1, a1
+    ands        v3, a3, #15
+    beq         2f
+    /* Leading pixels */
+    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
+    sub         v1, v1, v3, lsr #\log2Bpp
+    rbit        v3, v3
+.if bpp <= 16
+.if bpp == 8
+    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
+    strneb      ip, [a3], #1
+    tst         v3, #1<<30
+.else
+    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
+.endif
+    strneh      ip, [a3], #2
+.endif
+    movs        v3, v3, lsl #3
+    vstmcs      a3!, {s0}
+    vstmmi      a3!, {d0}
+2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
+    add         v3, a3, #32
+    /* Inner loop */
+3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
+    subs        v1, v1, #64 >> \log2Bpp
+    vst1.\bpp   {q0-q1}, [v3 :128], v2
+    bhs         3b
+    /* Trailing pixels */
+4:  movs        v1, v1, lsl #27 + \log2Bpp
+    bcc         5f
+    vst1.\bpp   {q0-q1}, [a3 :128]!
+5:  bpl         6f
+    vst1.\bpp   {q0}, [a3 :128]!
+6:  movs        v1, v1, lsl #2
+    vstmcs      a3!, {d0}
+    vstmmi      a3!, {s0}
+.if bpp <= 16
+    movs        v1, v1, lsl #2
+    strcsh      ip, [a3], #2
+.if bpp == 8
+    strmib      ip, [a3], #1
+.endif
+.endif
+    subs        a2, a2, #1
+    add         a3, a3, a4, lsl #\log2Bpp
+    bhi         1b
+    pop         {v1-v3,pc}
+
+    /* Short-row case */
+51: movs        v1, a1
+.if bpp == 8
+    tst         a3, #3
+    beq         53f
+52: subs        v1, v1, #1
+    blo         57f
+    strb        ip, [a3], #1
+    tst         a3, #3
+    bne         52b
+.elseif bpp == 16
+    tstne       a3, #2
+    subne       v1, v1, #1
+    strneh      ip, [a3], #2
+.endif
+53: cmp         v1, #32 >> \log2Bpp
+    bcc         54f
+    vst1.\bpp   {q0-q1}, [a3]!
+    sub         v1, v1, #32 >> \log2Bpp
+    /* Trailing pixels */
+54: movs        v1, v1, lsl #27 + \log2Bpp
+    bcc         55f
+    vst1.\bpp   {q0-q1}, [a3]!
+55: bpl         56f
+    vst1.\bpp   {q0}, [a3]!
+56: movs        v1, v1, lsl #2
+    vstmcs      a3!, {d0}
+    vstmmi      a3!, {s0}
+.if bpp <= 16
+    movs        v1, v1, lsl #2
+    strcsh      ip, [a3], #2
+.if bpp == 8
+    strmib      ip, [a3], #1
+.endif
+.endif
+    subs        a2, a2, #1
+    add         a3, a3, a4, lsl #\log2Bpp
+    bhi         51b
+57: pop         {v1-v3,pc}
+
+.endfunc
+.endm
+
+generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
+generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
+generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
+
+/******************************************************************************/
+
 .macro RGBtoRGBPixelAlpha_process_pixblock_head
     vmvn        d30, d3  /* get inverted source alpha */
     vmov        d31, d7  /* dest alpha is always unchanged */