ARM: NEON assembly optimization for SDL_FillRect
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 4d9d3f0..5bd43e4 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
return SDL_SetError("SDL_FillRects() passed NULL rects");
}
+#if SDL_ARM_NEON_BLITTERS
+ if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
+ void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+ void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
+ void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
+ switch (dst->format->BytesPerPixel) {
+ case 1:
+ FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
+ break;
+ case 2:
+ FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
+ break;
+ case 4:
+ FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
+ break;
+ }
+
+ SDL_UnlockSurface(dst);
+ return(0);
+ }
+#endif
#if SDL_ARM_SIMD_BLITTERS
if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S
index 1fcf3c1..ab9bcce 100644
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -95,6 +95,134 @@
/******************************************************************************/
+/* We can actually do significantly better than the Pixman macros, at least for
+ * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
+ * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
+ */
+
+.macro generate_fillrect_function name, bpp, log2Bpp
+/*
+ * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+ * On entry:
+ * a1 = width, pixels
+ * a2 = height, rows
+ * a3 = pointer to top-left destination pixel
+ * a4 = stride, pixels
+ * [sp] = pixel value to fill with
+ * Within the function:
+ * v1 = width remaining
+ * v2 = vst offset
+ * v3 = alternate pointer
+ * ip = data ARM register
+ */
+pixman_asm_function name
+ vld1.\bpp {d0[],d1[]}, [sp]
+ sub a4, a1
+ vld1.\bpp {d2[],d3[]}, [sp]
+ cmp a1, #(15+64) >> \log2Bpp
+ push {v1-v3,lr}
+ vmov ip, s0
+ blo 51f
+
+ /* Long-row case */
+ mov v2, #64
+1: mov v1, a1
+ ands v3, a3, #15
+ beq 2f
+ /* Leading pixels */
+ rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
+ sub v1, v1, v3, lsr #\log2Bpp
+ rbit v3, v3
+.if bpp <= 16
+.if bpp == 8
+ tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
+ strneb ip, [a3], #1
+ tst v3, #1<<30
+.else
+ tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
+.endif
+ strneh ip, [a3], #2
+.endif
+ movs v3, v3, lsl #3
+ vstmcs a3!, {s0}
+ vstmmi a3!, {d0}
+2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
+ add v3, a3, #32
+ /* Inner loop */
+3: vst1.\bpp {q0-q1}, [a3 :128], v2
+ subs v1, v1, #64 >> \log2Bpp
+ vst1.\bpp {q0-q1}, [v3 :128], v2
+ bhs 3b
+ /* Trailing pixels */
+4: movs v1, v1, lsl #27 + \log2Bpp
+ bcc 5f
+ vst1.\bpp {q0-q1}, [a3 :128]!
+5: bpl 6f
+ vst1.\bpp {q0}, [a3 :128]!
+6: movs v1, v1, lsl #2
+ vstmcs a3!, {d0}
+ vstmmi a3!, {s0}
+.if bpp <= 16
+ movs v1, v1, lsl #2
+ strcsh ip, [a3], #2
+.if bpp == 8
+ strmib ip, [a3], #1
+.endif
+.endif
+ subs a2, a2, #1
+ add a3, a3, a4, lsl #\log2Bpp
+ bhi 1b
+ pop {v1-v3,pc}
+
+ /* Short-row case */
+51: movs v1, a1
+.if bpp == 8
+ tst a3, #3
+ beq 53f
+52: subs v1, v1, #1
+ blo 57f
+ strb ip, [a3], #1
+ tst a3, #3
+ bne 52b
+.elseif bpp == 16
+ tstne a3, #2
+ subne v1, v1, #1
+ strneh ip, [a3], #2
+.endif
+53: cmp v1, #32 >> \log2Bpp
+ bcc 54f
+ vst1.\bpp {q0-q1}, [a3]!
+ sub v1, v1, #32 >> \log2Bpp
+ /* Trailing pixels */
+54: movs v1, v1, lsl #27 + \log2Bpp
+ bcc 55f
+ vst1.\bpp {q0-q1}, [a3]!
+55: bpl 56f
+ vst1.\bpp {q0}, [a3]!
+56: movs v1, v1, lsl #2
+ vstmcs a3!, {d0}
+ vstmmi a3!, {s0}
+.if bpp <= 16
+ movs v1, v1, lsl #2
+ strcsh ip, [a3], #2
+.if bpp == 8
+ strmib ip, [a3], #1
+.endif
+.endif
+ subs a2, a2, #1
+ add a3, a3, a4, lsl #\log2Bpp
+ bhi 51b
+57: pop {v1-v3,pc}
+
+.endfunc
+.endm
+
+generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
+generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
+generate_fillrect_function FillRect8ARMNEONAsm, 8, 0
+
+/******************************************************************************/
+
.macro RGBtoRGBPixelAlpha_process_pixblock_head
vmvn d30, d3 /* get inverted source alpha */
vmov d31, d7 /* dest alpha is always unchanged */