ARM: assembly optimization for SDL_FillRect
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
diff --git a/src/video/SDL_fillrect.c b/src/video/SDL_fillrect.c
index 899a1a5..4d9d3f0 100644
--- a/src/video/SDL_fillrect.c
+++ b/src/video/SDL_fillrect.c
@@ -22,6 +22,7 @@
#include "SDL_video.h"
#include "SDL_blit.h"
+#include "SDL_cpuinfo.h"
#ifdef __SSE__
@@ -280,6 +281,28 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
return SDL_SetError("SDL_FillRects() passed NULL rects");
}
+#if SDL_ARM_SIMD_BLITTERS
+ if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
+ void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
+ void FillRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
+ void FillRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
+ switch (dst->format->BytesPerPixel) {
+ case 1:
+ FillRect8ARMSIMDAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
+ break;
+ case 2:
+ FillRect16ARMSIMDAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
+ break;
+ case 4:
+ FillRect32ARMSIMDAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
+ break;
+ }
+
+ SDL_UnlockSurface(dst);
+ return(0);
+ }
+#endif
+
switch (dst->format->BytesPerPixel) {
case 1:
{
diff --git a/src/video/arm/pixman-arm-simd-asm.S b/src/video/arm/pixman-arm-simd-asm.S
index 81e38c4..769c213 100644
--- a/src/video/arm/pixman-arm-simd-asm.S
+++ b/src/video/arm/pixman-arm-simd-asm.S
@@ -47,6 +47,74 @@
/******************************************************************************/
+.macro FillRect32_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro FillRect16_init
+ ldrh SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro FillRect8_init
+ ldrb SRC, [sp, #ARGS_STACK_OFFSET]
+ orr SRC, SRC, lsl #8
+ orr SRC, SRC, lsl #16
+ mov STRIDE_S, SRC
+ mov MASK, SRC
+ mov STRIDE_M, SRC
+.endm
+
+.macro FillRect_process_tail cond, numbytes, firstreg
+ WK4 .req SRC
+ WK5 .req STRIDE_S
+ WK6 .req MASK
+ WK7 .req STRIDE_M
+ pixst cond, numbytes, 4, DST
+ .unreq WK4
+ .unreq WK5
+ .unreq WK6
+ .unreq WK7
+.endm
+
+generate_composite_function \
+ FillRect32ARMSIMDAsm, 0, 0, 32, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ FillRect32_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ FillRect_process_tail
+
+generate_composite_function \
+ FillRect16ARMSIMDAsm, 0, 0, 16, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ FillRect16_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ FillRect_process_tail
+
+generate_composite_function \
+ FillRect8ARMSIMDAsm, 0, 0, 8, \
+ FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+ 0, /* prefetch distance doesn't apply */ \
+ FillRect8_init \
+ nop_macro, /* newline */ \
+ nop_macro /* cleanup */ \
+ nop_macro /* process head */ \
+ FillRect_process_tail
+
+/******************************************************************************/
+
/* This differs from the over_8888_8888 routine in Pixman in that the destination
* alpha component is always left unchanged, and RGB components are not
* premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that