ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index 53a71f3..e6b99a7 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -422,6 +422,21 @@ BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
#endif
#if SDL_ARM_NEON_BLITTERS
+void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
+
+static void
+BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
+{
+ int32_t width = info->dst_w;
+ int32_t height = info->dst_h;
+ uint16_t *dstp = (uint16_t *)info->dst;
+ int32_t dststride = width + (info->dst_skip >> 1);
+ uint32_t *srcp = (uint32_t *)info->src;
+ int32_t srcstride = width + (info->src_skip >> 2);
+
+ BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
+}
+
void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
static void
@@ -1333,14 +1348,21 @@ SDL_CalculateBlitA(SDL_Surface * surface)
}
case 2:
-#if SDL_ARM_SIMD_BLITTERS
+#if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00 && df->Gmask == 0x7e0
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
- || (sf->Bmask == 0xff && df->Bmask == 0x1f))
- && SDL_HasARMSIMD())
+ || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
+ {
+#if SDL_ARM_NEON_BLITTERS
+ if (SDL_HasNEON())
+ return BlitARGBto565PixelAlphaARMNEON;
+#endif
+#if SDL_ARM_SIMD_BLITTERS
+ if (SDL_HasARMSIMD())
return BlitARGBto565PixelAlphaARMSIMD;
- else
+#endif
+ }
#endif
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
&& sf->Gmask == 0xff00
diff --git a/src/video/arm/pixman-arm-neon-asm.S b/src/video/arm/pixman-arm-neon-asm.S
index 72fd3a2..1fcf3c1 100644
--- a/src/video/arm/pixman-arm-neon-asm.S
+++ b/src/video/arm/pixman-arm-neon-asm.S
@@ -157,3 +157,91 @@ generate_composite_function \
RGBtoRGBPixelAlpha_process_pixblock_head, \
RGBtoRGBPixelAlpha_process_pixblock_tail, \
RGBtoRGBPixelAlpha_process_pixblock_tail_head
+
+ /******************************************************************************/
+
+.macro ARGBto565PixelAlpha_process_pixblock_head
+ vmvn d6, d3
+ vshr.u8 d1, #2
+ vshr.u8 d3, #3
+ vshr.u8 d0, #3
+ vshrn.u16 d7, q2, #3
+ vshrn.u16 d25, q2, #8
+ vbic.i16 q2, #0xe0
+ vshr.u8 d6, #3
+ vshr.u8 d7, #2
+ vshr.u8 d2, #3
+ vmovn.u16 d24, q2
+ vshr.u8 d25, #3
+ vmull.u8 q13, d1, d3
+ vmlal.u8 q13, d7, d6
+ vmull.u8 q14, d0, d3
+ vmlal.u8 q14, d24, d6
+ vmull.u8 q15, d2, d3
+ vmlal.u8 q15, d25, d6
+.endm
+
+.macro ARGBto565PixelAlpha_process_pixblock_tail
+ vsra.u16 q13, #5
+ vsra.u16 q14, #5
+ vsra.u16 q15, #5
+ vrshr.u16 q13, #5
+ vrshr.u16 q14, #5
+ vrshr.u16 q15, #5
+ vsli.u16 q14, q13, #5
+ vsli.u16 q14, q15, #11
+.endm
+
+.macro ARGBto565PixelAlpha_process_pixblock_tail_head
+ vld4.8 {d0-d3}, [SRC]!
+ PF add PF_X, PF_X, #8
+ vsra.u16 q13, #5
+ PF tst PF_CTL, #0xF
+ vsra.u16 q14, #5
+ PF addne PF_X, PF_X, #8
+ vsra.u16 q15, #5
+ PF subne PF_CTL, PF_CTL, #1
+ vrshr.u16 q13, #5
+ PF cmp PF_X, ORIG_W
+ vrshr.u16 q14, #5
+ PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ vrshr.u16 q15, #5
+ PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ vld1.8 {d4-d5}, [DST_R]!
+ PF subge PF_X, PF_X, ORIG_W
+ vsli.u16 q14, q13, #5
+ PF subges PF_CTL, PF_CTL, #0x10
+ vsli.u16 q14, q15, #11
+ PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ vst1.8 {q14}, [DST_W :128]!
+ vmvn d6, d3
+ vshr.u8 d1, #2
+ vshr.u8 d3, #3
+ vshr.u8 d0, #3
+ vshrn.u16 d7, q2, #3
+ vshrn.u16 d25, q2, #8
+ vbic.i16 q2, #0xe0
+ PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ vshr.u8 d6, #3
+ vshr.u8 d7, #2
+ vshr.u8 d2, #3
+ vmovn.u16 d24, q2
+ vshr.u8 d25, #3
+ vmull.u8 q13, d1, d3
+ vmlal.u8 q13, d7, d6
+ vmull.u8 q14, d0, d3
+ vmlal.u8 q14, d24, d6
+ vmull.u8 q15, d2, d3
+ vmlal.u8 q15, d25, d6
+.endm
+
+generate_composite_function \
+ BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 6, /* prefetch distance */ \
+ default_init, \
+ default_cleanup, \
+ ARGBto565PixelAlpha_process_pixblock_head, \
+ ARGBto565PixelAlpha_process_pixblock_tail, \
+ ARGBto565PixelAlpha_process_pixblock_tail_head