pixman-arm-simd-asm.S

Branch :
Show log
Commit
Author : Ben Avison
Date : 2019-10-24 21:15:50
Hash : 74846657
Message : ARM: SIMD optimization for 4:4:4:4 to 8:8:8:8 normal blits
src/video/arm/pixman-arm-simd-asm.S
/*
 * Copyright (c) 2016 RISC OS Open Ltd
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

	.text
	.arch armv6
	.object_arch armv4
	.arm
	.altmacro
	.p2align 2

#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"

/* A head macro should do all processing which results in an output of up to
 * 16 bytes, as far as the final load instruction. The corresponding tail macro
 * should complete the processing of the up-to-16 bytes. The calling macro will
 * sometimes choose to insert a preload or a decrement of X between them.
 *   cond           ARM condition code for code block
 *   numbytes       Number of output bytes that should be generated this time
 *   firstreg       First WK register in which to place output
 *   unaligned_src  Whether to use non-wordaligned loads of source image
 *   unaligned_mask Whether to use non-wordaligned loads of mask image
 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
 */

/******************************************************************************/

.macro FillRect32_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect16_init
        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect8_init
        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #8
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect_process_tail  cond, numbytes, firstreg
    WK4     .req    SRC
    WK5     .req    STRIDE_S
    WK6     .req    MASK
    WK7     .req    STRIDE_M
        pixst   cond, numbytes, 4, DST
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    FillRect32ARMSIMDAsm, 0, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect32_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

generate_composite_function \
    FillRect16ARMSIMDAsm, 0, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect16_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

generate_composite_function \
    FillRect8ARMSIMDAsm, 0, 0, 8, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect8_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

/******************************************************************************/

/* This differs from the over_8888_8888 routine in Pixman in that the destination
 * alpha component is always left unchanged, and RGB components are not
 * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
 * renormalisation is done by multiplying by 257/256 (with rounding) rather than
 * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
 */

.macro RGBtoRGBPixelAlpha_init
        line_saved_regs STRIDE_S, ORIG_W
        mov     MASK, #0x80
.endm

.macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
        uxtb    tmp3, s
        uxtb    tmp0, d
        sub     tmp0, tmp3, tmp0
        uxtb    tmp3, s, ror #16
        uxtb    tmp1, d, ror #16
        sub     tmp1, tmp3, tmp1
        uxtb    tmp3, s, ror #8
        mov     s, s, lsr #24
        uxtb    tmp2, d, ror #8
        sub     tmp2, tmp3, tmp2
        smlabb  tmp0, tmp0, s, half
        smlabb  tmp1, tmp1, s, half
        smlabb  tmp2, tmp2, s, half
        add     tmp0, tmp0, asr #8
        add     tmp1, tmp1, asr #8
        add     tmp2, tmp2, asr #8
        pkhbt   tmp0, tmp0, tmp1, lsl #16
        and     tmp2, tmp2, #0xff00
        uxtb16  tmp0, tmp0, ror #8
        orr     tmp0, tmp0, tmp2
        uadd8   d, d, tmp0
.endm

.macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
        and     d, d, #0xff000000
        bic     s, s, #0xff000000
        orr     d, d, s
.endm

.macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ldm     SRC!, {WK0, WK1}
        ldm     SRC!, {STRIDE_S, STRIDE_M}
        ldrd    WK2, WK3, [DST], #16
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        orr     SCRATCH, SCRATCH, STRIDE_S
        and     ORIG_W, ORIG_W, STRIDE_S
        orr     SCRATCH, SCRATCH, STRIDE_M
        and     ORIG_W, ORIG_W, STRIDE_M
        tst     SCRATCH, #0xff000000
 .elseif numbytes == 8
        ldm     SRC!, {WK0, WK1}
        ldm     DST!, {WK2, WK3}
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
 .else // numbytes == 4
        ldr     WK0, [SRC], #4
        ldr     WK2, [DST], #4
        tst     WK0, #0xff000000
 .endif
.endm

.macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
        beq     20f @ all transparent
 .if numbytes == 16
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19:     strd    WK2, WK3, [DST, #-8]
 .elseif numbytes == 8
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19:     strd    WK2, WK3, [DST, #-8]
 .else // numbytes == 4
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
19:     str     WK2, [DST, #-4]
 .endif
20:
.endm

generate_composite_function \
    BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    RGBtoRGBPixelAlpha_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGBtoRGBPixelAlpha_process_head, \
    RGBtoRGBPixelAlpha_process_tail

/******************************************************************************/

.macro ARGBto565PixelAlpha_init
        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
        mov     MASK, #0x001f
        mov     STRIDE_M, #0x0010
        orr     MASK, MASK, MASK, lsl #16
        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
.endm

.macro ARGBto565PixelAlpha_newline
        mov     STRIDE_S, #0x0200
.endm

/* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 */

.macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s, lsr #27
        and     misc, s, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s, rbmask, s, lsr #3
        and     rb, rbmask, rb
        sub     s, s, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s, s, alpha, rbhalf
        add     misc, misc, misc, lsl #5
        add     g, g, misc, asr #10
        add     s, s, s, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s, asr #10
        and     rb, rb, rbmask
        pkhbt   rb, rb, rb, lsl #11
        orr     d, rb, g
        orr     d, d, rb, lsr #16
.endm

/* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask holds 0x001f001f
 * On exit:
 * Constant registers preserved
 */

.macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
        and     d, rbmask, s, lsr #3
        and     s, s, #0xfc00
        orr     d, d, d, lsr #5
        orr     d, d, s, lsr #5
.endm

/* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * d holds 2 16bpp destination pixels
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */

.macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s1, lsr #27
        and     misc, s1, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s1, rbmask, s1, lsr #3
        and     rb, rbmask, rb
        sub     s1, s1, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s1, s1, alpha, rbhalf
          uxth    d, d, ror #16
        add     misc, misc, misc, lsl #5
          mov     alpha, s2, lsr #27
        add     g, g, misc, asr #10
        add     s1, s1, s1, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s1, asr #10
        and     rb, rb, rbmask
          and     misc, s2, #0xfc00
        pkhbt   rb, rb, rb, lsl #11
          and     s1, d, #0x07e0
          pkhbt   d, d, d, lsl #5
          rsb     misc, s1, misc, lsr #5
          and     s2, rbmask, s2, lsr #3
          and     d, rbmask, d
          sub     s2, s2, d
          smlabb  misc, misc, alpha, ghalf
          mla     s2, s2, alpha, rbhalf
        orr     alpha, rb, g
          add     misc, misc, misc, lsl #5
        orr     alpha, alpha, rb, lsr #16
          add     s1, s1, misc, asr #10
          add     s2, s2, s2, lsl #5
          and     s1, s1, #0x07e0
          add     d, d, s2, asr #10
          and     d, d, rbmask
        strh    alpha, [DST, #-4]
          pkhbt   d, d, d, lsl #11
          orr     alpha, d, s1
          orr     alpha, alpha, d, lsr #16
          strh    alpha, [DST, #-2]
.endm

/* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * rbmask holds 0x001f001f
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */

.macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
        and     g, s1, #0xfc00
        and     d, rbmask, s1, lsr #3
          and     s1, rbmask, s2, lsr #3
        orr     d, d, d, lsr #5
        orr     d, d, g, lsr #5
          and     g, s2, #0xfc00
        strh    d, [DST, #-4]
          orr     s1, s1, s1, lsr #5
          orr     s1, s1, g, lsr #5
          strh    s1, [DST, #-2]
.endm

.macro ARGBto565PixelAlpha_2pixels_head
        ldrd    WK0, WK1, [SRC], #8
        ldr     WK2, [DST], #4
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
.endm

.macro ARGBto565PixelAlpha_2pixels_tail
        beq     20f @ all transparent
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       20f
10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
20:
.endm

.macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 8
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_head
 .else // numbytes == 2
        ldr     WK0, [SRC], #4
        ldrh    WK2, [DST], #2
        tst     WK0, #0xff000000
 .endif
.endm

.macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_tail
 .else // numbytes == 2
        beq     20f @ all transparent
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       19f
10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
19:     strh    WK2, [DST, #-2]
20:
 .endif
.endm

generate_composite_function \
    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    ARGBto565PixelAlpha_init, \
    ARGBto565PixelAlpha_newline, \
    nop_macro, /* cleanup */ \
    ARGBto565PixelAlpha_process_head, \
    ARGBto565PixelAlpha_process_tail

 /******************************************************************************/

.macro BGR888toRGB888_1pixel cond, reg, tmp
        uxtb16&cond  tmp, WK&reg, ror #8
        uxtb16&cond  WK&reg, WK&reg, ror #16
        orr&cond     WK&reg, WK&reg, tmp, lsl #8
.endm

.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
        uxtb16&cond  tmp1, WK&reg1, ror #8
        uxtb16&cond  WK&reg1, WK&reg1, ror #16
        uxtb16&cond  tmp2, WK&reg2, ror #8
        uxtb16&cond  WK&reg2, WK&reg2, ror #16
        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
.endm

.macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes, firstreg, SRC, unaligned_src
.endm

.macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
  .if numbytes == 16
        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
  .endif
 .else @ numbytes == 4
        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
 .endif
.endm

generate_composite_function \
    Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    2, /* prefetch distance */ \
    nop_macro, /* init */ \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    BGR888toRGB888_process_head, \
    BGR888toRGB888_process_tail

/******************************************************************************/

.macro RGB444toRGB888_init
        ldr     MASK, =0x0f0f0f0f
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        msr     CPSR_s, #0x50000
.endm

.macro RGB444toRGB888_1pixel reg, mask, tmp
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
        and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
        orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
.endm

.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
        and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
        and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
        orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
        orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
        pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
        pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
        pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
        sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
        sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
.endm

.macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
.endm

.macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
  .if numbytes == 16
        RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
  .endif
        RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
 .else @ numbytes == 4
        RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
 .endif
.endm

generate_composite_function \
    Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    2, /* prefetch distance */ \
    RGB444toRGB888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGB444toRGB888_process_head, \
    RGB444toRGB888_process_tail
kc3-lang/SDL/src/video/arm/pixman-arm-simd-asm.S

Commit

kc3-lang/SDL /src/video/arm/pixman-arm-simd-asm.S