Edit

kc3-lang/SDL/src/video/arm/pixman-arm-simd-asm.S

Branch :

  • Show log

    Commit

  • Author : Ben Avison
    Date : 2019-10-24 21:15:50
    Hash : 74846657
    Message : ARM: SIMD optimization for 4:4:4:4 to 8:8:8:8 normal blits

  • src/video/arm/pixman-arm-simd-asm.S
  • /*
     * Copyright (c) 2016 RISC OS Open Ltd
     *
     * This software is provided 'as-is', without any express or implied
     * warranty.  In no event will the authors be held liable for any damages
     * arising from the use of this software.
     *
     * Permission is granted to anyone to use this software for any purpose,
     * including commercial applications, and to alter it and redistribute it
     * freely, subject to the following restrictions:
     *
     * 1. The origin of this software must not be misrepresented; you must not
     *    claim that you wrote the original software. If you use this software
     *    in a product, an acknowledgment in the product documentation would be
     *    appreciated but is not required.
     * 2. Altered source versions must be plainly marked as such, and must not be
     *    misrepresented as being the original software.
     * 3. This notice may not be removed or altered from any source distribution.
     */
    
    /* Prevent the stack from becoming executable */
    #if defined(__linux__) && defined(__ELF__)
    .section .note.GNU-stack,"",%progbits
    #endif
    
    	.text
    	.arch armv6
    	.object_arch armv4
    	.arm
    	.altmacro
    	.p2align 2
    
    #include "pixman-arm-asm.h"
    #include "pixman-arm-simd-asm.h"
    
    /* A head macro should do all processing which results in an output of up to
     * 16 bytes, as far as the final load instruction. The corresponding tail macro
     * should complete the processing of the up-to-16 bytes. The calling macro will
     * sometimes choose to insert a preload or a decrement of X between them.
     *   cond           ARM condition code for code block
     *   numbytes       Number of output bytes that should be generated this time
     *   firstreg       First WK register in which to place output
     *   unaligned_src  Whether to use non-wordaligned loads of source image
     *   unaligned_mask Whether to use non-wordaligned loads of mask image
     *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
     */
    
    /******************************************************************************/
    
    .macro FillRect32_init
            ldr     SRC, [sp, #ARGS_STACK_OFFSET]
            mov     STRIDE_S, SRC
            mov     MASK, SRC
            mov     STRIDE_M, SRC
    .endm
    
    .macro FillRect16_init
            ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
            orr     SRC, SRC, lsl #16
            mov     STRIDE_S, SRC
            mov     MASK, SRC
            mov     STRIDE_M, SRC
    .endm
    
    .macro FillRect8_init
            ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
            orr     SRC, SRC, lsl #8
            orr     SRC, SRC, lsl #16
            mov     STRIDE_S, SRC
            mov     MASK, SRC
            mov     STRIDE_M, SRC
    .endm
    
    .macro FillRect_process_tail  cond, numbytes, firstreg
        WK4     .req    SRC
        WK5     .req    STRIDE_S
        WK6     .req    MASK
        WK7     .req    STRIDE_M
            pixst   cond, numbytes, 4, DST
        .unreq  WK4
        .unreq  WK5
        .unreq  WK6
        .unreq  WK7
    .endm
    
    generate_composite_function \
        FillRect32ARMSIMDAsm, 0, 0, 32, \
        FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
        0, /* prefetch distance doesn't apply */ \
        FillRect32_init \
        nop_macro, /* newline */ \
        nop_macro /* cleanup */ \
        nop_macro /* process head */ \
        FillRect_process_tail
    
    generate_composite_function \
        FillRect16ARMSIMDAsm, 0, 0, 16, \
        FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
        0, /* prefetch distance doesn't apply */ \
        FillRect16_init \
        nop_macro, /* newline */ \
        nop_macro /* cleanup */ \
        nop_macro /* process head */ \
        FillRect_process_tail
    
    generate_composite_function \
        FillRect8ARMSIMDAsm, 0, 0, 8, \
        FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
        0, /* prefetch distance doesn't apply */ \
        FillRect8_init \
        nop_macro, /* newline */ \
        nop_macro /* cleanup */ \
        nop_macro /* process head */ \
        FillRect_process_tail
    
    /******************************************************************************/
    
    /* This differs from the over_8888_8888 routine in Pixman in that the destination
     * alpha component is always left unchanged, and RGB components are not
     * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
     * renormalisation is done by multiplying by 257/256 (with rounding) rather than
     * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
     */
    
    .macro RGBtoRGBPixelAlpha_init
            line_saved_regs STRIDE_S, ORIG_W
            mov     MASK, #0x80
    .endm
    
    .macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
            uxtb    tmp3, s
            uxtb    tmp0, d
            sub     tmp0, tmp3, tmp0
            uxtb    tmp3, s, ror #16
            uxtb    tmp1, d, ror #16
            sub     tmp1, tmp3, tmp1
            uxtb    tmp3, s, ror #8
            mov     s, s, lsr #24
            uxtb    tmp2, d, ror #8
            sub     tmp2, tmp3, tmp2
            smlabb  tmp0, tmp0, s, half
            smlabb  tmp1, tmp1, s, half
            smlabb  tmp2, tmp2, s, half
            add     tmp0, tmp0, asr #8
            add     tmp1, tmp1, asr #8
            add     tmp2, tmp2, asr #8
            pkhbt   tmp0, tmp0, tmp1, lsl #16
            and     tmp2, tmp2, #0xff00
            uxtb16  tmp0, tmp0, ror #8
            orr     tmp0, tmp0, tmp2
            uadd8   d, d, tmp0
    .endm
    
    .macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
            and     d, d, #0xff000000
            bic     s, s, #0xff000000
            orr     d, d, s
    .endm
    
    .macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     .if numbytes == 16
            ldm     SRC!, {WK0, WK1}
            ldm     SRC!, {STRIDE_S, STRIDE_M}
            ldrd    WK2, WK3, [DST], #16
            orr     SCRATCH, WK0, WK1
            and     ORIG_W, WK0, WK1
            orr     SCRATCH, SCRATCH, STRIDE_S
            and     ORIG_W, ORIG_W, STRIDE_S
            orr     SCRATCH, SCRATCH, STRIDE_M
            and     ORIG_W, ORIG_W, STRIDE_M
            tst     SCRATCH, #0xff000000
     .elseif numbytes == 8
            ldm     SRC!, {WK0, WK1}
            ldm     DST!, {WK2, WK3}
            orr     SCRATCH, WK0, WK1
            and     ORIG_W, WK0, WK1
            tst     SCRATCH, #0xff000000
     .else // numbytes == 4
            ldr     WK0, [SRC], #4
            ldr     WK2, [DST], #4
            tst     WK0, #0xff000000
     .endif
    .endm
    
    .macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
            beq     20f @ all transparent
     .if numbytes == 16
            cmp     ORIG_W, #0xff000000
            bhs     10f @ all opaque
            RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            strd    WK2, WK3, [DST, #-16]
            ldrd    WK0, WK1, [SRC, #-8]
            ldrd    WK2, WK3, [DST, #-8]
            RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            b       19f
    10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
            RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
            strd    WK2, WK3, [DST, #-16]
            ldrd    WK0, WK1, [SRC, #-8]
            ldrd    WK2, WK3, [DST, #-8]
            RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
            RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
    19:     strd    WK2, WK3, [DST, #-8]
     .elseif numbytes == 8
            cmp     ORIG_W, #0xff000000
            bhs     10f @ all opaque
            RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            b       19f
    10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
            RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
    19:     strd    WK2, WK3, [DST, #-8]
     .else // numbytes == 4
            cmp     WK0, #0xff000000
            bhs     10f @ opaque
            RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
            b       19f
    10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
    19:     str     WK2, [DST, #-4]
     .endif
    20:
    .endm
    
    generate_composite_function \
        BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
        FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
        2, /* prefetch distance */ \
        RGBtoRGBPixelAlpha_init, \
        nop_macro, /* newline */ \
        nop_macro, /* cleanup */ \
        RGBtoRGBPixelAlpha_process_head, \
        RGBtoRGBPixelAlpha_process_tail
    
    /******************************************************************************/
    
    .macro ARGBto565PixelAlpha_init
            line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
            mov     MASK, #0x001f
            mov     STRIDE_M, #0x0010
            orr     MASK, MASK, MASK, lsl #16
            orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
    .endm
    
    .macro ARGBto565PixelAlpha_newline
            mov     STRIDE_S, #0x0200
    .endm
    
    /* On entry:
     * s1 holds 1 32bpp source pixel
     * d holds 1 16bpp destination pixel
     * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
     * other registers are temporaries
     * On exit:
     * Constant registers preserved
     */
    
    .macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
            mov     alpha, s, lsr #27
            and     misc, s, #0xfc00
            and     g, d, #0x07e0
            pkhbt   rb, d, d, lsl #5
            rsb     misc, g, misc, lsr #5
            and     s, rbmask, s, lsr #3
            and     rb, rbmask, rb
            sub     s, s, rb
            smlabb  misc, misc, alpha, ghalf
            mla     s, s, alpha, rbhalf
            add     misc, misc, misc, lsl #5
            add     g, g, misc, asr #10
            add     s, s, s, lsl #5
            and     g, g, #0x07e0
            add     rb, rb, s, asr #10
            and     rb, rb, rbmask
            pkhbt   rb, rb, rb, lsl #11
            orr     d, rb, g
            orr     d, d, rb, lsr #16
    .endm
    
    /* On entry:
     * s1 holds 1 32bpp source pixel
     * d holds 1 16bpp destination pixel
     * rbmask holds 0x001f001f
     * On exit:
     * Constant registers preserved
     */
    
    .macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
            and     d, rbmask, s, lsr #3
            and     s, s, #0xfc00
            orr     d, d, d, lsr #5
            orr     d, d, s, lsr #5
    .endm
    
    /* On entry:
     * s1, s2 hold 2 32bpp source pixels
     * d holds 2 16bpp destination pixels
     * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
     * other registers are temporaries
     * On exit:
     * Constant registers preserved
     * Blended results have been written through destination pointer
     */
    
    .macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
            mov     alpha, s1, lsr #27
            and     misc, s1, #0xfc00
            and     g, d, #0x07e0
            pkhbt   rb, d, d, lsl #5
            rsb     misc, g, misc, lsr #5
            and     s1, rbmask, s1, lsr #3
            and     rb, rbmask, rb
            sub     s1, s1, rb
            smlabb  misc, misc, alpha, ghalf
            mla     s1, s1, alpha, rbhalf
              uxth    d, d, ror #16
            add     misc, misc, misc, lsl #5
              mov     alpha, s2, lsr #27
            add     g, g, misc, asr #10
            add     s1, s1, s1, lsl #5
            and     g, g, #0x07e0
            add     rb, rb, s1, asr #10
            and     rb, rb, rbmask
              and     misc, s2, #0xfc00
            pkhbt   rb, rb, rb, lsl #11
              and     s1, d, #0x07e0
              pkhbt   d, d, d, lsl #5
              rsb     misc, s1, misc, lsr #5
              and     s2, rbmask, s2, lsr #3
              and     d, rbmask, d
              sub     s2, s2, d
              smlabb  misc, misc, alpha, ghalf
              mla     s2, s2, alpha, rbhalf
            orr     alpha, rb, g
              add     misc, misc, misc, lsl #5
            orr     alpha, alpha, rb, lsr #16
              add     s1, s1, misc, asr #10
              add     s2, s2, s2, lsl #5
              and     s1, s1, #0x07e0
              add     d, d, s2, asr #10
              and     d, d, rbmask
            strh    alpha, [DST, #-4]
              pkhbt   d, d, d, lsl #11
              orr     alpha, d, s1
              orr     alpha, alpha, d, lsr #16
              strh    alpha, [DST, #-2]
    .endm
    
    /* On entry:
     * s1, s2 hold 2 32bpp source pixels
     * rbmask holds 0x001f001f
     * other registers are temporaries
     * On exit:
     * Constant registers preserved
     * Blended results have been written through destination pointer
     */
    
    .macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
            and     g, s1, #0xfc00
            and     d, rbmask, s1, lsr #3
              and     s1, rbmask, s2, lsr #3
            orr     d, d, d, lsr #5
            orr     d, d, g, lsr #5
              and     g, s2, #0xfc00
            strh    d, [DST, #-4]
              orr     s1, s1, s1, lsr #5
              orr     s1, s1, g, lsr #5
              strh    s1, [DST, #-2]
    .endm
    
    .macro ARGBto565PixelAlpha_2pixels_head
            ldrd    WK0, WK1, [SRC], #8
            ldr     WK2, [DST], #4
            orr     SCRATCH, WK0, WK1
            and     ORIG_W, WK0, WK1
            tst     SCRATCH, #0xff000000
    .endm
    
    .macro ARGBto565PixelAlpha_2pixels_tail
            beq     20f @ all transparent
            cmp     ORIG_W, #0xff000000
            bhs     10f @ all opaque
            ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
            b       20f
    10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
    20:
    .endm
    
    .macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
     .if numbytes == 16
            ARGBto565PixelAlpha_2pixels_head
            ARGBto565PixelAlpha_2pixels_tail
            ARGBto565PixelAlpha_2pixels_head
            ARGBto565PixelAlpha_2pixels_tail
     .endif
     .if numbytes >= 8
            ARGBto565PixelAlpha_2pixels_head
            ARGBto565PixelAlpha_2pixels_tail
     .endif
     .if numbytes >= 4
            ARGBto565PixelAlpha_2pixels_head
     .else // numbytes == 2
            ldr     WK0, [SRC], #4
            ldrh    WK2, [DST], #2
            tst     WK0, #0xff000000
     .endif
    .endm
    
    .macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
     .if numbytes >= 4
            ARGBto565PixelAlpha_2pixels_tail
     .else // numbytes == 2
            beq     20f @ all transparent
            cmp     WK0, #0xff000000
            bhs     10f @ opaque
            ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
            b       19f
    10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
    19:     strh    WK2, [DST, #-2]
    20:
     .endif
    .endm
    
    generate_composite_function \
        BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
        FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
        2, /* prefetch distance */ \
        ARGBto565PixelAlpha_init, \
        ARGBto565PixelAlpha_newline, \
        nop_macro, /* cleanup */ \
        ARGBto565PixelAlpha_process_head, \
        ARGBto565PixelAlpha_process_tail
    
     /******************************************************************************/
    
    .macro BGR888toRGB888_1pixel cond, reg, tmp
            uxtb16&cond  tmp, WK&reg, ror #8
            uxtb16&cond  WK&reg, WK&reg, ror #16
            orr&cond     WK&reg, WK&reg, tmp, lsl #8
    .endm
    
    .macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
            uxtb16&cond  tmp1, WK&reg1, ror #8
            uxtb16&cond  WK&reg1, WK&reg1, ror #16
            uxtb16&cond  tmp2, WK&reg2, ror #8
            uxtb16&cond  WK&reg2, WK&reg2, ror #16
            orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
            orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
    .endm
    
    .macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
            pixld   cond, numbytes, firstreg, SRC, unaligned_src
    .endm
    
    .macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
     .if numbytes >= 8
            BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
      .if numbytes == 16
            BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
      .endif
     .else @ numbytes == 4
            BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
     .endif
    .endm
    
    generate_composite_function \
        Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
        FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
        2, /* prefetch distance */ \
        nop_macro, /* init */ \
        nop_macro, /* newline */ \
        nop_macro, /* cleanup */ \
        BGR888toRGB888_process_head, \
        BGR888toRGB888_process_tail
    
    /******************************************************************************/
    
    .macro RGB444toRGB888_init
            ldr     MASK, =0x0f0f0f0f
            /* Set GE[3:0] to 0101 so SEL instructions do what we want */
            msr     CPSR_s, #0x50000
    .endm
    
    .macro RGB444toRGB888_1pixel reg, mask, tmp
            pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
            and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
            orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
            pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
            pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
            sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
    .endm
    
    .macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
            and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
            and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
            orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
            orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
            pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
            pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
            pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
            pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
            pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
            pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
            sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
            sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
    .endm
    
    .macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
            pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
    .endm
    
    .macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
     .if numbytes >= 8
      .if numbytes == 16
            RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
      .endif
            RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
     .else @ numbytes == 4
            RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
     .endif
    .endm
    
    generate_composite_function \
        Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \
        FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
        2, /* prefetch distance */ \
        RGB444toRGB888_init, \
        nop_macro, /* newline */ \
        nop_macro, /* cleanup */ \
        RGB444toRGB888_process_head, \
        RGB444toRGB888_process_tail