mirror of
https://github.com/Ryujinx/SDL.git
synced 2025-01-25 21:11:05 +00:00
ARM: SIMD assembly optimization for function BlitARGBto565PixelAlpha
This commit is contained in:
parent
57723b83e8
commit
0eaa52cedf
|
@ -390,6 +390,21 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
|
||||||
#endif /* __MMX__ */
|
#endif /* __MMX__ */
|
||||||
|
|
||||||
#if SDL_ARM_SIMD_BLITTERS
|
#if SDL_ARM_SIMD_BLITTERS
|
||||||
|
void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
|
||||||
|
|
||||||
|
static void
|
||||||
|
BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
|
||||||
|
{
|
||||||
|
int32_t width = info->dst_w;
|
||||||
|
int32_t height = info->dst_h;
|
||||||
|
uint16_t *dstp = (uint16_t *)info->dst;
|
||||||
|
int32_t dststride = width + (info->dst_skip >> 1);
|
||||||
|
uint32_t *srcp = (uint32_t *)info->src;
|
||||||
|
int32_t srcstride = width + (info->src_skip >> 2);
|
||||||
|
|
||||||
|
BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
|
||||||
|
}
|
||||||
|
|
||||||
void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
|
void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1301,6 +1316,15 @@ SDL_CalculateBlitA(SDL_Surface * surface)
|
||||||
}
|
}
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
|
#if SDL_ARM_SIMD_BLITTERS
|
||||||
|
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
|
||||||
|
&& sf->Gmask == 0xff00 && df->Gmask == 0x7e0
|
||||||
|
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
|
||||||
|
|| (sf->Bmask == 0xff && df->Bmask == 0x1f))
|
||||||
|
&& SDL_HasARMSIMD())
|
||||||
|
return BlitARGBto565PixelAlphaARMSIMD;
|
||||||
|
else
|
||||||
|
#endif
|
||||||
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
|
if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
|
||||||
&& sf->Gmask == 0xff00
|
&& sf->Gmask == 0xff00
|
||||||
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
|
&& ((sf->Rmask == 0xff && df->Rmask == 0x1f)
|
||||||
|
|
|
@ -166,3 +166,200 @@ generate_composite_function \
|
||||||
RGBtoRGBPixelAlpha_process_tail
|
RGBtoRGBPixelAlpha_process_tail
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_init
|
||||||
|
line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
|
||||||
|
mov MASK, #0x001f
|
||||||
|
mov STRIDE_M, #0x0010
|
||||||
|
orr MASK, MASK, MASK, lsl #16
|
||||||
|
orr STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_newline
|
||||||
|
mov STRIDE_S, #0x0200
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* On entry:
|
||||||
|
* s1 holds 1 32bpp source pixel
|
||||||
|
* d holds 1 16bpp destination pixel
|
||||||
|
* rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
|
||||||
|
* other registers are temporaries
|
||||||
|
* On exit:
|
||||||
|
* Constant registers preserved
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_1pixel_translucent s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
|
||||||
|
mov alpha, s, lsr #27
|
||||||
|
and misc, s, #0xfc00
|
||||||
|
and g, d, #0x07e0
|
||||||
|
pkhbt rb, d, d, lsl #5
|
||||||
|
rsb misc, g, misc, lsr #5
|
||||||
|
and s, rbmask, s, lsr #3
|
||||||
|
and rb, rbmask, rb
|
||||||
|
sub s, s, rb
|
||||||
|
smlabb misc, misc, alpha, ghalf
|
||||||
|
mla s, s, alpha, rbhalf
|
||||||
|
add misc, misc, misc, lsl #5
|
||||||
|
add g, g, misc, asr #10
|
||||||
|
add s, s, s, lsl #5
|
||||||
|
and g, g, #0x07e0
|
||||||
|
add rb, rb, s, asr #10
|
||||||
|
and rb, rb, rbmask
|
||||||
|
pkhbt rb, rb, rb, lsl #11
|
||||||
|
orr d, rb, g
|
||||||
|
orr d, d, rb, lsr #16
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* On entry:
|
||||||
|
* s1 holds 1 32bpp source pixel
|
||||||
|
* d holds 1 16bpp destination pixel
|
||||||
|
* rbmask holds 0x001f001f
|
||||||
|
* On exit:
|
||||||
|
* Constant registers preserved
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_1pixel_opaque s, d, rbmask
|
||||||
|
and d, rbmask, s, lsr #3
|
||||||
|
and s, s, #0xfc00
|
||||||
|
orr d, d, d, lsr #5
|
||||||
|
orr d, d, s, lsr #5
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* On entry:
|
||||||
|
* s1, s2 hold 2 32bpp source pixels
|
||||||
|
* d holds 2 16bpp destination pixels
|
||||||
|
* rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
|
||||||
|
* other registers are temporaries
|
||||||
|
* On exit:
|
||||||
|
* Constant registers preserved
|
||||||
|
* Blended results have been written through destination pointer
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_2pixels_translucent s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
|
||||||
|
mov alpha, s1, lsr #27
|
||||||
|
and misc, s1, #0xfc00
|
||||||
|
and g, d, #0x07e0
|
||||||
|
pkhbt rb, d, d, lsl #5
|
||||||
|
rsb misc, g, misc, lsr #5
|
||||||
|
and s1, rbmask, s1, lsr #3
|
||||||
|
and rb, rbmask, rb
|
||||||
|
sub s1, s1, rb
|
||||||
|
smlabb misc, misc, alpha, ghalf
|
||||||
|
mla s1, s1, alpha, rbhalf
|
||||||
|
uxth d, d, ror #16
|
||||||
|
add misc, misc, misc, lsl #5
|
||||||
|
mov alpha, s2, lsr #27
|
||||||
|
add g, g, misc, asr #10
|
||||||
|
add s1, s1, s1, lsl #5
|
||||||
|
and g, g, #0x07e0
|
||||||
|
add rb, rb, s1, asr #10
|
||||||
|
and rb, rb, rbmask
|
||||||
|
and misc, s2, #0xfc00
|
||||||
|
pkhbt rb, rb, rb, lsl #11
|
||||||
|
and s1, d, #0x07e0
|
||||||
|
pkhbt d, d, d, lsl #5
|
||||||
|
rsb misc, s1, misc, lsr #5
|
||||||
|
and s2, rbmask, s2, lsr #3
|
||||||
|
and d, rbmask, d
|
||||||
|
sub s2, s2, d
|
||||||
|
smlabb misc, misc, alpha, ghalf
|
||||||
|
mla s2, s2, alpha, rbhalf
|
||||||
|
orr alpha, rb, g
|
||||||
|
add misc, misc, misc, lsl #5
|
||||||
|
orr alpha, alpha, rb, lsr #16
|
||||||
|
add s1, s1, misc, asr #10
|
||||||
|
add s2, s2, s2, lsl #5
|
||||||
|
and s1, s1, #0x07e0
|
||||||
|
add d, d, s2, asr #10
|
||||||
|
and d, d, rbmask
|
||||||
|
strh alpha, [DST, #-4]
|
||||||
|
pkhbt d, d, d, lsl #11
|
||||||
|
orr alpha, d, s1
|
||||||
|
orr alpha, alpha, d, lsr #16
|
||||||
|
strh alpha, [DST, #-2]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/* On entry:
|
||||||
|
* s1, s2 hold 2 32bpp source pixels
|
||||||
|
* rbmask holds 0x001f001f
|
||||||
|
* other registers are temporaries
|
||||||
|
* On exit:
|
||||||
|
* Constant registers preserved
|
||||||
|
* Blended results have been written through destination pointer
|
||||||
|
*/
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_2pixels_opaque s1, s2, d, rbmask, g
|
||||||
|
and g, s1, #0xfc00
|
||||||
|
and d, rbmask, s1, lsr #3
|
||||||
|
and s1, rbmask, s2, lsr #3
|
||||||
|
orr d, d, d, lsr #5
|
||||||
|
orr d, d, g, lsr #5
|
||||||
|
and g, s2, #0xfc00
|
||||||
|
strh d, [DST, #-4]
|
||||||
|
orr s1, s1, s1, lsr #5
|
||||||
|
orr s1, s1, g, lsr #5
|
||||||
|
strh s1, [DST, #-2]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_2pixels_head
|
||||||
|
ldrd WK0, WK1, [SRC], #8
|
||||||
|
ldr WK2, [DST], #4
|
||||||
|
orr SCRATCH, WK0, WK1
|
||||||
|
and ORIG_W, WK0, WK1
|
||||||
|
tst SCRATCH, #0xff000000
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_2pixels_tail
|
||||||
|
beq 20f @ all transparent
|
||||||
|
cmp ORIG_W, #0xff000000
|
||||||
|
bhs 10f @ all opaque
|
||||||
|
ARGBto565PixelAlpha_2pixels_translucent WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
|
||||||
|
b 20f
|
||||||
|
10: ARGBto565PixelAlpha_2pixels_opaque WK0, WK1, WK2, MASK, SCRATCH
|
||||||
|
20:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
|
||||||
|
.if numbytes == 16
|
||||||
|
ARGBto565PixelAlpha_2pixels_head
|
||||||
|
ARGBto565PixelAlpha_2pixels_tail
|
||||||
|
ARGBto565PixelAlpha_2pixels_head
|
||||||
|
ARGBto565PixelAlpha_2pixels_tail
|
||||||
|
.endif
|
||||||
|
.if numbytes >= 8
|
||||||
|
ARGBto565PixelAlpha_2pixels_head
|
||||||
|
ARGBto565PixelAlpha_2pixels_tail
|
||||||
|
.endif
|
||||||
|
.if numbytes >= 4
|
||||||
|
ARGBto565PixelAlpha_2pixels_head
|
||||||
|
.else // numbytes == 2
|
||||||
|
ldr WK0, [SRC], #4
|
||||||
|
ldrh WK2, [DST], #2
|
||||||
|
tst WK0, #0xff000000
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro ARGBto565PixelAlpha_process_tail cond, numbytes, firstreg
|
||||||
|
.if numbytes >= 4
|
||||||
|
ARGBto565PixelAlpha_2pixels_tail
|
||||||
|
.else // numbytes == 2
|
||||||
|
beq 20f @ all transparent
|
||||||
|
cmp WK0, #0xff000000
|
||||||
|
bhs 10f @ opaque
|
||||||
|
ARGBto565PixelAlpha_1pixel_translucent WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
|
||||||
|
b 19f
|
||||||
|
10: ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
|
||||||
|
19: strh WK2, [DST, #-2]
|
||||||
|
20:
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
generate_composite_function \
|
||||||
|
BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
|
||||||
|
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
|
||||||
|
2, /* prefetch distance */ \
|
||||||
|
ARGBto565PixelAlpha_init, \
|
||||||
|
ARGBto565PixelAlpha_newline, \
|
||||||
|
nop_macro, /* cleanup */ \
|
||||||
|
ARGBto565PixelAlpha_process_head, \
|
||||||
|
ARGBto565PixelAlpha_process_tail
|
||||||
|
|
Loading…
Reference in a new issue