ARM: NEON assembly optimization for SDL_FillRect

This commit is contained in:
Ben Avison 2019-10-24 21:17:52 -04:00
parent 1187b013a5
commit 72f8044a42
2 changed files with 149 additions and 0 deletions

View file

@ -281,6 +281,27 @@ SDL_FillRects(SDL_Surface * dst, const SDL_Rect * rects, int count,
return SDL_SetError("SDL_FillRects() passed NULL rects"); return SDL_SetError("SDL_FillRects() passed NULL rects");
} }
#if SDL_ARM_NEON_BLITTERS
if (SDL_HasNEON() && dst->format->BytesPerPixel != 3) {
void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
switch (dst->format->BytesPerPixel) {
case 1:
FillRect8ARMNEONAsm(rect->w, rect->h, (uint8_t *) pixels, dst->pitch >> 0, color);
break;
case 2:
FillRect16ARMNEONAsm(rect->w, rect->h, (uint16_t *) pixels, dst->pitch >> 1, color);
break;
case 4:
FillRect32ARMNEONAsm(rect->w, rect->h, (uint32_t *) pixels, dst->pitch >> 2, color);
break;
}
SDL_UnlockSurface(dst);
return(0);
}
#endif
#if SDL_ARM_SIMD_BLITTERS #if SDL_ARM_SIMD_BLITTERS
if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) { if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src); void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);

View file

@ -95,6 +95,134 @@
/******************************************************************************/ /******************************************************************************/
/* We can actually do significantly better than the Pixman macros, at least for
* the case of fills, by using a carefully scheduled inner loop. Cortex-A53
* shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
*/
.macro generate_fillrect_function name, bpp, log2Bpp
/*
* void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
* On entry:
* a1 = width, pixels
* a2 = height, rows
* a3 = pointer to top-left destination pixel
* a4 = stride, pixels
* [sp] = pixel value to fill with
* Within the function:
* v1 = width remaining
* v2 = vst offset
* v3 = alternate pointer
* ip = data ARM register
*/
pixman_asm_function name
vld1.\bpp {d0[],d1[]}, [sp]
sub a4, a1
vld1.\bpp {d2[],d3[]}, [sp]
cmp a1, #(15+64) >> \log2Bpp
push {v1-v3,lr}
vmov ip, s0
blo 51f
/* Long-row case */
mov v2, #64
1: mov v1, a1
ands v3, a3, #15
beq 2f
/* Leading pixels */
rsb v3, v3, #16 /* number of leading bytes until 16-byte aligned */
sub v1, v1, v3, lsr #\log2Bpp
rbit v3, v3
.if bpp <= 16
.if bpp == 8
tst a3, #1 /* bit 0 unaffected by rsb so can avoid register interlock */
strneb ip, [a3], #1
tst v3, #1<<30
.else
tst a3, #2 /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
.endif
strneh ip, [a3], #2
.endif
movs v3, v3, lsl #3
vstmcs a3!, {s0}
vstmmi a3!, {d0}
2: sub v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
add v3, a3, #32
/* Inner loop */
3: vst1.\bpp {q0-q1}, [a3 :128], v2
subs v1, v1, #64 >> \log2Bpp
vst1.\bpp {q0-q1}, [v3 :128], v2
bhs 3b
/* Trailing pixels */
4: movs v1, v1, lsl #27 + \log2Bpp
bcc 5f
vst1.\bpp {q0-q1}, [a3 :128]!
5: bpl 6f
vst1.\bpp {q0}, [a3 :128]!
6: movs v1, v1, lsl #2
vstmcs a3!, {d0}
vstmmi a3!, {s0}
.if bpp <= 16
movs v1, v1, lsl #2
strcsh ip, [a3], #2
.if bpp == 8
strmib ip, [a3], #1
.endif
.endif
subs a2, a2, #1
add a3, a3, a4, lsl #\log2Bpp
bhi 1b
pop {v1-v3,pc}
/* Short-row case */
51: movs v1, a1
.if bpp == 8
tst a3, #3
beq 53f
52: subs v1, v1, #1
blo 57f
strb ip, [a3], #1
tst a3, #3
bne 52b
.elseif bpp == 16
tstne a3, #2
subne v1, v1, #1
strneh ip, [a3], #2
.endif
53: cmp v1, #32 >> \log2Bpp
bcc 54f
vst1.\bpp {q0-q1}, [a3]!
sub v1, v1, #32 >> \log2Bpp
/* Trailing pixels */
54: movs v1, v1, lsl #27 + \log2Bpp
bcc 55f
vst1.\bpp {q0-q1}, [a3]!
55: bpl 56f
vst1.\bpp {q0}, [a3]!
56: movs v1, v1, lsl #2
vstmcs a3!, {d0}
vstmmi a3!, {s0}
.if bpp <= 16
movs v1, v1, lsl #2
strcsh ip, [a3], #2
.if bpp == 8
strmib ip, [a3], #1
.endif
.endif
subs a2, a2, #1
add a3, a3, a4, lsl #\log2Bpp
bhi 51b
57: pop {v1-v3,pc}
.endfunc
.endm
generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
generate_fillrect_function FillRect8ARMNEONAsm, 8, 0
/******************************************************************************/
.macro RGBtoRGBPixelAlpha_process_pixblock_head .macro RGBtoRGBPixelAlpha_process_pixblock_head
vmvn d30, d3 /* get inverted source alpha */ vmvn d30, d3 /* get inverted source alpha */
vmov d31, d7 /* dest alpha is always unchanged */ vmov d31, d7 /* dest alpha is always unchanged */