mirror of
https://github.com/librempeg/librempeg
synced 2024-11-22 00:51:37 +00:00
swscale/aarch64/yuv2rgb: add neon yuv42{0,2}p -> gbrp unscaled colorspace converters
checkasm --bench on a Raspberry Pi 5 Model B Rev 1.0: yuv420p_gbrp_128_c: 1243.0 yuv420p_gbrp_128_neon: 453.5 yuv420p_gbrp_1920_c: 18165.5 yuv420p_gbrp_1920_neon: 6700.0 yuv422p_gbrp_128_c: 1463.5 yuv422p_gbrp_128_neon: 471.5 yuv422p_gbrp_1920_c: 21343.7 yuv422p_gbrp_1920_neon: 6743.5 Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
parent
8bee248b80
commit
93ae315a8d
@ -52,11 +52,41 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
|
||||
c->yuv2rgb_y_coeff); \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_GBRP_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcU, int linesizeU, \
|
||||
const uint8_t *srcV, int linesizeV, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff, \
|
||||
uint8_t *dst1, int linesize1, \
|
||||
uint8_t *dst2, int linesize2); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], \
|
||||
src[1], srcStride[1], \
|
||||
src[2], srcStride[2], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff, \
|
||||
dst[1] + srcSliceY * dstStride[1], dstStride[1], \
|
||||
dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
|
||||
DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
|
||||
DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp) \
|
||||
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
|
||||
DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
|
||||
@ -83,11 +113,38 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],
|
||||
c->yuv2rgb_y_coeff); \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_NVX_TO_GBRP_FUNCS(ifmt, ofmt) \
|
||||
int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
|
||||
uint8_t *dst, int linesize, \
|
||||
const uint8_t *srcY, int linesizeY, \
|
||||
const uint8_t *srcC, int linesizeC, \
|
||||
const int16_t *table, \
|
||||
int y_offset, \
|
||||
int y_coeff, \
|
||||
uint8_t *dst1, int linesize1, \
|
||||
uint8_t *dst2, int linesize2); \
|
||||
\
|
||||
static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
|
||||
int srcStride[], int srcSliceY, int srcSliceH, \
|
||||
uint8_t *dst[], int dstStride[]) { \
|
||||
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
|
||||
\
|
||||
return ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
|
||||
dst[0] + srcSliceY * dstStride[0], dstStride[0], \
|
||||
src[0], srcStride[0], src[1], srcStride[1], \
|
||||
yuv2rgb_table, \
|
||||
c->yuv2rgb_y_offset >> 6, \
|
||||
c->yuv2rgb_y_coeff, \
|
||||
dst[1] + srcSliceY * dstStride[1], dstStride[1], \
|
||||
dst[2] + srcSliceY * dstStride[2], dstStride[2]); \
|
||||
} \
|
||||
|
||||
#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
|
||||
DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
|
||||
DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp) \
|
||||
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
|
||||
DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
@ -110,6 +167,7 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
|
||||
SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd); \
|
||||
} while (0)
|
||||
|
||||
static void get_unscaled_swscale_neon(SwsContext *c) {
|
||||
|
@ -30,23 +30,43 @@
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12
|
||||
.macro load_dst1_dst2 dst1 linesize1 dst2 linesize2
|
||||
#if defined(__APPLE__)
|
||||
#define DST_OFFSET 8
|
||||
#else
|
||||
#define DST_OFFSET 0
|
||||
#endif
|
||||
ldr x10, [sp, #\dst1 - DST_OFFSET]
|
||||
ldr w12, [sp, #\linesize1 - DST_OFFSET]
|
||||
ldr x15, [sp, #\dst2 - DST_OFFSET]
|
||||
ldr w16, [sp, #\linesize2 - DST_OFFSET]
|
||||
#undef DST_OFFSET
|
||||
sub w12, w12, w0 // w12 = linesize1 - width (padding1)
|
||||
sub w16, w16, w0 // w16 = linesize2 - width (padding2)
|
||||
.endm
|
||||
|
||||
.macro load_args_nv12 ofmt
|
||||
ldr x8, [sp] // table
|
||||
load_yoff_ycoeff 8, 16 // y_offset, y_coeff
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
.ifc \ofmt,gbrp
|
||||
load_dst1_dst2 24, 32, 40, 48
|
||||
sub w3, w3, w0 // w3 = linesize - width (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
|
||||
neg w11, w0
|
||||
.endm
|
||||
|
||||
.macro load_args_nv21
|
||||
load_args_nv12
|
||||
.macro load_args_nv21 ofmt
|
||||
load_args_nv12 \ofmt
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv420p
|
||||
.macro load_args_yuv420p ofmt
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
@ -54,7 +74,12 @@
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
.ifc \ofmt,gbrp
|
||||
load_dst1_dst2 40, 48, 56, 64
|
||||
sub w3, w3, w0 // w3 = linesize - width (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
@ -62,7 +87,7 @@
|
||||
neg w11, w11
|
||||
.endm
|
||||
|
||||
.macro load_args_yuv422p
|
||||
.macro load_args_yuv422p ofmt
|
||||
ldr x13, [sp] // srcV
|
||||
ldr w14, [sp, #8] // linesizeV
|
||||
ldr x8, [sp, #16] // table
|
||||
@ -70,7 +95,12 @@
|
||||
ld1 {v1.1d}, [x8]
|
||||
dup v0.8h, w10
|
||||
dup v3.8h, w9
|
||||
.ifc \ofmt,gbrp
|
||||
load_dst1_dst2 40, 48, 56, 64
|
||||
sub w3, w3, w0 // w3 = linesize - width (padding)
|
||||
.else
|
||||
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
|
||||
.endif
|
||||
sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
|
||||
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
|
||||
sub w14, w14, w0, lsr #1 // w14 = linesizeV - width / 2 (paddingV)
|
||||
@ -100,9 +130,9 @@
|
||||
.endm
|
||||
|
||||
.macro increment_nv12
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w16, sxtw // srcC += incC
|
||||
ands w17, w1, #1
|
||||
csel w17, w7, w11, ne // incC = (h & 1) ? paddincC : -width
|
||||
add x6, x6, w17, sxtw // srcC += incC
|
||||
.endm
|
||||
|
||||
.macro increment_nv21
|
||||
@ -110,10 +140,10 @@
|
||||
.endm
|
||||
|
||||
.macro increment_yuv420p
|
||||
ands w15, w1, #1
|
||||
csel w16, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
ands w17, w1, #1
|
||||
csel w17, w7, w11, ne // incU = (h & 1) ? paddincU : -width/2
|
||||
add x6, x6, w17, sxtw // srcU += incU
|
||||
csel w17, w14, w11, ne // incV = (h & 1) ? paddincV : -width/2
|
||||
add x6, x6, w16, sxtw // srcU += incU
|
||||
add x13, x13, w17, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
@ -122,7 +152,7 @@
|
||||
add x13, x13, w14, sxtw // srcV += incV
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
.macro compute_rgb r1 g1 b1 r2 g2 b2
|
||||
add v20.8h, v26.8h, v20.8h // Y1 + R1
|
||||
add v21.8h, v27.8h, v21.8h // Y2 + R2
|
||||
add v22.8h, v26.8h, v22.8h // Y1 + G1
|
||||
@ -135,13 +165,18 @@
|
||||
sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
|
||||
sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
|
||||
sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
|
||||
.endm
|
||||
|
||||
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
|
||||
compute_rgb \r1, \g1, \b1, \r2, \g2, \b2
|
||||
movi \a1, #255
|
||||
movi \a2, #255
|
||||
.endm
|
||||
|
||||
.macro declare_func ifmt ofmt
|
||||
function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
load_args_\ifmt
|
||||
load_args_\ifmt \ofmt
|
||||
|
||||
mov w9, w1
|
||||
1:
|
||||
mov w8, w0 // w8 = width
|
||||
@ -185,11 +220,22 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
|
||||
compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
|
||||
.endif
|
||||
|
||||
.ifc \ofmt,gbrp
|
||||
compute_rgb v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
|
||||
st1 { v4.8b, v5.8b }, [x2], #16
|
||||
st1 { v6.8b, v7.8b }, [x10], #16
|
||||
st1 { v18.8b, v19.8b }, [x15], #16
|
||||
.else
|
||||
st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
|
||||
st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
|
||||
.endif
|
||||
subs w8, w8, #16 // width -= 16
|
||||
b.gt 2b
|
||||
add x2, x2, w3, sxtw // dst += padding
|
||||
.ifc \ofmt,gbrp
|
||||
add x10, x10, w12, sxtw // dst1 += padding1
|
||||
add x15, x15, w16, sxtw // dst2 += padding2
|
||||
.endif
|
||||
add x4, x4, w5, sxtw // srcY += paddingY
|
||||
increment_\ifmt
|
||||
subs w1, w1, #1 // height -= 1
|
||||
@ -204,6 +250,7 @@ endfunc
|
||||
declare_func \ifmt, rgba
|
||||
declare_func \ifmt, abgr
|
||||
declare_func \ifmt, bgra
|
||||
declare_func \ifmt, gbrp
|
||||
.endm
|
||||
|
||||
declare_rgb_funcs nv12
|
||||
|
Loading…
Reference in New Issue
Block a user