mirror of
https://github.com/librempeg/librempeg
synced 2024-11-23 03:28:27 +00:00
lavc/h264dsp: reuse the R-V V IDCT DC add functions
This reuses the DC bypass functions from the multiple IDCT functions, to leverage vector code. As an added bonus, the caller functions can now rely on the callee functions to preserve their parameters, thus cutting down on stack spills. Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
parent
9403e7888b
commit
54532de3e8
@ -536,7 +536,7 @@ endconst
|
||||
.macro idct4_adds type, depth
|
||||
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
csrwi vxrm, 0
|
||||
addi sp, sp, -96
|
||||
addi sp, sp, -64
|
||||
lla t0, ff_h264_scan8
|
||||
sd s0, (sp)
|
||||
li t1, 32 * (\depth / 8)
|
||||
@ -547,14 +547,6 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
sd s3, 32(sp)
|
||||
sd s4, 40(sp)
|
||||
sd s5, 48(sp)
|
||||
sd s6, 56(sp)
|
||||
sd s7, 64(sp)
|
||||
.if \depth > 8
|
||||
sd s8, 72(sp)
|
||||
sd s9, 80(sp)
|
||||
mv s8, a5
|
||||
mv s9, a6
|
||||
.endif
|
||||
vsetivli zero, 16, e8, m1, ta, ma
|
||||
vle8.v v8, (t0)
|
||||
.if \depth == 8
|
||||
@ -583,8 +575,8 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
li s1, 16
|
||||
mv s4, a0
|
||||
mv s5, a1
|
||||
mv s6, a2
|
||||
mv s7, a3
|
||||
mv a1, a2
|
||||
mv a2, a3
|
||||
1:
|
||||
andi t0, s2, 1
|
||||
addi s1, s1, -1
|
||||
@ -594,12 +586,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
.endif
|
||||
lw t2, (s5) # block_offset[i]
|
||||
andi t1, s3, 1
|
||||
mv a1, s6
|
||||
mv a2, s7
|
||||
add a0, s4, t2
|
||||
.if \depth > 8
|
||||
mv a5, s8
|
||||
.endif
|
||||
.ifc \type, 16
|
||||
bnez t1, 2f # if (nnz == 1 && block[i * 16])
|
||||
.else
|
||||
@ -611,23 +598,13 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
.ifnc \type, 16
|
||||
beqz t1, 3f # if (block[i * 16])
|
||||
.endif
|
||||
.if \depth == 8
|
||||
call ff_h264_idct_dc_add_\depth\()_c
|
||||
.else
|
||||
jalr s9
|
||||
.endif
|
||||
jal ff_h264_idct4_dc_add_\depth\()_rvv
|
||||
3:
|
||||
srli s3, s3, 1
|
||||
addi s5, s5, 4
|
||||
addi s6, s6, 16 * 2 * (\depth / 8)
|
||||
addi a1, a1, 16 * 2 * (\depth / 8)
|
||||
bnez s1, 1b
|
||||
|
||||
.if \depth > 8
|
||||
ld s9, 80(sp)
|
||||
ld s8, 72(sp)
|
||||
.endif
|
||||
ld s7, 64(sp)
|
||||
ld s6, 56(sp)
|
||||
ld s5, 48(sp)
|
||||
ld s4, 40(sp)
|
||||
ld s3, 32(sp)
|
||||
@ -635,7 +612,7 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
|
||||
ld s1, 16(sp)
|
||||
ld ra, 8(sp)
|
||||
ld s0, 0(sp)
|
||||
addi sp, sp, 96
|
||||
addi sp, sp, 64
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
@ -646,7 +623,7 @@ idct4_adds 16intra, \depth
|
||||
|
||||
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
csrwi vxrm, 0
|
||||
addi sp, sp, -96
|
||||
addi sp, sp, -64
|
||||
lla t0, ff_h264_scan8
|
||||
sd s0, (sp)
|
||||
li t1, 4 * 32 * (\depth / 8)
|
||||
@ -658,14 +635,6 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
sd s3, 32(sp)
|
||||
sd s4, 40(sp)
|
||||
sd s5, 48(sp)
|
||||
sd s6, 56(sp)
|
||||
sd s7, 64(sp)
|
||||
.if \depth > 8
|
||||
sd s8, 72(sp)
|
||||
sd s9, 80(sp)
|
||||
mv s8, a5
|
||||
mv s9, a6
|
||||
.endif
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
vlse8.v v8, (t0), t2
|
||||
.if \depth == 8
|
||||
@ -689,8 +658,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
li s1, 4
|
||||
mv s4, a0
|
||||
mv s5, a1
|
||||
mv s6, a2
|
||||
mv s7, a3
|
||||
mv a1, a2
|
||||
mv a2, a3
|
||||
1:
|
||||
andi t0, s2, 1
|
||||
addi s1, s1, -1
|
||||
@ -698,33 +667,23 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
beqz t0, 3f # if (nnz)
|
||||
lw t2, (s5) # block_offset[i]
|
||||
andi t1, s3, 1
|
||||
mv a1, s6
|
||||
mv a2, s7
|
||||
add a0, s4, t2
|
||||
.if \depth > 8
|
||||
mv a5, s8
|
||||
.endif
|
||||
bnez t1, 2f # if (nnz == 1 && block[i * 16])
|
||||
jal .Lidct8_add_\depth\()_rvv
|
||||
j 3f
|
||||
2:
|
||||
.if \depth == 8
|
||||
call ff_h264_idct8_dc_add_\depth\()_c
|
||||
j 3f
|
||||
.else
|
||||
jalr s9
|
||||
j 4f # idct8_add_16 updates a1
|
||||
.endif
|
||||
2:
|
||||
jal ff_h264_idct8_dc_add_\depth\()_rvv
|
||||
3:
|
||||
addi a1, a1, 4 * 16 * 2 * (\depth / 8)
|
||||
4:
|
||||
srli s3, s3, 1
|
||||
addi s5, s5, 4 * 4
|
||||
addi s6, s6, 4 * 16 * 2 * (\depth / 8)
|
||||
bnez s1, 1b
|
||||
|
||||
.if \depth > 8
|
||||
ld s9, 80(sp)
|
||||
ld s8, 72(sp)
|
||||
.endif
|
||||
ld s7, 64(sp)
|
||||
ld s6, 56(sp)
|
||||
ld s5, 48(sp)
|
||||
ld s4, 40(sp)
|
||||
ld s3, 32(sp)
|
||||
@ -732,7 +691,7 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
ld s1, 16(sp)
|
||||
ld ra, 8(sp)
|
||||
ld s0, 0(sp)
|
||||
addi sp, sp, 96
|
||||
addi sp, sp, 64
|
||||
ret
|
||||
endfunc
|
||||
.endr
|
||||
@ -740,19 +699,16 @@ endfunc
|
||||
.irp depth, 9, 10, 12, 14
|
||||
func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||
li a5, (1 << \depth) - 1
|
||||
lla a6, ff_h264_idct_dc_add_\depth\()_c
|
||||
j ff_h264_idct_add16_16_rvv
|
||||
endfunc
|
||||
|
||||
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||
li a5, (1 << \depth) - 1
|
||||
lla a6, ff_h264_idct_dc_add_\depth\()_c
|
||||
j ff_h264_idct_add16intra_16_rvv
|
||||
endfunc
|
||||
|
||||
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||
li a5, (1 << \depth) - 1
|
||||
lla a6, ff_h264_idct8_dc_add_\depth\()_c
|
||||
j ff_h264_idct8_add4_16_rvv
|
||||
endfunc
|
||||
.endr
|
||||
|
Loading…
Reference in New Issue
Block a user