mirror of
https://github.com/librempeg/librempeg
synced 2024-11-23 19:58:59 +00:00
43dab86bcd
Using named define properly documents the code paths. It also avoids passing additional numbered arguments through multiple levels of macro templates. The suffix handling is done by concatenation, like in other asm functions and avoid having two separate "cglobal" defines. Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
386 lines
12 KiB
NASM
386 lines
12 KiB
NASM
;******************************************************************************
|
|
;* SIMD optimized Opus encoder DSP function
|
|
;*
|
|
;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "config.asm"
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
%ifdef __NASM_VER__
|
|
%use "smartalign"
|
|
ALIGNMODE p6
|
|
%endif
|
|
|
|
SECTION_RODATA 64
|
|
|
|
const_float_abs_mask: times 8 dd 0x7fffffff
|
|
const_align_abs_edge: times 8 dd 0
|
|
|
|
const_float_0_5: times 8 dd 0.5
|
|
const_float_1: times 8 dd 1.0
|
|
const_float_sign_mask: times 8 dd 0x80000000
|
|
|
|
const_int32_offsets:
|
|
%rep 8
|
|
dd $-const_int32_offsets
|
|
%endrep
|
|
SECTION .text
|
|
|
|
;
|
|
; Setup High Register to be used
|
|
; for holding memory constants
|
|
;
|
|
; %1 - the register to be used, assmues it is >= mm8
|
|
; %2 - name of the constant.
|
|
;
|
|
; Subsequent opcodes are going to use the constant in the form
|
|
; "addps m0, mm_const_name" and it would be turned into:
|
|
; "addps m0, [const_name]" on 32 bit arch or
|
|
; "addps m0, m8" on 64 bit arch
|
|
%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
|
|
%if num_mmregs > 8
|
|
%define mm_%3 %2
|
|
%{1} %2, [%3] ; movaps m8, [const_name]
|
|
%else
|
|
%define mm_%3 [%3]
|
|
%endif
|
|
%endmacro
|
|
|
|
;
|
|
; Set Position Independent Code
|
|
; Base address of a constant
|
|
; %1 - the register to be used, if PIC is set
|
|
; %2 - name of the constant.
|
|
;
|
|
; Subsequent opcode are going to use the base address in the form
|
|
; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
|
|
; "movaps m0, [r5 + r4]" if PIC is enabled
|
|
; "movaps m0, [constant_name + r4]" if texrel are used
|
|
%macro SET_PIC_BASE 3; reg, const_label
|
|
%ifdef PIC
|
|
%{1} %2, [%3] ; lea r5, [rip+const]
|
|
%define pic_base_%3 %2
|
|
%else
|
|
%define pic_base_%3 %3
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro PULSES_SEARCH 1
|
|
; m6 Syy_norm
|
|
; m7 Sxy_norm
|
|
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
|
pxor m1, m1 ; max_idx
|
|
xorps m3, m3 ; p_max
|
|
xor r4d, r4d
|
|
align 16
|
|
%%distortion_search:
|
|
movd xm2, dword r4d ; movd zero extends
|
|
%ifidn %1,add
|
|
movaps m4, [tmpY + r4] ; y[i]
|
|
movaps m5, [tmpX + r4] ; X[i]
|
|
|
|
%if USE_APPROXIMATION == 1
|
|
xorps m0, m0
|
|
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
|
%endif
|
|
|
|
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
|
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
|
|
|
%if USE_APPROXIMATION == 1
|
|
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
|
%endif
|
|
|
|
%else
|
|
movaps m5, [tmpY + r4] ; m5 = y[i]
|
|
|
|
xorps m0, m0 ; m0 = 0;
|
|
cmpps m0, m0, m5, 1 ; m0 = (0<y)
|
|
|
|
subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
|
|
subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
|
|
andps m5, m0 ; (0<y)?m5:0
|
|
%endif
|
|
|
|
%if USE_APPROXIMATION == 1
|
|
rsqrtps m4, m4
|
|
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
|
%else
|
|
mulps m5, m5
|
|
divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
|
|
%endif
|
|
VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
|
|
|
|
cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
|
|
maxps m3, m5 ; m3=max(p_max,p)
|
|
; maxps here is faster than blendvps, despite blend having lower latency.
|
|
|
|
pand m2, m0 ; This version seems faster than sse41 pblendvb
|
|
pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
|
|
|
|
add r4d, mmsize
|
|
cmp r4d, Nd
|
|
jb %%distortion_search
|
|
|
|
por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
|
|
movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
|
|
|
|
%if mmsize >= 32
|
|
; Merge parallel maximums round 8 (4 vs 4)
|
|
|
|
vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
|
|
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
|
|
|
|
vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
|
|
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
|
|
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
|
|
%endif
|
|
|
|
; Merge parallel maximums round 4 (2 vs 2)
|
|
; m3=p[3210]
|
|
movhlps xm5, xm3 ; m5=p[xx32]
|
|
cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
|
|
|
|
pshufd xm2, xm1, q3232
|
|
BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
|
|
PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
|
|
|
|
; Merge parallel maximums final round (1 vs 1)
|
|
shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
|
|
cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
|
|
|
|
pshufd xm2, xm1, q1111
|
|
PBLENDVB xm1, xm2, xm0
|
|
|
|
movd dword r4d, xm1 ; zero extends to the rest of r4q
|
|
|
|
VBROADCASTSS m3, [tmpX + r4]
|
|
%{1}ps m7, m3 ; Sxy += X[max_idx]
|
|
|
|
VBROADCASTSS m5, [tmpY + r4]
|
|
%{1}ps m6, m5 ; Syy += Y[max_idx]
|
|
|
|
; We have to update a single element in Y[i]
|
|
; However writing 4 bytes and then doing 16 byte load in the inner loop
|
|
; could cause a stall due to breaking write forwarding.
|
|
VPBROADCASTD m1, xm1
|
|
pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
|
|
|
|
and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
|
|
movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
|
|
andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
|
|
%{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
|
|
movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
|
|
%endmacro
|
|
|
|
;
|
|
; We need one more register for
|
|
; PIC relative addressing. Use this
|
|
; to count it in cglobal
|
|
;
|
|
%ifdef PIC
|
|
%define num_pic_regs 1
|
|
%else
|
|
%define num_pic_regs 0
|
|
%endif
|
|
|
|
;
|
|
; Pyramid Vector Quantization Search implementation
|
|
;
|
|
; float * inX - Unaligned (SIMD) access, it will be overread,
|
|
; but extra data is masked away.
|
|
; int32 * outY - Should be aligned and padded buffer.
|
|
; It is used as temp buffer.
|
|
; uint32 K - Number of pulses to have after quantizations.
|
|
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
|
;
|
|
%macro PVQ_FAST_SEARCH 1
|
|
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
|
%define tmpX rsp
|
|
%define tmpY outYq
|
|
|
|
movaps m0, [const_float_abs_mask]
|
|
shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
|
|
mov r4d, Nd
|
|
|
|
neg r4d
|
|
and r4d, mmsize-1
|
|
|
|
SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
|
|
movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
|
|
|
|
add Nd, r4d ; N = align(N, mmsize)
|
|
|
|
lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
|
|
movups m1, [inXq + r4]
|
|
andps m1, m2
|
|
movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
|
|
|
|
align 16
|
|
%%loop_abs_sum:
|
|
sub r4d, mmsize
|
|
jc %%end_loop_abs_sum
|
|
|
|
movups m2, [inXq + r4]
|
|
andps m2, m0
|
|
|
|
movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
|
|
addps m1, m2 ; Sx += abs(X[i])
|
|
jmp %%loop_abs_sum
|
|
|
|
align 16
|
|
%%end_loop_abs_sum:
|
|
|
|
HSUMPS m1, m2 ; m1 = Sx
|
|
|
|
xorps m0, m0
|
|
comiss xm0, xm1 ;
|
|
jz %%zero_input ; if (Sx==0) goto zero_input
|
|
|
|
cvtsi2ss xm0, dword Kd ; m0 = K
|
|
%if USE_APPROXIMATION == 1
|
|
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
|
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
|
%else
|
|
divss xm0, xm1 ; b = K/Sx
|
|
; b = K/max_x
|
|
%endif
|
|
|
|
VBROADCASTSS m0, xm0
|
|
|
|
lea r4d, [Nd - mmsize]
|
|
pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
|
|
xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
|
|
xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
|
|
align 16
|
|
%%loop_guess:
|
|
movaps m1, [tmpX + r4] ; m1 = X[i]
|
|
mulps m2, m0, m1 ; m2 = res*X[i]
|
|
cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
|
|
paddd m5, m2 ; Sy += yt
|
|
cvtdq2ps m2, m2 ; yt = (float)yt
|
|
mulps m1, m2 ; m1 = X[i]*yt
|
|
movaps [tmpY + r4], m2 ; y[i] = m2
|
|
addps m7, m1 ; Sxy += m1;
|
|
mulps m2, m2 ; m2 = yt*yt
|
|
addps m6, m2 ; Syy += m2
|
|
|
|
sub r4d, mmsize
|
|
jnc %%loop_guess
|
|
|
|
HSUMPS m6, m1 ; Syy_norm
|
|
HADDD m5, m4 ; pulses
|
|
|
|
movd dword r4d, xm5 ; zero extends to the rest of r4q
|
|
|
|
sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
|
|
jz %%finish ; K - pulses == 0
|
|
|
|
SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
|
|
SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
|
|
SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
|
|
; Use Syy/2 in distortion parameter calculations.
|
|
; Saves pre and post-caclulation to correct Y[] values.
|
|
; Same precision, since float mantisa is normalized.
|
|
; The SQRT approximation does differ.
|
|
HSUMPS m7, m0 ; Sxy_norm
|
|
mulps m6, mm_const_float_0_5
|
|
|
|
jc %%remove_pulses_loop ; K - pulses < 0
|
|
|
|
align 16 ; K - pulses > 0
|
|
%%add_pulses_loop:
|
|
|
|
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
|
|
|
sub Kd, 1
|
|
jnz %%add_pulses_loop
|
|
|
|
addps m6, m6 ; Syy*=2
|
|
|
|
jmp %%finish
|
|
|
|
align 16
|
|
%%remove_pulses_loop:
|
|
|
|
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
|
|
|
add Kd, 1
|
|
jnz %%remove_pulses_loop
|
|
|
|
addps m6, m6 ; Syy*=2
|
|
|
|
align 16
|
|
%%finish:
|
|
lea r4d, [Nd - mmsize]
|
|
movaps m2, [const_float_sign_mask]
|
|
|
|
align 16
|
|
%%restore_sign_loop:
|
|
movaps m0, [tmpY + r4] ; m0 = Y[i]
|
|
movups m1, [inXq + r4] ; m1 = X[i]
|
|
andps m1, m2 ; m1 = sign(X[i])
|
|
orps m0, m1 ; m0 = Y[i]*sign
|
|
cvtps2dq m3, m0 ; m3 = (int)m0
|
|
movaps [outYq + r4], m3
|
|
|
|
sub r4d, mmsize
|
|
jnc %%restore_sign_loop
|
|
%%return:
|
|
|
|
%if ARCH_X86_64 == 0 ; sbrdsp
|
|
movss r0m, xm6 ; return (float)Syy_norm
|
|
fld dword r0m
|
|
%else
|
|
movaps m0, m6 ; return (float)Syy_norm
|
|
%endif
|
|
|
|
RET
|
|
|
|
align 16
|
|
%%zero_input:
|
|
lea r4d, [Nd - mmsize]
|
|
xorps m0, m0
|
|
%%zero_loop:
|
|
movaps [outYq + r4], m0
|
|
sub r4d, mmsize
|
|
jnc %%zero_loop
|
|
|
|
movaps m6, [const_float_1]
|
|
jmp %%return
|
|
%endmacro
|
|
|
|
; if 1, use a float op that give half precision but execute for around 3 cycles.
|
|
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
|
; that makes the full precision code about 2% slower.
|
|
; Opus also does use rsqrt approximation in their intrinsics code.
|
|
%define USE_APPROXIMATION 1
|
|
|
|
INIT_XMM sse2
|
|
PVQ_FAST_SEARCH _approx
|
|
|
|
INIT_XMM sse4
|
|
PVQ_FAST_SEARCH _approx
|
|
|
|
%define USE_APPROXIMATION 0
|
|
|
|
INIT_XMM avx
|
|
PVQ_FAST_SEARCH _exact
|