avcodec/ac3: Implement sum_square_butterfly_float for aarch64 NEON

Signed-off-by: Geoff Hill <geoff@geoffhill.org> Signed-off-by: Martin Storsjö <martin@martin.st> Signed-off-by: Paul B Mahol <onemda@gmail.com>
2024-11-22 00:51:37 +00:00 · 2024-04-06 07:26:13 -07:00 · 2024-04-06 07:26:13 -07:00 · 9b7f43c04a
commit 9b7f43c04a
parent dc8865d219
3 changed files with 61 additions and 0 deletions
--- a/libavcodec/aarch64/ac3dsp_init_aarch64.c
+++ b/libavcodec/aarch64/ac3dsp_init_aarch64.c
@ -32,6 +32,10 @@ void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
                                            const int32_t *coef0,
                                            const int32_t *coef1,
                                            int len);
 void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
                                            const float *coef0,
                                            const float *coef1,
                                            int len);
 av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
 {
@ -42,4 +46,5 @@ av_cold void ff_ac3dsp_init_aarch64(AC3DSPContext *c)
    c->extract_exponents = ff_ac3_extract_exponents_neon;
    c->float_to_fixed24 = ff_float_to_fixed24_neon;
    c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
    c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
 }
--- a/libavcodec/aarch64/ac3dsp_neon.S
+++ b/libavcodec/aarch64/ac3dsp_neon.S
@ -87,3 +87,33 @@ function ff_ac3_sum_square_butterfly_int32_neon, export=1
        st1             {v0.1d-v3.1d}, [x0]
        ret
 endfunc
 function ff_ac3_sum_square_butterfly_float_neon, export=1
        movi            v0.4s, #0
        movi            v1.4s, #0
        movi            v2.4s, #0
        movi            v3.4s, #0
 1:      ld1             {v30.4s}, [x1], #16
        ld1             {v31.4s}, [x2], #16
        fadd            v16.4s, v30.4s, v31.4s
        fsub            v17.4s, v30.4s, v31.4s
        fmla            v0.4s, v30.4s, v30.4s
        fmla            v1.4s, v31.4s, v31.4s
        fmla            v2.4s, v16.4s, v16.4s
        fmla            v3.4s, v17.4s, v17.4s
        subs            w3, w3, #4
        b.gt            1b
        faddp           v0.4s, v0.4s, v0.4s
        faddp           v0.2s, v0.2s, v0.2s
        st1             {v0.s}[0], [x0], #4
        faddp           v1.4s, v1.4s, v1.4s
        faddp           v1.2s, v1.2s, v1.2s
        st1             {v1.s}[0], [x0], #4
        faddp           v2.4s, v2.4s, v2.4s
        faddp           v2.2s, v2.2s, v2.2s
        st1             {v2.s}[0], [x0], #4
        faddp           v3.4s, v3.4s, v3.4s
        faddp           v3.2s, v3.2s, v3.2s
        st1             {v3.s}[0], [x0]
        ret
 endfunc
--- a/tests/checkasm/ac3dsp.c
+++ b/tests/checkasm/ac3dsp.c
@ -165,6 +165,31 @@ static void check_ac3_sum_square_butterfly_int32(AC3DSPContext *c) {
    report("ac3_sum_square_butterfly_int32");
 }
 static void check_ac3_sum_square_butterfly_float(AC3DSPContext *c) {
    LOCAL_ALIGNED_32(float, lt, [ELEMS]);
    LOCAL_ALIGNED_32(float, rt, [ELEMS]);
    LOCAL_ALIGNED_16(float, v1, [4]);
    LOCAL_ALIGNED_16(float, v2, [4]);
    declare_func(void, float[4], const float *, const float *, int);
    randomize_float(lt, ELEMS);
    randomize_float(rt, ELEMS);
    if (check_func(c->sum_square_butterfly_float,
                   "ac3_sum_square_bufferfly_float")) {
        call_ref(v1, lt, rt, ELEMS);
        call_new(v2, lt, rt, ELEMS);
        if (!float_near_ulp_array(v1, v2, 10, 4))
            fail();
        bench_new(v2, lt, rt, ELEMS);
    }
    report("ac3_sum_square_butterfly_float");
 }
 void checkasm_check_ac3dsp(void)
 {
    AC3DSPContext c;
@ -174,4 +199,5 @@ void checkasm_check_ac3dsp(void)
    check_ac3_extract_exponents(&c);
    check_float_to_fixed24(&c);
    check_ac3_sum_square_butterfly_int32(&c);
    check_ac3_sum_square_butterfly_float(&c);
 }