librempeg/libavcodec/h264qpel.c
Niklas Haas 6851ef29df avcodec/riscv: add h264 qpel
Benched on K230 for VLEN 128, SpaceMIT for VLEN 256. Variants for 4
width have no speedup for VLEN 256 vs VLEN 128 on available hardware,
so were disabled.

                        C      RVV128          C     RVV256
avg_h264_qpel_4_mc00_8  33.9   33.6   (1.01x)
avg_h264_qpel_4_mc01_8  218.8  89.1   (2.46x)
avg_h264_qpel_4_mc02_8  218.8  79.8   (2.74x)
avg_h264_qpel_4_mc03_8  218.8  89.1   (2.46x)
avg_h264_qpel_4_mc10_8  172.3  126.1  (1.37x)
avg_h264_qpel_4_mc11_8  339.1  190.8  (1.78x)
avg_h264_qpel_4_mc12_8  533.6  357.6  (1.49x)
avg_h264_qpel_4_mc13_8  348.4  190.8  (1.83x)
avg_h264_qpel_4_mc20_8  144.8  116.8  (1.24x)
avg_h264_qpel_4_mc21_8  478.1  385.6  (1.24x)
avg_h264_qpel_4_mc22_8  348.4  283.6  (1.23x)
avg_h264_qpel_4_mc23_8  478.1  394.6  (1.21x)
avg_h264_qpel_4_mc30_8  172.6  126.1  (1.37x)
avg_h264_qpel_4_mc31_8  339.4  191.1  (1.78x)
avg_h264_qpel_4_mc32_8  542.9  357.6  (1.52x)
avg_h264_qpel_4_mc33_8  339.4  191.1  (1.78x)
avg_h264_qpel_8_mc00_8  116.8  42.9   (2.72x)  123.6  50.6   (2.44x)
avg_h264_qpel_8_mc01_8  774.4  163.1  (4.75x)  779.8  165.1  (4.72x)
avg_h264_qpel_8_mc02_8  774.4  154.1  (5.03x)  779.8  144.3  (5.40x)
avg_h264_qpel_8_mc03_8  774.4  163.3  (4.74x)  779.8  165.3  (4.72x)
avg_h264_qpel_8_mc10_8  617.1  237.3  (2.60x)  613.1  227.6  (2.69x)
avg_h264_qpel_8_mc11_8  1209.3 376.4  (3.21x)  1206.8 363.1  (3.32x)
avg_h264_qpel_8_mc12_8  1913.3 598.6  (3.20x)  1894.3 561.1  (3.38x)
avg_h264_qpel_8_mc13_8  1218.6 376.4  (3.24x)  1217.1 363.1  (3.35x)
avg_h264_qpel_8_mc20_8  524.4  228.1  (2.30x)  519.3  227.6  (2.28x)
avg_h264_qpel_8_mc21_8  1709.6 681.9  (2.51x)  1707.1 644.3  (2.65x)
avg_h264_qpel_8_mc22_8  1274.3 459.6  (2.77x)  1279.8 436.1  (2.93x)
avg_h264_qpel_8_mc23_8  1700.3 672.6  (2.53x)  1706.8 644.6  (2.65x)
avg_h264_qpel_8_mc30_8  607.6  246.6  (2.46x)  623.6  238.1  (2.62x)
avg_h264_qpel_8_mc31_8  1209.6 376.4  (3.21x)  1206.8 363.1  (3.32x)
avg_h264_qpel_8_mc32_8  1904.1 607.9  (3.13x)  1894.3 571.3  (3.32x)
avg_h264_qpel_8_mc33_8  1209.6 376.1  (3.22x)  1206.8 363.1  (3.32x)
avg_h264_qpel_16_mc00_8 431.9  89.1   (4.85x)  436.1  71.3   (6.12x)
avg_h264_qpel_16_mc01_8 2894.6 376.1  (7.70x)  2842.3 300.6  (9.46x)
avg_h264_qpel_16_mc02_8 2987.3 348.4  (8.57x)  2967.3 290.1  (10.23x)
avg_h264_qpel_16_mc03_8 2885.3 376.4  (7.67x)  2842.3 300.6  (9.46x)
avg_h264_qpel_16_mc10_8 2404.1 524.4  (4.58x)  2404.8 456.8  (5.26x)
avg_h264_qpel_16_mc11_8 4709.4 811.6  (5.80x)  4675.6 706.8  (6.62x)
avg_h264_qpel_16_mc12_8 7477.9 1274.3 (5.87x)  7436.1 1061.1 (7.01x)
avg_h264_qpel_16_mc13_8 4718.6 820.6  (5.75x)  4655.1 706.8  (6.59x)
avg_h264_qpel_16_mc20_8 2052.1 487.1  (4.21x)  2071.3 446.3  (4.64x)
avg_h264_qpel_16_mc21_8 7440.6 1422.6 (5.23x)  6727.8 1217.3 (5.53x)
avg_h264_qpel_16_mc22_8 5051.9 950.4  (5.32x)  5071.6 790.3  (6.42x)
avg_h264_qpel_16_mc23_8 6764.9 1422.3 (4.76x)  6748.6 1217.3 (5.54x)
avg_h264_qpel_16_mc30_8 2413.1 524.4  (4.60x)  2415.1 467.3  (5.17x)
avg_h264_qpel_16_mc31_8 4681.6 839.1  (5.58x)  4675.6 727.6  (6.43x)
avg_h264_qpel_16_mc32_8 8579.6 1292.8 (6.64x)  7436.3 1071.3 (6.94x)
avg_h264_qpel_16_mc33_8 5375.9 829.9  (6.48x)  4665.3 717.3  (6.50x)
put_h264_qpel_4_mc00_8  24.4   24.4   (1.00x)
put_h264_qpel_4_mc01_8  987.4  79.8   (12.37x)
put_h264_qpel_4_mc02_8  190.8  79.8   (2.39x)
put_h264_qpel_4_mc03_8  209.6  89.1   (2.35x)
put_h264_qpel_4_mc10_8  163.3  117.1  (1.39x)
put_h264_qpel_4_mc11_8  339.4  181.6  (1.87x)
put_h264_qpel_4_mc12_8  533.6  348.4  (1.53x)
put_h264_qpel_4_mc13_8  339.4  190.8  (1.78x)
put_h264_qpel_4_mc20_8  126.3  116.8  (1.08x)
put_h264_qpel_4_mc21_8  468.9  376.1  (1.25x)
put_h264_qpel_4_mc22_8  330.1  274.4  (1.20x)
put_h264_qpel_4_mc23_8  468.9  376.1  (1.25x)
put_h264_qpel_4_mc30_8  163.3  126.3  (1.29x)
put_h264_qpel_4_mc31_8  339.1  191.1  (1.77x)
put_h264_qpel_4_mc32_8  533.6  348.4  (1.53x)
put_h264_qpel_4_mc33_8  339.4  181.8  (1.87x)
put_h264_qpel_8_mc00_8  98.6   33.6   (2.93x)  92.3   40.1   (2.30x)
put_h264_qpel_8_mc01_8  737.1  153.8  (4.79x)  738.1  144.3  (5.12x)
put_h264_qpel_8_mc02_8  663.1  135.3  (4.90x)  665.1  134.1  (4.96x)
put_h264_qpel_8_mc03_8  737.4  154.1  (4.79x)  1508.8 144.3  (10.46x)
put_h264_qpel_8_mc10_8  598.4  237.1  (2.52x)  592.3  227.6  (2.60x)
put_h264_qpel_8_mc11_8  1172.3 357.9  (3.28x)  1175.6 342.3  (3.43x)
put_h264_qpel_8_mc12_8  1867.1 589.1  (3.17x)  1863.1 561.1  (3.32x)
put_h264_qpel_8_mc13_8  1172.6 366.9  (3.20x)  1175.6 352.8  (3.33x)
put_h264_qpel_8_mc20_8  450.4  218.8  (2.06x)  446.3  206.8  (2.16x)
put_h264_qpel_8_mc21_8  1672.3 663.1  (2.52x)  1675.6 633.8  (2.64x)
put_h264_qpel_8_mc22_8  1144.6 1200.1 (0.95x)  1144.3 425.6  (2.69x)
put_h264_qpel_8_mc23_8  1672.6 672.4  (2.49x)  1665.3 634.1  (2.63x)
put_h264_qpel_8_mc30_8  598.6  237.3  (2.52x)  613.1  227.6  (2.69x)
put_h264_qpel_8_mc31_8  1172.3 376.1  (3.12x)  1175.6 352.6  (3.33x)
put_h264_qpel_8_mc32_8  1857.8 598.6  (3.10x)  1863.1 561.1  (3.32x)
put_h264_qpel_8_mc33_8  1172.3 376.1  (3.12x)  1175.6 352.8  (3.33x)
put_h264_qpel_16_mc00_8 320.6  61.4   (5.22x)  321.3  60.8   (5.28x)
put_h264_qpel_16_mc01_8 2774.3 339.1  (8.18x)  2759.1 279.8  (9.86x)
put_h264_qpel_16_mc02_8 2589.1 320.6  (8.08x)  2571.6 269.3  (9.55x)
put_h264_qpel_16_mc03_8 2774.3 339.4  (8.17x)  2738.1 290.1  (9.44x)
put_h264_qpel_16_mc10_8 2274.3 487.4  (4.67x)  2290.1 436.1  (5.25x)
put_h264_qpel_16_mc11_8 5237.1 792.9  (6.60x)  4529.8 685.8  (6.61x)
put_h264_qpel_16_mc12_8 7357.6 1255.8 (5.86x)  7352.8 1040.1 (7.07x)
put_h264_qpel_16_mc13_8 4579.9 792.9  (5.78x)  4571.6 686.1  (6.66x)
put_h264_qpel_16_mc20_8 1802.1 459.6  (3.92x)  1800.6 425.6  (4.23x)
put_h264_qpel_16_mc21_8 6644.6 2246.6 (2.96x)  6644.3 1196.6 (5.55x)
put_h264_qpel_16_mc22_8 4589.1 913.4  (5.02x)  4592.3 769.3  (5.97x)
put_h264_qpel_16_mc23_8 6644.6 1394.6 (4.76x)  6634.1 1196.6 (5.54x)
put_h264_qpel_16_mc30_8 2274.3 496.6  (4.58x)  2290.1 456.8  (5.01x)
put_h264_qpel_16_mc31_8 5255.6 802.1  (6.55x)  4550.8 706.8  (6.44x)
put_h264_qpel_16_mc32_8 7376.1 1265.1 (5.83x)  7352.8 1050.6 (7.00x)
put_h264_qpel_16_mc33_8 4579.9 802.1  (5.71x)  4561.1 696.3  (6.55x)

Signed-off-by: Niklas Haas <git@haasn.dev>
Signed-off-by: J. Dekker <jdek@itanimul.li>
Signed-off-by: Paul B Mahol <onemda@gmail.com>
2024-09-30 19:13:05 +02:00

115 lines
3.7 KiB
C

/*
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "h264qpel.h"
#define pixeltmp int16_t
#define BIT_DEPTH 8
#include "h264qpel_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 9
#include "h264qpel_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 10
#include "h264qpel_template.c"
#undef BIT_DEPTH
#undef pixeltmp
#define pixeltmp int32_t
#define BIT_DEPTH 12
#include "h264qpel_template.c"
#undef BIT_DEPTH
#define BIT_DEPTH 14
#include "h264qpel_template.c"
#undef BIT_DEPTH
av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
{
#undef FUNCC
#define FUNCC(f, depth) f ## _ ## depth ## _c
#define dspfunc2(PFX, IDX, NUM, depth) \
c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth); \
c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth); \
c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth); \
c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth); \
c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth); \
c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth); \
c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth); \
c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth); \
c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth); \
c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth); \
c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth); \
c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth); \
c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth); \
c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth); \
c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth); \
c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
#define SET_QPEL(depth) \
dspfunc2(put_h264_qpel, 0, 16, depth); \
dspfunc2(put_h264_qpel, 1, 8, depth); \
dspfunc2(put_h264_qpel, 2, 4, depth); \
dspfunc2(put_h264_qpel, 3, 2, depth); \
dspfunc2(avg_h264_qpel, 0, 16, depth); \
dspfunc2(avg_h264_qpel, 1, 8, depth); \
dspfunc2(avg_h264_qpel, 2, 4, depth)
switch (bit_depth) {
default:
SET_QPEL(8);
break;
case 9:
SET_QPEL(9);
break;
case 10:
SET_QPEL(10);
break;
case 12:
SET_QPEL(12);
break;
case 14:
SET_QPEL(14);
break;
}
#if ARCH_AARCH64
ff_h264qpel_init_aarch64(c, bit_depth);
#elif ARCH_ARM
ff_h264qpel_init_arm(c, bit_depth);
#elif ARCH_PPC
ff_h264qpel_init_ppc(c, bit_depth);
#elif ARCH_RISCV
ff_h264qpel_init_riscv(c, bit_depth);
#elif ARCH_X86
ff_h264qpel_init_x86(c, bit_depth);
#elif ARCH_MIPS
ff_h264qpel_init_mips(c, bit_depth);
#elif ARCH_LOONGARCH64
ff_h264qpel_init_loongarch(c, bit_depth);
#endif
}