mirror of
https://github.com/librempeg/librempeg
synced 2024-11-22 00:51:37 +00:00
x86/tx_float: add 15xN PFA FFT AVX SIMD
~4x faster than the C version. The shuffles in the 15pt dim1 are seriously expensive. Not happy with it, but I'm contempt. Can be easily converted to pure AVX by removing all vpermpd/vpermps instructions.
This commit is contained in:
parent
3241e9225c
commit
ace42cf581
@ -704,3 +704,326 @@ Also note that this function requires a special iteration way, due to coefficien
|
||||
beginning to overlap, particularly `[o1]` with `[0]` after the second iteration.
|
||||
To iterate further, set `z = &z[16]` via `z += 8` for the second iteration. After
|
||||
the 4th iteration, the layout resets, so repeat the same.
|
||||
|
||||
|
||||
# 15-point AVX FFT transform
|
||||
The 15-point transform is based on the following unrolling. The input
|
||||
must be permuted via the following loop:
|
||||
|
||||
``` C
|
||||
for (int k = 0; k < s->sub[0].len; k++) {
|
||||
int cnt = 0;
|
||||
int tmp[15];
|
||||
memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
|
||||
for (int i = 1; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 2; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 0; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
|
||||
memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
|
||||
s->map[k*15 + 1] = tmp[2];
|
||||
s->map[k*15 + 2] = tmp[0];
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
This separates the ACs from the DCs and flips the SIMD direction to
|
||||
performing 5x3pt transforms at once, followed by 3x5pt transforms.
|
||||
|
||||
``` C
|
||||
static av_always_inline void fft15(TXComplex *out, TXComplex *in,
|
||||
ptrdiff_t stride)
|
||||
{
|
||||
const TXSample *tab = TX_TAB(ff_tx_tab_53);
|
||||
TXComplex q[20];
|
||||
TXComplex dc[3], pc[32];
|
||||
TXComplex y[32], k[32];
|
||||
TXComplex t[32];
|
||||
TXComplex r[32];
|
||||
TXComplex z0[32];
|
||||
|
||||
/* DC */
|
||||
pc[0].re = in[ 1].im - in[ 0].im;
|
||||
pc[0].im = in[ 1].re - in[ 0].re;
|
||||
pc[1].re = in[ 1].re + in[ 0].re;
|
||||
pc[1].im = in[ 1].im + in[ 0].im;
|
||||
|
||||
dc[0].re = in[2].re + pc[1].re;
|
||||
dc[0].im = in[2].im + pc[1].im;
|
||||
|
||||
pc[0].re = tab[ 8] * pc[0].re;
|
||||
pc[0].im = tab[ 9] * pc[0].im;
|
||||
pc[1].re = tab[10] * pc[1].re;
|
||||
pc[1].im = tab[11] * pc[1].im;
|
||||
|
||||
dc[1].re = pc[0].re + pc[1].re;
|
||||
dc[1].im = pc[0].im + pc[1].im;
|
||||
dc[2].re = pc[1].re - pc[0].re;
|
||||
dc[2].im = pc[1].im - pc[0].im;
|
||||
|
||||
dc[1].re = in[2].re - dc[1].re;
|
||||
dc[1].im = in[2].im + dc[1].im;
|
||||
dc[2].re = in[2].re - dc[2].re;
|
||||
dc[2].im = in[2].im + dc[2].im;
|
||||
|
||||
/* ACs */
|
||||
q[0].im = in[ 3].re - in[ 7].re; // NOTE THE ORDER
|
||||
q[0].re = in[ 3].im - in[ 7].im;
|
||||
q[1].im = in[ 4].re - in[ 8].re;
|
||||
q[1].re = in[ 4].im - in[ 8].im;
|
||||
q[2].im = in[ 5].re - in[ 9].re;
|
||||
q[2].re = in[ 5].im - in[ 9].im;
|
||||
q[3].re = in[ 6].im - in[10].im;
|
||||
q[3].im = in[ 6].re - in[10].re;
|
||||
|
||||
q[4].re = in[ 3].re + in[ 7].re;
|
||||
q[4].im = in[ 3].im + in[ 7].im;
|
||||
q[5].re = in[ 4].re + in[ 8].re;
|
||||
q[5].im = in[ 4].im + in[ 8].im;
|
||||
q[6].re = in[ 5].re + in[ 9].re;
|
||||
q[6].im = in[ 5].im + in[ 9].im;
|
||||
q[7].re = in[ 6].re + in[10].re;
|
||||
q[7].im = in[ 6].im + in[10].im;
|
||||
|
||||
y[0].re = in[11].re + q[4].re;
|
||||
y[0].im = in[11].im + q[4].im;
|
||||
y[1].re = in[12].re + q[5].re;
|
||||
y[1].im = in[12].im + q[5].im;
|
||||
y[2].re = in[13].re + q[6].re;
|
||||
y[2].im = in[13].im + q[6].im;
|
||||
y[3].re = in[14].re + q[7].re;
|
||||
y[3].im = in[14].im + q[7].im;
|
||||
|
||||
q[0].re = tab[ 8] * q[0].re;
|
||||
q[0].im = tab[ 9] * q[0].im;
|
||||
q[1].re = tab[ 8] * q[1].re;
|
||||
q[1].im = tab[ 9] * q[1].im;
|
||||
q[2].re = tab[ 8] * q[2].re;
|
||||
q[2].im = tab[ 9] * q[2].im;
|
||||
q[3].re = tab[ 8] * q[3].re;
|
||||
q[3].im = tab[ 9] * q[3].im;
|
||||
|
||||
q[4].re = tab[10] * q[4].re;
|
||||
q[4].im = tab[11] * q[4].im;
|
||||
q[5].re = tab[10] * q[5].re;
|
||||
q[5].im = tab[11] * q[5].im;
|
||||
q[6].re = tab[10] * q[6].re;
|
||||
q[6].im = tab[11] * q[6].im;
|
||||
q[7].re = tab[10] * q[7].re;
|
||||
q[7].im = tab[11] * q[7].im;
|
||||
|
||||
k[0].re = q[4].re - q[0].re;
|
||||
k[0].im = q[4].im - q[0].im;
|
||||
k[1].re = q[5].re - q[1].re;
|
||||
k[1].im = q[5].im - q[1].im;
|
||||
k[2].re = q[6].re - q[2].re;
|
||||
k[2].im = q[6].im - q[2].im;
|
||||
k[3].re = q[7].re - q[3].re;
|
||||
k[3].im = q[7].im - q[3].im;
|
||||
|
||||
k[4].re = q[4].re + q[0].re;
|
||||
k[4].im = q[4].im + q[0].im;
|
||||
k[5].re = q[5].re + q[1].re;
|
||||
k[5].im = q[5].im + q[1].im;
|
||||
k[6].re = q[6].re + q[2].re;
|
||||
k[6].im = q[6].im + q[2].im;
|
||||
k[7].re = q[7].re + q[3].re;
|
||||
k[7].im = q[7].im + q[3].im;
|
||||
|
||||
k[0].re = in[11].re - k[0].re;
|
||||
k[0].im = in[11].im + k[0].im;
|
||||
k[1].re = in[12].re - k[1].re;
|
||||
k[1].im = in[12].im + k[1].im;
|
||||
k[2].re = in[13].re - k[2].re;
|
||||
k[2].im = in[13].im + k[2].im;
|
||||
k[3].re = in[14].re - k[3].re;
|
||||
k[3].im = in[14].im + k[3].im;
|
||||
|
||||
k[4].re = in[11].re - k[4].re;
|
||||
k[4].im = in[11].im + k[4].im;
|
||||
k[5].re = in[12].re - k[5].re;
|
||||
k[5].im = in[12].im + k[5].im;
|
||||
k[6].re = in[13].re - k[6].re;
|
||||
k[6].im = in[13].im + k[6].im;
|
||||
k[7].re = in[14].re - k[7].re;
|
||||
k[7].im = in[14].im + k[7].im;
|
||||
|
||||
/* 15pt start here */
|
||||
t[0].re = y[3].re + y[0].re;
|
||||
t[0].im = y[3].im + y[0].im;
|
||||
t[1].re = y[2].re + y[1].re;
|
||||
t[1].im = y[2].im + y[1].im;
|
||||
t[2].re = y[1].re - y[2].re;
|
||||
t[2].im = y[1].im - y[2].im;
|
||||
t[3].re = y[0].re - y[3].re;
|
||||
t[3].im = y[0].im - y[3].im;
|
||||
|
||||
t[4].re = k[3].re + k[0].re;
|
||||
t[4].im = k[3].im + k[0].im;
|
||||
t[5].re = k[2].re + k[1].re;
|
||||
t[5].im = k[2].im + k[1].im;
|
||||
t[6].re = k[1].re - k[2].re;
|
||||
t[6].im = k[1].im - k[2].im;
|
||||
t[7].re = k[0].re - k[3].re;
|
||||
t[7].im = k[0].im - k[3].im;
|
||||
|
||||
t[ 8].re = k[7].re + k[4].re;
|
||||
t[ 8].im = k[7].im + k[4].im;
|
||||
t[ 9].re = k[6].re + k[5].re;
|
||||
t[ 9].im = k[6].im + k[5].im;
|
||||
t[10].re = k[5].re - k[6].re;
|
||||
t[10].im = k[5].im - k[6].im;
|
||||
t[11].re = k[4].re - k[7].re;
|
||||
t[11].im = k[4].im - k[7].im;
|
||||
|
||||
out[ 0*stride].re = dc[0].re + t[0].re + t[ 1].re;
|
||||
out[ 0*stride].im = dc[0].im + t[0].im + t[ 1].im;
|
||||
out[10*stride].re = dc[1].re + t[4].re + t[ 5].re;
|
||||
out[10*stride].im = dc[1].im + t[4].im + t[ 5].im;
|
||||
out[ 5*stride].re = dc[2].re + t[8].re + t[ 9].re;
|
||||
out[ 5*stride].im = dc[2].im + t[8].im + t[ 9].im;
|
||||
|
||||
r[0].re = t[0].re * tab[0];
|
||||
r[0].im = t[0].im * tab[1];
|
||||
r[1].re = t[1].re * tab[0];
|
||||
r[1].im = t[1].im * tab[1];
|
||||
r[2].re = t[2].re * tab[4];
|
||||
r[2].im = t[2].im * tab[5];
|
||||
r[3].re = t[3].re * tab[4];
|
||||
r[3].im = t[3].im * tab[5];
|
||||
|
||||
r[4].re = t[4].re * tab[0];
|
||||
r[4].im = t[4].im * tab[1];
|
||||
r[5].re = t[5].re * tab[0];
|
||||
r[5].im = t[5].im * tab[1];
|
||||
r[6].re = t[6].re * tab[4];
|
||||
r[6].im = t[6].im * tab[5];
|
||||
r[7].re = t[7].re * tab[4];
|
||||
r[7].im = t[7].im * tab[5];
|
||||
|
||||
r[ 8].re = t[ 8].re * tab[0];
|
||||
r[ 8].im = t[ 8].im * tab[1];
|
||||
r[ 9].re = t[ 9].re * tab[0];
|
||||
r[ 9].im = t[ 9].im * tab[1];
|
||||
r[10].re = t[10].re * tab[4];
|
||||
r[10].im = t[10].im * tab[5];
|
||||
r[11].re = t[11].re * tab[4];
|
||||
r[11].im = t[11].im * tab[5];
|
||||
|
||||
t[0].re = t[0].re * tab[2];
|
||||
t[0].im = t[0].im * tab[3];
|
||||
t[1].re = t[1].re * tab[2];
|
||||
t[1].im = t[1].im * tab[3];
|
||||
t[2].re = t[2].re * tab[6];
|
||||
t[2].im = t[2].im * tab[7];
|
||||
t[3].re = t[3].re * tab[6];
|
||||
t[3].im = t[3].im * tab[7];
|
||||
|
||||
t[4].re = t[4].re * tab[2];
|
||||
t[4].im = t[4].im * tab[3];
|
||||
t[5].re = t[5].re * tab[2];
|
||||
t[5].im = t[5].im * tab[3];
|
||||
t[6].re = t[6].re * tab[6];
|
||||
t[6].im = t[6].im * tab[7];
|
||||
t[7].re = t[7].re * tab[6];
|
||||
t[7].im = t[7].im * tab[7];
|
||||
|
||||
t[ 8].re = t[ 8].re * tab[2];
|
||||
t[ 8].im = t[ 8].im * tab[3];
|
||||
t[ 9].re = t[ 9].re * tab[2];
|
||||
t[ 9].im = t[ 9].im * tab[3];
|
||||
t[10].re = t[10].re * tab[6];
|
||||
t[10].im = t[10].im * tab[7];
|
||||
t[11].re = t[11].re * tab[6];
|
||||
t[11].im = t[11].im * tab[7];
|
||||
|
||||
r[0].re = r[0].re - t[1].re;
|
||||
r[0].im = r[0].im - t[1].im;
|
||||
r[1].re = r[1].re - t[0].re;
|
||||
r[1].im = r[1].im - t[0].im;
|
||||
r[2].re = r[2].re - t[3].re;
|
||||
r[2].im = r[2].im - t[3].im;
|
||||
r[3].re = r[3].re + t[2].re;
|
||||
r[3].im = r[3].im + t[2].im;
|
||||
|
||||
r[4].re = r[4].re - t[5].re;
|
||||
r[4].im = r[4].im - t[5].im;
|
||||
r[5].re = r[5].re - t[4].re;
|
||||
r[5].im = r[5].im - t[4].im;
|
||||
r[6].re = r[6].re - t[7].re;
|
||||
r[6].im = r[6].im - t[7].im;
|
||||
r[7].re = r[7].re + t[6].re;
|
||||
r[7].im = r[7].im + t[6].im;
|
||||
|
||||
r[ 8].re = r[ 8].re - t[ 9].re;
|
||||
r[ 8].im = r[ 8].im - t[ 9].im;
|
||||
r[ 9].re = r[ 9].re - t[ 8].re;
|
||||
r[ 9].im = r[ 9].im - t[ 8].im;
|
||||
r[10].re = r[10].re - t[11].re;
|
||||
r[10].im = r[10].im - t[11].im;
|
||||
r[11].re = r[11].re + t[10].re;
|
||||
r[11].im = r[11].im + t[10].im;
|
||||
|
||||
z0[ 0].re = r[ 3].im + r[ 0].re;
|
||||
z0[ 0].im = r[ 3].re + r[ 0].im;
|
||||
z0[ 1].re = r[ 2].im + r[ 1].re;
|
||||
z0[ 1].im = r[ 2].re + r[ 1].im;
|
||||
z0[ 2].re = r[ 1].im - r[ 2].re;
|
||||
z0[ 2].im = r[ 1].re - r[ 2].im;
|
||||
z0[ 3].re = r[ 0].im - r[ 3].re;
|
||||
z0[ 3].im = r[ 0].re - r[ 3].im;
|
||||
|
||||
z0[ 4].re = r[ 7].im + r[ 4].re;
|
||||
z0[ 4].im = r[ 7].re + r[ 4].im;
|
||||
z0[ 5].re = r[ 6].im + r[ 5].re;
|
||||
z0[ 5].im = r[ 6].re + r[ 5].im;
|
||||
z0[ 6].re = r[ 5].im - r[ 6].re;
|
||||
z0[ 6].im = r[ 5].re - r[ 6].im;
|
||||
z0[ 7].re = r[ 4].im - r[ 7].re;
|
||||
z0[ 7].im = r[ 4].re - r[ 7].im;
|
||||
|
||||
z0[ 8].re = r[11].im + r[ 8].re;
|
||||
z0[ 8].im = r[11].re + r[ 8].im;
|
||||
z0[ 9].re = r[10].im + r[ 9].re;
|
||||
z0[ 9].im = r[10].re + r[ 9].im;
|
||||
z0[10].re = r[ 9].im - r[10].re;
|
||||
z0[10].im = r[ 9].re - r[10].im;
|
||||
z0[11].re = r[ 8].im - r[11].re;
|
||||
z0[11].im = r[ 8].re - r[11].im;
|
||||
|
||||
out[ 6*stride].re = dc[0].re + z0[0].re;
|
||||
out[ 6*stride].im = dc[0].im + z0[3].re;
|
||||
out[12*stride].re = dc[0].re + z0[2].im;
|
||||
out[12*stride].im = dc[0].im + z0[1].im;
|
||||
out[ 3*stride].re = dc[0].re + z0[1].re;
|
||||
out[ 3*stride].im = dc[0].im + z0[2].re;
|
||||
out[ 9*stride].re = dc[0].re + z0[3].im;
|
||||
out[ 9*stride].im = dc[0].im + z0[0].im;
|
||||
|
||||
out[ 1*stride].re = dc[1].re + z0[4].re;
|
||||
out[ 1*stride].im = dc[1].im + z0[7].re;
|
||||
out[ 7*stride].re = dc[1].re + z0[6].im;
|
||||
out[ 7*stride].im = dc[1].im + z0[5].im;
|
||||
out[13*stride].re = dc[1].re + z0[5].re;
|
||||
out[13*stride].im = dc[1].im + z0[6].re;
|
||||
out[ 4*stride].re = dc[1].re + z0[7].im;
|
||||
out[ 4*stride].im = dc[1].im + z0[4].im;
|
||||
|
||||
out[11*stride].re = dc[2].re + z0[8].re;
|
||||
out[11*stride].im = dc[2].im + z0[11].re;
|
||||
out[ 2*stride].re = dc[2].re + z0[10].im;
|
||||
out[ 2*stride].im = dc[2].im + z0[9].im;
|
||||
out[ 8*stride].re = dc[2].re + z0[9].re;
|
||||
out[ 8*stride].im = dc[2].im + z0[10].re;
|
||||
out[14*stride].re = dc[2].re + z0[11].im;
|
||||
out[14*stride].im = dc[2].im + z0[8].im;
|
||||
}
|
||||
```
|
||||
|
@ -48,9 +48,9 @@ SR_TABLE(65536);
|
||||
SR_TABLE(131072);
|
||||
|
||||
/* Other factors' tables */
|
||||
TABLE_DEF(53, 8);
|
||||
TABLE_DEF( 7, 6);
|
||||
TABLE_DEF( 9, 8);
|
||||
TABLE_DEF(53, 12);
|
||||
TABLE_DEF( 7, 6);
|
||||
TABLE_DEF( 9, 8);
|
||||
|
||||
typedef struct FFSRTabsInitOnce {
|
||||
void (*func)(void);
|
||||
@ -104,19 +104,26 @@ static FFSRTabsInitOnce sr_tabs_init_once[] = {
|
||||
{ TX_TAB(ff_tx_init_tab_131072), AV_ONCE_INIT },
|
||||
};
|
||||
|
||||
static void TX_TAB(ff_tx_init_tab_53)(void)
|
||||
static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
|
||||
{
|
||||
TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 12));
|
||||
TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 12));
|
||||
TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 6));
|
||||
TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(8 * M_PI / 6));
|
||||
TX_TAB(ff_tx_tab_53)[4] = RESCALE(cos(2 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(8 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[6] = RESCALE(cos(2 * M_PI / 10));
|
||||
TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(6 * M_PI / 5));
|
||||
/* 5pt, doubled to eliminate AVX lane shuffles */
|
||||
TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
|
||||
TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
|
||||
TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
|
||||
TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
|
||||
TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
|
||||
|
||||
/* 3pt */
|
||||
TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
|
||||
TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
|
||||
TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
|
||||
TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
|
||||
}
|
||||
|
||||
static void TX_TAB(ff_tx_init_tab_7)(void)
|
||||
static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
|
||||
{
|
||||
TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
|
||||
TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
|
||||
@ -126,7 +133,7 @@ static void TX_TAB(ff_tx_init_tab_7)(void)
|
||||
TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
|
||||
}
|
||||
|
||||
static void TX_TAB(ff_tx_init_tab_9)(void)
|
||||
static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
|
||||
{
|
||||
TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
|
||||
TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
|
||||
@ -189,19 +196,19 @@ static av_always_inline void fft3(TXComplex *out, TXComplex *in,
|
||||
out[0*stride].im = in[0].im + tmp[1].im;
|
||||
|
||||
#ifdef TX_INT32
|
||||
mtmp[0] = (int64_t)tab[0] * tmp[0].re;
|
||||
mtmp[1] = (int64_t)tab[1] * tmp[0].im;
|
||||
mtmp[2] = (int64_t)tab[2] * tmp[1].re;
|
||||
mtmp[3] = (int64_t)tab[2] * tmp[1].im;
|
||||
mtmp[0] = (int64_t)tab[ 8] * tmp[0].re;
|
||||
mtmp[1] = (int64_t)tab[ 9] * tmp[0].im;
|
||||
mtmp[2] = (int64_t)tab[10] * tmp[1].re;
|
||||
mtmp[3] = (int64_t)tab[10] * tmp[1].im;
|
||||
out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
|
||||
out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
|
||||
out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
|
||||
out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
|
||||
#else
|
||||
tmp[0].re = tab[0] * tmp[0].re;
|
||||
tmp[0].im = tab[1] * tmp[0].im;
|
||||
tmp[1].re = tab[2] * tmp[1].re;
|
||||
tmp[1].im = tab[2] * tmp[1].im;
|
||||
tmp[0].re = tab[ 8] * tmp[0].re;
|
||||
tmp[0].im = tab[ 9] * tmp[0].im;
|
||||
tmp[1].re = tab[10] * tmp[1].re;
|
||||
tmp[1].im = tab[10] * tmp[1].im;
|
||||
out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
|
||||
out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
|
||||
out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re;
|
||||
@ -224,10 +231,10 @@ static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
|
||||
out[D0*stride].re = in[0].re + t[0].re + t[2].re; \
|
||||
out[D0*stride].im = in[0].im + t[0].im + t[2].im; \
|
||||
\
|
||||
SMUL(t[4].re, t[0].re, tab[4], tab[6], t[2].re, t[0].re); \
|
||||
SMUL(t[4].im, t[0].im, tab[4], tab[6], t[2].im, t[0].im); \
|
||||
CMUL(t[5].re, t[1].re, -tab[5], -tab[7], t[3].re, t[1].re); \
|
||||
CMUL(t[5].im, t[1].im, -tab[5], -tab[7], t[3].im, t[1].im); \
|
||||
SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
|
||||
SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
|
||||
CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
|
||||
CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
|
||||
\
|
||||
BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
|
||||
BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
|
||||
|
@ -51,6 +51,8 @@ cextern tab_ %+ i %+ _float ; ff_tab_i_float...
|
||||
%assign i (i << 1)
|
||||
%endrep
|
||||
|
||||
cextern tab_53_float
|
||||
|
||||
struc AVTXContext
|
||||
.len: resd 1 ; Length
|
||||
.inv resd 1 ; Inverse flag
|
||||
@ -87,6 +89,9 @@ s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, CO
|
||||
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
|
||||
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
|
||||
|
||||
s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1
|
||||
|
||||
mask_mmmmmmpp: dd NEG, NEG, NEG, NEG, NEG, NEG, POS, POS
|
||||
mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
|
||||
mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
|
||||
mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
|
||||
@ -1565,3 +1570,278 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp,
|
||||
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
|
||||
IMDCT_FN avx2
|
||||
%endif
|
||||
|
||||
%macro PFA_15_FN 2
|
||||
INIT_YMM %1
|
||||
%if %2
|
||||
cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
|
||||
tgt5, stride3, stride5, btmp
|
||||
%else
|
||||
cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
|
||||
tgt5, stride3, stride5, btmp
|
||||
%endif
|
||||
|
||||
%if %2
|
||||
PUSH inq
|
||||
PUSH tgt5q
|
||||
PUSH stride3q
|
||||
PUSH stride5q
|
||||
PUSH btmpq
|
||||
%endif
|
||||
|
||||
mov btmpq, outq
|
||||
|
||||
mov outq, [ctxq + AVTXContext.tmp]
|
||||
%if !%2
|
||||
movsxd lenq, dword [ctxq + AVTXContext.len]
|
||||
mov lutq, [ctxq + AVTXContext.map]
|
||||
%endif
|
||||
|
||||
; Load stride (second transform's length) and second transform's LUT
|
||||
mov tmpq, [ctxq + AVTXContext.sub]
|
||||
movsxd strideq, dword [tmpq + AVTXContext.len]
|
||||
mov mapq, [tmpq + AVTXContext.map]
|
||||
|
||||
shl strideq, 3
|
||||
imul stride3q, strideq, 3
|
||||
imul stride5q, strideq, 5
|
||||
|
||||
movaps m13, [mask_mmmmmmpp] ; mmmmmmpp
|
||||
vpermpd m12, m13, q0033 ; ppppmmmm
|
||||
vextractf128 xm11, m13, 1 ; mmpp
|
||||
movaps m10, [ff_tx_tab_53_float] ; tab5
|
||||
movaps xm9, [ff_tx_tab_53_float + 32] ; tab3
|
||||
movaps m8, [s15_perm]
|
||||
|
||||
.dim1:
|
||||
mov tmpd, [mapq]
|
||||
lea tgtq, [outq + tmpq*8]
|
||||
|
||||
%if %2
|
||||
movups xm0, [inq]
|
||||
%else
|
||||
LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim
|
||||
%endif
|
||||
|
||||
shufps xm1, xm0, xm0, q3223 ; in[1].imrereim
|
||||
shufps xm0, xm0, xm0, q1001 ; in[0].imrereim
|
||||
|
||||
xorps xm1, xm11
|
||||
addps xm1, xm0 ; pc[0,1].imre
|
||||
|
||||
%if %2
|
||||
movddup xm14, [inq + 16] ; in[2].reimreim
|
||||
%else
|
||||
mov tmpd, [lutq + 8]
|
||||
movddup xm14, [inq + tmpq*8] ; in[2].reimreim
|
||||
%endif
|
||||
shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim
|
||||
addps xm0, xm14 ; dc[0].reimreim
|
||||
|
||||
mulps xm1, xm9 ; tab[0123]*pc[01]
|
||||
|
||||
shufpd xm5, xm1, xm1, 01b ; pc[1,0].reim
|
||||
xorps xm1, xm11
|
||||
addps xm1, xm1, xm5
|
||||
addsubps xm1, xm14, xm1 ; dc[1,2].reim
|
||||
|
||||
%if %2
|
||||
movups m2, [inq + mmsize*0 + 24]
|
||||
movups m3, [inq + mmsize*1 + 24]
|
||||
%else
|
||||
LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m14, m15
|
||||
LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
subps m7, m2, m3 ; q[0-3].imre
|
||||
addps m6, m2, m3 ; q[4-7]
|
||||
shufps m7, m7, m7, q2301 ; q[0-3].reim
|
||||
|
||||
%if %2
|
||||
movups m4, [inq + mmsize*2 + 24]
|
||||
%else
|
||||
LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m14, m15
|
||||
%endif
|
||||
|
||||
addps m5, m4, m6 ; y[0-3]
|
||||
|
||||
vpermpd m14, m9, q1111 ; tab[23232323]
|
||||
vbroadcastsd m15, xm9 ; tab[01010101]
|
||||
|
||||
mulps m6, m14
|
||||
mulps m7, m15
|
||||
|
||||
subps m2, m6, m7 ; k[0-3]
|
||||
addps m3, m6, m7 ; k[4-7]
|
||||
|
||||
addsubps m6, m4, m2 ; k[0-3]
|
||||
addsubps m7, m4, m3 ; k[4-7]
|
||||
|
||||
; 15pt from here on
|
||||
vpermpd m2, m5, q0123 ; y[3-0]
|
||||
vpermpd m3, m6, q0123 ; k[3-0]
|
||||
vpermpd m4, m7, q0123 ; k[7-4]
|
||||
|
||||
xorps m5, m12
|
||||
xorps m6, m12
|
||||
xorps m7, m12
|
||||
|
||||
addps m2, m5 ; t[0-3]
|
||||
addps m3, m6 ; t[4-7]
|
||||
addps m4, m7 ; t[8-11]
|
||||
|
||||
movlhps xm14, xm2 ; out[0]
|
||||
unpcklpd xm7, xm3, xm4 ; out[10,5]
|
||||
unpckhpd xm5, xm3, xm4 ; out[10,5]
|
||||
|
||||
addps xm14, xm2 ; out[0]
|
||||
addps xm7, xm5 ; out[10,5]
|
||||
addps xm14, xm0 ; out[0]
|
||||
addps xm7, xm1 ; out[10,5]
|
||||
|
||||
movhps [tgtq], xm14 ; out[0]
|
||||
movhps [tgtq + stride5q*1], xm7 ; out[5]
|
||||
movlps [tgtq + stride5q*2], xm7 ; out[10]
|
||||
|
||||
shufps m14, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9
|
||||
shufps m15, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11
|
||||
|
||||
mulps m5, m2, m14 ; t[0-3]
|
||||
mulps m6, m3, m14 ; t[4-7]
|
||||
mulps m7, m4, m14 ; t[8-11]
|
||||
|
||||
mulps m2, m15 ; r[0-3]
|
||||
mulps m3, m15 ; r[4-7]
|
||||
mulps m4, m15 ; r[8-11]
|
||||
|
||||
shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim
|
||||
shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim
|
||||
shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim
|
||||
|
||||
lea tgt5q, [tgtq + stride5q]
|
||||
lea tmpq, [tgtq + stride5q*2]
|
||||
|
||||
xorps m5, m13
|
||||
xorps m6, m13
|
||||
xorps m7, m13
|
||||
|
||||
addps m2, m5 ; r[0,1,2,3]
|
||||
addps m3, m6 ; r[4,5,6,7]
|
||||
addps m4, m7 ; r[8,9,10,11]
|
||||
|
||||
shufps m5, m2, m2, q2301
|
||||
shufps m6, m3, m3, q2301
|
||||
shufps m7, m4, m4, q2301
|
||||
|
||||
xorps m2, m12
|
||||
xorps m3, m12
|
||||
xorps m4, m12
|
||||
|
||||
vpermpd m5, m5, q0123
|
||||
vpermpd m6, m6, q0123
|
||||
vpermpd m7, m7, q0123
|
||||
|
||||
addps m5, m2
|
||||
addps m6, m3
|
||||
addps m7, m4
|
||||
|
||||
vpermps m5, m8, m5
|
||||
vpermps m6, m8, m6
|
||||
vpermps m7, m8, m7
|
||||
|
||||
vbroadcastsd m0, xm0 ; dc[0]
|
||||
vpermpd m2, m1, q1111 ; dc[2]
|
||||
vbroadcastsd m1, xm1 ; dc[1]
|
||||
|
||||
addps m0, m5
|
||||
addps m1, m6
|
||||
addps m2, m7
|
||||
|
||||
vextractf128 xm3, m0, 1
|
||||
vextractf128 xm4, m1, 1
|
||||
vextractf128 xm5, m2, 1
|
||||
|
||||
movlps [tgtq + strideq*1], xm1
|
||||
movhps [tgtq + strideq*2], xm2
|
||||
movlps [tgtq + stride3q*1], xm3
|
||||
movhps [tgtq + strideq*4], xm4
|
||||
movlps [tgtq + stride3q*2], xm0
|
||||
movlps [tgtq + strideq*8], xm5
|
||||
movhps [tgtq + stride3q*4], xm0
|
||||
movhps [tgt5q + strideq*2], xm1
|
||||
movhps [tgt5q + strideq*4], xm3
|
||||
movlps [tmpq + strideq*1], xm2
|
||||
movlps [tmpq + stride3q*1], xm4
|
||||
movhps [tmpq + strideq*4], xm5
|
||||
|
||||
%if %2
|
||||
add inq, mmsize*3 + 24
|
||||
%else
|
||||
add lutq, (mmsize/2)*3 + 12
|
||||
%endif
|
||||
add mapq, 4
|
||||
sub lenq, 15
|
||||
jg .dim1
|
||||
|
||||
; Second transform setup
|
||||
mov stride5q, ctxq ; backup original context
|
||||
movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length
|
||||
mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
|
||||
|
||||
mov inq, outq ; in-place transform
|
||||
mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context
|
||||
mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map
|
||||
movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length
|
||||
|
||||
.dim2:
|
||||
call tgt5q ; call the FFT
|
||||
lea inq, [inq + lenq*8]
|
||||
lea outq, [outq + lenq*8]
|
||||
sub stride3q, lenq
|
||||
jg .dim2
|
||||
|
||||
mov ctxq, stride5q ; restore original context
|
||||
mov lutq, [ctxq + AVTXContext.map]
|
||||
mov inq, [ctxq + AVTXContext.tmp]
|
||||
movsxd lenq, dword [ctxq + AVTXContext.len] ; full length
|
||||
|
||||
lea stride3q, [lutq + lenq*4] ; second part of the LUT
|
||||
mov stride5q, lenq
|
||||
mov tgt5q, btmpq
|
||||
|
||||
.post:
|
||||
LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9
|
||||
movups [tgt5q], m0
|
||||
|
||||
add tgt5q, mmsize
|
||||
add stride3q, mmsize/2
|
||||
sub stride5q, mmsize/8
|
||||
jg .post
|
||||
|
||||
%if %2
|
||||
mov outq, btmpq
|
||||
POP btmpq
|
||||
POP stride5q
|
||||
POP stride3q
|
||||
POP tgt5q
|
||||
POP inq
|
||||
ret
|
||||
%else
|
||||
RET
|
||||
%endif
|
||||
|
||||
%if %2
|
||||
cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
|
||||
tgt5, stride3, stride5, btmp
|
||||
movsxd lenq, dword [ctxq + AVTXContext.len]
|
||||
mov lutq, [ctxq + AVTXContext.map]
|
||||
|
||||
call mangle(fft_pfa_15xM_asm_float)
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
PFA_15_FN avx2, 0
|
||||
PFA_15_FN avx2, 1
|
||||
%endif
|
||||
|
@ -43,6 +43,9 @@ TX_DECL_FN(fft_sr_ns, fma3)
|
||||
TX_DECL_FN(fft_sr, avx2)
|
||||
TX_DECL_FN(fft_sr_ns, avx2)
|
||||
|
||||
TX_DECL_FN(fft_pfa_15xM, avx2)
|
||||
TX_DECL_FN(fft_pfa_15xM_ns, avx2)
|
||||
|
||||
TX_DECL_FN(mdct_sr_inv, avx2)
|
||||
|
||||
TX_DECL_FN(fft2_asm, sse3)
|
||||
@ -57,6 +60,8 @@ TX_DECL_FN(fft32_asm, fma3)
|
||||
TX_DECL_FN(fft_sr_asm, fma3)
|
||||
TX_DECL_FN(fft_sr_asm, avx2)
|
||||
|
||||
TX_DECL_FN(fft_pfa_15xM_asm, avx2)
|
||||
|
||||
#define DECL_INIT_FN(basis, interleave) \
|
||||
static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
|
||||
const FFTXCodelet *cd, \
|
||||
@ -102,6 +107,61 @@ static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static av_cold int fft_pfa_init(AVTXContext *s,
|
||||
const FFTXCodelet *cd,
|
||||
uint64_t flags,
|
||||
FFTXCodeletOptions *opts,
|
||||
int len, int inv,
|
||||
const void *scale)
|
||||
{
|
||||
int ret;
|
||||
int sub_len = len / cd->factors[0];
|
||||
FFTXCodeletOptions sub_opts = { .invert_lookup = 0 };
|
||||
|
||||
flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
|
||||
flags |= AV_TX_INPLACE; /* in-place */
|
||||
flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
|
||||
flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
|
||||
|
||||
if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
|
||||
sub_len, inv, scale)))
|
||||
return ret;
|
||||
|
||||
if ((ret = ff_tx_gen_compound_mapping(s, cd->factors[0], sub_len)))
|
||||
return ret;
|
||||
|
||||
if (cd->factors[0] == 15) {
|
||||
for (int k = 0; k < s->sub[0].len; k++) {
|
||||
int cnt = 0;
|
||||
int tmp[15];
|
||||
memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
|
||||
for (int i = 1; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 2; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
for (int i = 0; i < 15; i += 3) {
|
||||
s->map[k*15 + cnt] = tmp[i];
|
||||
cnt++;
|
||||
}
|
||||
memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
|
||||
memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
|
||||
s->map[k*15 + 1] = tmp[2];
|
||||
s->map[k*15 + 2] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
|
||||
return AVERROR(ENOMEM);
|
||||
|
||||
TX_TAB(ff_tx_init_tabs)(len / sub_len);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||
TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
|
||||
TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
|
||||
@ -150,6 +210,7 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW),
|
||||
|
||||
#if HAVE_AVX2_EXTERNAL
|
||||
TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 320, b8_i2, avx2, AVX2, 0,
|
||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
@ -158,6 +219,13 @@ const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
|
||||
TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
|
||||
AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
|
||||
TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 320, fft_pfa_init, avx2, AVX2,
|
||||
AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2,
|
||||
AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, TX_FACTOR_ANY, 384, fft_pfa_init, avx2, AVX2,
|
||||
AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
|
||||
TX_DEF(mdct_sr_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
|
||||
FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
|
||||
#endif
|
||||
|
@ -24,7 +24,7 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#define EPS 0.00005
|
||||
#define EPS 0.0005
|
||||
|
||||
#define SCALE_NOOP(x) (x)
|
||||
#define SCALE_INT20(x) (av_clip64(lrintf((x) * 2147483648.0), INT32_MIN, INT32_MAX) >> 12)
|
||||
@ -40,7 +40,7 @@
|
||||
} while (0)
|
||||
|
||||
static const int check_lens[] = {
|
||||
2, 4, 8, 16, 32, 64, 1024, 16384,
|
||||
2, 4, 8, 16, 32, 64, 120, 960, 1024, 1920, 16384,
|
||||
};
|
||||
|
||||
static AVTXContext *tx_refs[AV_TX_NB][2 /* Direction */][FF_ARRAY_ELEMS(check_lens)] = { 0 };
|
||||
|
Loading…
Reference in New Issue
Block a user