mirror of
https://github.com/librempeg/librempeg
synced 2024-11-22 00:51:37 +00:00
avfilter: add hflip x86 SIMD
Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
parent
d1d6f965d8
commit
86fda8be3f
38
libavfilter/hflip.h
Normal file
38
libavfilter/hflip.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2007 Benoit Fouet
|
||||||
|
* Copyright (c) 2010 Stefano Sabatini
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AVFILTER_HFLIP_H
|
||||||
|
#define AVFILTER_HFLIP_H
|
||||||
|
|
||||||
|
#include "avfilter.h"
|
||||||
|
|
||||||
|
typedef struct FlipContext {
|
||||||
|
const AVClass *class;
|
||||||
|
int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes
|
||||||
|
int planewidth[4]; ///< width of each plane
|
||||||
|
int planeheight[4]; ///< height of each plane
|
||||||
|
|
||||||
|
void (*flip_line[4])(const uint8_t *src, uint8_t *dst, int w);
|
||||||
|
} FlipContext;
|
||||||
|
|
||||||
|
void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes);
|
||||||
|
|
||||||
|
#endif /* AVFILTER_HFLIP_H */
|
@ -29,6 +29,7 @@
|
|||||||
#include "libavutil/opt.h"
|
#include "libavutil/opt.h"
|
||||||
#include "avfilter.h"
|
#include "avfilter.h"
|
||||||
#include "formats.h"
|
#include "formats.h"
|
||||||
|
#include "hflip.h"
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
#include "video.h"
|
#include "video.h"
|
||||||
#include "libavutil/pixdesc.h"
|
#include "libavutil/pixdesc.h"
|
||||||
@ -36,13 +37,6 @@
|
|||||||
#include "libavutil/intreadwrite.h"
|
#include "libavutil/intreadwrite.h"
|
||||||
#include "libavutil/imgutils.h"
|
#include "libavutil/imgutils.h"
|
||||||
|
|
||||||
typedef struct FlipContext {
|
|
||||||
const AVClass *class;
|
|
||||||
int max_step[4]; ///< max pixel step for each plane, expressed as a number of bytes
|
|
||||||
int planewidth[4]; ///< width of each plane
|
|
||||||
int planeheight[4]; ///< height of each plane
|
|
||||||
} FlipContext;
|
|
||||||
|
|
||||||
static const AVOption hflip_options[] = {
|
static const AVOption hflip_options[] = {
|
||||||
{ NULL }
|
{ NULL }
|
||||||
};
|
};
|
||||||
@ -67,12 +61,77 @@ static int query_formats(AVFilterContext *ctx)
|
|||||||
return ff_set_common_formats(ctx, pix_fmts);
|
return ff_set_common_formats(ctx, pix_fmts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void hflip_byte_c(const uint8_t *src, uint8_t *dst, int w)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++)
|
||||||
|
dst[j] = src[-j];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hflip_short_c(const uint8_t *ssrc, uint8_t *ddst, int w)
|
||||||
|
{
|
||||||
|
const uint16_t *src = (const uint16_t *)ssrc;
|
||||||
|
uint16_t *dst = (uint16_t *)ddst;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++)
|
||||||
|
dst[j] = src[-j];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hflip_dword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
|
||||||
|
{
|
||||||
|
const uint32_t *src = (const uint32_t *)ssrc;
|
||||||
|
uint32_t *dst = (uint32_t *)ddst;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++)
|
||||||
|
dst[j] = src[-j];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hflip_b24_c(const uint8_t *src, uint8_t *dst, int w)
|
||||||
|
{
|
||||||
|
const uint8_t *in = src;
|
||||||
|
uint8_t *out = dst;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++, out += 3, in -= 3) {
|
||||||
|
int32_t v = AV_RB24(in);
|
||||||
|
|
||||||
|
AV_WB24(out, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hflip_b48_c(const uint8_t *src, uint8_t *dst, int w)
|
||||||
|
{
|
||||||
|
const uint8_t *in = src;
|
||||||
|
uint8_t *out = dst;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++, out += 6, in -= 6) {
|
||||||
|
int64_t v = AV_RB48(in);
|
||||||
|
|
||||||
|
AV_WB48(out, v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void hflip_qword_c(const uint8_t *ssrc, uint8_t *ddst, int w)
|
||||||
|
{
|
||||||
|
const uint64_t *src = (const uint64_t *)ssrc;
|
||||||
|
uint64_t *dst = (uint64_t *)ddst;
|
||||||
|
int j;
|
||||||
|
|
||||||
|
for (j = 0; j < w; j++)
|
||||||
|
dst[j] = src[-j];
|
||||||
|
}
|
||||||
|
|
||||||
static int config_props(AVFilterLink *inlink)
|
static int config_props(AVFilterLink *inlink)
|
||||||
{
|
{
|
||||||
FlipContext *s = inlink->dst->priv;
|
FlipContext *s = inlink->dst->priv;
|
||||||
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
|
const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format);
|
||||||
const int hsub = pix_desc->log2_chroma_w;
|
const int hsub = pix_desc->log2_chroma_w;
|
||||||
const int vsub = pix_desc->log2_chroma_h;
|
const int vsub = pix_desc->log2_chroma_h;
|
||||||
|
int nb_planes, i;
|
||||||
|
|
||||||
av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
|
av_image_fill_max_pixsteps(s->max_step, NULL, pix_desc);
|
||||||
s->planewidth[0] = s->planewidth[3] = inlink->w;
|
s->planewidth[0] = s->planewidth[3] = inlink->w;
|
||||||
@ -80,6 +139,24 @@ static int config_props(AVFilterLink *inlink)
|
|||||||
s->planeheight[0] = s->planeheight[3] = inlink->h;
|
s->planeheight[0] = s->planeheight[3] = inlink->h;
|
||||||
s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
|
s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, vsub);
|
||||||
|
|
||||||
|
nb_planes = av_pix_fmt_count_planes(inlink->format);
|
||||||
|
|
||||||
|
for (i = 0; i < nb_planes; i++) {
|
||||||
|
switch (s->max_step[i]) {
|
||||||
|
case 1: s->flip_line[i] = hflip_byte_c; break;
|
||||||
|
case 2: s->flip_line[i] = hflip_short_c; break;
|
||||||
|
case 3: s->flip_line[i] = hflip_b24_c; break;
|
||||||
|
case 4: s->flip_line[i] = hflip_dword_c; break;
|
||||||
|
case 6: s->flip_line[i] = hflip_b48_c; break;
|
||||||
|
case 8: s->flip_line[i] = hflip_qword_c; break;
|
||||||
|
default:
|
||||||
|
return AVERROR_BUG;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ARCH_X86)
|
||||||
|
ff_hflip_init_x86(s, s->max_step, nb_planes);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,7 +171,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
|
|||||||
AVFrame *in = td->in;
|
AVFrame *in = td->in;
|
||||||
AVFrame *out = td->out;
|
AVFrame *out = td->out;
|
||||||
uint8_t *inrow, *outrow;
|
uint8_t *inrow, *outrow;
|
||||||
int i, j, plane, step;
|
int i, plane, step;
|
||||||
|
|
||||||
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
for (plane = 0; plane < 4 && in->data[plane] && in->linesize[plane]; plane++) {
|
||||||
const int width = s->planewidth[plane];
|
const int width = s->planewidth[plane];
|
||||||
@ -107,45 +184,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int job, int nb_jobs)
|
|||||||
outrow = out->data[plane] + start * out->linesize[plane];
|
outrow = out->data[plane] + start * out->linesize[plane];
|
||||||
inrow = in ->data[plane] + start * in->linesize[plane] + (width - 1) * step;
|
inrow = in ->data[plane] + start * in->linesize[plane] + (width - 1) * step;
|
||||||
for (i = start; i < end; i++) {
|
for (i = start; i < end; i++) {
|
||||||
switch (step) {
|
s->flip_line[plane](inrow, outrow, width);
|
||||||
case 1:
|
|
||||||
for (j = 0; j < width; j++)
|
|
||||||
outrow[j] = inrow[-j];
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
{
|
|
||||||
uint16_t *outrow16 = (uint16_t *)outrow;
|
|
||||||
uint16_t * inrow16 = (uint16_t *) inrow;
|
|
||||||
for (j = 0; j < width; j++)
|
|
||||||
outrow16[j] = inrow16[-j];
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 3:
|
|
||||||
{
|
|
||||||
uint8_t *in = inrow;
|
|
||||||
uint8_t *out = outrow;
|
|
||||||
for (j = 0; j < width; j++, out += 3, in -= 3) {
|
|
||||||
int32_t v = AV_RB24(in);
|
|
||||||
AV_WB24(out, v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
{
|
|
||||||
uint32_t *outrow32 = (uint32_t *)outrow;
|
|
||||||
uint32_t * inrow32 = (uint32_t *) inrow;
|
|
||||||
for (j = 0; j < width; j++)
|
|
||||||
outrow32[j] = inrow32[-j];
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
for (j = 0; j < width; j++)
|
|
||||||
memcpy(outrow + j*step, inrow - j*step, step);
|
|
||||||
}
|
|
||||||
|
|
||||||
inrow += in ->linesize[plane];
|
inrow += in ->linesize[plane];
|
||||||
outrow += out->linesize[plane];
|
outrow += out->linesize[plane];
|
||||||
|
@ -5,6 +5,7 @@ OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o
|
|||||||
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
|
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
|
||||||
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
|
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
|
||||||
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
|
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
|
||||||
|
OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip_init.o
|
||||||
OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
|
OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
|
||||||
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
|
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
|
||||||
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
|
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
|
||||||
@ -32,6 +33,7 @@ X86ASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o
|
|||||||
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
X86ASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o
|
||||||
X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
|
X86ASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
|
||||||
X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
|
X86ASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
|
||||||
|
X86ASM-OBJS-$(CONFIG_HFLIP_FILTER) += x86/vf_hflip.o
|
||||||
X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
|
X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
|
||||||
X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
|
X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
|
||||||
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
|
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
|
||||||
|
108
libavfilter/x86/vf_hflip.asm
Normal file
108
libavfilter/x86/vf_hflip.asm
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
;*****************************************************************************
|
||||||
|
;* x86-optimized functions for hflip filter
|
||||||
|
;*
|
||||||
|
;* Copyright (C) 2017 Paul B Mahol
|
||||||
|
;*
|
||||||
|
;* This file is part of FFmpeg.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
;* modify it under the terms of the GNU Lesser General Public
|
||||||
|
;* License as published by the Free Software Foundation; either
|
||||||
|
;* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
;*
|
||||||
|
;* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
;* Lesser General Public License for more details.
|
||||||
|
;*
|
||||||
|
;* You should have received a copy of the GNU Lesser General Public
|
||||||
|
;* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
;*****************************************************************************
|
||||||
|
|
||||||
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
|
SECTION_RODATA
|
||||||
|
|
||||||
|
pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||||||
|
pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
|
||||||
|
|
||||||
|
SECTION .text
|
||||||
|
|
||||||
|
INIT_XMM ssse3
|
||||||
|
cglobal hflip_byte, 3, 6, 3, src, dst, w, x, v, r
|
||||||
|
mova m0, [pb_flip_byte]
|
||||||
|
mov xq, 0
|
||||||
|
mov wd, dword wm
|
||||||
|
mov rq, wq
|
||||||
|
and rq, 2 * mmsize - 1
|
||||||
|
cmp wq, 2 * mmsize
|
||||||
|
jl .loop1
|
||||||
|
sub wq, rq
|
||||||
|
|
||||||
|
.loop0:
|
||||||
|
neg xq
|
||||||
|
movu m1, [srcq + xq - mmsize + 1]
|
||||||
|
movu m2, [srcq + xq - 2 * mmsize + 1]
|
||||||
|
pshufb m1, m0
|
||||||
|
pshufb m2, m0
|
||||||
|
neg xq
|
||||||
|
movu [dstq + xq ], m1
|
||||||
|
movu [dstq + xq + mmsize], m2
|
||||||
|
add xq, mmsize * 2
|
||||||
|
cmp xq, wq
|
||||||
|
jl .loop0
|
||||||
|
|
||||||
|
cmp rq, 0
|
||||||
|
je .end
|
||||||
|
add wq, rq
|
||||||
|
|
||||||
|
.loop1:
|
||||||
|
neg xq
|
||||||
|
mov vb, [srcq + xq]
|
||||||
|
neg xq
|
||||||
|
mov [dstq + xq], vb
|
||||||
|
add xq, 1
|
||||||
|
cmp xq, wq
|
||||||
|
jl .loop1
|
||||||
|
.end:
|
||||||
|
RET
|
||||||
|
|
||||||
|
cglobal hflip_short, 3, 6, 3, src, dst, w, x, v, r
|
||||||
|
mova m0, [pb_flip_short]
|
||||||
|
mov xq, 0
|
||||||
|
mov wd, dword wm
|
||||||
|
add wq, wq
|
||||||
|
mov rq, wq
|
||||||
|
and rq, 2 * mmsize - 1
|
||||||
|
cmp wq, 2 * mmsize
|
||||||
|
jl .loop1
|
||||||
|
sub wq, rq
|
||||||
|
|
||||||
|
.loop0:
|
||||||
|
neg xq
|
||||||
|
movu m1, [srcq + xq - mmsize + 2]
|
||||||
|
movu m2, [srcq + xq - 2 * mmsize + 2]
|
||||||
|
pshufb m1, m0
|
||||||
|
pshufb m2, m0
|
||||||
|
neg xq
|
||||||
|
movu [dstq + xq ], m1
|
||||||
|
movu [dstq + xq + mmsize], m2
|
||||||
|
add xq, mmsize * 2
|
||||||
|
cmp xq, wq
|
||||||
|
jl .loop0
|
||||||
|
|
||||||
|
cmp rq, 0
|
||||||
|
je .end
|
||||||
|
add wq, rq
|
||||||
|
|
||||||
|
.loop1:
|
||||||
|
neg xq
|
||||||
|
mov vw, [srcq + xq]
|
||||||
|
neg xq
|
||||||
|
mov [dstq + xq], vw
|
||||||
|
add xq, 2
|
||||||
|
cmp xq, wq
|
||||||
|
jl .loop1
|
||||||
|
.end:
|
||||||
|
RET
|
41
libavfilter/x86/vf_hflip_init.c
Normal file
41
libavfilter/x86/vf_hflip_init.c
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2017 Paul B Mahol
|
||||||
|
*
|
||||||
|
* This file is part of FFmpeg.
|
||||||
|
*
|
||||||
|
* FFmpeg is free software; you can redistribute it and/or
|
||||||
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
|
* License as published by the Free Software Foundation; either
|
||||||
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* FFmpeg is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
* Lesser General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public
|
||||||
|
* License along with FFmpeg; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "libavutil/attributes.h"
|
||||||
|
#include "libavutil/cpu.h"
|
||||||
|
#include "libavutil/x86/cpu.h"
|
||||||
|
#include "libavfilter/hflip.h"
|
||||||
|
|
||||||
|
void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
|
||||||
|
void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
|
||||||
|
|
||||||
|
av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
|
||||||
|
{
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < nb_planes; i++) {
|
||||||
|
if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) {
|
||||||
|
s->flip_line[i] = ff_hflip_byte_ssse3;
|
||||||
|
} else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) {
|
||||||
|
s->flip_line[i] = ff_hflip_short_ssse3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user