librempeg/libswscale/x86/yuv2rgb.c
Ramiro Polla 8bee248b80 swscale/x86/yuv2rgb: add ssse3 yuv42{0,2}p -> gbrp unscaled colorspace converters
Note: this implementation is limited to x86_64 due to general purpose
      register pressure.

checkasm --bench on an Intel(R) Core(TM) i5-5300U CPU @ 2.30GHz:
yuv420p_gbrp_8_c: 118.5
yuv420p_gbrp_8_ssse3: 93.3
yuv420p_gbrp_128_c: 1068.3
yuv420p_gbrp_128_ssse3: 319.3
yuv420p_gbrp_1080_c: 8841.8
yuv420p_gbrp_1080_ssse3: 2211.8
yuv420p_gbrp_1920_c: 15903.8
yuv420p_gbrp_1920_ssse3: 3814.3
yuv422p_gbrp_8_c: 144.8
yuv422p_gbrp_8_ssse3: 93.8
yuv422p_gbrp_128_c: 1395.8
yuv422p_gbrp_128_ssse3: 313.0
yuv422p_gbrp_1080_c: 11551.5
yuv422p_gbrp_1080_ssse3: 2240.8
yuv422p_gbrp_1920_c: 20585.3
yuv422p_gbrp_1920_ssse3: 5249.5
yuva420p_gbrp_8_c: 117.5
yuva420p_gbrp_8_ssse3: 92.0
yuva420p_gbrp_128_c: 1593.0
yuva420p_gbrp_128_ssse3: 319.3
yuva420p_gbrp_1080_c: 8694.5
yuva420p_gbrp_1080_ssse3: 2186.0
yuva420p_gbrp_1920_c: 15946.5
yuva420p_gbrp_1920_ssse3: 3805.3

Signed-off-by: Paul B Mahol <onemda@gmail.com>
2024-08-30 18:02:41 +02:00

282 lines
11 KiB
C

/*
* software YUV to RGB converter
*
* Copyright (C) 2001-2007 Michael Niedermayer
* Copyright (C) 2009-2010 Konstantin Shishkov
*
* MMX/MMXEXT template stuff (needed for fast movntq support),
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#if HAVE_X86ASM
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
extern void ff_yuv_420_rgb24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuv_420_bgr24_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuv_420_rgb15_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuv_420_rgb16_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuv_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuv_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
extern void ff_yuva_420_rgb32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
extern void ff_yuva_420_bgr32_ssse3(x86_reg index, uint8_t *image, const uint8_t *pu_index,
const uint8_t *pv_index, const uint64_t *pointer_c_dither,
const uint8_t *py_2index, const uint8_t *pa_2index);
#if ARCH_X86_64
extern void ff_yuv_420_gbrp24_ssse3(x86_reg index, uint8_t *image, uint8_t *dst_b, uint8_t *dst_r,
const uint8_t *pu_index, const uint8_t *pv_index,
const uint64_t *pointer_c_dither,
const uint8_t *py_2index);
#endif
static inline int yuv420_rgb15_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
ff_yuv_420_rgb15_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int yuv420_rgb16_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(2)
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
ff_yuv_420_rgb16_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int yuv420_rgb32_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
ff_yuv_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int yuv420_bgr32_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
ff_yuv_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int yuva420_rgb32_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
ff_yuva_420_rgb32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
static inline int yuva420_bgr32_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(4)
const uint8_t *pa = src[3] + y * srcStride[3];
ff_yuva_420_bgr32_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index, pa - 2 * index);
}
return srcSliceH;
}
static inline int yuv420_rgb24_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
ff_yuv_420_rgb24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
static inline int yuv420_bgr24_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
YUV2RGB_LOOP(3)
ff_yuv_420_bgr24_ssse3(index, image, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
#if ARCH_X86_64
static inline int yuv420_gbrp_ssse3(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
h_size = (c->dstW + 7) & ~7;
if (h_size * 3 > FFABS(dstStride[0]))
h_size -= 8;
vshift = c->srcFormat != AV_PIX_FMT_YUV422P;
for (y = 0; y < srcSliceH; y++) {
uint8_t *dst_g = dst[0] + (y + srcSliceY) * dstStride[0];
uint8_t *dst_b = dst[1] + (y + srcSliceY) * dstStride[1];
uint8_t *dst_r = dst[2] + (y + srcSliceY) * dstStride[2];
const uint8_t *py = src[0] + y * srcStride[0];
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1];
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2];
x86_reg index = -h_size / 2;
ff_yuv_420_gbrp24_ssse3(index, dst_g, dst_b, dst_r, pu - index, pv - index, &(c->redDither), py - 2 * index);
}
return srcSliceH;
}
#endif
#endif /* HAVE_X86ASM */
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSSE3(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_ssse3;
#endif
break;
} else
return yuv420_rgb32_ssse3;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_ssse3;
#endif
break;
} else
return yuv420_bgr32_ssse3;
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_ssse3;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_ssse3;
case AV_PIX_FMT_RGB565:
return yuv420_rgb16_ssse3;
case AV_PIX_FMT_RGB555:
return yuv420_rgb15_ssse3;
#if ARCH_X86_64
case AV_PIX_FMT_GBRP:
return yuv420_gbrp_ssse3;
#endif
}
}
#endif /* HAVE_X86ASM */
return NULL;
}