mirror of
https://github.com/librempeg/librempeg
synced 2024-11-23 03:28:27 +00:00
bfb28b5ce8
x64 always has MMX, MMXEXT, SSE and SSE2 and this means that some functions for MMX, MMXEXT and 3dnow are always overridden by other functions (unless one e.g. explicitly disables SSE2) for x64. So given that the only systems that benefit from these functions are truely ancient 32bit x86s they are removed. Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
113 lines
3.8 KiB
NASM
113 lines
3.8 KiB
NASM
;******************************************************************************
|
|
;* SIMD-optimized IDCT-related routines
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2003-2013 Michael Niedermayer
|
|
;* Copyright (c) 2013 Daniel Kang
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
cextern pb_80
|
|
|
|
SECTION .text
|
|
|
|
;--------------------------------------------------------------------------
|
|
;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
|
; ptrdiff_t line_size)
|
|
;--------------------------------------------------------------------------
|
|
|
|
%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
|
|
mova m1, [blockq+mmsize*0+%1]
|
|
mova m2, [blockq+mmsize*2+%1]
|
|
packsswb m1, [blockq+mmsize*1+%1]
|
|
packsswb m2, [blockq+mmsize*3+%1]
|
|
paddb m1, m0
|
|
paddb m2, m0
|
|
movq [pixelsq+lsizeq*0], m1
|
|
movhps [pixelsq+lsizeq*1], m1
|
|
movq [pixelsq+lsizeq*2], m2
|
|
movhps [pixelsq+lsize3q ], m2
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
cglobal put_signed_pixels_clamped, 3, 4, 3, block, pixels, lsize, lsize3
|
|
mova m0, [pb_80]
|
|
lea lsize3q, [lsizeq*3]
|
|
PUT_SIGNED_PIXELS_CLAMPED_HALF 0
|
|
lea pixelsq, [pixelsq+lsizeq*4]
|
|
PUT_SIGNED_PIXELS_CLAMPED_HALF 64
|
|
RET
|
|
|
|
;--------------------------------------------------------------------------
|
|
; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
|
; ptrdiff_t line_size);
|
|
;--------------------------------------------------------------------------
|
|
; %1 = block offset
|
|
%macro PUT_PIXELS_CLAMPED_HALF 1
|
|
mova m0, [blockq+mmsize*0+%1]
|
|
mova m1, [blockq+mmsize*2+%1]
|
|
packuswb m0, [blockq+mmsize*1+%1]
|
|
packuswb m1, [blockq+mmsize*3+%1]
|
|
movq [pixelsq], m0
|
|
movhps [lsizeq+pixelsq], m0
|
|
movq [2*lsizeq+pixelsq], m1
|
|
movhps [lsize3q+pixelsq], m1
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
|
|
lea lsize3q, [lsizeq*3]
|
|
PUT_PIXELS_CLAMPED_HALF 0
|
|
lea pixelsq, [pixelsq+lsizeq*4]
|
|
PUT_PIXELS_CLAMPED_HALF 64
|
|
RET
|
|
|
|
;--------------------------------------------------------------------------
|
|
; void ff_add_pixels_clamped(const int16_t *block, uint8_t *pixels,
|
|
; ptrdiff_t line_size);
|
|
;--------------------------------------------------------------------------
|
|
; %1 = block offset
|
|
%macro ADD_PIXELS_CLAMPED 1
|
|
mova m0, [blockq+mmsize*0+%1]
|
|
mova m1, [blockq+mmsize*1+%1]
|
|
movq m2, [pixelsq]
|
|
movq m3, [pixelsq+lsizeq]
|
|
punpcklbw m2, m4
|
|
punpcklbw m3, m4
|
|
paddsw m0, m2
|
|
paddsw m1, m3
|
|
packuswb m0, m1
|
|
movq [pixelsq], m0
|
|
movhps [pixelsq+lsizeq], m0
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
cglobal add_pixels_clamped, 3, 3, 5, block, pixels, lsize
|
|
pxor m4, m4
|
|
ADD_PIXELS_CLAMPED 0
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 32
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 64
|
|
lea pixelsq, [pixelsq+lsizeq*2]
|
|
ADD_PIXELS_CLAMPED 96
|
|
RET
|