mirror of
https://github.com/librempeg/librempeg
synced 2024-11-21 16:44:05 +00:00
x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128
When called inside a loop, the inline asm version results in one pxor unnecessarely emitted per iteration, as the contents of the __asm__() block are opaque to the compiler's instruction scheduler. This is not the case with intrinsics, where pxor will be emitted once with any half decent compiler. This also has the benefit of removing any SSE -> AVX penalty that may happen when the compiler emits VEX encoded instructions. Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Paul B Mahol <onemda@gmail.com>
This commit is contained in:
parent
1d123905ad
commit
ea24343c81
3
configure
vendored
3
configure
vendored
@ -2308,6 +2308,7 @@ HEADERS_LIST="
|
||||
|
||||
INTRINSICS_LIST="
|
||||
intrinsics_neon
|
||||
intrinsics_sse2
|
||||
"
|
||||
|
||||
MATH_FUNCS="
|
||||
@ -2737,6 +2738,7 @@ armv6t2_deps="arm"
|
||||
armv8_deps="aarch64"
|
||||
neon_deps_any="aarch64 arm"
|
||||
intrinsics_neon_deps="neon"
|
||||
intrinsics_sse2_deps="sse2"
|
||||
vfp_deps="arm"
|
||||
vfpv3_deps="vfp"
|
||||
setend_deps="arm"
|
||||
@ -6432,6 +6434,7 @@ elif enabled loongarch; then
|
||||
fi
|
||||
|
||||
check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
|
||||
check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
|
||||
|
||||
check_ldflags -Wl,--as-needed
|
||||
check_ldflags -Wl,-z,noexecstack
|
||||
|
@ -22,6 +22,9 @@
|
||||
#define AVUTIL_X86_INTREADWRITE_H
|
||||
|
||||
#include <stdint.h>
|
||||
#if HAVE_INTRINSICS_SSE2
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
|
||||
@ -43,20 +46,16 @@ static av_always_inline void AV_COPY128(void *d, const void *s)
|
||||
|
||||
#endif /* __SSE__ */
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if HAVE_INTRINSICS_SSE2
|
||||
|
||||
#define AV_ZERO128 AV_ZERO128
|
||||
static av_always_inline void AV_ZERO128(void *d)
|
||||
{
|
||||
struct v {uint64_t v[2];};
|
||||
|
||||
__asm__("pxor %%xmm0, %%xmm0 \n\t"
|
||||
"movdqa %%xmm0, %0 \n\t"
|
||||
: "=m"(*(struct v*)d)
|
||||
:: "xmm0");
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
_mm_store_si128(d, zero);
|
||||
}
|
||||
|
||||
#endif /* __SSE2__ */
|
||||
#endif /* HAVE_INTRINSICS_SSE2 */
|
||||
|
||||
#endif /* HAVE_MMX */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user