mirror of
https://github.com/librempeg/librempeg
synced 2024-11-22 00:51:37 +00:00
compat/cuda: switch from powf to __powf intrinsic
The powf builtin causes crashes on older clang, so manually implement the (faster) intrinsic. The code it spawns is identical to that of nvcc.
This commit is contained in:
parent
73fada029c
commit
416923346a
@ -182,11 +182,11 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
|
||||
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
|
||||
static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
|
||||
static inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
|
||||
static inline __device__ float powf(float a, float y) { return __builtin_powf(a,y); }
|
||||
|
||||
static inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
|
||||
static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
|
||||
static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
|
||||
static inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
|
||||
static inline __device__ float __powf(float a, float b) { return __nvvm_ex2_approx_f(__nvvm_lg2_approx_f(a) * b); }
|
||||
|
||||
#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
|
||||
|
@ -34,9 +34,9 @@ extern "C"
|
||||
__device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv)
|
||||
{
|
||||
float ans = 0;
|
||||
ans += powf(first_yuv.x - second_yuv.x, 2);
|
||||
ans += powf(first_yuv.y - second_yuv.y, 2);
|
||||
ans += powf(first_yuv.z - second_yuv.z, 2);
|
||||
ans += __powf(first_yuv.x - second_yuv.x, 2);
|
||||
ans += __powf(first_yuv.y - second_yuv.y, 2);
|
||||
ans += __powf(first_yuv.z - second_yuv.z, 2);
|
||||
return ans;
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ __device__ static inline float calculate_w(int x, int y, int r, int c,
|
||||
float sigma_space, float sigma_color)
|
||||
{
|
||||
float first_term, second_term;
|
||||
first_term = (powf(x - r, 2) + powf(y - c, 2)) / (2 * sigma_space * sigma_space);
|
||||
first_term = (__powf(x - r, 2) + __powf(y - c, 2)) / (2 * sigma_space * sigma_space);
|
||||
second_term = norm_squared(pixel_value, neighbor_value) / (2 * sigma_color * sigma_color);
|
||||
return __expf(-first_term - second_term);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user