You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
15143 lines
550 KiB
15143 lines
550 KiB
#pragma clang diagnostic ignored "-Weverything"
|
|
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h"
|
|
# 1 "<built-in>" 1
|
|
# 1 "<built-in>" 3
|
|
# 845 "<built-in>" 3
|
|
# 1 "<command line>" 1
|
|
# 1 "<built-in>" 2
|
|
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 1 3
|
|
# 33 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
|
|
extern "C" {
|
|
__attribute__((__visibility__("default")))
|
|
__attribute__((weak))
|
|
__attribute__((noreturn))
|
|
__attribute__((device)) void __cxa_pure_virtual(void) {
|
|
__builtin_trap();
|
|
}
|
|
__attribute__((__visibility__("default")))
|
|
__attribute__((weak))
|
|
__attribute__((noreturn))
|
|
__attribute__((device)) void __cxa_deleted_virtual(void) {
|
|
__builtin_trap();
|
|
}
|
|
}
|
|
# 57 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
|
|
typedef long unsigned int size_t;
|
|
# 74 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
|
|
typedef long unsigned int __hip_size_t;
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
|
|
|
extern "C" __attribute__((device)) unsigned long long __ockl_dm_alloc(unsigned long long __size);
|
|
extern "C" __attribute__((device)) void __ockl_dm_dealloc(unsigned long long __addr);
|
|
# 95 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
|
|
__attribute__((weak)) inline __attribute__((device)) void *malloc(__hip_size_t __size) {
|
|
return (void *) __ockl_dm_alloc(__size);
|
|
}
|
|
__attribute__((weak)) inline __attribute__((device)) void free(void *__ptr) {
|
|
__ockl_dm_dealloc((unsigned long long)__ptr);
|
|
}
|
|
# 124 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
|
|
}
|
|
|
|
|
|
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 1 3
|
|
# 14 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 3
|
|
extern "C" {
|
|
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) float __ocml_acos_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_acosh_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_asin_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_asinh_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_atan2_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_atan_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_atanh_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_cbrt_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_ceil_f32(float);
|
|
__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_copysign_f32(float,
|
|
float);
|
|
__attribute__((device)) float __ocml_cos_f32(float);
|
|
__attribute__((device)) float __ocml_native_cos_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) __attribute__((device)) float __ocml_cosh_f32(float);
|
|
__attribute__((device)) float __ocml_cospi_f32(float);
|
|
__attribute__((device)) float __ocml_i0_f32(float);
|
|
__attribute__((device)) float __ocml_i1_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_erfc_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_erfcinv_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_erfcx_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_erf_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_erfinv_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_exp10_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_native_exp10_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_exp_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_native_exp_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_expm1_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fabs_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fdim_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_floor_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fma_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fmin_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_fmod_f32(float,
|
|
float);
|
|
__attribute__((device)) float __ocml_frexp_f32(float,
|
|
__attribute__((address_space(5))) int *);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_hypot_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f32(float);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f32(float);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f32(float);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f32(float);
|
|
__attribute__((device)) float __ocml_j0_f32(float);
|
|
__attribute__((device)) float __ocml_j1_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_ldexp_f32(float, int);
|
|
__attribute__((device)) float __ocml_lgamma_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_log10_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_native_log10_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_log1p_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_native_log2_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_logb_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_log_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_native_log_f32(float);
|
|
__attribute__((device)) float __ocml_modf_f32(float,
|
|
__attribute__((address_space(5))) float *);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_nearbyint_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_nextafter_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_len3_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_len4_f32(float, float, float,
|
|
float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_ncdf_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_ncdfinv_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_pow_f32(float, float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_pown_f32(float, int);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_rcbrt_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_remainder_f32(float, float);
|
|
__attribute__((device)) float __ocml_remquo_f32(float, float,
|
|
__attribute__((address_space(5))) int *);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_rhypot_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_rint_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
|
|
float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_round_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_rsqrt_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_scalb_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_scalbn_f32(float, int);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_signbit_f32(float);
|
|
__attribute__((device)) float __ocml_sincos_f32(float,
|
|
__attribute__((address_space(5))) float *);
|
|
__attribute__((device)) float __ocml_sincospi_f32(float,
|
|
__attribute__((address_space(5))) float *);
|
|
__attribute__((device)) float __ocml_sin_f32(float);
|
|
__attribute__((device)) float __ocml_native_sin_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_sinh_f32(float);
|
|
__attribute__((device)) float __ocml_sinpi_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_native_sqrt_f32(float);
|
|
__attribute__((device)) float __ocml_tan_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) float __ocml_tanh_f32(float);
|
|
__attribute__((device)) float __ocml_tgamma_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_trunc_f32(float);
|
|
__attribute__((device)) float __ocml_y0_f32(float);
|
|
__attribute__((device)) float __ocml_y1_f32(float);
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) float __ocml_add_rte_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_add_rtn_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_add_rtp_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_add_rtz_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sub_rte_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_mul_rte_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_div_rte_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_div_rtn_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_div_rtp_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_div_rtz_f32(float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rte_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
|
|
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
|
|
|
|
__attribute__((device)) inline __attribute__((const)) float
|
|
__llvm_amdgcn_cos_f32(float __x) {
|
|
return __builtin_amdgcn_cosf(__x);
|
|
}
|
|
__attribute__((device)) inline __attribute__((const)) float
|
|
__llvm_amdgcn_rcp_f32(float __x) {
|
|
return __builtin_amdgcn_rcpf(__x);
|
|
}
|
|
__attribute__((device)) inline __attribute__((const)) float
|
|
__llvm_amdgcn_rsq_f32(float __x) {
|
|
return __builtin_amdgcn_rsqf(__x);
|
|
}
|
|
__attribute__((device)) inline __attribute__((const)) float
|
|
__llvm_amdgcn_sin_f32(float __x) {
|
|
return __builtin_amdgcn_sinf(__x);
|
|
}
|
|
|
|
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) double __ocml_acos_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_acosh_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_asin_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_asinh_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_atan2_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_atan_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_atanh_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_cbrt_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_ceil_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_copysign_f64(double, double);
|
|
__attribute__((device)) double __ocml_cos_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_cosh_f64(double);
|
|
__attribute__((device)) double __ocml_cospi_f64(double);
|
|
__attribute__((device)) double __ocml_i0_f64(double);
|
|
__attribute__((device)) double __ocml_i1_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_erfc_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_erfcinv_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_erfcx_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_erf_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_erfinv_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_exp10_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_exp_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_expm1_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fabs_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fdim_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_floor_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fma_f64(double, double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fmin_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fmod_f64(double, double);
|
|
__attribute__((device)) double __ocml_frexp_f64(double,
|
|
__attribute__((address_space(5))) int *);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_hypot_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f64(double);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f64(double);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f64(double);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f64(double);
|
|
__attribute__((device)) double __ocml_j0_f64(double);
|
|
__attribute__((device)) double __ocml_j1_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_ldexp_f64(double, int);
|
|
__attribute__((device)) double __ocml_lgamma_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_log10_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_log1p_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_logb_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_log_f64(double);
|
|
__attribute__((device)) double __ocml_modf_f64(double,
|
|
__attribute__((address_space(5))) double *);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_nearbyint_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_nextafter_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_len3_f64(double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_len4_f64(double, double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_ncdf_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_ncdfinv_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_pow_f64(double, double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_pown_f64(double, int);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_rcbrt_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_remainder_f64(double, double);
|
|
__attribute__((device)) double __ocml_remquo_f64(double, double,
|
|
__attribute__((address_space(5))) int *);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_rhypot_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_rint_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_rlen3_f64(double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_rlen4_f64(double, double,
|
|
double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_round_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_rsqrt_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_scalb_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_scalbn_f64(double, int);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_signbit_f64(double);
|
|
__attribute__((device)) double __ocml_sincos_f64(double,
|
|
__attribute__((address_space(5))) double *);
|
|
__attribute__((device)) double
|
|
__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
|
|
__attribute__((device)) double __ocml_sin_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_sinh_f64(double);
|
|
__attribute__((device)) double __ocml_sinpi_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
|
|
__attribute__((device)) double __ocml_tan_f64(double);
|
|
__attribute__((device)) __attribute__((pure)) double __ocml_tanh_f64(double);
|
|
__attribute__((device)) double __ocml_tgamma_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_trunc_f64(double);
|
|
__attribute__((device)) double __ocml_y0_f64(double);
|
|
__attribute__((device)) double __ocml_y1_f64(double);
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) double __ocml_add_rte_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_add_rtn_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_add_rtp_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_add_rtz_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sub_rte_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_mul_rte_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_div_rte_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_div_rtn_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_div_rtp_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_div_rtz_f64(double, double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rte_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fma_rte_f64(double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
|
|
double);
|
|
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
|
|
double);
|
|
|
|
__attribute__((device)) inline __attribute__((const)) double
|
|
__llvm_amdgcn_rcp_f64(double __x) {
|
|
return __builtin_amdgcn_rcp(__x);
|
|
}
|
|
__attribute__((device)) inline __attribute__((const)) double
|
|
__llvm_amdgcn_rsq_f64(double __x) {
|
|
return __builtin_amdgcn_rsq(__x);
|
|
}
|
|
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
|
__attribute__((device)) _Float16 __ocml_cos_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
|
|
_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
|
__attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
|
__attribute__((device)) __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
|
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
|
|
|
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
|
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
|
|
float c, bool s);
|
|
|
|
|
|
|
|
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
|
__attribute__((device)) __2f16 __ocml_cos_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const))
|
|
__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
|
__attribute__((device)) __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
|
__attribute__((device)) inline __2f16
|
|
__llvm_amdgcn_rcp_2f16(__2f16 __x)
|
|
{
|
|
return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
|
|
}
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
|
__attribute__((device)) __2f16 __ocml_sin_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
|
__attribute__((device)) __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
|
|
|
|
|
|
}
|
|
# 128 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
|
|
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 1 3
|
|
# 94 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long unsigned int __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
|
|
long unsigned int __r = 0;
|
|
while (*__tagp != '\0') {
|
|
char __tmp = *__tagp;
|
|
|
|
if (__tmp >= '0' && __tmp <= '7')
|
|
__r = (__r * 8u) + __tmp - '0';
|
|
else
|
|
return 0;
|
|
|
|
++__tagp;
|
|
}
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long unsigned int __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
|
|
long unsigned int __r = 0;
|
|
while (*__tagp != '\0') {
|
|
char __tmp = *__tagp;
|
|
|
|
if (__tmp >= '0' && __tmp <= '9')
|
|
__r = (__r * 10u) + __tmp - '0';
|
|
else
|
|
return 0;
|
|
|
|
++__tagp;
|
|
}
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long unsigned int __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
|
|
long unsigned int __r = 0;
|
|
while (*__tagp != '\0') {
|
|
char __tmp = *__tagp;
|
|
|
|
if (__tmp >= '0' && __tmp <= '9')
|
|
__r = (__r * 16u) + __tmp - '0';
|
|
else if (__tmp >= 'a' && __tmp <= 'f')
|
|
__r = (__r * 16u) + __tmp - 'a' + 10;
|
|
else if (__tmp >= 'A' && __tmp <= 'F')
|
|
__r = (__r * 16u) + __tmp - 'A' + 10;
|
|
else
|
|
return 0;
|
|
|
|
++__tagp;
|
|
}
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long unsigned int __make_mantissa(const char *__tagp __attribute__((nonnull))) {
|
|
if (*__tagp == '0') {
|
|
++__tagp;
|
|
|
|
if (*__tagp == 'x' || *__tagp == 'X')
|
|
return __make_mantissa_base16(__tagp);
|
|
else
|
|
return __make_mantissa_base8(__tagp);
|
|
}
|
|
|
|
return __make_mantissa_base10(__tagp);
|
|
}
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
int abs(int __x) {
|
|
int __sgn = __x >> (sizeof(int) * 8 - 1);
|
|
return (__x ^ __sgn) - __sgn;
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long labs(long __x) {
|
|
long __sgn = __x >> (sizeof(long) * 8 - 1);
|
|
return (__x ^ __sgn) - __sgn;
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long long llabs(long long __x) {
|
|
long long __sgn = __x >> (sizeof(long long) * 8 - 1);
|
|
return (__x ^ __sgn) - __sgn;
|
|
}
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float acosf(float __x) { return __ocml_acos_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float acoshf(float __x) { return __ocml_acosh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float asinf(float __x) { return __ocml_asin_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float asinhf(float __x) { return __ocml_asinh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float atanf(float __x) { return __ocml_atan_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float atanhf(float __x) { return __ocml_atanh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float ceilf(float __x) { return __ocml_ceil_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float cosf(float __x) { return __ocml_cos_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float coshf(float __x) { return __ocml_cosh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float cospif(float __x) { return __ocml_cospi_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float erfcf(float __x) { return __ocml_erfc_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float erff(float __x) { return __ocml_erf_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float exp10f(float __x) { return __ocml_exp10_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float exp2f(float __x) { return __ocml_exp2_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float expf(float __x) { return __ocml_exp_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float expm1f(float __x) { return __ocml_expm1_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fabsf(float __x) { return __builtin_fabsf(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fdividef(float __x, float __y) { return __x / __y; }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float floorf(float __x) { return __ocml_floor_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fmaf(float __x, float __y, float __z) {
|
|
return __ocml_fma_f32(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float frexpf(float __x, int *__nptr) {
|
|
int __tmp;
|
|
|
|
|
|
|
|
float __r =
|
|
__ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
|
|
*__nptr = __tmp;
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __finitef(float __x) { return __ocml_isfinite_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __isinff(float __x) { return __ocml_isinf_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __isnanf(float __x) { return __ocml_isnan_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float j0f(float __x) { return __ocml_j0_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float j1f(float __x) { return __ocml_j1_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float jnf(int __n, float __x) {
|
|
|
|
|
|
|
|
if (__n == 0)
|
|
return j0f(__x);
|
|
if (__n == 1)
|
|
return j1f(__x);
|
|
|
|
float __x0 = j0f(__x);
|
|
float __x1 = j1f(__x);
|
|
for (int __i = 1; __i < __n; ++__i) {
|
|
float __x2 = (2 * __i) / __x * __x1 - __x0;
|
|
__x0 = __x1;
|
|
__x1 = __x2;
|
|
}
|
|
|
|
return __x1;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long long int llrintf(float __x) { return __ocml_rint_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long long int llroundf(float __x) { return __ocml_round_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float log10f(float __x) { return __ocml_log10_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float log1pf(float __x) { return __ocml_log1p_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float log2f(float __x) { return __ocml_log2_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float log2fi(int __x) { return __ocml_log2_f32((float) __x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float logbf(float __x) { return __ocml_logb_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float logf(float __x) { return __ocml_log_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long int lrintf(float __x) { return __ocml_rint_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long int lroundf(float __x) { return __ocml_round_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float modff(float __x, float *__iptr) {
|
|
float __tmp;
|
|
|
|
|
|
|
|
float __r =
|
|
__ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
|
|
*__iptr = __tmp;
|
|
return __r;
|
|
}
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float nanf(const char *__tagp __attribute__((nonnull))) {
|
|
union {
|
|
float val;
|
|
struct ieee_float {
|
|
unsigned int mantissa : 22;
|
|
unsigned int quiet : 1;
|
|
unsigned int exponent : 8;
|
|
unsigned int sign : 1;
|
|
} bits;
|
|
} __tmp;
|
|
static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");
|
|
|
|
__tmp.bits.sign = 0u;
|
|
__tmp.bits.exponent = ~0u;
|
|
__tmp.bits.quiet = 1u;
|
|
__tmp.bits.mantissa = __make_mantissa(__tagp);
|
|
|
|
return __tmp.val;
|
|
}
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float nextafterf(float __x, float __y) {
|
|
return __ocml_nextafter_f32(__x, __y);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float norm3df(float __x, float __y, float __z) {
|
|
return __ocml_len3_f32(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float norm4df(float __x, float __y, float __z, float __w) {
|
|
return __ocml_len4_f32(__x, __y, __z, __w);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float normcdff(float __x) { return __ocml_ncdf_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float normf(int __dim,
|
|
const float *__a) {
|
|
float __r = 0;
|
|
while (__dim--) {
|
|
__r += __a[0] * __a[0];
|
|
++__a;
|
|
}
|
|
|
|
return __ocml_sqrt_f32(__r);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
int powii(int __base, int __exp) {
|
|
if (__exp < 0 )
|
|
return -1;
|
|
int __result = 1;
|
|
for (;;) {
|
|
if (__exp & 1)
|
|
__result *= __base;
|
|
__exp >>= 1;
|
|
if (!__exp)
|
|
break;
|
|
__base *= __base;
|
|
}
|
|
return __result;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float remainderf(float __x, float __y) {
|
|
return __ocml_remainder_f32(__x, __y);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float remquof(float __x, float __y, int *__quo) {
|
|
int __tmp;
|
|
|
|
|
|
|
|
float __r = __ocml_remquo_f32(
|
|
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
|
|
*__quo = __tmp;
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rintf(float __x) { return __ocml_rint_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rnorm3df(float __x, float __y, float __z) {
|
|
return __ocml_rlen3_f32(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rnorm4df(float __x, float __y, float __z, float __w) {
|
|
return __ocml_rlen4_f32(__x, __y, __z, __w);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rnormf(int __dim,
|
|
const float *__a) {
|
|
float __r = 0;
|
|
while (__dim--) {
|
|
__r += __a[0] * __a[0];
|
|
++__a;
|
|
}
|
|
|
|
return __ocml_rsqrt_f32(__r);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float roundf(float __x) { return __ocml_round_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float scalblnf(float __x, long int __n) {
|
|
return (__n < 9223372036854775807L) ? __ocml_scalbn_f32(__x, __n)
|
|
: __ocml_scalb_f32(__x, __n);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __signbitf(float __x) { return __ocml_signbit_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
void sincosf(float __x, float *__sinptr, float *__cosptr) {
|
|
float __tmp;
|
|
|
|
|
|
|
|
*__sinptr =
|
|
__ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
|
|
*__cosptr = __tmp;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
void sincospif(float __x, float *__sinptr, float *__cosptr) {
|
|
float __tmp;
|
|
|
|
|
|
|
|
*__sinptr = __ocml_sincospi_f32(
|
|
__x, (__attribute__((address_space(5))) float *)&__tmp);
|
|
*__cosptr = __tmp;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float sinf(float __x) { return __ocml_sin_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float sinhf(float __x) { return __ocml_sinh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float sinpif(float __x) { return __ocml_sinpi_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float tanf(float __x) { return __ocml_tan_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float tanhf(float __x) { return __ocml_tanh_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float truncf(float __x) { return __ocml_trunc_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float y0f(float __x) { return __ocml_y0_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float y1f(float __x) { return __ocml_y1_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float ynf(int __n, float __x) {
|
|
|
|
|
|
|
|
|
|
if (__n == 0)
|
|
return y0f(__x);
|
|
if (__n == 1)
|
|
return y1f(__x);
|
|
|
|
float __x0 = y0f(__x);
|
|
float __x1 = y1f(__x);
|
|
for (int __i = 1; __i < __n; ++__i) {
|
|
float __x2 = (2 * __i) / __x * __x1 - __x0;
|
|
__x0 = __x1;
|
|
__x1 = __x2;
|
|
}
|
|
|
|
return __x1;
|
|
}
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __expf(float __x) { return __ocml_native_exp_f32(__x); }
|
|
# 627 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fadd_rn(float __x, float __y) { return __x + __y; }
|
|
# 641 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fdiv_rn(float __x, float __y) { return __x / __y; }
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fdividef(float __x, float __y) { return __x / __y; }
|
|
# 666 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fmaf_rn(float __x, float __y, float __z) {
|
|
return __ocml_fma_f32(__x, __y, __z);
|
|
}
|
|
# 682 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fmul_rn(float __x, float __y) { return __x * __y; }
|
|
# 696 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __frcp_rn(float __x) { return 1.0f / __x; }
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
|
|
# 713 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
|
|
# 727 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __fsub_rn(float __x, float __y) { return __x - __y; }
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __logf(float __x) { return __ocml_native_log_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
void __sincosf(float __x, float *__sinptr, float *__cosptr) {
|
|
*__sinptr = __ocml_native_sin_f32(__x);
|
|
*__cosptr = __ocml_native_cos_f32(__x);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float __tanf(float __x) { return __ocml_tan_f32(__x); }
|
|
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double acos(double __x) { return __ocml_acos_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double acosh(double __x) { return __ocml_acosh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double asin(double __x) { return __ocml_asin_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double asinh(double __x) { return __ocml_asinh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double atan(double __x) { return __ocml_atan_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double atanh(double __x) { return __ocml_atanh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cbrt(double __x) { return __ocml_cbrt_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double ceil(double __x) { return __ocml_ceil_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double copysign(double __x, double __y) {
|
|
return __ocml_copysign_f64(__x, __y);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cos(double __x) { return __ocml_cos_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cosh(double __x) { return __ocml_cosh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cospi(double __x) { return __ocml_cospi_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double erf(double __x) { return __ocml_erf_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double erfc(double __x) { return __ocml_erfc_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double erfcx(double __x) { return __ocml_erfcx_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double erfinv(double __x) { return __ocml_erfinv_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double exp(double __x) { return __ocml_exp_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double exp10(double __x) { return __ocml_exp10_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double exp2(double __x) { return __ocml_exp2_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double expm1(double __x) { return __ocml_expm1_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fabs(double __x) { return __builtin_fabs(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double floor(double __x) { return __ocml_floor_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fma(double __x, double __y, double __z) {
|
|
return __ocml_fma_f64(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double frexp(double __x, int *__nptr) {
|
|
int __tmp;
|
|
|
|
|
|
|
|
double __r =
|
|
__ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
|
|
*__nptr = __tmp;
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
int ilogb(double __x) { return __ocml_ilogb_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __finite(double __x) { return __ocml_isfinite_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __isinf(double __x) { return __ocml_isinf_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __isnan(double __x) { return __ocml_isnan_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double j0(double __x) { return __ocml_j0_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double j1(double __x) { return __ocml_j1_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double jn(int __n, double __x) {
|
|
|
|
|
|
|
|
|
|
if (__n == 0)
|
|
return j0(__x);
|
|
if (__n == 1)
|
|
return j1(__x);
|
|
|
|
double __x0 = j0(__x);
|
|
double __x1 = j1(__x);
|
|
for (int __i = 1; __i < __n; ++__i) {
|
|
double __x2 = (2 * __i) / __x * __x1 - __x0;
|
|
__x0 = __x1;
|
|
__x1 = __x2;
|
|
}
|
|
return __x1;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double lgamma(double __x) { return __ocml_lgamma_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long long int llrint(double __x) { return __ocml_rint_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long long int llround(double __x) { return __ocml_round_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double log(double __x) { return __ocml_log_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double log10(double __x) { return __ocml_log10_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double log1p(double __x) { return __ocml_log1p_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double log2(double __x) { return __ocml_log2_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double logb(double __x) { return __ocml_logb_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long int lrint(double __x) { return __ocml_rint_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
long int lround(double __x) { return __ocml_round_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double modf(double __x, double *__iptr) {
|
|
double __tmp;
|
|
|
|
|
|
|
|
double __r =
|
|
__ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
|
|
*__iptr = __tmp;
|
|
|
|
return __r;
|
|
}
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double nan(const char *__tagp) {
|
|
|
|
union {
|
|
double val;
|
|
struct ieee_double {
|
|
long unsigned int mantissa : 51;
|
|
unsigned int quiet : 1;
|
|
unsigned int exponent : 11;
|
|
unsigned int sign : 1;
|
|
} bits;
|
|
} __tmp;
|
|
static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");
|
|
|
|
__tmp.bits.sign = 0u;
|
|
__tmp.bits.exponent = ~0u;
|
|
__tmp.bits.quiet = 1u;
|
|
__tmp.bits.mantissa = __make_mantissa(__tagp);
|
|
|
|
return __tmp.val;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double nextafter(double __x, double __y) {
|
|
return __ocml_nextafter_f64(__x, __y);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double norm(int __dim,
|
|
const double *__a) {
|
|
double __r = 0;
|
|
while (__dim--) {
|
|
__r += __a[0] * __a[0];
|
|
++__a;
|
|
}
|
|
|
|
return __ocml_sqrt_f64(__r);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double norm3d(double __x, double __y, double __z) {
|
|
return __ocml_len3_f64(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double norm4d(double __x, double __y, double __z, double __w) {
|
|
return __ocml_len4_f64(__x, __y, __z, __w);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double normcdf(double __x) { return __ocml_ncdf_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double remainder(double __x, double __y) {
|
|
return __ocml_remainder_f64(__x, __y);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double remquo(double __x, double __y, int *__quo) {
|
|
int __tmp;
|
|
|
|
|
|
|
|
double __r = __ocml_remquo_f64(
|
|
__x, __y, (__attribute__((address_space(5))) int *)&__tmp);
|
|
*__quo = __tmp;
|
|
|
|
return __r;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rint(double __x) { return __ocml_rint_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rnorm(int __dim,
|
|
const double *__a) {
|
|
double __r = 0;
|
|
while (__dim--) {
|
|
__r += __a[0] * __a[0];
|
|
++__a;
|
|
}
|
|
|
|
return __ocml_rsqrt_f64(__r);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rnorm3d(double __x, double __y, double __z) {
|
|
return __ocml_rlen3_f64(__x, __y, __z);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rnorm4d(double __x, double __y, double __z, double __w) {
|
|
return __ocml_rlen4_f64(__x, __y, __z, __w);
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double round(double __x) { return __ocml_round_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double scalbln(double __x, long int __n) {
|
|
return (__n < 9223372036854775807L) ? __ocml_scalbn_f64(__x, __n)
|
|
: __ocml_scalb_f64(__x, __n);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
bool __signbit(double __x) { return __ocml_signbit_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double sin(double __x) { return __ocml_sin_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
void sincos(double __x, double *__sinptr, double *__cosptr) {
|
|
double __tmp;
|
|
|
|
|
|
|
|
*__sinptr = __ocml_sincos_f64(
|
|
__x, (__attribute__((address_space(5))) double *)&__tmp);
|
|
*__cosptr = __tmp;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
void sincospi(double __x, double *__sinptr, double *__cosptr) {
|
|
double __tmp;
|
|
|
|
|
|
|
|
*__sinptr = __ocml_sincospi_f64(
|
|
__x, (__attribute__((address_space(5))) double *)&__tmp);
|
|
*__cosptr = __tmp;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double sinh(double __x) { return __ocml_sinh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double sinpi(double __x) { return __ocml_sinpi_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double tan(double __x) { return __ocml_tan_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double tanh(double __x) { return __ocml_tanh_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double tgamma(double __x) { return __ocml_tgamma_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double trunc(double __x) { return __ocml_trunc_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double y0(double __x) { return __ocml_y0_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double y1(double __x) { return __ocml_y1_f64(__x); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double yn(int __n, double __x) {
|
|
|
|
|
|
|
|
|
|
if (__n == 0)
|
|
return y0(__x);
|
|
if (__n == 1)
|
|
return y1(__x);
|
|
|
|
double __x0 = y0(__x);
|
|
double __x1 = y1(__x);
|
|
for (int __i = 1; __i < __n; ++__i) {
|
|
double __x2 = (2 * __i) / __x * __x1 - __x0;
|
|
__x0 = __x1;
|
|
__x1 = __x2;
|
|
}
|
|
|
|
return __x1;
|
|
}
|
|
# 1190 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __dadd_rn(double __x, double __y) { return __x + __y; }
|
|
# 1212 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __ddiv_rn(double __x, double __y) { return __x / __y; }
|
|
# 1234 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __dmul_rn(double __x, double __y) { return __x * __y; }
|
|
# 1248 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __drcp_rn(double __x) { return 1.0 / __x; }
|
|
# 1262 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
|
|
# 1284 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __dsub_rn(double __x, double __y) { return __x - __y; }
|
|
# 1306 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double __fma_rn(double __x, double __y, double __z) {
|
|
return __ocml_fma_f64(__x, __y, __z);
|
|
}
|
|
# 1325 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
|
|
template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T min(T __arg1, T __arg2) {
|
|
return (__arg1 < __arg2) ? __arg1 : __arg2;
|
|
}
|
|
|
|
template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T max(T __arg1, T __arg2) {
|
|
return (__arg1 > __arg2) ? __arg1 : __arg2;
|
|
}
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline)) int min(int __arg1, int __arg2) {
|
|
return (__arg1 < __arg2) ? __arg1 : __arg2;
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) int max(int __arg1, int __arg2) {
|
|
return (__arg1 > __arg2) ? __arg1 : __arg2;
|
|
}
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float max(float __x, float __y) { return fmaxf(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double max(double __x, double __y) { return fmax(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
float min(float __x, float __y) { return fminf(__x, __y); }
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
double min(double __x, double __y) { return fmin(__x, __y); }
|
|
# 129 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
|
|
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_stdlib.h" 1 3
|
|
# 130 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
|
|
|
|
|
|
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 1 3
|
|
# 41 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline)) double abs(double __x) { return ::fabs(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float abs(float __x) { return ::fabsf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long long abs(long long __n) { return ::llabs(__n); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long abs(long __n) { return ::labs(__n); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fma(float __x, float __y, float __z) {
|
|
return ::fmaf(__x, __y, __z);
|
|
}
|
|
# 61 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float frexp(float __arg, int *__exp) {
|
|
return ::frexpf(__arg, __exp);
|
|
}
|
|
# 93 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(float __x) { return ::__isinff(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(double __x) { return ::__isinf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(float __x) { return ::__finitef(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(double __x) { return ::__finite(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(float __x) { return ::__isnanf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(double __x) { return ::__isnan(__x); }
|
|
|
|
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(float __x, float __y) {
|
|
return __builtin_isgreater(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(double __x, double __y) {
|
|
return __builtin_isgreater(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(float __x, float __y) {
|
|
return __builtin_isgreaterequal(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(double __x, double __y) {
|
|
return __builtin_isgreaterequal(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isless(float __x, float __y) {
|
|
return __builtin_isless(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isless(double __x, double __y) {
|
|
return __builtin_isless(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(float __x, float __y) {
|
|
return __builtin_islessequal(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(double __x, double __y) {
|
|
return __builtin_islessequal(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(float __x, float __y) {
|
|
return __builtin_islessgreater(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(double __x, double __y) {
|
|
return __builtin_islessgreater(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(float __x) {
|
|
return __builtin_isnormal(__x);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(double __x) {
|
|
return __builtin_isnormal(__x);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(float __x, float __y) {
|
|
return __builtin_isunordered(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(double __x, double __y) {
|
|
return __builtin_isunordered(__x, __y);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float modf(float __x, float *__iptr) {
|
|
return ::modff(__x, __iptr);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __base, int __iexp) {
|
|
return ::powif(__base, __iexp);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) double pow(double __base, int __iexp) {
|
|
return ::powi(__base, __iexp);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float remquo(float __x, float __y, int *__quo) {
|
|
return ::remquof(__x, __y, __quo);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float scalbln(float __x, long int __n) {
|
|
return ::scalblnf(__x, __n);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(float __x) { return ::__signbitf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(double __x) { return ::__signbit(__x); }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static __attribute__((device)) inline __attribute__((always_inline)) _Float16 fma(_Float16 __x, _Float16 __y,
|
|
_Float16 __z) {
|
|
return __ocml_fma_f16(__x, __y, __z);
|
|
}
|
|
static __attribute__((device)) inline __attribute__((always_inline)) _Float16 pow(_Float16 __base, int __iexp) {
|
|
return __ocml_pown_f16(__base, __iexp);
|
|
}
|
|
# 202 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float acos(float __x) { return acosf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float acosh(float __x) { return acoshf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float asin(float __x) { return asinf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float asinh(float __x) { return asinhf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float atan(float __x) { return atanf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float atan2(float __x, float __y) { return atan2f(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float atanh(float __x) { return atanhf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float cbrt(float __x) { return cbrtf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float ceil(float __x) { return ceilf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float copysign(float __x, float __y) { return copysignf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float cos(float __x) { return cosf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float cosh(float __x) { return coshf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float erf(float __x) { return erff(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float erfc(float __x) { return erfcf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float exp(float __x) { return expf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float exp2(float __x) { return exp2f(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float expm1(float __x) { return expm1f(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fabs(float __x) { return fabsf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fdim(float __x, float __y) { return fdimf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float floor(float __x) { return floorf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fmax(float __x, float __y) { return fmaxf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fmin(float __x, float __y) { return fminf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float fmod(float __x, float __y) { return fmodf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float hypot(float __x, float __y) { return hypotf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) int ilogb(float __x) { return ilogbf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float ldexp(float __x, int __y) { return ldexpf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float lgamma(float __x) { return lgammaf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float log(float __x) { return logf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float log10(float __x) { return log10f(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float log1p(float __x) { return log1pf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float log2(float __x) { return log2f(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float logb(float __x) { return logbf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long long llrint(float __x) { return llrintf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long long llround(float __x) { return llroundf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long lrint(float __x) { return lrintf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) long lround(float __x) { return lroundf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float nearbyint(float __x) { return nearbyintf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float nextafter(float __x, float __y) { return nextafterf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __x, float __y) { return powf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float remainder(float __x, float __y) { return remainderf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float rint(float __x) { return rintf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float round(float __x) { return roundf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float scalbn(float __x, int __y) { return scalbnf(__x, __y); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float sin(float __x) { return sinf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float sinh(float __x) { return sinhf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float sqrt(float __x) { return sqrtf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float tan(float __x) { return tanf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float tanh(float __x) { return tanhf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float tgamma(float __x) { return tgammaf(__x); }
|
|
static __attribute__((device)) inline __attribute__((always_inline)) float trunc(float __x) { return truncf(__x); }
|
|
# 265 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
template <bool __B, class __T = void> struct __hip_enable_if {};
|
|
|
|
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
|
|
|
|
namespace __hip {
|
|
template <class _Tp> struct is_integral {
|
|
enum { value = 0 };
|
|
};
|
|
template <> struct is_integral<bool> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<signed char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<unsigned char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<wchar_t> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<short> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<unsigned short> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<int> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<unsigned int> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<unsigned long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<long long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_integral<unsigned long long> {
|
|
enum { value = 1 };
|
|
};
|
|
|
|
|
|
template <class _Tp> struct is_arithmetic {
|
|
enum { value = 0 };
|
|
};
|
|
template <> struct is_arithmetic<bool> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<signed char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<unsigned char> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<wchar_t> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<short> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<unsigned short> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<int> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<unsigned int> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<unsigned long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<long long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<unsigned long long> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<float> {
|
|
enum { value = 1 };
|
|
};
|
|
template <> struct is_arithmetic<double> {
|
|
enum { value = 1 };
|
|
};
|
|
|
|
struct true_type {
|
|
static const __attribute__((constant)) bool value = true;
|
|
};
|
|
struct false_type {
|
|
static const __attribute__((constant)) bool value = false;
|
|
};
|
|
|
|
template <typename __T, typename __U> struct is_same : public false_type {};
|
|
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
|
|
|
template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
|
|
|
|
template <typename __T> typename add_rvalue_reference<__T>::type declval();
|
|
|
|
|
|
|
|
|
|
template <class _Tp> struct __numeric_type {
|
|
static void __test(...);
|
|
static _Float16 __test(_Float16);
|
|
static float __test(float);
|
|
static double __test(char);
|
|
static double __test(int);
|
|
static double __test(unsigned);
|
|
static double __test(long);
|
|
static double __test(unsigned long);
|
|
static double __test(long long);
|
|
static double __test(unsigned long long);
|
|
static double __test(double);
|
|
|
|
static double __test(long double);
|
|
|
|
typedef decltype(__test(declval<_Tp>())) type;
|
|
static const bool value = !is_same<type, void>::value;
|
|
};
|
|
|
|
template <> struct __numeric_type<void> { static const bool value = true; };
|
|
|
|
template <class _A1, class _A2 = void, class _A3 = void,
|
|
bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
|
|
&&__numeric_type<_A3>::value>
|
|
class __promote_imp {
|
|
public:
|
|
static const bool value = false;
|
|
};
|
|
|
|
template <class _A1, class _A2, class _A3>
|
|
class __promote_imp<_A1, _A2, _A3, true> {
|
|
private:
|
|
typedef typename __promote_imp<_A1>::type __type1;
|
|
typedef typename __promote_imp<_A2>::type __type2;
|
|
typedef typename __promote_imp<_A3>::type __type3;
|
|
|
|
public:
|
|
typedef decltype(__type1() + __type2() + __type3()) type;
|
|
static const bool value = true;
|
|
};
|
|
|
|
template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
|
|
private:
|
|
typedef typename __promote_imp<_A1>::type __type1;
|
|
typedef typename __promote_imp<_A2>::type __type2;
|
|
|
|
public:
|
|
typedef decltype(__type1() + __type2()) type;
|
|
static const bool value = true;
|
|
};
|
|
|
|
template <class _A1> class __promote_imp<_A1, void, void, true> {
|
|
public:
|
|
typedef typename __numeric_type<_A1>::type type;
|
|
static const bool value = true;
|
|
};
|
|
|
|
template <class _A1, class _A2 = void, class _A3 = void>
|
|
class __promote : public __promote_imp<_A1, _A2, _A3> {};
|
|
|
|
}
|
|
# 478 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acos(__T __x) { return ::acos((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acosh(__T __x) { return ::acosh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asin(__T __x) { return ::asin((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asinh(__T __x) { return ::asinh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atan(__T __x) { return ::atan((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type atan2(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return atan2((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atanh(__T __x) { return ::atanh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cbrt(__T __x) { return ::cbrt((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type ceil(__T __x) { return ::ceil((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type copysign(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return copysign((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cos(__T __x) { return ::cos((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cosh(__T __x) { return ::cosh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erf(__T __x) { return ::erf((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erfc(__T __x) { return ::erfc((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp(__T __x) { return ::exp((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp2(__T __x) { return ::exp2((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type expm1(__T __x) { return ::expm1((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type fabs(__T __x) { return ::fabs((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fdim(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fdim((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type floor(__T __x) { return ::floor((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmax(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmax((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmin(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmin((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmod(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmod((__result_type)__x, (__result_type)__y); }
|
|
|
|
|
|
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type hypot(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return hypot((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, int>::type ilogb(__T __x) { return ::ilogb((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isfinite(__T __x) { return ::isfinite((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreater((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreaterequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreaterequal((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isinf(__T __x) { return ::isinf((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isless(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isless((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessequal((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessgreater((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnan(__T __x) { return ::isnan((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnormal(__T __x) { return ::isnormal((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isunordered(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isunordered((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type lgamma(__T __x) { return ::lgamma((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log(__T __x) { return ::log((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log10(__T __x) { return ::log10((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log1p(__T __x) { return ::log1p((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log2(__T __x) { return ::log2((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type logb(__T __x) { return ::logb((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llrint(__T __x) { return ::llrint((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llround(__T __x) { return ::llround((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lrint(__T __x) { return ::lrint((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lround(__T __x) { return ::lround((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type nearbyint(__T __x) { return ::nearbyint((double)__x); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type nextafter(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return nextafter((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type pow(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return pow((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type remainder(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return remainder((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type rint(__T __x) { return ::rint((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type round(__T __x) { return ::round((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type signbit(__T __x) { return ::signbit((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sin(__T __x) { return ::sin((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sinh(__T __x) { return ::sinh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sqrt(__T __x) { return ::sqrt((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tan(__T __x) { return ::tan((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tanh(__T __x) { return ::tanh((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tgamma(__T __x) { return ::tgamma((double)__x); }
|
|
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type trunc(__T __x) { return ::trunc((double)__x); }
|
|
|
|
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type max(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return max((__result_type)__x, (__result_type)__y); }
|
|
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type min(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return min((__result_type)__x, (__result_type)__y); }
|
|
|
|
|
|
|
|
template <typename __T1, typename __T2, typename __T3>
|
|
static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<
|
|
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
|
|
__hip::is_arithmetic<__T3>::value,
|
|
typename __hip::__promote<__T1, __T2, __T3>::type>::type
|
|
fma(__T1 __x, __T2 __y, __T3 __z) {
|
|
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
|
|
return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
|
|
}
|
|
# 568 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
template <typename __T>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
frexp(__T __x, int *__exp) {
|
|
return ::frexp((double)__x, __exp);
|
|
}
|
|
|
|
template <typename __T>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
ldexp(__T __x, int __exp) {
|
|
return ::ldexp((double)__x, __exp);
|
|
}
|
|
|
|
template <typename __T>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
modf(__T __x, double *__exp) {
|
|
return ::modf((double)__x, __exp);
|
|
}
|
|
|
|
|
|
template <typename __T1, typename __T2>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
|
|
__hip::is_arithmetic<__T2>::value,
|
|
typename __hip::__promote<__T1, __T2>::type>::type
|
|
remquo(__T1 __x, __T2 __y, int *__quo) {
|
|
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
|
|
return ::remquo((__result_type)__x, (__result_type)__y, __quo);
|
|
}
|
|
# 610 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
|
|
template <typename __T>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
scalbln(__T __x, long int __exp) {
|
|
return ::scalbln((double)__x, __exp);
|
|
}
|
|
|
|
template <typename __T>
|
|
static __attribute__((device)) inline __attribute__((always_inline))
|
|
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
|
|
scalbn(__T __x, int __exp) {
|
|
return ::scalbn((double)__x, __exp);
|
|
}
|
|
# 133 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
|
|
# 2 "<built-in>" 2
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
|
|
|
|
|
|
|
|
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 1 3
|
|
# 58 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/include/hip/hip_version.h" 1 3
|
|
# 59 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 1 3
|
|
# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
|
|
# 97 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
|
|
#pragma clang diagnostic pop
|
|
# 60 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
|
|
|
|
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 1 3
|
|
# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_common.h" 1 3
|
|
# 33 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
|
|
# 43 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
extern "C" {
|
|
# 54 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
const char* amd_dbgapi_get_build_name();
|
|
# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
const char* amd_dbgapi_get_git_hash();
|
|
# 72 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
size_t amd_dbgapi_get_build_id();
|
|
|
|
|
|
}
|
|
# 92 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
typedef unsigned int uint32_t;
|
|
typedef unsigned long long uint64_t;
|
|
typedef signed int int32_t;
|
|
typedef signed long long int64_t;
|
|
namespace std {
|
|
using ::uint32_t;
|
|
using ::uint64_t;
|
|
using ::int32_t;
|
|
using ::int64_t;
|
|
}
|
|
# 124 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 1 3
|
|
# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 1 3
|
|
# 31 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 1 3
|
|
# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 3
|
|
namespace __hip_internal {
|
|
typedef unsigned char uint8_t;
|
|
typedef unsigned short uint16_t;
|
|
typedef unsigned int uint32_t;
|
|
typedef unsigned long long uint64_t;
|
|
typedef signed char int8_t;
|
|
typedef signed short int16_t;
|
|
typedef signed int int32_t;
|
|
typedef signed long long int64_t;
|
|
|
|
template <class _Tp, _Tp __v> struct integral_constant {
|
|
static constexpr const _Tp value = __v;
|
|
typedef _Tp value_type;
|
|
typedef integral_constant type;
|
|
constexpr operator value_type() const { return value; }
|
|
constexpr value_type operator()() const { return value; }
|
|
};
|
|
template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
|
|
|
|
typedef integral_constant<bool, true> true_type;
|
|
typedef integral_constant<bool, false> false_type;
|
|
|
|
template <bool B> using bool_constant = integral_constant<bool, B>;
|
|
typedef bool_constant<true> true_type;
|
|
typedef bool_constant<false> false_type;
|
|
|
|
template <bool __B, class __T = void> struct enable_if {};
|
|
template <class __T> struct enable_if<true, __T> { typedef __T type; };
|
|
|
|
template<bool _B> struct true_or_false_type : public false_type {};
|
|
template<> struct true_or_false_type<true> : public true_type {};
|
|
|
|
template <class _Tp> struct is_integral : public false_type {};
|
|
template <> struct is_integral<bool> : public true_type {};
|
|
template <> struct is_integral<char> : public true_type {};
|
|
template <> struct is_integral<signed char> : public true_type {};
|
|
template <> struct is_integral<unsigned char> : public true_type {};
|
|
template <> struct is_integral<wchar_t> : public true_type {};
|
|
template <> struct is_integral<short> : public true_type {};
|
|
template <> struct is_integral<unsigned short> : public true_type {};
|
|
template <> struct is_integral<int> : public true_type {};
|
|
template <> struct is_integral<unsigned int> : public true_type {};
|
|
template <> struct is_integral<long> : public true_type {};
|
|
template <> struct is_integral<unsigned long> : public true_type {};
|
|
template <> struct is_integral<long long> : public true_type {};
|
|
template <> struct is_integral<unsigned long long> : public true_type {};
|
|
|
|
template <class _Tp> struct is_arithmetic : public false_type {};
|
|
template <> struct is_arithmetic<bool> : public true_type {};
|
|
template <> struct is_arithmetic<char> : public true_type {};
|
|
template <> struct is_arithmetic<signed char> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned char> : public true_type {};
|
|
template <> struct is_arithmetic<wchar_t> : public true_type {};
|
|
template <> struct is_arithmetic<short> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned short> : public true_type {};
|
|
template <> struct is_arithmetic<int> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned int> : public true_type {};
|
|
template <> struct is_arithmetic<long> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned long> : public true_type {};
|
|
template <> struct is_arithmetic<long long> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned long long> : public true_type {};
|
|
template <> struct is_arithmetic<float> : public true_type {};
|
|
template <> struct is_arithmetic<double> : public true_type {};
|
|
|
|
template<typename _Tp> struct is_floating_point : public false_type {};
|
|
template<> struct is_floating_point<float> : public true_type {};
|
|
template<> struct is_floating_point<double> : public true_type {};
|
|
template<> struct is_floating_point<long double> : public true_type {};
|
|
|
|
template <typename __T, typename __U> struct is_same : public false_type {};
|
|
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
|
|
|
template<typename _Tp, bool = is_arithmetic<_Tp>::value>
|
|
struct is_signed : public false_type {};
|
|
template<typename _Tp>
|
|
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
|
|
|
|
template<typename _CharT> struct char_traits;
|
|
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
|
|
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
|
|
typedef basic_istream<char> istream;
|
|
typedef basic_ostream<char> ostream;
|
|
|
|
template<typename _Tp>
|
|
struct is_standard_layout
|
|
: public integral_constant<bool, __is_standard_layout(_Tp)>
|
|
{ };
|
|
|
|
template<typename _Tp>
|
|
struct is_trivial
|
|
: public integral_constant<bool, __is_trivial(_Tp)>
|
|
{ };
|
|
}
|
|
typedef __hip_internal::uint8_t __hip_uint8_t;
|
|
typedef __hip_internal::uint16_t __hip_uint16_t;
|
|
typedef __hip_internal::uint32_t __hip_uint32_t;
|
|
typedef __hip_internal::uint64_t __hip_uint64_t;
|
|
typedef __hip_internal::int8_t __hip_int8_t;
|
|
typedef __hip_internal::int16_t __hip_int16_t;
|
|
typedef __hip_internal::int32_t __hip_int32_t;
|
|
typedef __hip_internal::int64_t __hip_int64_t;
|
|
# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 2 3
|
|
# 52 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
|
|
namespace std {
|
|
using ::size_t;
|
|
|
|
template <class _Tp, _Tp __v> struct integral_constant {
|
|
static constexpr const _Tp value = __v;
|
|
typedef _Tp value_type;
|
|
typedef integral_constant type;
|
|
constexpr operator value_type() const { return value; }
|
|
constexpr value_type operator()() const { return value; }
|
|
};
|
|
template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;
|
|
|
|
typedef integral_constant<bool, true> true_type;
|
|
typedef integral_constant<bool, false> false_type;
|
|
|
|
template <bool B> using bool_constant = integral_constant<bool, B>;
|
|
typedef bool_constant<true> true_type;
|
|
typedef bool_constant<false> false_type;
|
|
|
|
template <bool __B, class __T = void> struct enable_if {};
|
|
template <class __T> struct enable_if<true, __T> { typedef __T type; };
|
|
|
|
template<bool _B> struct true_or_false_type : public false_type {};
|
|
template<> struct true_or_false_type<true> : public true_type {};
|
|
|
|
template <class _Tp> struct is_integral : public false_type {};
|
|
template <> struct is_integral<bool> : public true_type {};
|
|
template <> struct is_integral<char> : public true_type {};
|
|
template <> struct is_integral<signed char> : public true_type {};
|
|
template <> struct is_integral<unsigned char> : public true_type {};
|
|
template <> struct is_integral<wchar_t> : public true_type {};
|
|
template <> struct is_integral<short> : public true_type {};
|
|
template <> struct is_integral<unsigned short> : public true_type {};
|
|
template <> struct is_integral<int> : public true_type {};
|
|
template <> struct is_integral<unsigned int> : public true_type {};
|
|
template <> struct is_integral<long> : public true_type {};
|
|
template <> struct is_integral<unsigned long> : public true_type {};
|
|
template <> struct is_integral<long long> : public true_type {};
|
|
template <> struct is_integral<unsigned long long> : public true_type {};
|
|
|
|
template <class _Tp> struct is_arithmetic : public false_type {};
|
|
template <> struct is_arithmetic<bool> : public true_type {};
|
|
template <> struct is_arithmetic<char> : public true_type {};
|
|
template <> struct is_arithmetic<signed char> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned char> : public true_type {};
|
|
template <> struct is_arithmetic<wchar_t> : public true_type {};
|
|
template <> struct is_arithmetic<short> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned short> : public true_type {};
|
|
template <> struct is_arithmetic<int> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned int> : public true_type {};
|
|
template <> struct is_arithmetic<long> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned long> : public true_type {};
|
|
template <> struct is_arithmetic<long long> : public true_type {};
|
|
template <> struct is_arithmetic<unsigned long long> : public true_type {};
|
|
template <> struct is_arithmetic<float> : public true_type {};
|
|
template <> struct is_arithmetic<double> : public true_type {};
|
|
|
|
template<typename _Tp> struct is_floating_point : public false_type {};
|
|
template<> struct is_floating_point<float> : public true_type {};
|
|
template<> struct is_floating_point<double> : public true_type {};
|
|
template<> struct is_floating_point<long double> : public true_type {};
|
|
|
|
template <typename __T, typename __U> struct is_same : public false_type {};
|
|
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
|
|
|
template<typename _Tp, bool = is_arithmetic<_Tp>::value>
|
|
struct is_signed : public false_type {};
|
|
template<typename _Tp>
|
|
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
|
|
|
|
template <class _T1, class _T2> struct is_convertible
|
|
: public true_or_false_type<__is_convertible_to(_T1, _T2)> {};
|
|
|
|
template<typename _CharT> struct char_traits;
|
|
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
|
|
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
|
|
typedef basic_istream<char> istream;
|
|
typedef basic_ostream<char> ostream;
|
|
|
|
template <typename __T> struct is_scalar : public integral_constant<bool, __is_scalar(__T)> {};
|
|
}
|
|
|
|
|
|
namespace hip_impl {
|
|
inline
|
|
constexpr
|
|
unsigned int next_pot(unsigned int x) {
|
|
|
|
return 1u << (32u - __builtin_clz(x - 1u));
|
|
}
|
|
}
|
|
|
|
template<typename T, unsigned int n> struct HIP_vector_base;
|
|
|
|
template<typename T>
|
|
struct HIP_vector_base<T, 1> {
|
|
using Native_vec_ = T __attribute__((ext_vector_type(1)));
|
|
|
|
union {
|
|
Native_vec_ data;
|
|
struct {
|
|
T x;
|
|
};
|
|
};
|
|
|
|
using value_type = T;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
HIP_vector_base(T x_) noexcept : data{x_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(const HIP_vector_base&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(HIP_vector_base&&) = default;
|
|
__attribute__((device))
|
|
~HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
HIP_vector_base& operator=(const HIP_vector_base&) = default;
|
|
};
|
|
|
|
template<typename T>
|
|
struct HIP_vector_base<T, 2> {
|
|
using Native_vec_ = T __attribute__((ext_vector_type(2)));
|
|
|
|
union
|
|
|
|
|
|
|
|
{
|
|
Native_vec_ data;
|
|
struct {
|
|
T x;
|
|
T y;
|
|
};
|
|
};
|
|
|
|
using value_type = T;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
HIP_vector_base(T x_) noexcept : data{x_, x_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(const HIP_vector_base&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(HIP_vector_base&&) = default;
|
|
__attribute__((device))
|
|
~HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
HIP_vector_base& operator=(const HIP_vector_base&) = default;
|
|
};
|
|
|
|
template<typename T>
|
|
struct HIP_vector_base<T, 3> {
|
|
struct Native_vec_ {
|
|
T d[3];
|
|
|
|
__attribute__((device))
|
|
Native_vec_() = default;
|
|
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
Native_vec_(const Native_vec_&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
Native_vec_(Native_vec_&&) = default;
|
|
__attribute__((device))
|
|
~Native_vec_() = default;
|
|
|
|
__attribute__((device))
|
|
Native_vec_& operator=(const Native_vec_&) = default;
|
|
__attribute__((device))
|
|
Native_vec_& operator=(Native_vec_&&) = default;
|
|
|
|
__attribute__((device))
|
|
T& operator[](unsigned int idx) noexcept { return d[idx]; }
|
|
__attribute__((device))
|
|
T operator[](unsigned int idx) const noexcept { return d[idx]; }
|
|
|
|
__attribute__((device))
|
|
Native_vec_& operator+=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
|
|
return *this;
|
|
}
|
|
__attribute__((device))
|
|
Native_vec_& operator-=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
|
|
return *this;
|
|
}
|
|
|
|
__attribute__((device))
|
|
Native_vec_& operator*=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
|
|
return *this;
|
|
}
|
|
__attribute__((device))
|
|
Native_vec_& operator/=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_ operator-() const noexcept
|
|
{
|
|
auto r{*this};
|
|
for (auto&& x : r.d) x = -x;
|
|
return r;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_ operator~() const noexcept
|
|
{
|
|
auto r{*this};
|
|
for (auto&& x : r.d) x = ~x;
|
|
return r;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator%=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator^=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator|=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator&=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator>>=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
Native_vec_& operator<<=(const Native_vec_& x_) noexcept
|
|
{
|
|
for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
|
|
return *this;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));
|
|
|
|
__attribute__((device))
|
|
Vec3_cmp operator==(const Native_vec_& x_) const noexcept
|
|
{
|
|
return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
|
|
}
|
|
};
|
|
|
|
union {
|
|
Native_vec_ data;
|
|
struct {
|
|
T x;
|
|
T y;
|
|
T z;
|
|
};
|
|
};
|
|
|
|
using value_type = T;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(const HIP_vector_base&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(HIP_vector_base&&) = default;
|
|
__attribute__((device))
|
|
~HIP_vector_base() = default;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_base& operator=(const HIP_vector_base&) = default;
|
|
__attribute__((device))
|
|
HIP_vector_base& operator=(HIP_vector_base&&) = default;
|
|
};
|
|
|
|
template<typename T>
|
|
struct HIP_vector_base<T, 4> {
|
|
using Native_vec_ = T __attribute__((ext_vector_type(4)));
|
|
|
|
union
|
|
|
|
|
|
|
|
{
|
|
Native_vec_ data;
|
|
struct {
|
|
T x;
|
|
T y;
|
|
T z;
|
|
T w;
|
|
};
|
|
};
|
|
|
|
using value_type = T;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(const HIP_vector_base&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_base(HIP_vector_base&&) = default;
|
|
__attribute__((device))
|
|
~HIP_vector_base() = default;
|
|
__attribute__((device))
|
|
HIP_vector_base& operator=(const HIP_vector_base&) = default;
|
|
};
|
|
|
|
template<typename T, unsigned int rank>
|
|
struct HIP_vector_type : public HIP_vector_base<T, rank> {
|
|
using HIP_vector_base<T, rank>::data;
|
|
using typename HIP_vector_base<T, rank>::Native_vec_;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type() = default;
|
|
template<
|
|
typename U,
|
|
typename std::enable_if<
|
|
std::is_convertible<U, T>::value>::type* = nullptr>
|
|
__attribute__((device))
|
|
explicit
|
|
constexpr
|
|
HIP_vector_type(U x_) noexcept
|
|
: HIP_vector_base<T, rank>{static_cast<T>(x_)}
|
|
{}
|
|
template<
|
|
typename... Us,
|
|
typename std::enable_if<
|
|
(rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_type(Us... xs) noexcept
|
|
: HIP_vector_base<T, rank>{static_cast<T>(xs)...}
|
|
{}
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_type(const HIP_vector_type&) = default;
|
|
__attribute__((device))
|
|
constexpr
|
|
HIP_vector_type(HIP_vector_type&&) = default;
|
|
__attribute__((device))
|
|
~HIP_vector_type() = default;
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator=(const HIP_vector_type&) = default;
|
|
__attribute__((device))
|
|
HIP_vector_type& operator=(HIP_vector_type&&) = default;
|
|
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator++() noexcept
|
|
{
|
|
return *this += HIP_vector_type{1};
|
|
}
|
|
__attribute__((device))
|
|
HIP_vector_type operator++(int) noexcept
|
|
{
|
|
auto tmp(*this);
|
|
++*this;
|
|
return tmp;
|
|
}
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator--() noexcept
|
|
{
|
|
return *this -= HIP_vector_type{1};
|
|
}
|
|
__attribute__((device))
|
|
HIP_vector_type operator--(int) noexcept
|
|
{
|
|
auto tmp(*this);
|
|
--*this;
|
|
return tmp;
|
|
}
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data += x.data;
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U,
|
|
typename std::enable_if<
|
|
std::is_convertible<U, T>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator+=(U x) noexcept
|
|
{
|
|
return *this += HIP_vector_type{x};
|
|
}
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data -= x.data;
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U,
|
|
typename std::enable_if<
|
|
std::is_convertible<U, T>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator-=(U x) noexcept
|
|
{
|
|
return *this -= HIP_vector_type{x};
|
|
}
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data *= x.data;
|
|
return *this;
|
|
}
|
|
|
|
friend __attribute__((device)) inline constexpr HIP_vector_type operator*(
|
|
HIP_vector_type x, const HIP_vector_type& y) noexcept
|
|
{
|
|
return HIP_vector_type{ x } *= y;
|
|
}
|
|
|
|
template<
|
|
typename U,
|
|
typename std::enable_if<
|
|
std::is_convertible<U, T>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator*=(U x) noexcept
|
|
{
|
|
return *this *= HIP_vector_type{x};
|
|
}
|
|
|
|
friend __attribute__((device)) inline constexpr HIP_vector_type operator/(
|
|
HIP_vector_type x, const HIP_vector_type& y) noexcept
|
|
{
|
|
return HIP_vector_type{ x } /= y;
|
|
}
|
|
|
|
__attribute__((device))
|
|
HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data /= x.data;
|
|
return *this;
|
|
}
|
|
template<
|
|
typename U,
|
|
typename std::enable_if<
|
|
std::is_convertible<U, T>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator/=(U x) noexcept
|
|
{
|
|
return *this /= HIP_vector_type{x};
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type operator-() const noexcept
|
|
{
|
|
auto tmp(*this);
|
|
tmp.data = -tmp.data;
|
|
return tmp;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type operator~() const noexcept
|
|
{
|
|
HIP_vector_type r{*this};
|
|
r.data = ~r.data;
|
|
return r;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data %= x.data;
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data ^= x.data;
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data |= x.data;
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data &= x.data;
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data >>= x.data;
|
|
return *this;
|
|
}
|
|
|
|
template<
|
|
typename U = T,
|
|
typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
|
|
__attribute__((device))
|
|
HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
|
|
{
|
|
data <<= x.data;
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
template<typename T, unsigned int n>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator+(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} += y;
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator+(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator+(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} += y;
|
|
}
|
|
|
|
template<typename T, unsigned int n>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator-(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} -= y;
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator-(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator-(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} -= y;
|
|
}
|
|
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator*(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator*(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} *= y;
|
|
}
|
|
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator/(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator/(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} /= y;
|
|
}
|
|
|
|
template<typename V>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool _hip_any_zero(const V& x, int n) noexcept
|
|
{
|
|
return
|
|
(n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1));
|
|
}
|
|
|
|
template<typename T, unsigned int n>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator==(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return _hip_any_zero(x.data == y.data, n - 1);
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return x == HIP_vector_type<T, n>{y};
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} == y;
|
|
}
|
|
|
|
template<typename T, unsigned int n>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator!=(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return !(x == y);
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return !(x == y);
|
|
}
|
|
template<typename T, unsigned int n, typename U>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return !(x == y);
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator%(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} %= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator%(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} %= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator%(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} %= y;
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator^(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} ^= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator^(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} ^= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator^(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} ^= y;
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator|(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} |= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator|(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} |= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator|(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} |= y;
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator&(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} &= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator&(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} &= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator&(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} &= y;
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator>>(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} >>= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator>>(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} >>= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator>>(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} >>= y;
|
|
}
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator<<(
|
|
const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} <<= y;
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator<<(
|
|
const HIP_vector_type<T, n>& x, U y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} <<= HIP_vector_type<T, n>{y};
|
|
}
|
|
template<
|
|
typename T,
|
|
unsigned int n,
|
|
typename U,
|
|
typename std::enable_if<std::is_arithmetic<U>::value>::type,
|
|
typename std::enable_if<std::is_integral<T>{}>* = nullptr>
|
|
__attribute__((device))
|
|
inline
|
|
constexpr
|
|
HIP_vector_type<T, n> operator<<(
|
|
U x, const HIP_vector_type<T, n>& y) noexcept
|
|
{
|
|
return HIP_vector_type<T, n>{x} <<= y;
|
|
}
|
|
|
|
|
|
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 1 && rankU >= 1),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT>(static_cast<T>(u.x));
|
|
};
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU == 1),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0));
|
|
};
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU >= 2),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y));
|
|
};
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 1),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0),
|
|
static_cast<T>(0), static_cast<T>(0));
|
|
};
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 2),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y),
|
|
static_cast<T>(0), static_cast<T>(0));
|
|
};
|
|
|
|
template <typename T, unsigned int rankT, typename U, unsigned int rankU>
|
|
inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 4),
|
|
const HIP_vector_type<T, rankT>>::type
|
|
__hipMapVector(const HIP_vector_type<U, rankU>& u) {
|
|
return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y),
|
|
static_cast<T>(u.z), static_cast<T>(u.w));
|
|
};
|
|
# 1135 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
|
|
using uchar1 = HIP_vector_type<unsigned char, 1>; using uchar2 = HIP_vector_type<unsigned char, 2>; using uchar3 = HIP_vector_type<unsigned char, 3>; using uchar4 = HIP_vector_type<unsigned char, 4>;;
|
|
using char1 = HIP_vector_type<char, 1>; using char2 = HIP_vector_type<char, 2>; using char3 = HIP_vector_type<char, 3>; using char4 = HIP_vector_type<char, 4>;;
|
|
using ushort1 = HIP_vector_type<unsigned short, 1>; using ushort2 = HIP_vector_type<unsigned short, 2>; using ushort3 = HIP_vector_type<unsigned short, 3>; using ushort4 = HIP_vector_type<unsigned short, 4>;;
|
|
using short1 = HIP_vector_type<short, 1>; using short2 = HIP_vector_type<short, 2>; using short3 = HIP_vector_type<short, 3>; using short4 = HIP_vector_type<short, 4>;;
|
|
using uint1 = HIP_vector_type<unsigned int, 1>; using uint2 = HIP_vector_type<unsigned int, 2>; using uint3 = HIP_vector_type<unsigned int, 3>; using uint4 = HIP_vector_type<unsigned int, 4>;;
|
|
using int1 = HIP_vector_type<int, 1>; using int2 = HIP_vector_type<int, 2>; using int3 = HIP_vector_type<int, 3>; using int4 = HIP_vector_type<int, 4>;;
|
|
using ulong1 = HIP_vector_type<unsigned long, 1>; using ulong2 = HIP_vector_type<unsigned long, 2>; using ulong3 = HIP_vector_type<unsigned long, 3>; using ulong4 = HIP_vector_type<unsigned long, 4>;;
|
|
using long1 = HIP_vector_type<long, 1>; using long2 = HIP_vector_type<long, 2>; using long3 = HIP_vector_type<long, 3>; using long4 = HIP_vector_type<long, 4>;;
|
|
using ulonglong1 = HIP_vector_type<unsigned long long, 1>; using ulonglong2 = HIP_vector_type<unsigned long long, 2>; using ulonglong3 = HIP_vector_type<unsigned long long, 3>; using ulonglong4 = HIP_vector_type<unsigned long long, 4>;;
|
|
using longlong1 = HIP_vector_type<long long, 1>; using longlong2 = HIP_vector_type<long long, 2>; using longlong3 = HIP_vector_type<long long, 3>; using longlong4 = HIP_vector_type<long long, 4>;;
|
|
using float1 = HIP_vector_type<float, 1>; using float2 = HIP_vector_type<float, 2>; using float3 = HIP_vector_type<float, 3>; using float4 = HIP_vector_type<float, 4>;;
|
|
using double1 = HIP_vector_type<double, 1>; using double2 = HIP_vector_type<double, 2>; using double3 = HIP_vector_type<double, 3>; using double4 = HIP_vector_type<double, 4>;;
|
|
# 2117 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
|
|
static inline __attribute__((device)) uchar1 make_uchar1(unsigned char x) { uchar1 r{x}; return r; };
|
|
static inline __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y) { uchar2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) { uchar3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) { uchar4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) char1 make_char1(signed char x) { char1 r{x}; return r; };
|
|
static inline __attribute__((device)) char2 make_char2(signed char x, signed char y) { char2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z) { char3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w) { char4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) ushort1 make_ushort1(unsigned short x) { ushort1 r{x}; return r; };
|
|
static inline __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y) { ushort2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) { ushort3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) { ushort4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) short1 make_short1(signed short x) { short1 r{x}; return r; };
|
|
static inline __attribute__((device)) short2 make_short2(signed short x, signed short y) { short2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) short3 make_short3(signed short x, signed short y, signed short z) { short3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) short4 make_short4(signed short x, signed short y, signed short z, signed short w) { short4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) uint1 make_uint1(unsigned int x) { uint1 r{x}; return r; };
|
|
static inline __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y) { uint2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) { uint3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) { uint4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) int1 make_int1(signed int x) { int1 r{x}; return r; };
|
|
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { int2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) int3 make_int3(signed int x, signed int y, signed int z) { int3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { int4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) float1 make_float1(float x) { float1 r{x}; return r; };
|
|
static inline __attribute__((device)) float2 make_float2(float x, float y) { float2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) float3 make_float3(float x, float y, float z) { float3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { float4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) double1 make_double1(double x) { double1 r{x}; return r; };
|
|
static inline __attribute__((device)) double2 make_double2(double x, double y) { double2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) double3 make_double3(double x, double y, double z) { double3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) double4 make_double4(double x, double y, double z, double w) { double4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) ulong1 make_ulong1(unsigned long x) { ulong1 r{x}; return r; };
|
|
static inline __attribute__((device)) ulong2 make_ulong2(unsigned long x, unsigned long y) { ulong2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z) { ulong3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w) { ulong4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) long1 make_long1(signed long x) { long1 r{x}; return r; };
|
|
static inline __attribute__((device)) long2 make_long2(signed long x, signed long y) { long2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) long3 make_long3(signed long x, signed long y, signed long z) { long3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) long4 make_long4(signed long x, signed long y, signed long z, signed long w) { long4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long x) { ulonglong1 r{x}; return r; };
|
|
static inline __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y) { ulonglong2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z) { ulonglong3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w) { ulonglong4 r{x, y, z, w}; return r; };
|
|
|
|
static inline __attribute__((device)) longlong1 make_longlong1(signed long long x) { longlong1 r{x}; return r; };
|
|
static inline __attribute__((device)) longlong2 make_longlong2(signed long long x, signed long long y) { longlong2 r{x, y}; return r; };
|
|
static inline __attribute__((device)) longlong3 make_longlong3(signed long long x, signed long long y, signed long long z) { longlong3 r{x, y, z}; return r; };
|
|
static inline __attribute__((device)) longlong4 make_longlong4(signed long long x, signed long long y, signed long long z, signed long long w) { longlong4 r{x, y, z, w}; return r; };
|
|
# 28 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 2 3
|
|
|
|
|
|
__attribute__((device)) inline static char __ldg(const char* ptr) { return *ptr; }
|
|
|
|
__attribute__((device)) inline static char2 __ldg(const char2* ptr) { return *ptr; }
|
|
|
|
__attribute__((device)) inline static char4 __ldg(const char4* ptr) { return *ptr; }
|
|
|
|
__attribute__((device)) inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static short __ldg(const short* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static short2 __ldg(const short2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static short4 __ldg(const short4* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static int __ldg(const int* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static int2 __ldg(const int2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static int4 __ldg(const int4* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static long __ldg(const long* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static long long __ldg(const long long* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static float __ldg(const float* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static float2 __ldg(const float2* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static float4 __ldg(const float4* ptr) { return ptr[0]; }
|
|
|
|
|
|
__attribute__((device)) inline static double __ldg(const double* ptr) { return ptr[0]; }
|
|
|
|
__attribute__((device)) inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
|
|
# 125 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
|
|
# 250 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
|
|
struct __HIP_BlockIdx {
|
|
__attribute__((device))
|
|
std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
|
|
};
|
|
struct __HIP_BlockDim {
|
|
__attribute__((device))
|
|
std::uint32_t operator()(std::uint32_t x) const noexcept {
|
|
return __ockl_get_local_size(x);
|
|
}
|
|
};
|
|
struct __HIP_GridDim {
|
|
__attribute__((device))
|
|
std::uint32_t operator()(std::uint32_t x) const noexcept {
|
|
return __ockl_get_num_groups(x);
|
|
}
|
|
};
|
|
struct __HIP_ThreadIdx {
|
|
__attribute__((device))
|
|
std::uint32_t operator()(std::uint32_t x) const noexcept {
|
|
return __ockl_get_local_id(x);
|
|
}
|
|
};
|
|
|
|
|
|
typedef struct dim3 {
|
|
uint32_t x;
|
|
uint32_t y;
|
|
uint32_t z;
|
|
|
|
constexpr __attribute__((device)) dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};
|
|
|
|
} dim3;
|
|
|
|
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_global_size(unsigned int);
|
|
|
|
|
|
template <typename F> struct __HIP_Coordinates {
|
|
using R = decltype(F{}(0));
|
|
|
|
struct __X {
|
|
__attribute__((device)) operator R() const noexcept { return F{}(0); }
|
|
__attribute__((device)) R operator+=(const R& rhs) { return F{}(0) + rhs; }
|
|
};
|
|
struct __Y {
|
|
__attribute__((device)) operator R() const noexcept { return F{}(1); }
|
|
__attribute__((device)) R operator+=(const R& rhs) { return F{}(1) + rhs; }
|
|
};
|
|
struct __Z {
|
|
__attribute__((device)) operator R() const noexcept { return F{}(2); }
|
|
__attribute__((device)) R operator+=(const R& rhs) { return F{}(2) + rhs; }
|
|
};
|
|
|
|
|
|
__attribute__((weak))
|
|
|
|
__attribute__((device)) static constexpr __X x{};
|
|
|
|
__attribute__((weak))
|
|
|
|
__attribute__((device)) static constexpr __Y y{};
|
|
|
|
__attribute__((weak))
|
|
|
|
__attribute__((device)) static constexpr __Z z{};
|
|
|
|
__attribute__((device)) operator dim3() const { return dim3(x, y, z); }
|
|
};
|
|
|
|
template <typename F>
|
|
constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
|
|
template <typename F>
|
|
constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
|
|
template <typename F>
|
|
constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;
|
|
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
|
|
__HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
|
|
return __ockl_get_global_size(0);
|
|
}
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
|
|
__HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
|
|
return __ockl_get_global_size(0);
|
|
}
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
|
|
__HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
|
|
return __ockl_get_global_size(1);
|
|
}
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
|
|
__HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
|
|
return __ockl_get_global_size(1);
|
|
}
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
|
|
__HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
|
|
return __ockl_get_global_size(2);
|
|
}
|
|
inline
|
|
__attribute__((device))
|
|
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
|
|
__HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
|
|
return __ockl_get_global_size(2);
|
|
}
|
|
|
|
static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
|
|
static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
|
|
static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
|
|
static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};
|
|
|
|
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
|
|
|
|
|
|
|
|
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
|
|
|
|
|
|
|
|
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
|
|
|
|
|
|
|
|
|
|
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
|
|
# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
|
|
# 73 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_vector_types.h" 1 3
|
|
# 74 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
|
|
# 6 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 1 3
|
|
# 37 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 3
|
|
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 1 3
|
|
# 55 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wshadow"
|
|
struct hip_bfloat16
|
|
{
|
|
__hip_uint16_t data;
|
|
|
|
enum truncate_t
|
|
{
|
|
truncate
|
|
};
|
|
|
|
__attribute__((device)) hip_bfloat16() = default;
|
|
|
|
|
|
explicit __attribute__((device)) hip_bfloat16(float f)
|
|
: data(float_to_bfloat16(f))
|
|
{
|
|
}
|
|
|
|
explicit __attribute__((device)) hip_bfloat16(float f, truncate_t)
|
|
: data(truncate_float_to_bfloat16(f))
|
|
{
|
|
}
|
|
|
|
|
|
__attribute__((device)) operator float() const
|
|
{
|
|
union
|
|
{
|
|
uint32_t int32;
|
|
float fp32;
|
|
} u = {uint32_t(data) << 16};
|
|
return u.fp32;
|
|
}
|
|
|
|
__attribute__((device)) hip_bfloat16 &operator=(const float& f)
|
|
{
|
|
data = float_to_bfloat16(f);
|
|
return *this;
|
|
}
|
|
|
|
static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f)
|
|
{
|
|
hip_bfloat16 output;
|
|
output.data = float_to_bfloat16(f);
|
|
return output;
|
|
}
|
|
|
|
static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f, truncate_t)
|
|
{
|
|
hip_bfloat16 output;
|
|
output.data = truncate_float_to_bfloat16(f);
|
|
return output;
|
|
}
|
|
|
|
private:
|
|
static __attribute__((device)) __hip_uint16_t float_to_bfloat16(float f)
|
|
{
|
|
union
|
|
{
|
|
float fp32;
|
|
uint32_t int32;
|
|
} u = {f};
|
|
if(~u.int32 & 0x7f800000)
|
|
{
|
|
# 136 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
|
|
u.int32 += 0x7fff + ((u.int32 >> 16) & 1);
|
|
}
|
|
else if(u.int32 & 0xffff)
|
|
{
|
|
# 148 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
|
|
u.int32 |= 0x10000;
|
|
}
|
|
return __hip_uint16_t(u.int32 >> 16);
|
|
}
|
|
|
|
|
|
static __attribute__((device)) __hip_uint16_t truncate_float_to_bfloat16(float f)
|
|
{
|
|
union
|
|
{
|
|
float fp32;
|
|
uint32_t int32;
|
|
} u = {f};
|
|
return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
|
|
}
|
|
};
|
|
#pragma clang diagnostic pop
|
|
|
|
typedef struct
|
|
{
|
|
__hip_uint16_t data;
|
|
} hip_bfloat16_public;
|
|
|
|
static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
|
|
"hip_bfloat16 is not a standard layout type, and thus is "
|
|
"incompatible with C.");
|
|
|
|
static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
|
|
"hip_bfloat16 is not a trivial type, and thus is "
|
|
"incompatible with C.");
|
|
# 189 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
|
|
inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a)
|
|
{
|
|
return a;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a)
|
|
{
|
|
a.data ^= 0x8000;
|
|
return a;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return hip_bfloat16(float(a) + float(b));
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return hip_bfloat16(float(a) - float(b));
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return hip_bfloat16(float(a) * float(b));
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return hip_bfloat16(float(a) / float(b));
|
|
}
|
|
inline __attribute__((device)) bool operator<(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return float(a) < float(b);
|
|
}
|
|
inline __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return float(a) == float(b);
|
|
}
|
|
inline __attribute__((device)) bool operator>(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return b < a;
|
|
}
|
|
inline __attribute__((device)) bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return !(a > b);
|
|
}
|
|
inline __attribute__((device)) bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return !(a == b);
|
|
}
|
|
inline __attribute__((device)) bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
|
|
{
|
|
return !(a < b);
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
|
|
{
|
|
return a = a + b;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
|
|
{
|
|
return a = a - b;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
|
|
{
|
|
return a = a * b;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
|
|
{
|
|
return a = a / b;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator++(hip_bfloat16& a)
|
|
{
|
|
return a += hip_bfloat16(1.0f);
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16& operator--(hip_bfloat16& a)
|
|
{
|
|
return a -= hip_bfloat16(1.0f);
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator++(hip_bfloat16& a, int)
|
|
{
|
|
hip_bfloat16 orig = a;
|
|
++a;
|
|
return orig;
|
|
}
|
|
inline __attribute__((device)) hip_bfloat16 operator--(hip_bfloat16& a, int)
|
|
{
|
|
hip_bfloat16 orig = a;
|
|
--a;
|
|
return orig;
|
|
}
|
|
|
|
namespace std
|
|
{
|
|
constexpr __attribute__((device)) bool isinf(hip_bfloat16 a)
|
|
{
|
|
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
|
|
}
|
|
constexpr __attribute__((device)) bool isnan(hip_bfloat16 a)
|
|
{
|
|
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
|
|
}
|
|
constexpr __attribute__((device)) bool iszero(hip_bfloat16 a)
|
|
{
|
|
return !(a.data & 0x7fff);
|
|
}
|
|
}
|
|
# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 2 3
|
|
# 7 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
|
|
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wreserved-id-macro"
|
|
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
|
|
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
|
|
#pragma clang diagnostic ignored "-Wundef"
|
|
#define __device__ __attribute__((device))
|
|
#define __host__ __attribute__((host))
|
|
#define __global__ __attribute__((global))
|
|
#define __constant__ __attribute__((constant))
|
|
#define __shared__ __attribute__((shared))
|
|
#define __align__(x) __attribute__((aligned(x)))
|
|
#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
|
|
#define __noinline__ __attribute__((noinline))
|
|
#endif
|
|
#define __forceinline__ inline __attribute__((always_inline))
|
|
#if __HIP_NO_IMAGE_SUPPORT
|
|
#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
|
|
#else
|
|
#define __hip_img_chk__
|
|
#endif
|
|
#define launch_bounds_impl0(requiredMaxThreadsPerBlock) \
|
|
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
|
|
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) \
|
|
__attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock), \
|
|
amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
|
|
#define select_impl_(_1, _2, impl_, ...) impl_
|
|
#define __launch_bounds__(...) \
|
|
select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
|
|
#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
|
|
#define _HIP_BFLOAT16_H_
|
|
#define HIP_INCLUDE_HIP_MATH_FUNCTIONS_H
|
|
#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
|
|
#if !__HIP_NO_STD_DEFS__
|
|
#if defined(__HIPRTC_PTRDIFF_T_IS_LONG_LONG__) && __HIPRTC_PTRDIFF_T_IS_LONG_LONG__==1
|
|
typedef long long ptrdiff_t;
|
|
#else
|
|
typedef __PTRDIFF_TYPE__ ptrdiff_t;
|
|
#endif
|
|
typedef long clock_t;
|
|
namespace std {
|
|
using ::ptrdiff_t;
|
|
using ::clock_t;
|
|
}
|
|
#endif // __HIP_NO_STD_DEFS__
|
|
#pragma clang diagnostic pop/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_HIP_COMMON_H
|
|
#define HIP_INCLUDE_HIP_HIP_COMMON_H
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
|
|
#endif
|
|
// Common code included at start of every hip file.
|
|
// Auto enable __HIP_PLATFORM_AMD__ if compiling on AMD platform
|
|
// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
#ifndef __HIP_PLATFORM_AMD__
|
|
#define __HIP_PLATFORM_AMD__
|
|
#endif
|
|
#endif // defined(__clang__) && defined(__HIP__)
|
|
|
|
// Auto enable __HIP_PLATFORM_NVIDIA__ if compiling with NVIDIA platform
|
|
#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
|
|
#ifndef __HIP_PLATFORM_NVIDIA__
|
|
#define __HIP_PLATFORM_NVIDIA__
|
|
#endif
|
|
|
|
#ifdef __CUDACC__
|
|
#define __HIPCC__
|
|
#endif
|
|
|
|
#endif //__NVCC__
|
|
|
|
// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
|
|
#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) || \
|
|
(defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
|
|
#define __HIP_DEVICE_COMPILE__ 1
|
|
#endif
|
|
|
|
#ifdef __GNUC__
|
|
#define HIP_PUBLIC_API __attribute__ ((visibility ("default")))
|
|
#define HIP_INTERNAL_EXPORTED_API __attribute__ ((visibility ("default")))
|
|
#else
|
|
#define HIP_PUBLIC_API
|
|
#define HIP_INTERNAL_EXPORTED_API
|
|
#endif
|
|
|
|
#if __HIP_DEVICE_COMPILE__ == 0
|
|
// 32-bit Atomics
|
|
#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
|
|
#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
|
|
#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
|
|
#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
|
|
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
|
|
|
|
// 64-bit Atomics
|
|
#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
|
|
#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)
|
|
|
|
// Doubles
|
|
#define __HIP_ARCH_HAS_DOUBLES__ (0)
|
|
|
|
// Warp cross-lane operations
|
|
#define __HIP_ARCH_HAS_WARP_VOTE__ (0)
|
|
#define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
|
|
#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
|
|
#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)
|
|
|
|
// Sync
|
|
#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
|
|
#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)
|
|
|
|
// Misc
|
|
#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
|
|
#define __HIP_ARCH_HAS_3DGRID__ (0)
|
|
#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
|
|
#endif
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H
|
|
#define HIP_INCLUDE_HIP_LIBRARY_TYPES_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_common.h>
|
|
#endif
|
|
|
|
#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
|
|
|
typedef enum hipDataType {
|
|
HIP_R_32F = 0,
|
|
HIP_R_64F = 1,
|
|
HIP_R_16F = 2,
|
|
HIP_R_8I = 3,
|
|
HIP_C_32F = 4,
|
|
HIP_C_64F = 5,
|
|
HIP_C_16F = 6,
|
|
HIP_C_8I = 7,
|
|
HIP_R_8U = 8,
|
|
HIP_C_8U = 9,
|
|
HIP_R_32I = 10,
|
|
HIP_C_32I = 11,
|
|
HIP_R_32U = 12,
|
|
HIP_C_32U = 13,
|
|
HIP_R_16BF = 14,
|
|
HIP_C_16BF = 15,
|
|
HIP_R_4I = 16,
|
|
HIP_C_4I = 17,
|
|
HIP_R_4U = 18,
|
|
HIP_C_4U = 19,
|
|
HIP_R_16I = 20,
|
|
HIP_C_16I = 21,
|
|
HIP_R_16U = 22,
|
|
HIP_C_16U = 23,
|
|
HIP_R_64I = 24,
|
|
HIP_C_64I = 25,
|
|
HIP_R_64U = 26,
|
|
HIP_C_64U = 27,
|
|
// HIP specific Data Types
|
|
HIP_R_8F_E4M3_FNUZ = 1000,
|
|
HIP_R_8F_E5M2_FNUZ = 1001
|
|
} hipDataType;
|
|
|
|
typedef enum hipLibraryPropertyType {
|
|
HIP_LIBRARY_MAJOR_VERSION,
|
|
HIP_LIBRARY_MINOR_VERSION,
|
|
HIP_LIBRARY_PATCH_LEVEL
|
|
} hipLibraryPropertyType;
|
|
|
|
#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
|
|
#include "library_types.h"
|
|
#else
|
|
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
|
|
#endif
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H
|
|
#define HIP_INCLUDE_HIP_DRIVER_TYPES_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_common.h>
|
|
#endif
|
|
|
|
#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
|
|
#include "driver_types.h"
|
|
#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#ifndef __cplusplus
|
|
#include <stdbool.h>
|
|
#endif
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
typedef void* hipDeviceptr_t;
|
|
typedef enum hipChannelFormatKind {
|
|
hipChannelFormatKindSigned = 0,
|
|
hipChannelFormatKindUnsigned = 1,
|
|
hipChannelFormatKindFloat = 2,
|
|
hipChannelFormatKindNone = 3
|
|
}hipChannelFormatKind;
|
|
typedef struct hipChannelFormatDesc {
|
|
int x;
|
|
int y;
|
|
int z;
|
|
int w;
|
|
enum hipChannelFormatKind f;
|
|
}hipChannelFormatDesc;
|
|
#define HIP_TRSA_OVERRIDE_FORMAT 0x01
|
|
#define HIP_TRSF_READ_AS_INTEGER 0x01
|
|
#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
|
|
#define HIP_TRSF_SRGB 0x10
|
|
|
|
typedef struct hipArray* hipArray_t;
|
|
typedef const struct hipArray* hipArray_const_t;
|
|
typedef enum hipArray_Format {
|
|
HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
|
|
HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
|
|
HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
|
|
HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
|
|
HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
|
|
HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
|
|
HIP_AD_FORMAT_HALF = 0x10,
|
|
HIP_AD_FORMAT_FLOAT = 0x20
|
|
}hipArray_Format;
|
|
typedef struct HIP_ARRAY_DESCRIPTOR {
|
|
size_t Width;
|
|
size_t Height;
|
|
enum hipArray_Format Format;
|
|
unsigned int NumChannels;
|
|
}HIP_ARRAY_DESCRIPTOR;
|
|
typedef struct HIP_ARRAY3D_DESCRIPTOR {
|
|
size_t Width;
|
|
size_t Height;
|
|
size_t Depth;
|
|
enum hipArray_Format Format;
|
|
unsigned int NumChannels;
|
|
unsigned int Flags;
|
|
}HIP_ARRAY3D_DESCRIPTOR;
|
|
#if !defined(__HIPCC_RTC__)
|
|
typedef struct hip_Memcpy2D {
|
|
size_t srcXInBytes;
|
|
size_t srcY;
|
|
hipMemoryType srcMemoryType;
|
|
const void* srcHost;
|
|
hipDeviceptr_t srcDevice;
|
|
hipArray_t srcArray;
|
|
size_t srcPitch;
|
|
size_t dstXInBytes;
|
|
size_t dstY;
|
|
hipMemoryType dstMemoryType;
|
|
void* dstHost;
|
|
hipDeviceptr_t dstDevice;
|
|
hipArray_t dstArray;
|
|
size_t dstPitch;
|
|
size_t WidthInBytes;
|
|
size_t Height;
|
|
} hip_Memcpy2D;
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
typedef struct hipMipmappedArray {
|
|
void* data;
|
|
struct hipChannelFormatDesc desc;
|
|
unsigned int type;
|
|
unsigned int width;
|
|
unsigned int height;
|
|
unsigned int depth;
|
|
unsigned int min_mipmap_level;
|
|
unsigned int max_mipmap_level;
|
|
unsigned int flags;
|
|
enum hipArray_Format format;
|
|
unsigned int num_channels;
|
|
} hipMipmappedArray;
|
|
typedef struct hipMipmappedArray* hipMipmappedArray_t;
|
|
typedef hipMipmappedArray_t hipmipmappedArray;
|
|
typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
|
|
/**
|
|
* hip resource types
|
|
*/
|
|
typedef enum hipResourceType {
|
|
hipResourceTypeArray = 0x00,
|
|
hipResourceTypeMipmappedArray = 0x01,
|
|
hipResourceTypeLinear = 0x02,
|
|
hipResourceTypePitch2D = 0x03
|
|
}hipResourceType;
|
|
typedef enum HIPresourcetype_enum {
|
|
HIP_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
|
|
HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
|
|
HIP_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
|
|
HIP_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */
|
|
} HIPresourcetype, hipResourcetype;
|
|
/**
|
|
* hip address modes
|
|
*/
|
|
typedef enum HIPaddress_mode_enum {
|
|
HIP_TR_ADDRESS_MODE_WRAP = 0,
|
|
HIP_TR_ADDRESS_MODE_CLAMP = 1,
|
|
HIP_TR_ADDRESS_MODE_MIRROR = 2,
|
|
HIP_TR_ADDRESS_MODE_BORDER = 3
|
|
} HIPaddress_mode;
|
|
/**
|
|
* hip filter modes
|
|
*/
|
|
typedef enum HIPfilter_mode_enum {
|
|
HIP_TR_FILTER_MODE_POINT = 0,
|
|
HIP_TR_FILTER_MODE_LINEAR = 1
|
|
} HIPfilter_mode;
|
|
/**
|
|
* Texture descriptor
|
|
*/
|
|
typedef struct HIP_TEXTURE_DESC_st {
|
|
HIPaddress_mode addressMode[3]; /**< Address modes */
|
|
HIPfilter_mode filterMode; /**< Filter mode */
|
|
unsigned int flags; /**< Flags */
|
|
unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */
|
|
HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
|
|
float mipmapLevelBias; /**< Mipmap level bias */
|
|
float minMipmapLevelClamp; /**< Mipmap minimum level clamp */
|
|
float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */
|
|
float borderColor[4]; /**< Border Color */
|
|
int reserved[12];
|
|
} HIP_TEXTURE_DESC;
|
|
/**
|
|
* hip texture resource view formats
|
|
*/
|
|
typedef enum hipResourceViewFormat {
|
|
hipResViewFormatNone = 0x00,
|
|
hipResViewFormatUnsignedChar1 = 0x01,
|
|
hipResViewFormatUnsignedChar2 = 0x02,
|
|
hipResViewFormatUnsignedChar4 = 0x03,
|
|
hipResViewFormatSignedChar1 = 0x04,
|
|
hipResViewFormatSignedChar2 = 0x05,
|
|
hipResViewFormatSignedChar4 = 0x06,
|
|
hipResViewFormatUnsignedShort1 = 0x07,
|
|
hipResViewFormatUnsignedShort2 = 0x08,
|
|
hipResViewFormatUnsignedShort4 = 0x09,
|
|
hipResViewFormatSignedShort1 = 0x0a,
|
|
hipResViewFormatSignedShort2 = 0x0b,
|
|
hipResViewFormatSignedShort4 = 0x0c,
|
|
hipResViewFormatUnsignedInt1 = 0x0d,
|
|
hipResViewFormatUnsignedInt2 = 0x0e,
|
|
hipResViewFormatUnsignedInt4 = 0x0f,
|
|
hipResViewFormatSignedInt1 = 0x10,
|
|
hipResViewFormatSignedInt2 = 0x11,
|
|
hipResViewFormatSignedInt4 = 0x12,
|
|
hipResViewFormatHalf1 = 0x13,
|
|
hipResViewFormatHalf2 = 0x14,
|
|
hipResViewFormatHalf4 = 0x15,
|
|
hipResViewFormatFloat1 = 0x16,
|
|
hipResViewFormatFloat2 = 0x17,
|
|
hipResViewFormatFloat4 = 0x18,
|
|
hipResViewFormatUnsignedBlockCompressed1 = 0x19,
|
|
hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
|
|
hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
|
|
hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
|
|
hipResViewFormatSignedBlockCompressed4 = 0x1d,
|
|
hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
|
|
hipResViewFormatSignedBlockCompressed5 = 0x1f,
|
|
hipResViewFormatUnsignedBlockCompressed6H = 0x20,
|
|
hipResViewFormatSignedBlockCompressed6H = 0x21,
|
|
hipResViewFormatUnsignedBlockCompressed7 = 0x22
|
|
}hipResourceViewFormat;
|
|
typedef enum HIPresourceViewFormat_enum
|
|
{
|
|
HIP_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
|
|
HIP_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */
|
|
HIP_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */
|
|
HIP_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
|
|
HIP_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */
|
|
HIP_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */
|
|
} HIPresourceViewFormat;
|
|
/**
|
|
* HIP resource descriptor
|
|
*/
|
|
typedef struct hipResourceDesc {
|
|
enum hipResourceType resType;
|
|
union {
|
|
struct {
|
|
hipArray_t array;
|
|
} array;
|
|
struct {
|
|
hipMipmappedArray_t mipmap;
|
|
} mipmap;
|
|
struct {
|
|
void* devPtr;
|
|
struct hipChannelFormatDesc desc;
|
|
size_t sizeInBytes;
|
|
} linear;
|
|
struct {
|
|
void* devPtr;
|
|
struct hipChannelFormatDesc desc;
|
|
size_t width;
|
|
size_t height;
|
|
size_t pitchInBytes;
|
|
} pitch2D;
|
|
} res;
|
|
}hipResourceDesc;
|
|
typedef struct HIP_RESOURCE_DESC_st
|
|
{
|
|
HIPresourcetype resType; /**< Resource type */
|
|
union {
|
|
struct {
|
|
hipArray_t hArray; /**< HIP array */
|
|
} array;
|
|
struct {
|
|
hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
|
|
} mipmap;
|
|
struct {
|
|
hipDeviceptr_t devPtr; /**< Device pointer */
|
|
hipArray_Format format; /**< Array format */
|
|
unsigned int numChannels; /**< Channels per array element */
|
|
size_t sizeInBytes; /**< Size in bytes */
|
|
} linear;
|
|
struct {
|
|
hipDeviceptr_t devPtr; /**< Device pointer */
|
|
hipArray_Format format; /**< Array format */
|
|
unsigned int numChannels; /**< Channels per array element */
|
|
size_t width; /**< Width of the array in elements */
|
|
size_t height; /**< Height of the array in elements */
|
|
size_t pitchInBytes; /**< Pitch between two rows in bytes */
|
|
} pitch2D;
|
|
struct {
|
|
int reserved[32];
|
|
} reserved;
|
|
} res;
|
|
unsigned int flags; /**< Flags (must be zero) */
|
|
} HIP_RESOURCE_DESC;
|
|
/**
|
|
* hip resource view descriptor
|
|
*/
|
|
struct hipResourceViewDesc {
|
|
enum hipResourceViewFormat format;
|
|
size_t width;
|
|
size_t height;
|
|
size_t depth;
|
|
unsigned int firstMipmapLevel;
|
|
unsigned int lastMipmapLevel;
|
|
unsigned int firstLayer;
|
|
unsigned int lastLayer;
|
|
};
|
|
/**
|
|
* Resource view descriptor
|
|
*/
|
|
typedef struct HIP_RESOURCE_VIEW_DESC_st
|
|
{
|
|
HIPresourceViewFormat format; /**< Resource view format */
|
|
size_t width; /**< Width of the resource view */
|
|
size_t height; /**< Height of the resource view */
|
|
size_t depth; /**< Depth of the resource view */
|
|
unsigned int firstMipmapLevel; /**< First defined mipmap level */
|
|
unsigned int lastMipmapLevel; /**< Last defined mipmap level */
|
|
unsigned int firstLayer; /**< First layer index */
|
|
unsigned int lastLayer; /**< Last layer index */
|
|
unsigned int reserved[16];
|
|
} HIP_RESOURCE_VIEW_DESC;
|
|
/**
|
|
* Memory copy types
|
|
*
|
|
*/
|
|
#if !defined(__HIPCC_RTC__)
|
|
typedef enum hipMemcpyKind {
|
|
hipMemcpyHostToHost = 0, ///< Host-to-Host Copy
|
|
hipMemcpyHostToDevice = 1, ///< Host-to-Device Copy
|
|
hipMemcpyDeviceToHost = 2, ///< Device-to-Host Copy
|
|
hipMemcpyDeviceToDevice = 3, ///< Device-to-Device Copy
|
|
hipMemcpyDefault =
|
|
4 ///< Runtime will automatically determine copy-kind based on virtual addresses.
|
|
} hipMemcpyKind;
|
|
typedef struct hipPitchedPtr {
|
|
void* ptr;
|
|
size_t pitch;
|
|
size_t xsize;
|
|
size_t ysize;
|
|
}hipPitchedPtr;
|
|
typedef struct hipExtent {
|
|
size_t width; // Width in elements when referring to array memory, in bytes when referring to
|
|
// linear memory
|
|
size_t height;
|
|
size_t depth;
|
|
}hipExtent;
|
|
typedef struct hipPos {
|
|
size_t x;
|
|
size_t y;
|
|
size_t z;
|
|
}hipPos;
|
|
typedef struct hipMemcpy3DParms {
|
|
hipArray_t srcArray;
|
|
struct hipPos srcPos;
|
|
struct hipPitchedPtr srcPtr;
|
|
hipArray_t dstArray;
|
|
struct hipPos dstPos;
|
|
struct hipPitchedPtr dstPtr;
|
|
struct hipExtent extent;
|
|
enum hipMemcpyKind kind;
|
|
} hipMemcpy3DParms;
|
|
typedef struct HIP_MEMCPY3D {
|
|
size_t srcXInBytes;
|
|
size_t srcY;
|
|
size_t srcZ;
|
|
size_t srcLOD;
|
|
hipMemoryType srcMemoryType;
|
|
const void* srcHost;
|
|
hipDeviceptr_t srcDevice;
|
|
hipArray_t srcArray;
|
|
size_t srcPitch;
|
|
size_t srcHeight;
|
|
size_t dstXInBytes;
|
|
size_t dstY;
|
|
size_t dstZ;
|
|
size_t dstLOD;
|
|
hipMemoryType dstMemoryType;
|
|
void* dstHost;
|
|
hipDeviceptr_t dstDevice;
|
|
hipArray_t dstArray;
|
|
size_t dstPitch;
|
|
size_t dstHeight;
|
|
size_t WidthInBytes;
|
|
size_t Height;
|
|
size_t Depth;
|
|
} HIP_MEMCPY3D;
|
|
static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
|
|
size_t ysz) {
|
|
struct hipPitchedPtr s;
|
|
s.ptr = d;
|
|
s.pitch = p;
|
|
s.xsize = xsz;
|
|
s.ysize = ysz;
|
|
return s;
|
|
}
|
|
static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
|
|
struct hipPos p;
|
|
p.x = x;
|
|
p.y = y;
|
|
p.z = z;
|
|
return p;
|
|
}
|
|
static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
|
|
struct hipExtent e;
|
|
e.width = w;
|
|
e.height = h;
|
|
e.depth = d;
|
|
return e;
|
|
}
|
|
typedef enum hipFunction_attribute {
|
|
HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
|
|
HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
|
|
HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,
|
|
HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
|
|
HIP_FUNC_ATTRIBUTE_NUM_REGS,
|
|
HIP_FUNC_ATTRIBUTE_PTX_VERSION,
|
|
HIP_FUNC_ATTRIBUTE_BINARY_VERSION,
|
|
HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,
|
|
HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
|
|
HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
|
|
HIP_FUNC_ATTRIBUTE_MAX
|
|
} hipFunction_attribute;
|
|
|
|
typedef enum hipPointer_attribute {
|
|
HIP_POINTER_ATTRIBUTE_CONTEXT = 1, ///< The context on which a pointer was allocated
|
|
///< @warning - not supported in HIP
|
|
HIP_POINTER_ATTRIBUTE_MEMORY_TYPE, ///< memory type describing location of a pointer
|
|
HIP_POINTER_ATTRIBUTE_DEVICE_POINTER,///< address at which the pointer is allocated on device
|
|
HIP_POINTER_ATTRIBUTE_HOST_POINTER, ///< address at which the pointer is allocated on host
|
|
HIP_POINTER_ATTRIBUTE_P2P_TOKENS, ///< A pair of tokens for use with linux kernel interface
|
|
///< @warning - not supported in HIP
|
|
HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS, ///< Synchronize every synchronous memory operation
|
|
///< initiated on this region
|
|
HIP_POINTER_ATTRIBUTE_BUFFER_ID, ///< Unique ID for an allocated memory region
|
|
HIP_POINTER_ATTRIBUTE_IS_MANAGED, ///< Indicates if the pointer points to managed memory
|
|
HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,///< device ordinal of a device on which a pointer
|
|
///< was allocated or registered
|
|
HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE, ///< if this pointer maps to an allocation
|
|
///< that is suitable for hipIpcGetMemHandle
|
|
///< @warning - not supported in HIP
|
|
HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,///< Starting address for this requested pointer
|
|
HIP_POINTER_ATTRIBUTE_RANGE_SIZE, ///< Size of the address range for this requested pointer
|
|
HIP_POINTER_ATTRIBUTE_MAPPED, ///< tells if this pointer is in a valid address range
|
|
///< that is mapped to a backing allocation
|
|
HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,///< Bitmask of allowed hipmemAllocationHandleType
|
|
///< for this allocation @warning - not supported in HIP
|
|
HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, ///< returns if the memory referenced by
|
|
///< this pointer can be used with the GPUDirect RDMA API
|
|
///< @warning - not supported in HIP
|
|
HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS, ///< Returns the access flags the device associated with
|
|
///< for the corresponding memory referenced by the ptr
|
|
HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE ///< Returns the mempool handle for the allocation if
|
|
///< it was allocated from a mempool
|
|
///< @warning - not supported in HIP
|
|
} hipPointer_attribute;
|
|
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
#else
|
|
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
|
|
#endif
|
|
#endif
|
|
/*
|
|
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @file surface_types.h
|
|
* @brief Defines surface types for HIP runtime.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_SURFACE_TYPES_H
|
|
#define HIP_INCLUDE_HIP_SURFACE_TYPES_H
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wreserved-identifier"
|
|
#endif
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/driver_types.h>
|
|
#endif
|
|
|
|
/**
|
|
* An opaque value that represents a hip surface object
|
|
*/
|
|
struct __hip_surface;
|
|
typedef struct __hip_surface* hipSurfaceObject_t;
|
|
|
|
/**
|
|
* hip surface reference
|
|
*/
|
|
struct surfaceReference {
|
|
hipSurfaceObject_t surfaceObject;
|
|
};
|
|
|
|
/**
|
|
* hip surface boundary modes
|
|
*/
|
|
enum hipSurfaceBoundaryMode {
|
|
hipBoundaryModeZero = 0,
|
|
hipBoundaryModeTrap = 1,
|
|
hipBoundaryModeClamp = 2
|
|
};
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
|
|
#endif /* !HIP_INCLUDE_HIP_SURFACE_TYPES_H */
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_common.h>
|
|
#include <hip/driver_types.h>
|
|
#include <hip/amd_detail/amd_hip_vector_types.h>
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" HIP_PUBLIC_API
|
|
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);
|
|
|
|
static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
template <typename T>
|
|
static inline hipChannelFormatDesc hipCreateChannelDesc() {
|
|
return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
|
|
int e = (int)sizeof(char) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
|
|
int e = (int)sizeof(signed char) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
|
|
int e = (int)sizeof(unsigned char) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
|
|
int e = (int)sizeof(unsigned char) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
|
|
int e = (int)sizeof(signed char) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
|
|
int e = (int)sizeof(unsigned char) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
|
|
int e = (int)sizeof(signed char) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
#ifndef __GNUC__ // vector3 is the same as vector4
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
|
|
int e = (int)sizeof(unsigned char) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
|
|
int e = (int)sizeof(signed char) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
|
}
|
|
#endif
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
|
|
int e = (int)sizeof(unsigned char) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
|
|
int e = (int)sizeof(signed char) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
|
|
int e = (int)sizeof(signed short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
|
|
int e = (int)sizeof(signed short) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
|
|
int e = (int)sizeof(signed short) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
#ifndef __GNUC__
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
|
|
int e = (int)sizeof(signed short) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
|
}
|
|
#endif
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
|
|
int e = (int)sizeof(unsigned short) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
|
|
int e = (int)sizeof(signed short) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
|
|
int e = (int)sizeof(unsigned int) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
|
|
int e = (int)sizeof(signed int) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
|
|
int e = (int)sizeof(unsigned int) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
|
|
int e = (int)sizeof(signed int) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
|
|
int e = (int)sizeof(unsigned int) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
|
|
int e = (int)sizeof(signed int) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
#ifndef __GNUC__
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
|
|
int e = (int)sizeof(unsigned int) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
|
|
int e = (int)sizeof(signed int) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
|
}
|
|
#endif
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
|
|
int e = (int)sizeof(unsigned int) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
|
|
int e = (int)sizeof(signed int) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
|
|
int e = (int)sizeof(float) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
|
|
int e = (int)sizeof(float) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
|
|
int e = (int)sizeof(float) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
#ifndef __GNUC__
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
|
|
int e = (int)sizeof(float) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
|
|
}
|
|
#endif
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
|
|
int e = (int)sizeof(float) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
|
|
}
|
|
|
|
#if !defined(__LP64__)
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
|
|
int e = (int)sizeof(unsigned long) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
|
|
int e = (int)sizeof(signed long) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
|
|
int e = (int)sizeof(unsigned long) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
|
|
int e = (int)sizeof(signed long) * 8;
|
|
return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
|
|
int e = (int)sizeof(unsigned long) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
|
|
int e = (int)sizeof(signed long) * 8;
|
|
return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
|
|
}
|
|
|
|
#ifndef __GNUC__
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
|
|
int e = (int)sizeof(unsigned long) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
|
|
int e = (int)sizeof(signed long) * 8;
|
|
return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
|
|
}
|
|
#endif
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
|
|
int e = (int)sizeof(unsigned long) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
|
|
}
|
|
|
|
template <>
|
|
inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
|
|
int e = (int)sizeof(signed long) * 8;
|
|
return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
|
|
}
|
|
#endif /* !__LP64__ */
|
|
|
|
#else
|
|
|
|
struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
|
|
enum hipChannelFormatKind f);
|
|
|
|
#endif /* __cplusplus */
|
|
|
|
#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H */
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
|
|
#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wreserved-identifier"
|
|
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
|
|
#pragma clang diagnostic ignored "-Wc++98-compat"
|
|
#endif
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_common.h>
|
|
#endif
|
|
|
|
#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
|
|
#include "texture_types.h"
|
|
#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
|
|
/*******************************************************************************
|
|
* *
|
|
* *
|
|
* *
|
|
*******************************************************************************/
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <limits.h>
|
|
#include <hip/channel_descriptor.h>
|
|
#include <hip/driver_types.h>
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#define hipTextureType1D 0x01
|
|
#define hipTextureType2D 0x02
|
|
#define hipTextureType3D 0x03
|
|
#define hipTextureTypeCubemap 0x0C
|
|
#define hipTextureType1DLayered 0xF1
|
|
#define hipTextureType2DLayered 0xF2
|
|
#define hipTextureTypeCubemapLayered 0xFC
|
|
|
|
/**
|
|
* Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
|
|
*/
|
|
#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
|
|
#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
|
|
#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
|
|
#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)
|
|
|
|
/**
|
|
* An opaque value that represents a hip texture object
|
|
*/
|
|
struct __hip_texture;
|
|
typedef struct __hip_texture* hipTextureObject_t;
|
|
|
|
/**
|
|
* hip texture address modes
|
|
*/
|
|
enum hipTextureAddressMode {
|
|
hipAddressModeWrap = 0,
|
|
hipAddressModeClamp = 1,
|
|
hipAddressModeMirror = 2,
|
|
hipAddressModeBorder = 3
|
|
};
|
|
|
|
/**
|
|
* hip texture filter modes
|
|
*/
|
|
enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };
|
|
|
|
/**
|
|
* hip texture read modes
|
|
*/
|
|
enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };
|
|
|
|
/**
|
|
* hip texture reference
|
|
*/
|
|
typedef struct textureReference {
|
|
int normalized;
|
|
enum hipTextureReadMode readMode;// used only for driver API's
|
|
enum hipTextureFilterMode filterMode;
|
|
enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
|
|
struct hipChannelFormatDesc channelDesc;
|
|
int sRGB; // Perform sRGB->linear conversion during texture read
|
|
unsigned int maxAnisotropy; // Limit to the anisotropy ratio
|
|
enum hipTextureFilterMode mipmapFilterMode;
|
|
float mipmapLevelBias;
|
|
float minMipmapLevelClamp;
|
|
float maxMipmapLevelClamp;
|
|
|
|
hipTextureObject_t textureObject;
|
|
int numChannels;
|
|
enum hipArray_Format format;
|
|
}textureReference;
|
|
|
|
/**
|
|
* hip texture descriptor
|
|
*/
|
|
typedef struct hipTextureDesc {
|
|
enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions
|
|
enum hipTextureFilterMode filterMode;
|
|
enum hipTextureReadMode readMode;
|
|
int sRGB; // Perform sRGB->linear conversion during texture read
|
|
float borderColor[4];
|
|
int normalizedCoords;
|
|
unsigned int maxAnisotropy;
|
|
enum hipTextureFilterMode mipmapFilterMode;
|
|
float mipmapLevelBias;
|
|
float minMipmapLevelClamp;
|
|
float maxMipmapLevelClamp;
|
|
}hipTextureDesc;
|
|
|
|
#if __cplusplus
|
|
|
|
/*******************************************************************************
|
|
* *
|
|
* *
|
|
* *
|
|
*******************************************************************************/
|
|
#if __HIP__
|
|
#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
|
|
#else
|
|
#define __HIP_TEXTURE_ATTRIB
|
|
#endif
|
|
|
|
typedef textureReference* hipTexRef;
|
|
|
|
template <class T, int texType = hipTextureType1D,
|
|
enum hipTextureReadMode mode = hipReadModeElementType>
|
|
struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
|
|
texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
|
|
enum hipTextureAddressMode aMode = hipAddressModeClamp) {
|
|
normalized = norm;
|
|
readMode = mode;
|
|
filterMode = fMode;
|
|
addressMode[0] = aMode;
|
|
addressMode[1] = aMode;
|
|
addressMode[2] = aMode;
|
|
channelDesc = hipCreateChannelDesc<T>();
|
|
sRGB = 0;
|
|
textureObject = nullptr;
|
|
maxAnisotropy = 0;
|
|
mipmapLevelBias = 0;
|
|
minMipmapLevelClamp = 0;
|
|
maxMipmapLevelClamp = 0;
|
|
}
|
|
|
|
texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
|
|
struct hipChannelFormatDesc desc) {
|
|
normalized = norm;
|
|
readMode = mode;
|
|
filterMode = fMode;
|
|
addressMode[0] = aMode;
|
|
addressMode[1] = aMode;
|
|
addressMode[2] = aMode;
|
|
channelDesc = desc;
|
|
sRGB = 0;
|
|
textureObject = nullptr;
|
|
maxAnisotropy = 0;
|
|
mipmapLevelBias = 0;
|
|
minMipmapLevelClamp = 0;
|
|
maxMipmapLevelClamp = 0;
|
|
}
|
|
};
|
|
|
|
#endif /* __cplusplus */
|
|
|
|
#else
|
|
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
|
|
#endif
|
|
|
|
#if defined(__clang__)
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_vector_types.h>
|
|
#endif
|
|
|
|
extern "C" {
|
|
|
|
#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);
|
|
|
|
__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);
|
|
|
|
__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);
|
|
|
|
__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);
|
|
|
|
}
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_vector_types.h>
|
|
#include <hip/hip_texture_types.h>
|
|
#include <hip/amd_detail/ockl_image.h>
|
|
#include <type_traits>
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#define TEXTURE_PARAMETERS_INIT \
|
|
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
|
|
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
|
|
|
|
template<typename T>
|
|
struct __hip_is_tex_surf_scalar_channel_type
|
|
{
|
|
static constexpr bool value =
|
|
std::is_same<T, char>::value ||
|
|
std::is_same<T, unsigned char>::value ||
|
|
std::is_same<T, short>::value ||
|
|
std::is_same<T, unsigned short>::value ||
|
|
std::is_same<T, int>::value ||
|
|
std::is_same<T, unsigned int>::value ||
|
|
std::is_same<T, float>::value;
|
|
};
|
|
|
|
template<typename T>
|
|
struct __hip_is_tex_surf_channel_type
|
|
{
|
|
static constexpr bool value =
|
|
__hip_is_tex_surf_scalar_channel_type<T>::value;
|
|
};
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int rank>
|
|
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>
|
|
{
|
|
static constexpr bool value =
|
|
__hip_is_tex_surf_scalar_channel_type<T>::value &&
|
|
((rank == 1) ||
|
|
(rank == 2) ||
|
|
(rank == 4));
|
|
};
|
|
|
|
template<typename T>
|
|
struct __hip_is_tex_normalized_channel_type
|
|
{
|
|
static constexpr bool value =
|
|
std::is_same<T, char>::value ||
|
|
std::is_same<T, unsigned char>::value ||
|
|
std::is_same<T, short>::value ||
|
|
std::is_same<T, unsigned short>::value;
|
|
};
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int rank>
|
|
struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
|
|
{
|
|
static constexpr bool value =
|
|
__hip_is_tex_normalized_channel_type<T>::value &&
|
|
((rank == 1) ||
|
|
(rank == 2) ||
|
|
(rank == 4));
|
|
};
|
|
|
|
template <
|
|
typename T,
|
|
hipTextureReadMode readMode,
|
|
typename Enable = void>
|
|
struct __hip_tex_ret
|
|
{
|
|
static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
|
|
};
|
|
|
|
/*
|
|
* Map from device function return U to scalar texture type T
|
|
*/
|
|
template<typename T, typename U>
|
|
__forceinline__ __device__
|
|
typename std::enable_if<
|
|
__hip_is_tex_surf_scalar_channel_type<T>::value, const T>::type
|
|
__hipMapFrom(const U &u) {
|
|
if constexpr (sizeof(T) < sizeof(float)) {
|
|
union {
|
|
U u;
|
|
int i;
|
|
} d = { u };
|
|
return static_cast<T>(d.i);
|
|
} else { // sizeof(T) == sizeof(float)
|
|
union {
|
|
U u;
|
|
T t;
|
|
} d = { u };
|
|
return d.t;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Map from device function return U to vector texture type T
|
|
*/
|
|
template<typename T, typename U>
|
|
__forceinline__ __device__
|
|
typename std::enable_if<
|
|
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
|
|
__hipMapFrom(const U &u) {
|
|
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
|
union {
|
|
U u;
|
|
int4 i4;
|
|
} d = { u };
|
|
return __hipMapVector<typename T::value_type, sizeof(T)/sizeof(typename T::value_type)>(d.i4);
|
|
} else { // sizeof(typename T::value_type) == sizeof(float)
|
|
union {
|
|
U u;
|
|
T t;
|
|
} d = { u };
|
|
return d.t;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Map from scalar texture type T to device function input U
|
|
*/
|
|
template<typename U, typename T>
|
|
__forceinline__ __device__
|
|
typename std::enable_if<
|
|
__hip_is_tex_surf_scalar_channel_type<T>::value, const U>::type
|
|
__hipMapTo(const T &t) {
|
|
if constexpr (sizeof(T) < sizeof(float)) {
|
|
union {
|
|
U u;
|
|
int i;
|
|
} d = { 0 };
|
|
d.i = static_cast<int>(t);
|
|
return d.u;
|
|
} else { // sizeof(T) == sizeof(float)
|
|
union {
|
|
U u;
|
|
T t;
|
|
} d = { 0 };
|
|
d.t = t;
|
|
return d.u;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Map from vector texture type T to device function input U
|
|
*/
|
|
template<typename U, typename T>
|
|
__forceinline__ __device__
|
|
typename std::enable_if<
|
|
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
|
|
__hipMapTo(const T &t) {
|
|
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
|
union {
|
|
U u;
|
|
int4 i4;
|
|
} d = { 0 };
|
|
d.i4 = __hipMapVector<int, 4>(t);
|
|
return d.u;
|
|
} else { // sizeof(typename T::value_type) == sizeof(float)
|
|
union {
|
|
U u;
|
|
T t;
|
|
} d = { 0 };
|
|
d.t = t;
|
|
return d.u;
|
|
}
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
hipTextureReadMode readMode>
|
|
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
|
|
|
|
template <typename T>
|
|
struct __hip_tex_ret<
|
|
T,
|
|
hipReadModeElementType,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
|
|
{
|
|
using type = T;
|
|
};
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int rank>
|
|
struct __hip_tex_ret<
|
|
HIP_vector_type<T, rank>,
|
|
hipReadModeElementType,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
|
{
|
|
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
|
|
};
|
|
|
|
template<typename T>
|
|
struct __hip_tex_ret<
|
|
T,
|
|
hipReadModeNormalizedFloat,
|
|
typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
|
|
{
|
|
using type = float;
|
|
};
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int rank>
|
|
struct __hip_tex_ret<
|
|
HIP_vector_type<T, rank>,
|
|
hipReadModeNormalizedFloat,
|
|
typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
|
{
|
|
using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
|
|
};
|
|
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_load_1Db(i, x);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_1D(i, s, x);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
// TODO missing in device libs.
|
|
// auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
// return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
return {};
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
// TODO missing in device libs.
|
|
// auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
// return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
return {};
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
hipTextureReadMode readMode,
|
|
typename Enable = void>
|
|
struct __hip_tex2dgather_ret
|
|
{
|
|
static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
|
|
};
|
|
|
|
template <
|
|
typename T,
|
|
hipTextureReadMode readMode>
|
|
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
|
|
|
|
template <typename T>
|
|
struct __hip_tex2dgather_ret<
|
|
T,
|
|
hipReadModeElementType,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
|
|
{
|
|
using type = HIP_vector_type<T, 4>;
|
|
};
|
|
|
|
template<
|
|
typename T,
|
|
unsigned int rank>
|
|
struct __hip_tex2dgather_ret<
|
|
HIP_vector_type<T, rank>,
|
|
hipReadModeElementType,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
|
|
{
|
|
using type = HIP_vector_type<T, 4>;
|
|
};
|
|
|
|
template <typename T>
|
|
struct __hip_tex2dgather_ret<
|
|
T,
|
|
hipReadModeNormalizedFloat,
|
|
typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
|
|
{
|
|
using type = float4;
|
|
};
|
|
|
|
template <typename T, hipTextureReadMode readMode>
|
|
static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
|
|
{
|
|
TEXTURE_PARAMETERS_INIT;
|
|
switch (comp) {
|
|
case 1: {
|
|
auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
|
}
|
|
case 2: {
|
|
auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
|
}
|
|
case 3: {
|
|
auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
|
}
|
|
default: {
|
|
auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/hip_vector_types.h>
|
|
#include <hip/hip_texture_types.h>
|
|
#include <hip/amd_detail/texture_fetch_functions.h>
|
|
#include <hip/amd_detail/ockl_image.h>
|
|
#include <type_traits>
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#define TEXTURE_OBJECT_PARAMETERS_INIT \
|
|
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
|
|
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_load_1Db(i, x);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
|
|
{
|
|
*ptr = tex1Dfetch<T>(textureObject, x);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_1D(i, s, x);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
|
|
{
|
|
*ptr = tex1D<T>(textureObject, x);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
|
|
{
|
|
*ptr = tex2D<T>(textureObject, x, y);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
|
|
{
|
|
*ptr = tex3D<T>(textureObject, x, y, z);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
|
|
{
|
|
*ptr = tex1DLayered<T>(textureObject, x, layer);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
|
|
{
|
|
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
|
|
{
|
|
*ptr = texCubemap<T>(textureObject, x, y, z);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
|
|
{
|
|
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
switch (comp) {
|
|
case 1: {
|
|
auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
break;
|
|
}
|
|
case 2: {
|
|
auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
break;
|
|
}
|
|
case 3: {
|
|
auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
break;
|
|
}
|
|
default: {
|
|
auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
break;
|
|
}
|
|
}
|
|
return {};
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
|
|
{
|
|
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
|
|
{
|
|
*ptr = tex1DLod<T>(textureObject, x, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
|
|
{
|
|
*ptr = tex2DLod<T>(textureObject, x, y, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
|
|
{
|
|
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
|
|
{
|
|
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
|
|
{
|
|
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
|
|
{
|
|
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
// TODO missing in device libs.
|
|
// auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
// return __hipMapFrom<T>(tmp);
|
|
return {};
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
|
|
{
|
|
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
|
|
{
|
|
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
|
|
{
|
|
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
|
|
{
|
|
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
|
|
{
|
|
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
|
|
return __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
|
|
{
|
|
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
|
{
|
|
TEXTURE_OBJECT_PARAMETERS_INIT
|
|
// TODO missing in device libs.
|
|
// auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
|
|
// return __hipMapFrom<T>(tmp);
|
|
return {};
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
|
|
{
|
|
*ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
|
|
}
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2018 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
|
|
|
|
#if defined(__cplusplus)
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/surface_types.h>
|
|
#include <hip/hip_vector_types.h>
|
|
#include <hip/amd_detail/texture_fetch_functions.h>
|
|
#include <hip/amd_detail/ockl_image.h>
|
|
#endif
|
|
|
|
#if defined(__HIPCC_RTC__)
|
|
#define __HOST_DEVICE__ __device__
|
|
#else
|
|
#define __HOST_DEVICE__ __host__ __device__
|
|
#endif
|
|
|
|
#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT \
|
|
unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;
|
|
|
|
// CUDA is using byte address, need map to pixel address for HIP
|
|
static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
|
|
/*
|
|
* use below format index to generate format LUT
|
|
typedef enum {
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
|
|
HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
|
|
} hsa_ext_image_channel_type_t;
|
|
*/
|
|
static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
|
|
x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];
|
|
|
|
/*
|
|
* use below order index to generate order LUT
|
|
typedef enum {
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
|
|
HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
|
|
} hsa_ext_image_channel_order_t;
|
|
*/
|
|
static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
|
|
return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
|
|
int boundaryMode = hipBoundaryModeZero) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
|
auto tmp = __ockl_image_load_1D(i, x);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_1D(i, x, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __ockl_image_load_2D(i, int2(x, y).data);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_2D(i, int2(x, y).data, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
|
auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
|
auto tmp = __ockl_image_load_lod_1D(i, x, layer);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_lod_1D(i, x, layer, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_CM(i, int2(x, y).data, face, tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
|
|
int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer);
|
|
*data = __hipMapFrom<T>(tmp);
|
|
}
|
|
|
|
template <
|
|
typename T,
|
|
typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
|
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
|
|
int layer) {
|
|
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
|
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
|
|
auto tmp = __hipMapTo<float4::Native_vec_>(data);
|
|
__ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp);
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "hip/amd_detail/amd_hip_vector_types.h"
|
|
#endif
|
|
|
|
#if defined(__HIPCC_RTC__)
|
|
#define __HOST_DEVICE__ __device__
|
|
#else
|
|
#define __HOST_DEVICE__ __host__ __device__
|
|
// TODO: Clang has a bug which allows device functions to call std functions
|
|
// when std functions are introduced into default namespace by using statement.
|
|
// math.h may be included after this bug is fixed.
|
|
#if __cplusplus
|
|
#include <cmath>
|
|
#else
|
|
#include "math.h"
|
|
#endif
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#if __cplusplus
|
|
#define COMPLEX_NEG_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type operator-(const type& op) { \
|
|
type ret; \
|
|
ret.x = -op.x; \
|
|
ret.y = -op.y; \
|
|
return ret; \
|
|
}
|
|
|
|
#define COMPLEX_EQ_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) { \
|
|
return lhs.x == rhs.x && lhs.y == rhs.y; \
|
|
}
|
|
|
|
#define COMPLEX_NE_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) { \
|
|
return !(lhs == rhs); \
|
|
}
|
|
|
|
#define COMPLEX_ADD_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) { \
|
|
type ret; \
|
|
ret.x = lhs.x + rhs.x; \
|
|
ret.y = lhs.y + rhs.y; \
|
|
return ret; \
|
|
}
|
|
|
|
#define COMPLEX_SUB_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) { \
|
|
type ret; \
|
|
ret.x = lhs.x - rhs.x; \
|
|
ret.y = lhs.y - rhs.y; \
|
|
return ret; \
|
|
}
|
|
|
|
#define COMPLEX_MUL_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) { \
|
|
type ret; \
|
|
ret.x = lhs.x * rhs.x - lhs.y * rhs.y; \
|
|
ret.y = lhs.x * rhs.y + lhs.y * rhs.x; \
|
|
return ret; \
|
|
}
|
|
|
|
#define COMPLEX_DIV_OP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) { \
|
|
type ret; \
|
|
ret.x = (lhs.x * rhs.x + lhs.y * rhs.y); \
|
|
ret.y = (rhs.x * lhs.y - lhs.x * rhs.y); \
|
|
ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y); \
|
|
ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y); \
|
|
return ret; \
|
|
}
|
|
|
|
#define COMPLEX_ADD_PREOP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) { \
|
|
lhs.x += rhs.x; \
|
|
lhs.y += rhs.y; \
|
|
return lhs; \
|
|
}
|
|
|
|
#define COMPLEX_SUB_PREOP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) { \
|
|
lhs.x -= rhs.x; \
|
|
lhs.y -= rhs.y; \
|
|
return lhs; \
|
|
}
|
|
|
|
#define COMPLEX_MUL_PREOP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) { \
|
|
type temp{lhs}; \
|
|
lhs.x = rhs.x * temp.x - rhs.y * temp.y; \
|
|
lhs.y = rhs.y * temp.x + rhs.x * temp.y; \
|
|
return lhs; \
|
|
}
|
|
|
|
#define COMPLEX_DIV_PREOP_OVERLOAD(type) \
|
|
__HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) { \
|
|
type temp; \
|
|
temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
|
|
temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y); \
|
|
lhs = temp; \
|
|
return lhs; \
|
|
}
|
|
|
|
#define COMPLEX_SCALAR_PRODUCT(type, type1) \
|
|
__HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) { \
|
|
type ret; \
|
|
ret.x = lhs.x * rhs; \
|
|
ret.y = lhs.y * rhs; \
|
|
return ret; \
|
|
}
|
|
|
|
#endif
|
|
|
|
typedef float2 hipFloatComplex;
|
|
|
|
__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }
|
|
|
|
__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
|
|
hipFloatComplex z;
|
|
z.x = a;
|
|
z.y = b;
|
|
return z;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
|
|
hipFloatComplex ret;
|
|
ret.x = z.x;
|
|
ret.y = -z.y;
|
|
return ret;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
|
|
return z.x * z.x + z.y * z.y;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
|
|
return make_hipFloatComplex(p.x + q.x, p.y + q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
|
|
return make_hipFloatComplex(p.x - q.x, p.y - q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
|
|
return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
|
|
float sqabs = hipCsqabsf(q);
|
|
hipFloatComplex ret;
|
|
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
|
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
|
return ret;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }
|
|
|
|
|
|
typedef double2 hipDoubleComplex;
|
|
|
|
__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }
|
|
|
|
__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
|
|
hipDoubleComplex z;
|
|
z.x = a;
|
|
z.y = b;
|
|
return z;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
|
|
hipDoubleComplex ret;
|
|
ret.x = z.x;
|
|
ret.y = -z.y;
|
|
return ret;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
|
|
return z.x * z.x + z.y * z.y;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
|
|
return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
|
|
return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
|
|
return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
|
|
double sqabs = hipCsqabs(q);
|
|
hipDoubleComplex ret;
|
|
ret.x = (p.x * q.x + p.y * q.y) / sqabs;
|
|
ret.y = (p.y * q.x - p.x * q.y) / sqabs;
|
|
return ret;
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }
|
|
|
|
|
|
#if __cplusplus
|
|
|
|
COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
|
|
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)
|
|
|
|
COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
|
|
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)
|
|
|
|
#endif
|
|
|
|
|
|
typedef hipFloatComplex hipComplex;
|
|
|
|
__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
|
|
return make_hipFloatComplex(x, y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
|
|
return make_hipFloatComplex((float)z.x, (float)z.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
|
|
return make_hipDoubleComplex((double)z.x, (double)z.y);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
|
|
float real = (p.x * q.x) + r.x;
|
|
float imag = (q.x * p.y) + r.y;
|
|
|
|
real = -(p.y * q.y) + real;
|
|
imag = (p.x * q.y) + imag;
|
|
|
|
return make_hipComplex(real, imag);
|
|
}
|
|
|
|
__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
|
|
hipDoubleComplex r) {
|
|
double real = (p.x * q.x) + r.x;
|
|
double imag = (q.x * p.y) + r.y;
|
|
|
|
real = -(p.y * q.y) + real;
|
|
imag = (p.x * q.y) + imag;
|
|
|
|
return make_hipDoubleComplex(real, imag);
|
|
}
|
|
|
|
#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
#ifndef AMD_HIP_MATH_CONSTANTS_H
|
|
#define AMD_HIP_MATH_CONSTANTS_H
|
|
|
|
// single precision constants
|
|
#define HIP_INF_F __int_as_float(0x7f800000U)
|
|
#define HIP_NAN_F __int_as_float(0x7fffffffU)
|
|
#define HIP_MIN_DENORM_F __int_as_float(0x00000001U)
|
|
#define HIP_MAX_NORMAL_F __int_as_float(0x7f7fffffU)
|
|
#define HIP_NEG_ZERO_F __int_as_float(0x80000000U)
|
|
#define HIP_ZERO_F 0.0F
|
|
#define HIP_ONE_F 1.0F
|
|
#define HIP_SQRT_HALF_F 0.707106781F
|
|
#define HIP_SQRT_HALF_HI_F 0.707106781F
|
|
#define HIP_SQRT_HALF_LO_F 1.210161749e-08F
|
|
#define HIP_SQRT_TWO_F 1.414213562F
|
|
#define HIP_THIRD_F 0.333333333F
|
|
#define HIP_PIO4_F 0.785398163F
|
|
#define HIP_PIO2_F 1.570796327F
|
|
#define HIP_3PIO4_F 2.356194490F
|
|
#define HIP_2_OVER_PI_F 0.636619772F
|
|
#define HIP_SQRT_2_OVER_PI_F 0.797884561F
|
|
#define HIP_PI_F 3.141592654F
|
|
#define HIP_L2E_F 1.442695041F
|
|
#define HIP_L2T_F 3.321928094F
|
|
#define HIP_LG2_F 0.301029996F
|
|
#define HIP_LGE_F 0.434294482F
|
|
#define HIP_LN2_F 0.693147181F
|
|
#define HIP_LNT_F 2.302585093F
|
|
#define HIP_LNPI_F 1.144729886F
|
|
#define HIP_TWO_TO_M126_F 1.175494351e-38F
|
|
#define HIP_TWO_TO_126_F 8.507059173e37F
|
|
#define HIP_NORM_HUGE_F 3.402823466e38F
|
|
#define HIP_TWO_TO_23_F 8388608.0F
|
|
#define HIP_TWO_TO_24_F 16777216.0F
|
|
#define HIP_TWO_TO_31_F 2147483648.0F
|
|
#define HIP_TWO_TO_32_F 4294967296.0F
|
|
#define HIP_REMQUO_BITS_F 3U
|
|
#define HIP_REMQUO_MASK_F (~((~0U)<<HIP_REMQUO_BITS_F))
|
|
#define HIP_TRIG_PLOSS_F 105615.0F
|
|
|
|
// double precision constants
|
|
#define HIP_INF __longlong_as_double(0x7ff0000000000000ULL)
|
|
#define HIP_NAN __longlong_as_double(0xfff8000000000000ULL)
|
|
#define HIP_NEG_ZERO __longlong_as_double(0x8000000000000000ULL)
|
|
#define HIP_MIN_DENORM __longlong_as_double(0x0000000000000001ULL)
|
|
#define HIP_ZERO 0.0
|
|
#define HIP_ONE 1.0
|
|
#define HIP_SQRT_TWO 1.4142135623730951e+0
|
|
#define HIP_SQRT_HALF 7.0710678118654757e-1
|
|
#define HIP_SQRT_HALF_HI 7.0710678118654757e-1
|
|
#define HIP_SQRT_HALF_LO (-4.8336466567264567e-17)
|
|
#define HIP_THIRD 3.3333333333333333e-1
|
|
#define HIP_TWOTHIRD 6.6666666666666667e-1
|
|
#define HIP_PIO4 7.8539816339744828e-1
|
|
#define HIP_PIO4_HI 7.8539816339744828e-1
|
|
#define HIP_PIO4_LO 3.0616169978683830e-17
|
|
#define HIP_PIO2 1.5707963267948966e+0
|
|
#define HIP_PIO2_HI 1.5707963267948966e+0
|
|
#define HIP_PIO2_LO 6.1232339957367660e-17
|
|
#define HIP_3PIO4 2.3561944901923448e+0
|
|
#define HIP_2_OVER_PI 6.3661977236758138e-1
|
|
#define HIP_PI 3.1415926535897931e+0
|
|
#define HIP_PI_HI 3.1415926535897931e+0
|
|
#define HIP_PI_LO 1.2246467991473532e-16
|
|
#define HIP_SQRT_2PI 2.5066282746310007e+0
|
|
#define HIP_SQRT_2PI_HI 2.5066282746310007e+0
|
|
#define HIP_SQRT_2PI_LO (-1.8328579980459167e-16)
|
|
#define HIP_SQRT_PIO2 1.2533141373155003e+0
|
|
#define HIP_SQRT_PIO2_HI 1.2533141373155003e+0
|
|
#define HIP_SQRT_PIO2_LO (-9.1642899902295834e-17)
|
|
#define HIP_SQRT_2OPI 7.9788456080286536e-1
|
|
#define HIP_L2E 1.4426950408889634e+0
|
|
#define HIP_L2E_HI 1.4426950408889634e+0
|
|
#define HIP_L2E_LO 2.0355273740931033e-17
|
|
#define HIP_L2T 3.3219280948873622e+0
|
|
#define HIP_LG2 3.0102999566398120e-1
|
|
#define HIP_LG2_HI 3.0102999566398120e-1
|
|
#define HIP_LG2_LO (-2.8037281277851704e-18)
|
|
#define HIP_LGE 4.3429448190325182e-1
|
|
#define HIP_LGE_HI 4.3429448190325182e-1
|
|
#define HIP_LGE_LO 1.09831965021676510e-17
|
|
#define HIP_LN2 6.9314718055994529e-1
|
|
#define HIP_LN2_HI 6.9314718055994529e-1
|
|
#define HIP_LN2_LO 2.3190468138462996e-17
|
|
#define HIP_LNT 2.3025850929940459e+0
|
|
#define HIP_LNT_HI 2.3025850929940459e+0
|
|
#define HIP_LNT_LO (-2.1707562233822494e-16)
|
|
#define HIP_LNPI 1.1447298858494002e+0
|
|
#define HIP_LN2_X_1024 7.0978271289338397e+2
|
|
#define HIP_LN2_X_1025 7.1047586007394398e+2
|
|
#define HIP_LN2_X_1075 7.4513321910194122e+2
|
|
#define HIP_LG2_X_1024 3.0825471555991675e+2
|
|
#define HIP_LG2_X_1075 3.2360724533877976e+2
|
|
#define HIP_TWO_TO_23 8388608.0
|
|
#define HIP_TWO_TO_52 4503599627370496.0
|
|
#define HIP_TWO_TO_53 9007199254740992.0
|
|
#define HIP_TWO_TO_54 18014398509481984.0
|
|
#define HIP_TWO_TO_M54 5.5511151231257827e-17
|
|
#define HIP_TWO_TO_M1022 2.22507385850720140e-308
|
|
#define HIP_TRIG_PLOSS 2147483648.0
|
|
#define HIP_DBL2INT_CVT 6755399441055744.0
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "host_defines.h"
|
|
#include "amd_hip_vector_types.h" // For Native_vec_
|
|
#endif
|
|
|
|
#if defined(__cplusplus)
|
|
extern "C" {
|
|
#endif
|
|
|
|
// DOT FUNCTIONS
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot2(
|
|
HIP_vector_base<short, 2>::Native_vec_,
|
|
HIP_vector_base<short, 2>::Native_vec_,
|
|
int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot2(
|
|
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
|
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
|
unsigned int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot4(
|
|
HIP_vector_base<char, 4>::Native_vec_,
|
|
HIP_vector_base<char, 4>::Native_vec_,
|
|
int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot4(
|
|
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
|
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
|
unsigned int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot8(int, int, int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
|
|
#endif
|
|
|
|
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
// BEGIN FLOAT
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_acos_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_acosh_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_asin_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_asinh_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_atan2_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_atan_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_atanh_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_cbrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_ceil_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
__device__
|
|
float __ocml_copysign_f32(float, float);
|
|
__device__
|
|
float __ocml_cos_f32(float);
|
|
__device__
|
|
float __ocml_native_cos_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
__device__
|
|
float __ocml_cosh_f32(float);
|
|
__device__
|
|
float __ocml_cospi_f32(float);
|
|
__device__
|
|
float __ocml_i0_f32(float);
|
|
__device__
|
|
float __ocml_i1_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfc_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfcinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfcx_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erf_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_exp10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp2_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_exp_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_expm1_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fabs_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fdim_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_floor_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fmax_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fmin_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
__device__
|
|
float __ocml_fmod_f32(float, float);
|
|
__device__
|
|
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_hypot_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_ilogb_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isfinite_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isinf_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isnan_f32(float);
|
|
__device__
|
|
float __ocml_j0_f32(float);
|
|
__device__
|
|
float __ocml_j1_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_ldexp_f32(float, int);
|
|
__device__
|
|
float __ocml_lgamma_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log1p_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log2_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log2_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_logb_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log_f32(float);
|
|
__device__
|
|
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_nearbyint_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_nextafter_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_len3_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_len4_f32(float, float, float, float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_ncdf_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_ncdfinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_pow_f32(float, float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_pown_f32(float, int);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_rcbrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_remainder_f32(float, float);
|
|
__device__
|
|
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rhypot_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rint_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rlen3_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rlen4_f32(float, float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_round_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_rsqrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_scalb_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_scalbn_f32(float, int);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_signbit_f32(float);
|
|
__device__
|
|
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
float __ocml_sin_f32(float);
|
|
__device__
|
|
float __ocml_native_sin_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_sinh_f32(float);
|
|
__device__
|
|
float __ocml_sinpi_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_native_sqrt_f32(float);
|
|
__device__
|
|
float __ocml_tan_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_tanh_f32(float);
|
|
__device__
|
|
float __ocml_tgamma_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_trunc_f32(float);
|
|
__device__
|
|
float __ocml_y0_f32(float);
|
|
__device__
|
|
float __ocml_y1_f32(float);
|
|
|
|
// BEGIN INTRINSICS
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rte_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtn_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtp_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtz_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rte_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtn_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtp_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtz_f32(float, float, float);
|
|
// END INTRINSICS
|
|
// END FLOAT
|
|
|
|
// BEGIN DOUBLE
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_acos_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_acosh_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_asin_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_asinh_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_atan2_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_atan_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_atanh_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_cbrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_ceil_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_copysign_f64(double, double);
|
|
__device__
|
|
double __ocml_cos_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_cosh_f64(double);
|
|
__device__
|
|
double __ocml_cospi_f64(double);
|
|
__device__
|
|
double __ocml_i0_f64(double);
|
|
__device__
|
|
double __ocml_i1_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfc_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfcinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfcx_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erf_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp10_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp2_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_expm1_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fabs_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fdim_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_floor_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmax_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmin_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmod_f64(double, double);
|
|
__device__
|
|
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_hypot_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_ilogb_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isfinite_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isinf_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isnan_f64(double);
|
|
__device__
|
|
double __ocml_j0_f64(double);
|
|
__device__
|
|
double __ocml_j1_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_ldexp_f64(double, int);
|
|
__device__
|
|
double __ocml_lgamma_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log10_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log1p_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log2_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_logb_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log_f64(double);
|
|
__device__
|
|
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_nearbyint_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_nextafter_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_len3_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_len4_f64(double, double, double, double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_ncdf_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_ncdfinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_pow_f64(double, double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_pown_f64(double, int);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_rcbrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_remainder_f64(double, double);
|
|
__device__
|
|
double __ocml_remquo_f64(
|
|
double, double, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rhypot_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rint_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rlen3_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rlen4_f64(double, double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_round_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_rsqrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_scalb_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_scalbn_f64(double, int);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_signbit_f64(double);
|
|
__device__
|
|
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
double __ocml_sin_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_sinh_f64(double);
|
|
__device__
|
|
double __ocml_sinpi_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_f64(double);
|
|
__device__
|
|
double __ocml_tan_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_tanh_f64(double);
|
|
__device__
|
|
double __ocml_tgamma_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_trunc_f64(double);
|
|
__device__
|
|
double __ocml_y0_f64(double);
|
|
__device__
|
|
double __ocml_y1_f64(double);
|
|
|
|
// BEGIN INTRINSICS
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rte_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtn_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtp_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtz_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rte_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtn_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtp_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtz_f64(double, double, double);
|
|
// END INTRINSICS
|
|
// END DOUBLE
|
|
|
|
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
|
|
#if defined(__cplusplus)
|
|
} // extern "C"
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @file amd_detail/device_library_decls.h
|
|
* @brief Contains declarations for types and functions in device library.
|
|
* Uses int64_t and uint64_t instead of long, long long, unsigned
|
|
* long and unsigned long long types for device library API
|
|
* declarations.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "hip/amd_detail/host_defines.h"
|
|
#endif
|
|
|
|
typedef unsigned char uchar;
|
|
typedef unsigned short ushort;
|
|
typedef unsigned int uint;
|
|
typedef unsigned long ulong;
|
|
typedef unsigned long long ullong;
|
|
|
|
extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
|
|
extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
|
|
extern "C" __device__ uint __ockl_activelane_u32(void);
|
|
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
|
|
extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
|
|
extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);
|
|
|
|
extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
|
|
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
|
|
extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);
|
|
|
|
extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);
|
|
|
|
extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
|
|
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);
|
|
|
|
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
|
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
|
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
|
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
|
|
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
|
|
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);
|
|
|
|
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
|
|
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);
|
|
|
|
extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
|
|
extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
|
|
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
|
|
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
|
|
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
|
|
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
|
|
|
|
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
|
|
|
|
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
|
|
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
|
|
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);
|
|
|
|
extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
|
|
extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
|
|
uint64_t value0, uint64_t value1,
|
|
uint64_t value2, uint64_t value3,
|
|
uint64_t value4, uint64_t value5,
|
|
uint64_t value6, uint32_t is_last);
|
|
extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
|
|
uint64_t length, uint32_t is_last);
|
|
|
|
// Introduce local address space
|
|
#define __local __attribute__((address_space(3)))
|
|
|
|
#ifdef __HIP_DEVICE_COMPILE__
|
|
__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
|
|
#endif //__HIP_DEVICE_COMPILE__
|
|
|
|
// Using hip.amdgcn.bc - sync threads
|
|
#define __CLK_LOCAL_MEM_FENCE 0x01
|
|
typedef unsigned __cl_mem_fence_flags;
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/amd_detail/amd_hip_common.h>
|
|
#include "host_defines.h"
|
|
#include "math_fwd.h"
|
|
#include <hip/hip_runtime_api.h>
|
|
#include <stddef.h>
|
|
#include <hip/hip_vector_types.h>
|
|
#include <hip/amd_detail/device_library_decls.h>
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
extern "C" __device__ int printf(const char *fmt, ...);
|
|
#else
|
|
template <typename... All>
|
|
static inline __device__ void printf(const char* format, All... all) {}
|
|
#endif // __HIP_CLANG_ONLY__
|
|
|
|
extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
|
|
|
|
/*
|
|
Integer Intrinsics
|
|
*/
|
|
|
|
// integer intrinsic function __poc __clz __ffs __brev
|
|
__device__ static inline unsigned int __popc(unsigned int input) {
|
|
return __builtin_popcount(input);
|
|
}
|
|
__device__ static inline unsigned int __popcll(unsigned long long int input) {
|
|
return __builtin_popcountll(input);
|
|
}
|
|
|
|
__device__ static inline int __clz(int input) {
|
|
return __ockl_clz_u32((uint)input);
|
|
}
|
|
|
|
__device__ static inline int __clzll(long long int input) {
|
|
return __ockl_clz_u64((uint64_t)input);
|
|
}
|
|
|
|
__device__ static inline unsigned int __ffs(unsigned int input) {
|
|
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
|
|
}
|
|
|
|
__device__ static inline unsigned int __ffsll(unsigned long long int input) {
|
|
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
|
|
}
|
|
|
|
__device__ static inline unsigned int __ffs(int input) {
|
|
return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
|
|
}
|
|
|
|
__device__ static inline unsigned int __ffsll(long long int input) {
|
|
return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
|
|
}
|
|
|
|
// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
|
|
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
|
|
// If not found, return -1.
|
|
__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
|
|
uint64_t temp_mask = mask;
|
|
int32_t temp_offset = offset;
|
|
|
|
if (offset == 0) {
|
|
temp_mask &= (1 << base);
|
|
temp_offset = 1;
|
|
}
|
|
else if (offset < 0) {
|
|
temp_mask = __builtin_bitreverse64(mask);
|
|
base = 63 - base;
|
|
temp_offset = -offset;
|
|
}
|
|
|
|
temp_mask = temp_mask & ((~0ULL) << base);
|
|
if (__builtin_popcountll(temp_mask) < temp_offset)
|
|
return -1;
|
|
int32_t total = 0;
|
|
for (int i = 0x20; i > 0; i >>= 1) {
|
|
uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
|
|
int32_t pcnt = __builtin_popcountll(temp_mask_lo);
|
|
if (pcnt < temp_offset) {
|
|
temp_mask = temp_mask >> i;
|
|
temp_offset -= pcnt;
|
|
total += i;
|
|
}
|
|
else {
|
|
temp_mask = temp_mask_lo;
|
|
}
|
|
}
|
|
if (offset < 0)
|
|
return 63 - total;
|
|
else
|
|
return total;
|
|
}
|
|
|
|
__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
|
|
uint64_t temp_mask = mask;
|
|
int32_t temp_offset = offset;
|
|
if (offset == 0) {
|
|
temp_mask &= (1 << base);
|
|
temp_offset = 1;
|
|
}
|
|
else if (offset < 0) {
|
|
temp_mask = __builtin_bitreverse64(mask);
|
|
base = 63 - base;
|
|
temp_offset = -offset;
|
|
}
|
|
temp_mask = temp_mask & ((~0ULL) << base);
|
|
if (__builtin_popcountll(temp_mask) < temp_offset)
|
|
return -1;
|
|
int32_t total = 0;
|
|
for (int i = 0x20; i > 0; i >>= 1) {
|
|
uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
|
|
int32_t pcnt = __builtin_popcountll(temp_mask_lo);
|
|
if (pcnt < temp_offset) {
|
|
temp_mask = temp_mask >> i;
|
|
temp_offset -= pcnt;
|
|
total += i;
|
|
}
|
|
else {
|
|
temp_mask = temp_mask_lo;
|
|
}
|
|
}
|
|
if (offset < 0)
|
|
return 63 - total;
|
|
else
|
|
return total;
|
|
}
|
|
__device__ static inline unsigned int __brev(unsigned int input) {
|
|
return __builtin_bitreverse32(input);
|
|
}
|
|
|
|
__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
|
|
return __builtin_bitreverse64(input);
|
|
}
|
|
|
|
__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
|
|
return input == 0 ? -1 : __builtin_ctzl(input);
|
|
}
|
|
|
|
__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
|
|
uint32_t offset = src1 & 31;
|
|
uint32_t width = src2 & 31;
|
|
return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
|
|
}
|
|
|
|
__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
|
|
uint64_t offset = src1 & 63;
|
|
uint64_t width = src2 & 63;
|
|
return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
|
|
}
|
|
|
|
__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
|
|
uint32_t offset = src2 & 31;
|
|
uint32_t width = src3 & 31;
|
|
uint32_t mask = (1 << width) - 1;
|
|
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
|
|
}
|
|
|
|
__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
|
|
uint64_t offset = src2 & 63;
|
|
uint64_t width = src3 & 63;
|
|
uint64_t mask = (1ULL << width) - 1;
|
|
return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
|
|
}
|
|
|
|
__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
|
|
{
|
|
uint32_t mask_shift = shift & 31;
|
|
return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
|
|
}
|
|
|
|
__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
|
|
{
|
|
uint32_t min_shift = shift >= 32 ? 32 : shift;
|
|
return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
|
|
}
|
|
|
|
__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
|
|
{
|
|
return __builtin_amdgcn_alignbit(hi, lo, shift);
|
|
}
|
|
|
|
__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
|
|
{
|
|
return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
|
|
}
|
|
|
|
__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
|
|
__device__ static unsigned int __hadd(int x, int y);
|
|
__device__ static int __mul24(int x, int y);
|
|
__device__ static long long int __mul64hi(long long int x, long long int y);
|
|
__device__ static int __mulhi(int x, int y);
|
|
__device__ static int __rhadd(int x, int y);
|
|
__device__ static unsigned int __sad(int x, int y,unsigned int z);
|
|
__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
|
|
__device__ static int __umul24(unsigned int x, unsigned int y);
|
|
__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
|
|
__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
|
|
__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
|
|
__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
|
|
|
|
struct ucharHolder {
|
|
union {
|
|
unsigned char c[4];
|
|
unsigned int ui;
|
|
};
|
|
} __attribute__((aligned(4)));
|
|
|
|
struct uchar2Holder {
|
|
union {
|
|
unsigned int ui[2];
|
|
unsigned char c[8];
|
|
};
|
|
} __attribute__((aligned(8)));
|
|
|
|
__device__
|
|
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
|
|
struct uchar2Holder cHoldVal;
|
|
struct ucharHolder cHoldKey;
|
|
cHoldKey.ui = s;
|
|
cHoldVal.ui[0] = x;
|
|
cHoldVal.ui[1] = y;
|
|
unsigned int result;
|
|
result = cHoldVal.c[cHoldKey.c[0] & 0x07];
|
|
result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
|
|
result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
|
|
result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
|
|
return result;
|
|
}
|
|
|
|
__device__ static inline unsigned int __hadd(int x, int y) {
|
|
int z = x + y;
|
|
int sign = z & 0x8000000;
|
|
int value = z & 0x7FFFFFFF;
|
|
return ((value) >> 1 || sign);
|
|
}
|
|
|
|
__device__ static inline int __mul24(int x, int y) {
|
|
return __ockl_mul24_i32(x, y);
|
|
}
|
|
|
|
__device__ static inline long long __mul64hi(long long int x, long long int y) {
|
|
ulong x0 = (ulong)x & 0xffffffffUL;
|
|
long x1 = x >> 32;
|
|
ulong y0 = (ulong)y & 0xffffffffUL;
|
|
long y1 = y >> 32;
|
|
ulong z0 = x0*y0;
|
|
long t = x1*y0 + (z0 >> 32);
|
|
long z1 = t & 0xffffffffL;
|
|
long z2 = t >> 32;
|
|
z1 = x0*y1 + z1;
|
|
return x1*y1 + z2 + (z1 >> 32);
|
|
}
|
|
|
|
__device__ static inline int __mulhi(int x, int y) {
|
|
return __ockl_mul_hi_i32(x, y);
|
|
}
|
|
|
|
__device__ static inline int __rhadd(int x, int y) {
|
|
int z = x + y + 1;
|
|
int sign = z & 0x8000000;
|
|
int value = z & 0x7FFFFFFF;
|
|
return ((value) >> 1 || sign);
|
|
}
|
|
__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
|
|
return x > y ? x - y + z : y - x + z;
|
|
}
|
|
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
|
|
return (x + y) >> 1;
|
|
}
|
|
__device__ static inline int __umul24(unsigned int x, unsigned int y) {
|
|
return __ockl_mul24_u32(x, y);
|
|
}
|
|
|
|
__device__
|
|
static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
|
|
ulong x0 = x & 0xffffffffUL;
|
|
ulong x1 = x >> 32;
|
|
ulong y0 = y & 0xffffffffUL;
|
|
ulong y1 = y >> 32;
|
|
ulong z0 = x0*y0;
|
|
ulong t = x1*y0 + (z0 >> 32);
|
|
ulong z1 = t & 0xffffffffUL;
|
|
ulong z2 = t >> 32;
|
|
z1 = x0*y1 + z1;
|
|
return x1*y1 + z2 + (z1 >> 32);
|
|
}
|
|
|
|
__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
|
|
return __ockl_mul_hi_u32(x, y);
|
|
}
|
|
__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
|
|
return (x + y + 1) >> 1;
|
|
}
|
|
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
|
|
return __ockl_sadd_u32(x, y, z);
|
|
}
|
|
|
|
__device__ static inline unsigned int __lane_id() {
|
|
return __builtin_amdgcn_mbcnt_hi(
|
|
-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
|
|
}
|
|
|
|
__device__
|
|
static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
|
|
|
|
__device__
|
|
static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
|
|
|
|
/*
|
|
HIP specific device functions
|
|
*/
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "amd_warp_functions.h"
|
|
#endif
|
|
|
|
#define MASK1 0x00ff00ff
|
|
#define MASK2 0xff00ff00
|
|
|
|
__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 + one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 + one2) & MASK2);
|
|
return out;
|
|
}
|
|
|
|
__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 - one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 - one2) & MASK2);
|
|
return out;
|
|
}
|
|
|
|
__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
|
|
char4 out;
|
|
unsigned one1 = in1.w & MASK1;
|
|
unsigned one2 = in2.w & MASK1;
|
|
out.w = (one1 * one2) & MASK1;
|
|
one1 = in1.w & MASK2;
|
|
one2 = in2.w & MASK2;
|
|
out.w = out.w | ((one1 * one2) & MASK2);
|
|
return out;
|
|
}
|
|
|
|
__device__ static inline float __double2float_rd(double x) {
|
|
return __ocml_cvtrtn_f32_f64(x);
|
|
}
|
|
__device__ static inline float __double2float_rn(double x) { return x; }
|
|
__device__ static inline float __double2float_ru(double x) {
|
|
return __ocml_cvtrtp_f32_f64(x);
|
|
}
|
|
__device__ static inline float __double2float_rz(double x) {
|
|
return __ocml_cvtrtz_f32_f64(x);
|
|
}
|
|
|
|
__device__ static inline int __double2hiint(double x) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
|
|
int tmp[2];
|
|
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
|
|
|
return tmp[1];
|
|
}
|
|
__device__ static inline int __double2loint(double x) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
|
|
int tmp[2];
|
|
__builtin_memcpy(tmp, &x, sizeof(tmp));
|
|
|
|
return tmp[0];
|
|
}
|
|
|
|
__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
|
|
__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
|
|
__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
|
|
__device__ static inline int __double2int_rz(double x) { return (int)x; }
|
|
|
|
__device__ static inline long long int __double2ll_rd(double x) {
|
|
return (long long)__ocml_floor_f64(x);
|
|
}
|
|
__device__ static inline long long int __double2ll_rn(double x) {
|
|
return (long long)__ocml_rint_f64(x);
|
|
}
|
|
__device__ static inline long long int __double2ll_ru(double x) {
|
|
return (long long)__ocml_ceil_f64(x);
|
|
}
|
|
__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
|
|
|
|
__device__ static inline unsigned int __double2uint_rd(double x) {
|
|
return (unsigned int)__ocml_floor_f64(x);
|
|
}
|
|
__device__ static inline unsigned int __double2uint_rn(double x) {
|
|
return (unsigned int)__ocml_rint_f64(x);
|
|
}
|
|
__device__ static inline unsigned int __double2uint_ru(double x) {
|
|
return (unsigned int)__ocml_ceil_f64(x);
|
|
}
|
|
__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
|
|
|
|
__device__ static inline unsigned long long int __double2ull_rd(double x) {
|
|
return (unsigned long long int)__ocml_floor_f64(x);
|
|
}
|
|
__device__ static inline unsigned long long int __double2ull_rn(double x) {
|
|
return (unsigned long long int)__ocml_rint_f64(x);
|
|
}
|
|
__device__ static inline unsigned long long int __double2ull_ru(double x) {
|
|
return (unsigned long long int)__ocml_ceil_f64(x);
|
|
}
|
|
__device__ static inline unsigned long long int __double2ull_rz(double x) {
|
|
return (unsigned long long int)x;
|
|
}
|
|
__device__ static inline long long int __double_as_longlong(double x) {
|
|
static_assert(sizeof(long long) == sizeof(double), "");
|
|
|
|
long long tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
/*
|
|
__device__ unsigned short __float2half_rn(float x);
|
|
__device__ float __half2float(unsigned short);
|
|
|
|
The above device function are not a valid .
|
|
Use
|
|
__device__ __half __float2half_rn(float x);
|
|
__device__ float __half2float(__half);
|
|
from hip_fp16.h
|
|
|
|
CUDA implements half as unsigned short whereas, HIP doesn't.
|
|
|
|
*/
|
|
|
|
__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
|
|
__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
|
|
__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
|
|
__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
|
|
|
|
__device__ static inline long long int __float2ll_rd(float x) {
|
|
return (long long int)__ocml_floor_f32(x);
|
|
}
|
|
__device__ static inline long long int __float2ll_rn(float x) {
|
|
return (long long int)__ocml_rint_f32(x);
|
|
}
|
|
__device__ static inline long long int __float2ll_ru(float x) {
|
|
return (long long int)__ocml_ceil_f32(x);
|
|
}
|
|
__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
|
|
|
|
__device__ static inline unsigned int __float2uint_rd(float x) {
|
|
return (unsigned int)__ocml_floor_f32(x);
|
|
}
|
|
__device__ static inline unsigned int __float2uint_rn(float x) {
|
|
return (unsigned int)__ocml_rint_f32(x);
|
|
}
|
|
__device__ static inline unsigned int __float2uint_ru(float x) {
|
|
return (unsigned int)__ocml_ceil_f32(x);
|
|
}
|
|
__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
|
|
|
|
__device__ static inline unsigned long long int __float2ull_rd(float x) {
|
|
return (unsigned long long int)__ocml_floor_f32(x);
|
|
}
|
|
__device__ static inline unsigned long long int __float2ull_rn(float x) {
|
|
return (unsigned long long int)__ocml_rint_f32(x);
|
|
}
|
|
__device__ static inline unsigned long long int __float2ull_ru(float x) {
|
|
return (unsigned long long int)__ocml_ceil_f32(x);
|
|
}
|
|
__device__ static inline unsigned long long int __float2ull_rz(float x) {
|
|
return (unsigned long long int)x;
|
|
}
|
|
|
|
__device__ static inline int __float_as_int(float x) {
|
|
static_assert(sizeof(int) == sizeof(float), "");
|
|
|
|
int tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ static inline unsigned int __float_as_uint(float x) {
|
|
static_assert(sizeof(unsigned int) == sizeof(float), "");
|
|
|
|
unsigned int tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ static inline double __hiloint2double(int hi, int lo) {
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
|
|
double tmp1;
|
|
__builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
|
|
return tmp1;
|
|
}
|
|
|
|
__device__ static inline double __int2double_rn(int x) { return (double)x; }
|
|
|
|
__device__ static inline float __int2float_rd(int x) {
|
|
return __ocml_cvtrtn_f32_s32(x);
|
|
}
|
|
__device__ static inline float __int2float_rn(int x) { return (float)x; }
|
|
__device__ static inline float __int2float_ru(int x) {
|
|
return __ocml_cvtrtp_f32_s32(x);
|
|
}
|
|
__device__ static inline float __int2float_rz(int x) {
|
|
return __ocml_cvtrtz_f32_s32(x);
|
|
}
|
|
|
|
__device__ static inline float __int_as_float(int x) {
|
|
static_assert(sizeof(float) == sizeof(int), "");
|
|
|
|
float tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ static inline double __ll2double_rd(long long int x) {
|
|
return __ocml_cvtrtn_f64_s64(x);
|
|
}
|
|
__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
|
|
__device__ static inline double __ll2double_ru(long long int x) {
|
|
return __ocml_cvtrtp_f64_s64(x);
|
|
}
|
|
__device__ static inline double __ll2double_rz(long long int x) {
|
|
return __ocml_cvtrtz_f64_s64(x);
|
|
}
|
|
|
|
__device__ static inline float __ll2float_rd(long long int x) {
|
|
return __ocml_cvtrtn_f32_s64(x);
|
|
}
|
|
__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
|
|
__device__ static inline float __ll2float_ru(long long int x) {
|
|
return __ocml_cvtrtp_f32_s64(x);
|
|
}
|
|
__device__ static inline float __ll2float_rz(long long int x) {
|
|
return __ocml_cvtrtz_f32_s64(x);
|
|
}
|
|
|
|
__device__ static inline double __longlong_as_double(long long int x) {
|
|
static_assert(sizeof(double) == sizeof(long long), "");
|
|
|
|
double tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
|
|
|
|
__device__ static inline float __uint2float_rd(unsigned int x) {
|
|
return __ocml_cvtrtn_f32_u32(x);
|
|
}
|
|
__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
|
|
__device__ static inline float __uint2float_ru(unsigned int x) {
|
|
return __ocml_cvtrtp_f32_u32(x);
|
|
}
|
|
__device__ static inline float __uint2float_rz(unsigned int x) {
|
|
return __ocml_cvtrtz_f32_u32(x);
|
|
}
|
|
|
|
__device__ static inline float __uint_as_float(unsigned int x) {
|
|
static_assert(sizeof(float) == sizeof(unsigned int), "");
|
|
|
|
float tmp;
|
|
__builtin_memcpy(&tmp, &x, sizeof(tmp));
|
|
|
|
return tmp;
|
|
}
|
|
|
|
__device__ static inline double __ull2double_rd(unsigned long long int x) {
|
|
return __ocml_cvtrtn_f64_u64(x);
|
|
}
|
|
__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
|
|
__device__ static inline double __ull2double_ru(unsigned long long int x) {
|
|
return __ocml_cvtrtp_f64_u64(x);
|
|
}
|
|
__device__ static inline double __ull2double_rz(unsigned long long int x) {
|
|
return __ocml_cvtrtz_f64_u64(x);
|
|
}
|
|
|
|
__device__ static inline float __ull2float_rd(unsigned long long int x) {
|
|
return __ocml_cvtrtn_f32_u64(x);
|
|
}
|
|
__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
|
|
__device__ static inline float __ull2float_ru(unsigned long long int x) {
|
|
return __ocml_cvtrtp_f32_u64(x);
|
|
}
|
|
__device__ static inline float __ull2float_rz(unsigned long long int x) {
|
|
return __ocml_cvtrtz_f32_u64(x);
|
|
}
|
|
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
|
|
// Clock functions
|
|
__device__ long long int __clock64();
|
|
__device__ long long int __clock();
|
|
__device__ long long int clock64();
|
|
__device__ long long int clock();
|
|
__device__ long long int wall_clock64();
|
|
// hip.amdgcn.bc - named sync
|
|
__device__ void __named_sync();
|
|
|
|
#ifdef __HIP_DEVICE_COMPILE__
|
|
|
|
// Clock function to return GPU core cycle count.
|
|
// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
|
|
// through hipDeviceAttributeClockRate attribute.
|
|
__device__
|
|
inline __attribute((always_inline))
|
|
long long int __clock64() {
|
|
#if __has_builtin(__builtin_amdgcn_s_memtime)
|
|
// Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
|
|
return (long long int) __builtin_amdgcn_s_memtime();
|
|
#else
|
|
// Subject to change when better solution available
|
|
return (long long int) __builtin_readcyclecounter();
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline __attribute((always_inline))
|
|
long long int __clock() { return __clock64(); }
|
|
|
|
// Clock function to return wall clock count at a constant frequency that can be queried
|
|
// through hipDeviceAttributeWallClockRate attribute.
|
|
__device__
|
|
inline __attribute__((always_inline))
|
|
long long int wall_clock64() {
|
|
return (long long int) __ockl_steadyctr_u64();
|
|
}
|
|
|
|
__device__
|
|
inline __attribute__((always_inline))
|
|
long long int clock64() { return __clock64(); }
|
|
|
|
__device__
|
|
inline __attribute__((always_inline))
|
|
long long int clock() { return __clock(); }
|
|
|
|
// hip.amdgcn.bc - named sync
|
|
__device__
|
|
inline
|
|
void __named_sync() { __builtin_amdgcn_s_barrier(); }
|
|
|
|
#endif // __HIP_DEVICE_COMPILE__
|
|
|
|
// warp vote function __all __any __ballot
|
|
__device__
|
|
inline
|
|
int __all(int predicate) {
|
|
return __ockl_wfall_i32(predicate);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int __any(int predicate) {
|
|
return __ockl_wfany_i32(predicate);
|
|
}
|
|
|
|
// XXX from llvm/include/llvm/IR/InstrTypes.h
|
|
#define ICMP_NE 33
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long int __ballot(int predicate) {
|
|
return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long int __ballot64(int predicate) {
|
|
return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
|
|
}
|
|
|
|
// hip.amdgcn.bc - lanemask
|
|
__device__
|
|
inline
|
|
uint64_t __lanemask_gt()
|
|
{
|
|
uint32_t lane = __ockl_lane_u32();
|
|
if (lane == 63)
|
|
return 0;
|
|
uint64_t ballot = __ballot64(1);
|
|
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
|
|
return mask & ballot;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
uint64_t __lanemask_lt()
|
|
{
|
|
uint32_t lane = __ockl_lane_u32();
|
|
int64_t ballot = __ballot64(1);
|
|
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
|
|
return mask & ballot;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
uint64_t __lanemask_eq()
|
|
{
|
|
uint32_t lane = __ockl_lane_u32();
|
|
int64_t mask = ((uint64_t)1 << lane);
|
|
return mask;
|
|
}
|
|
|
|
|
|
__device__ inline void* __local_to_generic(void* p) { return p; }
|
|
|
|
#ifdef __HIP_DEVICE_COMPILE__
|
|
__device__
|
|
inline
|
|
void* __get_dynamicgroupbaseptr()
|
|
{
|
|
// Get group segment base pointer.
|
|
return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
|
|
}
|
|
#else
|
|
__device__
|
|
void* __get_dynamicgroupbaseptr();
|
|
#endif // __HIP_DEVICE_COMPILE__
|
|
|
|
__device__
|
|
inline
|
|
void *__amdgcn_get_dynamicgroupbaseptr() {
|
|
return __get_dynamicgroupbaseptr();
|
|
}
|
|
|
|
// Memory Fence Functions
|
|
__device__
|
|
inline
|
|
static void __threadfence()
|
|
{
|
|
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
static void __threadfence_block()
|
|
{
|
|
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
static void __threadfence_system()
|
|
{
|
|
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
|
|
}
|
|
|
|
// abort
|
|
__device__
|
|
inline
|
|
__attribute__((weak))
|
|
void abort() {
|
|
return __builtin_trap();
|
|
}
|
|
|
|
// The noinline attribute helps encapsulate the printf expansion,
|
|
// which otherwise has a performance impact just by increasing the
|
|
// size of the calling function. Additionally, the weak attribute
|
|
// allows the function to exist as a global although its definition is
|
|
// included in every compilation unit.
|
|
#if defined(_WIN32) || defined(_WIN64)
|
|
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
|
void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
|
|
// FIXME: Need `wchar_t` support to generate assertion message.
|
|
__builtin_trap();
|
|
}
|
|
#else /* defined(_WIN32) || defined(_WIN64) */
|
|
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
|
void __assert_fail(const char *assertion,
|
|
const char *file,
|
|
unsigned int line,
|
|
const char *function)
|
|
{
|
|
const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";
|
|
|
|
// strlen is not available as a built-in yet, so we create our own
|
|
// loop in a macro. With a string literal argument, the compiler
|
|
// usually manages to replace the loop with a constant.
|
|
//
|
|
// The macro does not check for null pointer, since all the string
|
|
// arguments are defined to be constant literals when called from
|
|
// the assert() macro.
|
|
//
|
|
// NOTE: The loop below includes the null terminator in the length
|
|
// as required by append_string_n().
|
|
#define __hip_get_string_length(LEN, STR) \
|
|
do { \
|
|
const char *tmp = STR; \
|
|
while (*tmp++); \
|
|
LEN = tmp - STR; \
|
|
} while (0)
|
|
|
|
auto msg = __ockl_fprintf_stderr_begin();
|
|
int len = 0;
|
|
__hip_get_string_length(len, fmt);
|
|
msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
|
|
__hip_get_string_length(len, file);
|
|
msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
|
|
msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
|
|
__hip_get_string_length(len, function);
|
|
msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
|
|
__hip_get_string_length(len, assertion);
|
|
__ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);
|
|
|
|
#undef __hip_get_string_length
|
|
|
|
__builtin_trap();
|
|
}
|
|
|
|
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
|
|
void __assertfail()
|
|
{
|
|
// ignore all the args for now.
|
|
__builtin_trap();
|
|
}
|
|
#endif /* defined(_WIN32) || defined(_WIN64) */
|
|
|
|
__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
|
|
if (flags) {
|
|
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
|
|
__builtin_amdgcn_s_barrier();
|
|
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
|
|
} else {
|
|
__builtin_amdgcn_s_barrier();
|
|
}
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
static void __barrier(int n)
|
|
{
|
|
__work_group_barrier((__cl_mem_fence_flags)n);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
__attribute__((convergent))
|
|
void __syncthreads()
|
|
{
|
|
__barrier(__CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
__attribute__((convergent))
|
|
int __syncthreads_count(int predicate)
|
|
{
|
|
return __ockl_wgred_add_i32(!!predicate);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
__attribute__((convergent))
|
|
int __syncthreads_and(int predicate)
|
|
{
|
|
return __ockl_wgred_and_i32(!!predicate);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
__attribute__((convergent))
|
|
int __syncthreads_or(int predicate)
|
|
{
|
|
return __ockl_wgred_or_i32(!!predicate);
|
|
}
|
|
|
|
// hip.amdgcn.bc - device routine
|
|
/*
|
|
HW_ID Register bit structure for RDNA2 & RDNA3
|
|
WAVE_ID 4:0 Wave id within the SIMD.
|
|
SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
|
|
WGP_ID 13:10 Physical WGP ID.
|
|
SA_ID 16 Shader Array ID
|
|
SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
|
|
SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
|
|
DP_RATE 31:29 Number of double-precision float units per SIMD
|
|
|
|
HW_ID Register bit structure for GCN and CDNA
|
|
WAVE_ID 3:0 Wave buffer slot number. 0-9.
|
|
SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
|
|
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
|
|
CU_ID 11:8 Compute Unit the wave is assigned to.
|
|
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
|
|
SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
|
|
14:13 Shader Engine the wave is assigned to for Vega.
|
|
TG_ID 19:16 Thread-group ID
|
|
VM_ID 23:20 Virtual Memory ID
|
|
QUEUE_ID 26:24 Queue from which this wave was dispatched.
|
|
STATE_ID 29:27 State ID (graphics only, not compute).
|
|
ME_ID 31:30 Micro-engine ID.
|
|
|
|
XCC_ID Register bit structure for gfx940
|
|
XCC_ID 3:0 XCC the wave is assigned to.
|
|
*/
|
|
|
|
#if (defined (__GFX10__) || defined (__GFX11__))
|
|
#define HW_ID 23
|
|
#else
|
|
#define HW_ID 4
|
|
#endif
|
|
|
|
#if (defined(__GFX10__) || defined(__GFX11__))
|
|
#define HW_ID_WGP_ID_SIZE 4
|
|
#define HW_ID_WGP_ID_OFFSET 10
|
|
#else
|
|
#define HW_ID_CU_ID_SIZE 4
|
|
#define HW_ID_CU_ID_OFFSET 8
|
|
#endif
|
|
|
|
#if (defined(__gfx908__) || defined(__gfx90a__) || \
|
|
defined(__GFX11__))
|
|
#define HW_ID_SE_ID_SIZE 3
|
|
#else //4 SEs/XCC for gfx940-942
|
|
#define HW_ID_SE_ID_SIZE 2
|
|
#endif
|
|
#if (defined(__GFX10__) || defined(__GFX11__))
|
|
#define HW_ID_SE_ID_OFFSET 18
|
|
#define HW_ID_SA_ID_OFFSET 16
|
|
#define HW_ID_SA_ID_SIZE 1
|
|
#else
|
|
#define HW_ID_SE_ID_OFFSET 13
|
|
#endif
|
|
|
|
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
|
#define XCC_ID 20
|
|
#define XCC_ID_XCC_ID_SIZE 4
|
|
#define XCC_ID_XCC_ID_OFFSET 0
|
|
#endif
|
|
|
|
#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
|
|
(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
|
|
#define __HIP_NO_IMAGE_SUPPORT 1
|
|
#endif
|
|
|
|
/*
|
|
Encoding of parameter bitmask
|
|
HW_ID 5:0 HW_ID
|
|
OFFSET 10:6 Range: 0..31
|
|
SIZE 15:11 Range: 1..32
|
|
*/
|
|
|
|
#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
|
|
|
|
/*
|
|
__smid returns the wave's assigned Compute Unit and Shader Engine.
|
|
The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
|
|
Note: the results vary over time.
|
|
SZ minus 1 since SIZE is 1-based.
|
|
*/
|
|
__device__
|
|
inline
|
|
unsigned __smid(void)
|
|
{
|
|
unsigned se_id = __builtin_amdgcn_s_getreg(
|
|
GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
|
|
#if (defined(__GFX10__) || defined(__GFX11__))
|
|
unsigned wgp_id = __builtin_amdgcn_s_getreg(
|
|
GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
|
|
unsigned sa_id = __builtin_amdgcn_s_getreg(
|
|
GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
|
|
#else
|
|
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
|
unsigned xcc_id = __builtin_amdgcn_s_getreg(
|
|
GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
|
|
#endif
|
|
unsigned cu_id = __builtin_amdgcn_s_getreg(
|
|
GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
|
|
#endif
|
|
#if (defined(__GFX10__) || defined(__GFX11__))
|
|
unsigned temp = se_id;
|
|
temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
|
|
temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
|
|
return temp;
|
|
//TODO : CU Mode impl
|
|
#elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
|
|
unsigned temp = xcc_id;
|
|
temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
|
|
temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
|
|
return temp;
|
|
#else
|
|
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
|
|
* To be removed in a future release.
|
|
*/
|
|
#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
|
|
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
|
|
|
#endif //defined(__clang__) && defined(__HIP__)
|
|
|
|
|
|
// loop unrolling
|
|
static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
|
|
auto dstPtr = static_cast<unsigned char*>(dst);
|
|
auto srcPtr = static_cast<const unsigned char*>(src);
|
|
|
|
while (size >= 4u) {
|
|
dstPtr[0] = srcPtr[0];
|
|
dstPtr[1] = srcPtr[1];
|
|
dstPtr[2] = srcPtr[2];
|
|
dstPtr[3] = srcPtr[3];
|
|
|
|
size -= 4u;
|
|
srcPtr += 4u;
|
|
dstPtr += 4u;
|
|
}
|
|
switch (size) {
|
|
case 3:
|
|
dstPtr[2] = srcPtr[2];
|
|
case 2:
|
|
dstPtr[1] = srcPtr[1];
|
|
case 1:
|
|
dstPtr[0] = srcPtr[0];
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
|
|
static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
|
|
auto dstPtr = static_cast<unsigned char*>(dst);
|
|
|
|
while (size >= 4u) {
|
|
dstPtr[0] = val;
|
|
dstPtr[1] = val;
|
|
dstPtr[2] = val;
|
|
dstPtr[3] = val;
|
|
|
|
size -= 4u;
|
|
dstPtr += 4u;
|
|
}
|
|
switch (size) {
|
|
case 3:
|
|
dstPtr[2] = val;
|
|
case 2:
|
|
dstPtr[1] = val;
|
|
case 1:
|
|
dstPtr[0] = val;
|
|
}
|
|
|
|
return dst;
|
|
}
|
|
#ifndef __OPENMP_AMDGCN__
|
|
static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
|
|
return __hip_hc_memcpy(dst, src, size);
|
|
}
|
|
|
|
static inline __device__ void* memset(void* ptr, int val, size_t size) {
|
|
unsigned char val8 = static_cast<unsigned char>(val);
|
|
return __hip_hc_memset(ptr, val8, size);
|
|
}
|
|
#endif // !__OPENMP_AMDGCN__
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
|
|
|
|
__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
|
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
|
return tmp.u;
|
|
}
|
|
|
|
__device__ static inline float __hip_ds_bpermutef(int index, float src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
|
tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
|
|
return tmp.f;
|
|
}
|
|
|
|
__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
|
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
|
return tmp.u;
|
|
}
|
|
|
|
__device__ static inline float __hip_ds_permutef(int index, float src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
|
tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
|
|
return tmp.f;
|
|
}
|
|
|
|
#define __hip_ds_swizzle(src, pattern) __hip_ds_swizzle_N<(pattern)>((src))
|
|
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))
|
|
|
|
template <int pattern>
|
|
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = src;
|
|
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
|
return tmp.u;
|
|
}
|
|
|
|
template <int pattern>
|
|
__device__ static inline float __hip_ds_swizzlef_N(float src) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = src;
|
|
tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
|
|
return tmp.f;
|
|
}
|
|
|
|
#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
|
|
__hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))
|
|
|
|
template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
|
|
__device__ static inline int __hip_move_dpp_N(int src) {
|
|
return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
|
|
bound_ctrl);
|
|
}
|
|
|
|
static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;
|
|
|
|
__device__
|
|
inline
|
|
int __shfl(int var, int src_lane, int width = warpSize) {
|
|
int self = __lane_id();
|
|
int index = (src_lane & (width - 1)) + (self & ~(width-1));
|
|
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
|
tmp.i = __shfl(tmp.i, src_lane, width);
|
|
return tmp.u;
|
|
}
|
|
__device__
|
|
inline
|
|
float __shfl(float var, int src_lane, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
|
tmp.i = __shfl(tmp.i, src_lane, width);
|
|
return tmp.f;
|
|
}
|
|
__device__
|
|
inline
|
|
double __shfl(double var, int src_lane, int width = warpSize) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl(tmp[0], src_lane, width);
|
|
tmp[1] = __shfl(tmp[1], src_lane, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
long __shfl(long var, int src_lane, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl(tmp[0], src_lane, width);
|
|
tmp[1] = __shfl(tmp[1], src_lane, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(long) == sizeof(int), "");
|
|
return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
|
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl(tmp[0], src_lane, width);
|
|
tmp[1] = __shfl(tmp[1], src_lane, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
|
return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
long long __shfl(long long var, int src_lane, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl(tmp[0], src_lane, width);
|
|
tmp[1] = __shfl(tmp[1], src_lane, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
|
|
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
|
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl(tmp[0], src_lane, width);
|
|
tmp[1] = __shfl(tmp[1], src_lane, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
|
|
int self = __lane_id();
|
|
int index = self - lane_delta;
|
|
index = (index < (self & ~(width-1)))?self:index;
|
|
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
|
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
|
return tmp.u;
|
|
}
|
|
__device__
|
|
inline
|
|
float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
|
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
|
return tmp.f;
|
|
}
|
|
__device__
|
|
inline
|
|
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(long) == sizeof(int), "");
|
|
return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
|
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
|
return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_up(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_up(tmp[1], lane_delta, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
|
|
int self = __lane_id();
|
|
int index = self + lane_delta;
|
|
index = (int)((self&(width-1))+lane_delta) >= width?self:index;
|
|
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
|
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
|
return tmp.u;
|
|
}
|
|
__device__
|
|
inline
|
|
float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
|
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
|
return tmp.f;
|
|
}
|
|
__device__
|
|
inline
|
|
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(long) == sizeof(int), "");
|
|
return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
|
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
|
return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_down(tmp[0], lane_delta, width);
|
|
tmp[1] = __shfl_down(tmp[1], lane_delta, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int __shfl_xor(int var, int lane_mask, int width = warpSize) {
|
|
int self = __lane_id();
|
|
int index = self^lane_mask;
|
|
index = index >= ((self+width)&~(width-1))?self:index;
|
|
return __builtin_amdgcn_ds_bpermute(index<<2, var);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.u = var;
|
|
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
|
return tmp.u;
|
|
}
|
|
__device__
|
|
inline
|
|
float __shfl_xor(float var, int lane_mask, int width = warpSize) {
|
|
union { int i; unsigned u; float f; } tmp; tmp.f = var;
|
|
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
|
return tmp.f;
|
|
}
|
|
__device__
|
|
inline
|
|
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
|
|
static_assert(sizeof(double) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(double) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
|
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
double tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
long __shfl_xor(long var, int lane_mask, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long) == sizeof(uint64_t), "");
|
|
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
|
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(long) == sizeof(int), "");
|
|
return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
|
|
{
|
|
#ifndef _MSC_VER
|
|
static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");
|
|
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
|
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
|
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
#else
|
|
static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
|
|
return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
|
|
#endif
|
|
}
|
|
__device__
|
|
inline
|
|
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(long long) == 2 * sizeof(int), "");
|
|
static_assert(sizeof(long long) == sizeof(uint64_t), "");
|
|
int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
|
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
|
|
{
|
|
static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
|
|
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
|
unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
|
|
tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
|
|
tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
|
|
uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
|
|
unsigned long long tmp1; __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
|
|
return tmp1;
|
|
}
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @file amd_detail/hip_cooperative_groups_helper.h
|
|
*
|
|
* @brief Device side implementation of cooperative group feature.
|
|
*
|
|
* Defines helper constructs and APIs which aid the types and device API
|
|
* wrappers defined within `amd_detail/hip_cooperative_groups.h`.
|
|
*/
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
|
|
|
#if __cplusplus
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
|
|
#include <hip/amd_detail/amd_device_functions.h>
|
|
#endif
|
|
#if !defined(__align__)
|
|
#define __align__(x) __attribute__((aligned(x)))
|
|
#endif
|
|
|
|
#if !defined(__CG_QUALIFIER__)
|
|
#define __CG_QUALIFIER__ __device__ __forceinline__
|
|
#endif
|
|
|
|
#if !defined(__CG_STATIC_QUALIFIER__)
|
|
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
|
|
#endif
|
|
|
|
#if !defined(_CG_STATIC_CONST_DECL_)
|
|
#define _CG_STATIC_CONST_DECL_ static constexpr
|
|
#endif
|
|
|
|
#if __AMDGCN_WAVEFRONT_SIZE == 32
|
|
using lane_mask = unsigned int;
|
|
#else
|
|
using lane_mask = unsigned long long int;
|
|
#endif
|
|
|
|
namespace cooperative_groups {
|
|
|
|
/* Global scope */
|
|
template <unsigned int size>
|
|
using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;
|
|
|
|
template <unsigned int size>
|
|
using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;
|
|
|
|
template <unsigned int size>
|
|
using is_valid_tile_size =
|
|
std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
|
|
|
|
template <typename T>
|
|
using is_valid_type =
|
|
std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;
|
|
|
|
namespace internal {
|
|
|
|
/**
|
|
* @brief Enums representing different cooperative group types
|
|
* @note This enum is only applicable on Linux.
|
|
*
|
|
*/
|
|
typedef enum {
|
|
cg_invalid,
|
|
cg_multi_grid,
|
|
cg_grid,
|
|
cg_workgroup,
|
|
cg_tiled_group,
|
|
cg_coalesced_group
|
|
} group_type;
|
|
/**
|
|
* @ingroup CooperativeG
|
|
* @{
|
|
* This section describes the cooperative groups functions of HIP runtime API.
|
|
*
|
|
* The cooperative groups provides flexible thread parallel programming algorithms, threads
|
|
* cooperate and share data to perform collective computations.
|
|
*
|
|
* @note Cooperative groups feature is implemented on Linux, under developement
|
|
* on Windows.
|
|
*
|
|
*/
|
|
/**
|
|
*
|
|
* @brief Functionalities related to multi-grid cooperative group type
|
|
* @note The following cooperative groups functions are only applicable on Linux.
|
|
*
|
|
*/
|
|
namespace multi_grid {
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
|
|
return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
|
|
return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
|
return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
|
|
|
|
} // namespace multi_grid
|
|
|
|
/**
|
|
* @brief Functionalities related to grid cooperative group type
|
|
* @note The following cooperative groups functions are only applicable on Linux.
|
|
*/
|
|
namespace grid {
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
|
return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
|
|
(blockDim.x * gridDim.x));
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
|
// Compute global id of the workgroup to which the current thread belongs to
|
|
uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
|
|
(blockIdx.y * gridDim.x) + (blockIdx.x));
|
|
|
|
// Compute total number of threads being passed to reach current workgroup
|
|
// within grid
|
|
uint32_t num_threads_till_current_workgroup =
|
|
static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
|
|
|
|
// Compute thread local rank within current workgroup
|
|
uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
|
(threadIdx.y * blockDim.x) + (threadIdx.x));
|
|
|
|
return (num_threads_till_current_workgroup + local_thread_rank);
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
|
|
|
|
__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
|
|
|
|
} // namespace grid
|
|
|
|
/**
|
|
* @brief Functionalities related to `workgroup` (thread_block in CUDA terminology)
|
|
* cooperative group type
|
|
* @note The following cooperative groups functions are only applicable on Linux.
|
|
*/
|
|
namespace workgroup {
|
|
|
|
__CG_STATIC_QUALIFIER__ dim3 group_index() {
|
|
return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
|
|
static_cast<uint32_t>(blockIdx.z)));
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ dim3 thread_index() {
|
|
return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
|
|
static_cast<uint32_t>(threadIdx.z)));
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t size() {
|
|
return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
|
|
return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
|
|
(threadIdx.y * blockDim.x) + (threadIdx.x)));
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ bool is_valid() {
|
|
return true;
|
|
}
|
|
|
|
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
|
|
|
|
__CG_STATIC_QUALIFIER__ dim3 block_dim() {
|
|
return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
|
|
static_cast<uint32_t>(blockDim.z)));
|
|
}
|
|
|
|
} // namespace workgroup
|
|
|
|
namespace tiled_group {
|
|
|
|
// enforce ordering for memory intructions
|
|
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
|
|
|
|
} // namespace tiled_group
|
|
|
|
namespace coalesced_group {
|
|
|
|
// enforce ordering for memory intructions
|
|
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }
|
|
|
|
// Masked bit count
|
|
//
|
|
// For each thread, this function returns the number of active threads which
|
|
// have i-th bit of x set and come before the current thread.
|
|
__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
|
|
unsigned int counter=0;
|
|
#if __AMDGCN_WAVEFRONT_SIZE == 32
|
|
counter = __builtin_amdgcn_mbcnt_lo(x, add);
|
|
#else
|
|
counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
|
|
counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
|
|
#endif
|
|
|
|
return counter;
|
|
}
|
|
|
|
} // namespace coalesced_group
|
|
|
|
|
|
} // namespace internal
|
|
|
|
} // namespace cooperative_groups
|
|
/**
|
|
* @}
|
|
*/
|
|
|
|
#endif // __cplusplus
|
|
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
/**
|
|
* @file amd_detail/hip_cooperative_groups.h
|
|
*
|
|
* @brief Device side implementation of `Cooperative Group` feature.
|
|
*
|
|
* Defines new types and device API wrappers related to `Cooperative Group`
|
|
* feature, which the programmer can directly use in his kernel(s) in order to
|
|
* make use of this feature.
|
|
*/
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
|
|
|
#if __cplusplus
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/amd_detail/hip_cooperative_groups_helper.h>
|
|
#endif
|
|
|
|
#define __hip_abort() \
|
|
{ abort(); }
|
|
#if defined(NDEBUG)
|
|
#define __hip_assert(COND)
|
|
#else
|
|
#define __hip_assert(COND) \
|
|
{ \
|
|
if (!COND) { \
|
|
__hip_abort(); \
|
|
} \
|
|
}
|
|
#endif
|
|
|
|
namespace cooperative_groups {
|
|
|
|
/** @brief The base type of all cooperative group types
|
|
*
|
|
* \details Holds the key properties of a constructed cooperative group types
|
|
* object, like the group type, its size, etc
|
|
*
|
|
* @note Cooperative groups feature is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
class thread_group {
|
|
protected:
|
|
uint32_t _type; // thread_group type
|
|
uint32_t _size; // total number of threads in the tread_group
|
|
uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
|
|
// LSB represents lane 0, and MSB represents lane 63
|
|
|
|
// Construct a thread group, and set thread group type and other essential
|
|
// thread group properties. This generic thread group is directly constructed
|
|
// only when the group is supposed to contain only the calling the thread
|
|
// (throurh the API - `this_thread()`), and in all other cases, this thread
|
|
// group object is a sub-object of some other derived thread group object
|
|
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
|
|
uint64_t mask = static_cast<uint64_t>(0)) {
|
|
_type = type;
|
|
_size = size;
|
|
_mask = mask;
|
|
}
|
|
|
|
struct _tiled_info {
|
|
bool is_tiled;
|
|
unsigned int size;
|
|
unsigned int meta_group_rank;
|
|
unsigned int meta_group_size;
|
|
};
|
|
|
|
struct _coalesced_info {
|
|
lane_mask member_mask;
|
|
unsigned int size;
|
|
struct _tiled_info tiled_info;
|
|
} coalesced_info;
|
|
|
|
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
|
|
unsigned int tile_size);
|
|
friend class thread_block;
|
|
|
|
public:
|
|
// Total number of threads in the thread group, and this serves the purpose
|
|
// for all derived cooperative group types since their `size` is directly
|
|
// saved during the construction
|
|
__CG_QUALIFIER__ uint32_t size() const { return _size; }
|
|
__CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
|
|
// Rank of the calling thread within [0, size())
|
|
__CG_QUALIFIER__ uint32_t thread_rank() const;
|
|
// Is this cooperative group type valid?
|
|
__CG_QUALIFIER__ bool is_valid() const;
|
|
// synchronize the threads in the thread group
|
|
__CG_QUALIFIER__ void sync() const;
|
|
};
|
|
/**
|
|
*-------------------------------------------------------------------------------------------------
|
|
*-------------------------------------------------------------------------------------------------
|
|
* @defgroup CooperativeG Cooperative Groups
|
|
* @ingroup API
|
|
* @{
|
|
* This section describes the cooperative groups functions of HIP runtime API.
|
|
*
|
|
* The cooperative groups provides flexible thread parallel programming algorithms, threads
|
|
* cooperate and share data to perform collective computations.
|
|
*
|
|
* @note Cooperative groups feature is implemented on Linux, under developement
|
|
* on Windows.
|
|
*
|
|
*/
|
|
/** \brief The multi-grid cooperative group type
|
|
*
|
|
* \details Represents an inter-device cooperative group type where the
|
|
* participating threads within the group spans across multple
|
|
* devices, running the (same) kernel on these devices
|
|
* @note The multi-grid cooperative group type is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
class multi_grid_group : public thread_group {
|
|
// Only these friend functions are allowed to construct an object of this class
|
|
// and access its resources
|
|
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
|
|
|
|
protected:
|
|
// Construct mutli-grid thread group (through the API this_multi_grid())
|
|
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
|
|
: thread_group(internal::cg_multi_grid, size) {}
|
|
|
|
public:
|
|
// Number of invocations participating in this multi-grid group. In other
|
|
// words, the number of GPUs
|
|
__CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
|
|
// Rank of this invocation. In other words, an ID number within the range
|
|
// [0, num_grids()) of the GPU, this kernel is running on
|
|
__CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
|
|
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
|
|
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
|
|
__CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
|
|
};
|
|
|
|
/** @brief User exposed API interface to construct multi-grid cooperative
|
|
* group type object - `multi_grid_group`
|
|
*
|
|
* \details User is not allowed to directly construct an object of type
|
|
* `multi_grid_group`. Instead, he should construct it through this
|
|
* API function
|
|
* @note This multi-grid cooperative API type is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
|
|
return multi_grid_group(internal::multi_grid::size());
|
|
}
|
|
|
|
/** @brief The grid cooperative group type
|
|
*
|
|
* \details Represents an inter-workgroup cooperative group type where the
|
|
* participating threads within the group spans across multiple
|
|
* workgroups running the (same) kernel on the same device
|
|
* @note This is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
class grid_group : public thread_group {
|
|
// Only these friend functions are allowed to construct an object of this class
|
|
// and access its resources
|
|
friend __CG_QUALIFIER__ grid_group this_grid();
|
|
|
|
protected:
|
|
// Construct grid thread group (through the API this_grid())
|
|
explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
|
|
|
|
public:
|
|
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
|
|
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
|
|
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
|
|
};
|
|
|
|
/** @brief User exposed API interface to construct grid cooperative group type
|
|
* object - `grid_group`
|
|
*
|
|
* \details User is not allowed to directly construct an object of type
|
|
* `multi_grid_group`. Instead, he should construct it through this
|
|
* API function
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
|
|
|
|
/** @brief The workgroup (thread-block in CUDA terminology) cooperative group
|
|
* type
|
|
*
|
|
* \details Represents an intra-workgroup cooperative group type where the
|
|
* participating threads within the group are exactly the same threads
|
|
* which are participated in the currently executing `workgroup`
|
|
* @note This is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
class thread_block : public thread_group {
|
|
// Only these friend functions are allowed to construct an object of thi
|
|
// class and access its resources
|
|
friend __CG_QUALIFIER__ thread_block this_thread_block();
|
|
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
|
|
unsigned int tile_size);
|
|
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
|
|
unsigned int tile_size);
|
|
protected:
|
|
// Construct a workgroup thread group (through the API this_thread_block())
|
|
explicit __CG_QUALIFIER__ thread_block(uint32_t size)
|
|
: thread_group(internal::cg_workgroup, size) {}
|
|
|
|
__CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
|
|
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
|
|
// Invalid tile size, assert
|
|
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
|
|
__hip_assert(false && "invalid tile size")
|
|
}
|
|
|
|
thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
|
|
tiledGroup.coalesced_info.tiled_info.size = tile_size;
|
|
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
|
|
tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
|
tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size;
|
|
return tiledGroup;
|
|
}
|
|
|
|
public:
|
|
// 3-dimensional block index within the grid
|
|
__CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
|
|
// 3-dimensional thread index within the block
|
|
__CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
|
|
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
|
|
__CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
|
|
__CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
|
|
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
|
|
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
|
|
};
|
|
|
|
/** \brief User exposed API interface to construct workgroup cooperative
|
|
* group type object - `thread_block`.
|
|
*
|
|
* \details User is not allowed to directly construct an object of type
|
|
* `thread_block`. Instead, he should construct it through this API
|
|
* function.
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ thread_block this_thread_block() {
|
|
return thread_block(internal::workgroup::size());
|
|
}
|
|
|
|
/** \brief The tiled_group cooperative group type
|
|
*
|
|
* \details Represents one tiled thread group in a wavefront.
|
|
* This group type also supports sub-wave level intrinsics.
|
|
* @note This is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
|
|
class tiled_group : public thread_group {
|
|
private:
|
|
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
|
|
unsigned int tile_size);
|
|
friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
|
|
unsigned int tile_size);
|
|
|
|
__CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
|
|
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
|
|
|
|
if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
|
|
__hip_assert(false && "invalid tile size")
|
|
}
|
|
|
|
if (size() <= tile_size) {
|
|
return *this;
|
|
}
|
|
|
|
tiled_group tiledGroup = tiled_group(tile_size);
|
|
tiledGroup.coalesced_info.tiled_info.is_tiled = true;
|
|
return tiledGroup;
|
|
}
|
|
|
|
protected:
|
|
explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
|
|
: thread_group(internal::cg_tiled_group, tileSize) {
|
|
coalesced_info.tiled_info.size = tileSize;
|
|
coalesced_info.tiled_info.is_tiled = true;
|
|
}
|
|
|
|
public:
|
|
__CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }
|
|
|
|
__CG_QUALIFIER__ unsigned int thread_rank() const {
|
|
return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
|
|
}
|
|
|
|
__CG_QUALIFIER__ void sync() const {
|
|
internal::tiled_group::sync();
|
|
}
|
|
};
|
|
|
|
/** \brief The coalesced_group cooperative group type
|
|
*
|
|
* \details Represents a active thread group in a wavefront.
|
|
* This group type also supports sub-wave level intrinsics.
|
|
* @note This is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
class coalesced_group : public thread_group {
|
|
private:
|
|
friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
|
|
friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
|
|
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);
|
|
|
|
__CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
|
|
const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
|
|
|
|
if (!tile_size || (tile_size > size()) || !pow2) {
|
|
return coalesced_group(0);
|
|
}
|
|
|
|
// If a tiled group is passed to be partitioned further into a coalesced_group.
|
|
// prepare a mask for further partitioning it so that it stays coalesced.
|
|
if (coalesced_info.tiled_info.is_tiled) {
|
|
unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
|
|
unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
|
|
lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
|
|
|
|
member_mask <<= (__lane_id() & ~(tile_size - 1));
|
|
coalesced_group coalesced_tile = coalesced_group(member_mask);
|
|
coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
|
|
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
|
coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
|
|
return coalesced_tile;
|
|
}
|
|
// Here the parent coalesced_group is not partitioned.
|
|
else {
|
|
lane_mask member_mask = 0;
|
|
unsigned int tile_rank = 0;
|
|
int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;
|
|
|
|
for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
|
|
lane_mask active = coalesced_info.member_mask & (1 << i);
|
|
// Make sure the lane is active
|
|
if (active) {
|
|
if (lanes_to_skip <= 0 && tile_rank < tile_size) {
|
|
// Prepare a member_mask that is appropriate for a tile
|
|
member_mask |= active;
|
|
tile_rank++;
|
|
}
|
|
lanes_to_skip--;
|
|
}
|
|
}
|
|
coalesced_group coalesced_tile = coalesced_group(member_mask);
|
|
coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
|
|
coalesced_tile.coalesced_info.tiled_info.meta_group_size =
|
|
(size() + tile_size - 1) / tile_size;
|
|
return coalesced_tile;
|
|
}
|
|
return coalesced_group(0);
|
|
}
|
|
|
|
protected:
|
|
// Constructor
|
|
explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
|
|
: thread_group(internal::cg_coalesced_group) {
|
|
coalesced_info.member_mask = member_mask; // Which threads are active
|
|
coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
|
|
coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
|
|
coalesced_info.tiled_info.meta_group_rank = 0;
|
|
coalesced_info.tiled_info.meta_group_size = 1;
|
|
}
|
|
|
|
public:
|
|
__CG_QUALIFIER__ unsigned int size() const {
|
|
return coalesced_info.size;
|
|
}
|
|
|
|
__CG_QUALIFIER__ unsigned int thread_rank() const {
|
|
return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
|
|
}
|
|
|
|
__CG_QUALIFIER__ void sync() const {
|
|
internal::coalesced_group::sync();
|
|
}
|
|
|
|
__CG_QUALIFIER__ unsigned int meta_group_rank() const {
|
|
return coalesced_info.tiled_info.meta_group_rank;
|
|
}
|
|
|
|
__CG_QUALIFIER__ unsigned int meta_group_size() const {
|
|
return coalesced_info.tiled_info.meta_group_size;
|
|
}
|
|
|
|
template <class T>
|
|
__CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
|
|
srcRank = srcRank % static_cast<int>(size());
|
|
|
|
int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
|
|
: (__AMDGCN_WAVEFRONT_SIZE == 64) ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
|
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
|
|
|
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
|
|
}
|
|
|
|
template <class T>
|
|
__CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
|
|
// Note: The cuda implementation appears to use the remainder of lane_delta
|
|
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
|
// This is not described in the documentation and is not done here.
|
|
|
|
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
|
|
return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
|
|
}
|
|
|
|
int lane;
|
|
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
|
|
lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
|
|
}
|
|
else {
|
|
lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
|
|
}
|
|
|
|
if (lane == -1) {
|
|
lane = __lane_id();
|
|
}
|
|
|
|
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
|
|
}
|
|
|
|
template <class T>
|
|
__CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
|
|
// Note: The cuda implementation appears to use the remainder of lane_delta
|
|
// and WARP_SIZE as the shift value rather than lane_delta itself.
|
|
// This is not described in the documentation and is not done here.
|
|
|
|
if (size() == __AMDGCN_WAVEFRONT_SIZE) {
|
|
return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
|
|
}
|
|
|
|
int lane;
|
|
if (__AMDGCN_WAVEFRONT_SIZE == 64) {
|
|
lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
|
|
}
|
|
else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
|
|
lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
|
|
}
|
|
|
|
if (lane == -1) {
|
|
lane = __lane_id();
|
|
}
|
|
|
|
return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
|
|
}
|
|
};
|
|
|
|
/** \brief User exposed API to create coalesced groups.
|
|
*
|
|
* \details A collective operation that groups all active lanes into a new thread group.
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
|
|
__CG_QUALIFIER__ coalesced_group coalesced_threads() {
|
|
return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
|
|
}
|
|
|
|
/**
|
|
* Implemenation of all publicly exposed base class APIs
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
|
|
switch (this->_type) {
|
|
case internal::cg_multi_grid: {
|
|
return (static_cast<const multi_grid_group*>(this)->thread_rank());
|
|
}
|
|
case internal::cg_grid: {
|
|
return (static_cast<const grid_group*>(this)->thread_rank());
|
|
}
|
|
case internal::cg_workgroup: {
|
|
return (static_cast<const thread_block*>(this)->thread_rank());
|
|
}
|
|
case internal::cg_tiled_group: {
|
|
return (static_cast<const tiled_group*>(this)->thread_rank());
|
|
}
|
|
case internal::cg_coalesced_group: {
|
|
return (static_cast<const coalesced_group*>(this)->thread_rank());
|
|
}
|
|
default: {
|
|
__hip_assert(false && "invalid cooperative group type")
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Implemenation of all publicly exposed thread group API
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ bool thread_group::is_valid() const {
|
|
switch (this->_type) {
|
|
case internal::cg_multi_grid: {
|
|
return (static_cast<const multi_grid_group*>(this)->is_valid());
|
|
}
|
|
case internal::cg_grid: {
|
|
return (static_cast<const grid_group*>(this)->is_valid());
|
|
}
|
|
case internal::cg_workgroup: {
|
|
return (static_cast<const thread_block*>(this)->is_valid());
|
|
}
|
|
case internal::cg_tiled_group: {
|
|
return (static_cast<const tiled_group*>(this)->is_valid());
|
|
}
|
|
case internal::cg_coalesced_group: {
|
|
return (static_cast<const coalesced_group*>(this)->is_valid());
|
|
}
|
|
default: {
|
|
__hip_assert(false && "invalid cooperative group type")
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Implemenation of all publicly exposed thread group sync API
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
__CG_QUALIFIER__ void thread_group::sync() const {
|
|
switch (this->_type) {
|
|
case internal::cg_multi_grid: {
|
|
static_cast<const multi_grid_group*>(this)->sync();
|
|
break;
|
|
}
|
|
case internal::cg_grid: {
|
|
static_cast<const grid_group*>(this)->sync();
|
|
break;
|
|
}
|
|
case internal::cg_workgroup: {
|
|
static_cast<const thread_block*>(this)->sync();
|
|
break;
|
|
}
|
|
case internal::cg_tiled_group: {
|
|
static_cast<const tiled_group*>(this)->sync();
|
|
break;
|
|
}
|
|
case internal::cg_coalesced_group: {
|
|
static_cast<const coalesced_group*>(this)->sync();
|
|
break;
|
|
}
|
|
default: {
|
|
__hip_assert(false && "invalid cooperative group type")
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Implemenation of publicly exposed `wrapper` API on top of basic cooperative
|
|
* group type APIs
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
|
|
/**
|
|
* Implemenation of publicly exposed `wrapper` API on top of basic cooperative
|
|
* group type APIs
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
|
|
return g.thread_rank();
|
|
}
|
|
/**
|
|
* Implemenation of publicly exposed `wrapper` API on top of basic cooperative
|
|
* group type APIs
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
|
|
/**
|
|
* Implemenation of publicly exposed `wrapper` API on top of basic cooperative
|
|
* group type APIs
|
|
* @note This function is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
|
|
/**
|
|
* template class tile_base
|
|
* @note This class is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <unsigned int tileSize> class tile_base {
|
|
protected:
|
|
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
|
|
|
public:
|
|
// Rank of the thread within this tile
|
|
_CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
|
|
return (internal::workgroup::thread_rank() & (numThreads - 1));
|
|
}
|
|
|
|
// Number of threads within this tile
|
|
__CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
|
|
};
|
|
/**
|
|
* template class thread_block_tile_base
|
|
* @note This class is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
|
|
static_assert(is_valid_tile_size<size>::value,
|
|
"Tile size is either not a power of 2 or greater than the wavefront size");
|
|
using tile_base<size>::numThreads;
|
|
|
|
public:
|
|
__CG_STATIC_QUALIFIER__ void sync() {
|
|
internal::tiled_group::sync();
|
|
}
|
|
|
|
template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
return (__shfl(var, srcRank, numThreads));
|
|
}
|
|
|
|
template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
return (__shfl_down(var, lane_delta, numThreads));
|
|
}
|
|
|
|
template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
return (__shfl_up(var, lane_delta, numThreads));
|
|
}
|
|
|
|
template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
|
|
static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
|
|
return (__shfl_xor(var, laneMask, numThreads));
|
|
}
|
|
};
|
|
/** \brief User exposed API that captures the state of the parent group pre-partition
|
|
*/
|
|
template <unsigned int tileSize, typename ParentCGTy>
|
|
class parent_group_info {
|
|
public:
|
|
// Returns the linear rank of the group within the set of tiles partitioned
|
|
// from a parent group (bounded by meta_group_size)
|
|
__CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
|
|
return ParentCGTy::thread_rank() / tileSize;
|
|
}
|
|
|
|
// Returns the number of groups created when the parent group was partitioned.
|
|
__CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
|
|
return (ParentCGTy::size() + tileSize - 1) / tileSize;
|
|
}
|
|
};
|
|
|
|
/** \brief Group type - thread_block_tile
|
|
*
|
|
* \details Represents one tile of thread group.
|
|
* @note This type is implemented on Linux, under developement
|
|
* on Windows.
|
|
*/
|
|
template <unsigned int tileSize, class ParentCGTy>
|
|
class thread_block_tile_type : public thread_block_tile_base<tileSize>,
|
|
public tiled_group,
|
|
public parent_group_info<tileSize, ParentCGTy> {
|
|
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
|
protected:
|
|
__CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
|
|
coalesced_info.tiled_info.size = numThreads;
|
|
coalesced_info.tiled_info.is_tiled = true;
|
|
}
|
|
};
|
|
|
|
// Partial template specialization
|
|
template <unsigned int tileSize>
|
|
class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
|
|
public tiled_group
|
|
{
|
|
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
|
|
|
typedef thread_block_tile_base<numThreads> tbtBase;
|
|
|
|
protected:
|
|
|
|
__CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
|
|
: tiled_group(numThreads) {
|
|
coalesced_info.tiled_info.size = numThreads;
|
|
coalesced_info.tiled_info.is_tiled = true;
|
|
coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
|
|
coalesced_info.tiled_info.meta_group_size = meta_group_size;
|
|
}
|
|
|
|
public:
|
|
using tbtBase::size;
|
|
using tbtBase::sync;
|
|
using tbtBase::thread_rank;
|
|
|
|
__CG_QUALIFIER__ unsigned int meta_group_rank() const {
|
|
return coalesced_info.tiled_info.meta_group_rank;
|
|
}
|
|
|
|
__CG_QUALIFIER__ unsigned int meta_group_size() const {
|
|
return coalesced_info.tiled_info.meta_group_size;
|
|
}
|
|
// end of operative group
|
|
/**
|
|
* @}
|
|
*/
|
|
};
|
|
|
|
|
|
/** \brief User exposed API to partition groups.
|
|
*
|
|
* \details A collective operation that partitions the parent group into a one-dimensional,
|
|
* row-major, tiling of subgroups.
|
|
*/
|
|
|
|
__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
|
|
if (parent.cg_type() == internal::cg_tiled_group) {
|
|
const tiled_group* cg = static_cast<const tiled_group*>(&parent);
|
|
return cg->new_tiled_group(tile_size);
|
|
}
|
|
else if(parent.cg_type() == internal::cg_coalesced_group) {
|
|
const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
|
|
return cg->new_tiled_group(tile_size);
|
|
}
|
|
else {
|
|
const thread_block* tb = static_cast<const thread_block*>(&parent);
|
|
return tb->new_tiled_group(tile_size);
|
|
}
|
|
}
|
|
|
|
// Thread block type overload
|
|
__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
|
|
return (parent.new_tiled_group(tile_size));
|
|
}
|
|
|
|
__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
|
|
return (parent.new_tiled_group(tile_size));
|
|
}
|
|
|
|
// If a coalesced group is passed to be partitioned, it should remain coalesced
|
|
__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
|
|
return (parent.new_tiled_group(tile_size));
|
|
}
|
|
|
|
template <unsigned int size, class ParentCGTy> class thread_block_tile;
|
|
|
|
namespace impl {
|
|
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
|
|
|
|
template <unsigned int size, class ParentCGTy>
|
|
class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
|
|
protected:
|
|
template <unsigned int tbtSize, class tbtParentT>
|
|
__CG_QUALIFIER__ thread_block_tile_internal(
|
|
const thread_block_tile_internal<tbtSize, tbtParentT>& g)
|
|
: thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
|
|
|
|
__CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
|
|
: thread_block_tile_type<size, ParentCGTy>() {}
|
|
};
|
|
} // namespace impl
|
|
|
|
template <unsigned int size, class ParentCGTy>
|
|
class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
|
|
protected:
|
|
__CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
|
|
: impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
|
|
|
|
public:
|
|
__CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
|
|
return thread_block_tile<size, void>(*this);
|
|
}
|
|
};
|
|
|
|
|
|
template <unsigned int size>
|
|
class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
|
|
template <unsigned int, class ParentCGTy> friend class thread_block_tile;
|
|
|
|
protected:
|
|
public:
|
|
template <class ParentCGTy>
|
|
__CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
|
|
: impl::thread_block_tile_internal<size, void>(g) {}
|
|
};
|
|
|
|
template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
|
|
|
|
namespace impl {
|
|
template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
|
|
|
|
template <unsigned int size>
|
|
struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
|
|
__CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
|
|
: thread_block_tile<size, thread_block>(g) {}
|
|
};
|
|
|
|
} // namespace impl
|
|
|
|
/** \brief User exposed API to partition groups.
|
|
*
|
|
* \details This constructs a templated class derieved from thread_group.
|
|
* The template defines tile size of the new thread group at compile time.
|
|
*/
|
|
template <unsigned int size, class ParentCGTy>
|
|
__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
|
|
static_assert(is_valid_tile_size<size>::value,
|
|
"Tiled partition with size > wavefront size. Currently not supported ");
|
|
return impl::tiled_partition_internal<size, ParentCGTy>(g);
|
|
}
|
|
} // namespace cooperative_groups
|
|
|
|
#endif // __cplusplus
|
|
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
|
/*
|
|
Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#ifdef __cplusplus
|
|
|
|
/**
|
|
* @brief Unsafe floating point rmw atomic add.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic add with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated to have the original value plus \p value
|
|
*
|
|
* @note This operation currently only performs different operations for
|
|
* the gfx90a target. Other devices continue to use safe atomics.
|
|
*
|
|
* It can be used to generate code that uses fast hardware floating point atomic
|
|
* operations which may handle rounding and subnormal values differently than
|
|
* non-atomic floating point operations.
|
|
*
|
|
* The operation is not always safe and can have undefined behavior unless
|
|
* following condition are met:
|
|
*
|
|
* - \p addr is at least 4 bytes aligned
|
|
* - If \p addr is a global segment address, it is in a coarse grain allocation.
|
|
* Passing in global segment addresses in fine grain allocations will result in
|
|
* undefined behavior and is not supported.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be increment by \p value.
|
|
* @param [in] value Value by \p addr is to be incremented.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float unsafeAtomicAdd(float* addr, float value) {
|
|
#if defined(__gfx90a__) && \
|
|
__has_builtin(__builtin_amdgcn_is_shared) && \
|
|
__has_builtin(__builtin_amdgcn_is_private) && \
|
|
__has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) && \
|
|
__has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
|
|
if (__builtin_amdgcn_is_shared(
|
|
(const __attribute__((address_space(0))) void*)addr))
|
|
return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
|
|
else if (__builtin_amdgcn_is_private(
|
|
(const __attribute__((address_space(0))) void*)addr)) {
|
|
float temp = *addr;
|
|
*addr = temp + value;
|
|
return temp;
|
|
}
|
|
else
|
|
return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
|
|
#elif __has_builtin(__hip_atomic_fetch_add)
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else
|
|
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Unsafe floating point rmw atomic max.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic max with
|
|
* device memory scope. The original value at \p addr is returned and
|
|
* the value at \p addr is replaced by \p val if greater.
|
|
*
|
|
* @note This operation is currently identical to that performed by
|
|
* atomicMax and is included for completeness.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated
|
|
* @param [in] val Value used to update the value at \p addr.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float unsafeAtomicMax(float* addr, float val) {
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Unsafe floating point rmw atomic min.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic min with
|
|
* device memory scope. The original value at \p addr is returned and
|
|
* the value at \p addr is replaced by \p val if lesser.
|
|
*
|
|
* @note This operation is currently identical to that performed by
|
|
* atomicMin and is included for completeness.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated
|
|
* @param [in] val Value used to update the value at \p addr.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float unsafeAtomicMin(float* addr, float val) {
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Unsafe double precision rmw atomic add.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic add with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated to have the original value plus \p value
|
|
*
|
|
* @note This operation currently only performs different operations for
|
|
* the gfx90a target. Other devices continue to use safe atomics.
|
|
*
|
|
* It can be used to generate code that uses fast hardware floating point atomic
|
|
* operations which may handle rounding and subnormal values differently than
|
|
* non-atomic floating point operations.
|
|
*
|
|
* The operation is not always safe and can have undefined behavior unless
|
|
* following condition are met:
|
|
*
|
|
* - \p addr is at least 8 byte aligned
|
|
* - If \p addr is a global segment address, it is in a coarse grain allocation.
|
|
* Passing in global segment addresses in fine grain allocations will result in
|
|
* undefined behavior and are not supported.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated.
|
|
* @param [in] value Value by \p addr is to be incremented.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline double unsafeAtomicAdd(double* addr, double value) {
|
|
#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
|
|
return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
|
|
#elif defined (__hip_atomic_fetch_add)
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else
|
|
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Unsafe double precision rmw atomic max.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic max with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated with \p val if greater.
|
|
*
|
|
* @note This operation currently only performs different operations for
|
|
* the gfx90a target. Other devices continue to use safe atomics.
|
|
*
|
|
* It can be used to generate code that uses fast hardware floating point atomic
|
|
* operations which may handle rounding and subnormal values differently than
|
|
* non-atomic floating point operations.
|
|
*
|
|
* The operation is not always safe and can have undefined behavior unless
|
|
* following condition are met:
|
|
*
|
|
* - \p addr is at least 8 byte aligned
|
|
* - If \p addr is a global segment address, it is in a coarse grain allocation.
|
|
* Passing in global segment addresses in fine grain allocations will result in
|
|
* undefined behavior and are not supported.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated.
|
|
* @param [in] val Value used to updated the contents at \p addr
|
|
* @return Original value contained at \p addr.
|
|
*/
|
|
__device__ inline double unsafeAtomicMax(double* addr, double val) {
|
|
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
|
__has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
|
|
return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Unsafe double precision rmw atomic min.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic min with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated with \p val if lesser.
|
|
*
|
|
* @note This operation currently only performs different operations for
|
|
* the gfx90a target. Other devices continue to use safe atomics.
|
|
*
|
|
* It can be used to generate code that uses fast hardware floating point atomic
|
|
* operations which may handle rounding and subnormal values differently than
|
|
* non-atomic floating point operations.
|
|
*
|
|
* The operation is not always safe and can have undefined behavior unless
|
|
* following condition are met:
|
|
*
|
|
* - \p addr is at least 8 byte aligned
|
|
* - If \p addr is a global segment address, it is in a coarse grain allocation.
|
|
* Passing in global segment addresses in fine grain allocations will result in
|
|
* undefined behavior and are not supported.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated.
|
|
* @param [in] val Value used to updated the contents at \p addr
|
|
* @return Original value contained at \p addr.
|
|
*/
|
|
__device__ inline double unsafeAtomicMin(double* addr, double val) {
|
|
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
|
|
__has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
|
|
return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe floating point rmw atomic add.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic add with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated to have the original value plus \p value
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be increment by \p value.
|
|
* @param [in] value Value by \p addr is to be incremented.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float safeAtomicAdd(float* addr, float value) {
|
|
#if defined(__gfx908__) || defined(__gfx941__) \
|
|
|| ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__)) \
|
|
&& !__has_builtin(__hip_atomic_fetch_add))
|
|
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
|
|
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
|
|
// On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
|
|
// so we need to force a CAS loop emulation to ensure safety.
|
|
// On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
|
|
// need to force a CAS loop here.
|
|
float old_val;
|
|
#if __has_builtin(__hip_atomic_load)
|
|
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else // !__has_builtin(__hip_atomic_load)
|
|
old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
|
|
#endif // __has_builtin(__hip_atomic_load)
|
|
float expected, temp;
|
|
do {
|
|
temp = expected = old_val;
|
|
#if __has_builtin(__hip_atomic_compare_exchange_strong)
|
|
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
|
|
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
|
old_val = expected;
|
|
} while (__float_as_uint(temp) != __float_as_uint(old_val));
|
|
return old_val;
|
|
#elif defined(__gfx90a__)
|
|
// On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
|
|
// atomics will produce safe CAS loops, but are otherwise not different than
|
|
// agent-scope atomics. This logic is only applicable for gfx90a, and should
|
|
// not be assumed on other architectures.
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#elif __has_builtin(__hip_atomic_fetch_add)
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else
|
|
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe floating point rmw atomic max.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic max with
|
|
* device memory scope. The original value at \p addr is returned and
|
|
* the value at \p addr is replaced by \p val if greater.
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated
|
|
* @param [in] val Value used to update the value at \p addr.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float safeAtomicMax(float* addr, float val) {
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe floating point rmw atomic min.
|
|
*
|
|
* Performs a relaxed read-modify-write floating point atomic min with
|
|
* device memory scope. The original value at \p addr is returned and
|
|
* the value at \p addr is replaced by \p val if lesser.
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated
|
|
* @param [in] val Value used to update the value at \p addr.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline float safeAtomicMin(float* addr, float val) {
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe double precision rmw atomic add.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic add with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated to have the original value plus \p value
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be increment by \p value.
|
|
* @param [in] value Value by \p addr is to be incremented.
|
|
* @return Original value contained in \p addr.
|
|
*/
|
|
__device__ inline double safeAtomicAdd(double* addr, double value) {
|
|
#if defined(__gfx90a__) && __has_builtin(__hip_atomic_fetch_add)
|
|
// On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
|
|
// atomics will produce safe CAS loops, but are otherwise not different than
|
|
// agent-scope atomics. This logic is only applicable for gfx90a, and should
|
|
// not be assumed on other architectures.
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#elif defined(__gfx90a__)
|
|
// On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
|
|
// force a CAS loop here.
|
|
double old_val;
|
|
#if __has_builtin(__hip_atomic_load)
|
|
old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else // !__has_builtin(__hip_atomic_load)
|
|
old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
|
|
#endif // __has_builtin(__hip_atomic_load)
|
|
double expected, temp;
|
|
do {
|
|
temp = expected = old_val;
|
|
#if __has_builtin(__hip_atomic_compare_exchange_strong)
|
|
__hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
|
|
__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
__atomic_compare_exchange_n(addr, &expected, old_val + value, false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
|
|
old_val = expected;
|
|
} while (__double_as_longlong(temp) != __double_as_longlong(old_val));
|
|
return old_val;
|
|
#else // !defined(__gfx90a__)
|
|
#if __has_builtin(__hip_atomic_fetch_add)
|
|
return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#else // !__has_builtin(__hip_atomic_fetch_add)
|
|
return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
|
|
#endif // __has_builtin(__hip_atomic_fetch_add)
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe double precision rmw atomic max.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic max with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated with \p val if greater.
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated.
|
|
* @param [in] val Value used to updated the contents at \p addr
|
|
* @return Original value contained at \p addr.
|
|
*/
|
|
__device__ inline double safeAtomicMax(double* addr, double val) {
|
|
#if __has_builtin(__builtin_amdgcn_is_private)
|
|
if (__builtin_amdgcn_is_private(
|
|
(const __attribute__((address_space(0))) void*)addr)) {
|
|
double old = *addr;
|
|
*addr = __builtin_fmax(old, val);
|
|
return old;
|
|
} else {
|
|
#endif
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#if __has_builtin(__builtin_amdgcn_is_private)
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* @brief Safe double precision rmw atomic min.
|
|
*
|
|
* Performs a relaxed read-modify-write double precision atomic min with
|
|
* device memory scope. Original value at \p addr is returned and
|
|
* the value of \p addr is updated with \p val if lesser.
|
|
*
|
|
* @note This operation ensures that, on all targets, we produce safe atomics.
|
|
* This will be the case even when -munsafe-fp-atomics is passed into the compiler.
|
|
*
|
|
* @param [in,out] addr Pointer to value to be updated.
|
|
* @param [in] val Value used to updated the contents at \p addr
|
|
* @return Original value contained at \p addr.
|
|
*/
|
|
__device__ inline double safeAtomicMin(double* addr, double val) {
|
|
#if __has_builtin(__builtin_amdgcn_is_private)
|
|
if (__builtin_amdgcn_is_private(
|
|
(const __attribute__((address_space(0))) void*)addr)) {
|
|
double old = *addr;
|
|
*addr = __builtin_fmin(old, val);
|
|
return old;
|
|
} else {
|
|
#endif
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#if __has_builtin(__builtin_amdgcn_is_private)
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "amd_device_functions.h"
|
|
#endif
|
|
|
|
#if __has_builtin(__hip_atomic_compare_exchange_strong)
|
|
|
|
template<bool B, typename T, typename F> struct Cond_t;
|
|
|
|
template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
|
|
template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };
|
|
|
|
#if !__HIP_DEVICE_COMPILE__
|
|
//TODO: Remove this after compiler pre-defines the following Macros.
|
|
#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
|
|
#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
|
|
#define __HIP_MEMORY_SCOPE_WORKGROUP 3
|
|
#define __HIP_MEMORY_SCOPE_AGENT 4
|
|
#define __HIP_MEMORY_SCOPE_SYSTEM 5
|
|
#endif
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "amd_hip_unsafe_atomics.h"
|
|
#endif
|
|
|
|
// Atomic expanders
|
|
template<
|
|
int mem_order = __ATOMIC_SEQ_CST,
|
|
int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
|
|
typename T,
|
|
typename Op,
|
|
typename F>
|
|
inline
|
|
__attribute__((always_inline, device))
|
|
T hip_cas_expander(T* p, T x, Op op, F f) noexcept
|
|
{
|
|
using FP = __attribute__((address_space(0))) const void*;
|
|
|
|
__device__
|
|
extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
|
|
|
if (is_shared_workaround((FP)p))
|
|
return f();
|
|
|
|
using U = typename Cond_t<
|
|
sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
|
|
|
auto q = reinterpret_cast<U*>(p);
|
|
|
|
U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
|
|
U tmp1;
|
|
do {
|
|
tmp1 = tmp0;
|
|
|
|
op(reinterpret_cast<T&>(tmp1), x);
|
|
} while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
|
|
mem_order, mem_scope));
|
|
|
|
return reinterpret_cast<const T&>(tmp0);
|
|
}
|
|
|
|
template<
|
|
int mem_order = __ATOMIC_SEQ_CST,
|
|
int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
|
|
typename T,
|
|
typename Cmp,
|
|
typename F>
|
|
inline
|
|
__attribute__((always_inline, device))
|
|
T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
|
|
{
|
|
using FP = __attribute__((address_space(0))) const void*;
|
|
|
|
__device__
|
|
extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");
|
|
|
|
if (is_shared_workaround((FP)p))
|
|
return f();
|
|
|
|
using U = typename Cond_t<
|
|
sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;
|
|
|
|
auto q = reinterpret_cast<U*>(p);
|
|
|
|
U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
|
|
while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
|
|
!__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
|
|
mem_scope));
|
|
|
|
return reinterpret_cast<const T&>(tmp);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicCAS(int* address, int compare, int val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicCAS_system(int* address, int compare, int val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
|
|
unsigned long long val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
|
|
unsigned long long val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicCAS(float* address, float compare, float val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicCAS_system(float* address, float compare, float val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicCAS(double* address, double compare, double val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicCAS_system(double* address, double compare, double val) {
|
|
__hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAdd(int* address, int val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAdd_system(int* address, int val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicAdd(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicAdd(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicAdd(float* address, float val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, val);
|
|
#else
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicAdd_system(float* address, float val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
DEPRECATED("use atomicAdd instead")
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
__device__
|
|
inline
|
|
void atomicAddNoRet(float* address, float val)
|
|
{
|
|
__ockl_atomic_add_noret_f32(address, val);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicAdd(double* address, double val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, val);
|
|
#else
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicAdd_system(double* address, double val) {
|
|
return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicSub(int* address, int val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicSub_system(int* address, int val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicSub(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicSub(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicSub(float* address, float val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, -val);
|
|
#else
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicSub_system(float* address, float val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicSub(double* address, double val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, -val);
|
|
#else
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicSub_system(double* address, double val) {
|
|
return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicExch(int* address, int val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicExch_system(int* address, int val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicExch(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicExch(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicExch(float* address, float val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicExch_system(float* address, float val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicExch(double* address, double val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicExch_system(double* address, double val) {
|
|
return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMin(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](int x, int y) { return x < y; }, [=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMin_system(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](int x, int y) { return x < y; }, [=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicMin(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMin(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long x, unsigned long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long x, unsigned long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long long x, unsigned long long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long long x, unsigned long long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
long long atomicMin(long long* address, long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](long long x, long long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
long long atomicMin_system(long long* address, long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](long long x, long long y) { return x < y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicMin(float* addr, float val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicMin(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicMin_system(float* address, float val) {
|
|
unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
|
|
#if __has_builtin(__hip_atomic_load)
|
|
unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
|
|
#else
|
|
unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
|
#endif
|
|
float value = __uint_as_float(tmp);
|
|
|
|
while (val < value) {
|
|
value = atomicCAS_system(address, value, val);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicMin(double* addr, double val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicMin(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value > val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) > val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicMin_system(double* address, double val) {
|
|
unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
|
|
#if __has_builtin(__hip_atomic_load)
|
|
unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
|
|
#else
|
|
unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
|
#endif
|
|
double value = __longlong_as_double(tmp);
|
|
|
|
while (val < value) {
|
|
value = atomicCAS_system(address, value, val);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMax(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](int x, int y) { return y < x; }, [=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMax_system(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](int x, int y) { return y < x; }, [=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicMax(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicMax(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long x, unsigned long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long x, unsigned long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long long x, unsigned long long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long long x, unsigned long long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
long long atomicMax(long long* address, long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](long long x, long long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
long long atomicMax_system(long long* address, long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](long long x, long long y) { return y < x; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicMax(float* addr, float val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicMax(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned int *uaddr = (unsigned int *)addr;
|
|
unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __uint_as_float(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __uint_as_float(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
float atomicMax_system(float* address, float val) {
|
|
unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
|
|
#if __has_builtin(__hip_atomic_load)
|
|
unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
|
|
#else
|
|
unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
|
#endif
|
|
float value = __uint_as_float(tmp);
|
|
|
|
while (value < val) {
|
|
value = atomicCAS_system(address, value, val);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicMax(double* addr, double val) {
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicMax(addr, val);
|
|
#else
|
|
#if __has_builtin(__hip_atomic_load) && \
|
|
__has_builtin(__hip_atomic_compare_exchange_strong)
|
|
double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
bool done = false;
|
|
while (!done && value < val) {
|
|
done = __hip_atomic_compare_exchange_strong(addr, &value, val,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
}
|
|
return value;
|
|
#else
|
|
unsigned long long *uaddr = (unsigned long long *)addr;
|
|
unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
|
|
bool done = false;
|
|
while (!done && __longlong_as_double(value) < val) {
|
|
done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
|
|
__ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
}
|
|
return __longlong_as_double(value);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicMax_system(double* address, double val) {
|
|
unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
|
|
#if __has_builtin(__hip_atomic_load)
|
|
unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
|
|
#else
|
|
unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
|
|
#endif
|
|
double value = __longlong_as_double(tmp);
|
|
|
|
while (value < val) {
|
|
value = atomicCAS_system(address, value, val);
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicInc(unsigned int* address, unsigned int val)
|
|
{
|
|
#if defined(__gfx941__)
|
|
__device__
|
|
extern
|
|
unsigned int __builtin_amdgcn_atomic_inc(
|
|
unsigned int*,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");
|
|
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
|
|
[=]() {
|
|
return
|
|
__builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
|
|
});
|
|
#else
|
|
return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
|
|
#endif // __gfx941__
|
|
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicDec(unsigned int* address, unsigned int val)
|
|
{
|
|
#if defined(__gfx941__)
|
|
__device__
|
|
extern
|
|
unsigned int __builtin_amdgcn_atomic_dec(
|
|
unsigned int*,
|
|
unsigned int,
|
|
unsigned int,
|
|
unsigned int,
|
|
bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");
|
|
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
|
|
[=]() {
|
|
return
|
|
__builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
|
|
});
|
|
#else
|
|
return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
|
|
#endif // __gfx941__
|
|
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAnd(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](int& x, int y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAnd_system(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](int& x, int y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicAnd(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicAnd(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long long& x, unsigned long long y) { x &= y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long long& x, unsigned long long y) { x &= y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicOr(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](int& x, int y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicOr_system(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](int& x, int y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicOr(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicOr(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long long& x, unsigned long long y) { x |= y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address,
|
|
val,
|
|
[](unsigned long long& x, unsigned long long y) { x |= y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicXor(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](int& x, int y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicXor_system(int* address, int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](int& x, int y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicXor(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicXor(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
|
|
address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_SYSTEM);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
|
|
#if defined(__gfx941__)
|
|
return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
|
|
address,
|
|
val,
|
|
[](unsigned long long& x, unsigned long long y) { x ^= y; },
|
|
[=]() {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
|
|
__HIP_MEMORY_SCOPE_AGENT);
|
|
});
|
|
#else
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
|
#endif // __gfx941__
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
|
|
return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
|
|
}
|
|
|
|
#else // __hip_atomic_compare_exchange_strong
|
|
|
|
__device__
|
|
inline
|
|
int atomicCAS(int* address, int compare, int val)
|
|
{
|
|
__atomic_compare_exchange_n(
|
|
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
|
|
return compare;
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicCAS(
|
|
unsigned int* address, unsigned int compare, unsigned int val)
|
|
{
|
|
__atomic_compare_exchange_n(
|
|
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
|
|
return compare;
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicCAS(
|
|
unsigned long long* address,
|
|
unsigned long long compare,
|
|
unsigned long long val)
|
|
{
|
|
__atomic_compare_exchange_n(
|
|
address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
|
|
|
|
return compare;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAdd(int* address, int val)
|
|
{
|
|
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicAdd(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAdd(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
float atomicAdd(float* address, float val)
|
|
{
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, val);
|
|
#else
|
|
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
|
#endif
|
|
}
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
DEPRECATED("use atomicAdd instead")
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
__device__
|
|
inline
|
|
void atomicAddNoRet(float* address, float val)
|
|
{
|
|
__ockl_atomic_add_noret_f32(address, val);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
double atomicAdd(double* address, double val)
|
|
{
|
|
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
|
|
return unsafeAtomicAdd(address, val);
|
|
#else
|
|
return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
|
|
#endif
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicSub(int* address, int val)
|
|
{
|
|
return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicSub(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicExch(int* address, int val)
|
|
{
|
|
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicExch(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
|
|
{
|
|
return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
float atomicExch(float* address, float val)
|
|
{
|
|
return __uint_as_float(__atomic_exchange_n(
|
|
reinterpret_cast<unsigned int*>(address),
|
|
__float_as_uint(val),
|
|
__ATOMIC_RELAXED));
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMin(int* address, int val)
|
|
{
|
|
return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicMin(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMin(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
|
while (val < tmp) {
|
|
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
|
|
|
if (tmp1 != tmp) { tmp = tmp1; continue; }
|
|
|
|
tmp = atomicCAS(address, tmp, val);
|
|
}
|
|
|
|
return tmp;
|
|
}
|
|
__device__ inline long long atomicMin(long long* address, long long val) {
|
|
long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
|
while (val < tmp) {
|
|
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
|
|
|
if (tmp1 != tmp) {
|
|
tmp = tmp1;
|
|
continue;
|
|
}
|
|
|
|
tmp = atomicCAS(address, tmp, val);
|
|
}
|
|
return tmp;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicMax(int* address, int val)
|
|
{
|
|
return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicMax(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicMax(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
|
while (tmp < val) {
|
|
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
|
|
|
if (tmp1 != tmp) { tmp = tmp1; continue; }
|
|
|
|
tmp = atomicCAS(address, tmp, val);
|
|
}
|
|
|
|
return tmp;
|
|
}
|
|
__device__ inline long long atomicMax(long long* address, long long val) {
|
|
long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
|
|
while (tmp < val) {
|
|
const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);
|
|
|
|
if (tmp1 != tmp) {
|
|
tmp = tmp1;
|
|
continue;
|
|
}
|
|
|
|
tmp = atomicCAS(address, tmp, val);
|
|
}
|
|
return tmp;
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicInc(unsigned int* address, unsigned int val)
|
|
{
|
|
return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
unsigned int atomicDec(unsigned int* address, unsigned int val)
|
|
{
|
|
return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicAnd(int* address, int val)
|
|
{
|
|
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicAnd(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicAnd(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicOr(int* address, int val)
|
|
{
|
|
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicOr(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicOr(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
__device__
|
|
inline
|
|
int atomicXor(int* address, int val)
|
|
{
|
|
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned int atomicXor(unsigned int* address, unsigned int val)
|
|
{
|
|
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
__device__
|
|
inline
|
|
unsigned long long atomicXor(
|
|
unsigned long long* address, unsigned long long val)
|
|
{
|
|
return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
|
|
}
|
|
|
|
#endif // __hip_atomic_compare_exchange_strong
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "host_defines.h"
|
|
#include "amd_hip_vector_types.h" // For Native_vec_
|
|
#endif
|
|
|
|
#if defined(__cplusplus)
|
|
extern "C" {
|
|
#endif
|
|
|
|
// DOT FUNCTIONS
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot2(
|
|
HIP_vector_base<short, 2>::Native_vec_,
|
|
HIP_vector_base<short, 2>::Native_vec_,
|
|
int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot2(
|
|
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
|
HIP_vector_base<unsigned short, 2>::Native_vec_,
|
|
unsigned int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot4(
|
|
HIP_vector_base<char, 4>::Native_vec_,
|
|
HIP_vector_base<char, 4>::Native_vec_,
|
|
int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot4(
|
|
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
|
HIP_vector_base<unsigned char, 4>::Native_vec_,
|
|
unsigned int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
int __ockl_sdot8(int, int, int, bool);
|
|
|
|
__device__
|
|
__attribute__((const))
|
|
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
|
|
#endif
|
|
|
|
#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
// BEGIN FLOAT
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_acos_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_acosh_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_asin_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_asinh_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_atan2_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_atan_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_atanh_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_cbrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_ceil_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
__device__
|
|
float __ocml_copysign_f32(float, float);
|
|
__device__
|
|
float __ocml_cos_f32(float);
|
|
__device__
|
|
float __ocml_native_cos_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
__device__
|
|
float __ocml_cosh_f32(float);
|
|
__device__
|
|
float __ocml_cospi_f32(float);
|
|
__device__
|
|
float __ocml_i0_f32(float);
|
|
__device__
|
|
float __ocml_i1_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfc_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfcinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfcx_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erf_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_erfinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_exp10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp2_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_exp_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_exp_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_expm1_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fabs_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fdim_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_floor_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fmax_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fmin_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
__device__
|
|
float __ocml_fmod_f32(float, float);
|
|
__device__
|
|
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_hypot_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_ilogb_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isfinite_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isinf_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isnan_f32(float);
|
|
__device__
|
|
float __ocml_j0_f32(float);
|
|
__device__
|
|
float __ocml_j1_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_ldexp_f32(float, int);
|
|
__device__
|
|
float __ocml_lgamma_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log10_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log1p_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log2_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log2_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_logb_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_log_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_native_log_f32(float);
|
|
__device__
|
|
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_nearbyint_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_nextafter_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_len3_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_len4_f32(float, float, float, float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_ncdf_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_ncdfinv_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_pow_f32(float, float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_pown_f32(float, int);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_rcbrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_remainder_f32(float, float);
|
|
__device__
|
|
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rhypot_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rint_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rlen3_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_rlen4_f32(float, float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_round_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_rsqrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_scalb_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_scalbn_f32(float, int);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_signbit_f32(float);
|
|
__device__
|
|
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
|
|
__device__
|
|
float __ocml_sin_f32(float);
|
|
__device__
|
|
float __ocml_native_sin_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_sinh_f32(float);
|
|
__device__
|
|
float __ocml_sinpi_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_native_sqrt_f32(float);
|
|
__device__
|
|
float __ocml_tan_f32(float);
|
|
__device__
|
|
__attribute__((pure))
|
|
float __ocml_tanh_f32(float);
|
|
__device__
|
|
float __ocml_tgamma_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_trunc_f32(float);
|
|
__device__
|
|
float __ocml_y0_f32(float);
|
|
__device__
|
|
float __ocml_y1_f32(float);
|
|
|
|
// BEGIN INTRINSICS
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_add_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sub_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_mul_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rte_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtn_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtp_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_div_rtz_f32(float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rte_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtn_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtp_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_sqrt_rtz_f32(float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rte_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtn_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtp_f32(float, float, float);
|
|
__device__
|
|
__attribute__((const))
|
|
float __ocml_fma_rtz_f32(float, float, float);
|
|
// END INTRINSICS
|
|
// END FLOAT
|
|
|
|
// BEGIN DOUBLE
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_acos_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_acosh_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_asin_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_asinh_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_atan2_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_atan_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_atanh_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_cbrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_ceil_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_copysign_f64(double, double);
|
|
__device__
|
|
double __ocml_cos_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_cosh_f64(double);
|
|
__device__
|
|
double __ocml_cospi_f64(double);
|
|
__device__
|
|
double __ocml_i0_f64(double);
|
|
__device__
|
|
double __ocml_i1_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfc_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfcinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfcx_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erf_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_erfinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp10_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp2_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_exp_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_expm1_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fabs_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fdim_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_floor_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmax_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmin_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fmod_f64(double, double);
|
|
__device__
|
|
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_hypot_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_ilogb_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isfinite_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isinf_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_isnan_f64(double);
|
|
__device__
|
|
double __ocml_j0_f64(double);
|
|
__device__
|
|
double __ocml_j1_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_ldexp_f64(double, int);
|
|
__device__
|
|
double __ocml_lgamma_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log10_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log1p_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log2_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_logb_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_log_f64(double);
|
|
__device__
|
|
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_nearbyint_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_nextafter_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_len3_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_len4_f64(double, double, double, double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_ncdf_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_ncdfinv_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_pow_f64(double, double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_pown_f64(double, int);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_rcbrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_remainder_f64(double, double);
|
|
__device__
|
|
double __ocml_remquo_f64(
|
|
double, double, __attribute__((address_space(5))) int*);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rhypot_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rint_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rlen3_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_rlen4_f64(double, double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_round_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_rsqrt_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_scalb_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_scalbn_f64(double, int);
|
|
__device__
|
|
__attribute__((const))
|
|
int __ocml_signbit_f64(double);
|
|
__device__
|
|
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
|
|
__device__
|
|
double __ocml_sin_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_sinh_f64(double);
|
|
__device__
|
|
double __ocml_sinpi_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_f64(double);
|
|
__device__
|
|
double __ocml_tan_f64(double);
|
|
__device__
|
|
__attribute__((pure))
|
|
double __ocml_tanh_f64(double);
|
|
__device__
|
|
double __ocml_tgamma_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_trunc_f64(double);
|
|
__device__
|
|
double __ocml_y0_f64(double);
|
|
__device__
|
|
double __ocml_y1_f64(double);
|
|
|
|
// BEGIN INTRINSICS
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_add_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sub_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_mul_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rte_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtn_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtp_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_div_rtz_f64(double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rte_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtn_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtp_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_sqrt_rtz_f64(double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rte_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtn_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtp_f64(double, double, double);
|
|
__device__
|
|
__attribute__((const))
|
|
double __ocml_fma_rtz_f64(double, double, double);
|
|
// END INTRINSICS
|
|
// END DOUBLE
|
|
|
|
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
|
|
#if defined(__cplusplus)
|
|
} // extern "C"
|
|
#endif
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
// /*
|
|
// Half Math Functions
|
|
// */
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "host_defines.h"
|
|
#endif
|
|
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
extern "C"
|
|
{
|
|
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
|
|
__device__ _Float16 __ocml_cos_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
|
|
__device__ __attribute__((const))
|
|
_Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
|
|
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
|
|
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
|
|
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
|
|
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
|
|
__device__ _Float16 __ocml_sin_f16(_Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
|
|
|
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
|
|
typedef short __2i16 __attribute__((ext_vector_type(2)));
|
|
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
|
|
#endif
|
|
|
|
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
|
|
__device__ __2f16 __ocml_cos_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
|
|
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
|
|
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
|
|
__device__ __2f16 __ocml_sin_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
|
|
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
|
|
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
|
|
|
}
|
|
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
|
|
//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
|
|
extern "C" {
|
|
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
|
|
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
|
|
}
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
|
|
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
|
|
|
|
#if defined(__HIPCC_RTC__)
|
|
#define __HOST_DEVICE__ __device__
|
|
#else
|
|
#define __HOST_DEVICE__ __host__ __device__
|
|
#include <hip/amd_detail/amd_hip_common.h>
|
|
#include "hip/amd_detail/host_defines.h"
|
|
#include <assert.h>
|
|
#if defined(__cplusplus)
|
|
#include <algorithm>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#endif
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));
|
|
|
|
struct __half_raw {
|
|
union {
|
|
static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
|
|
|
|
_Float16 data;
|
|
unsigned short x;
|
|
};
|
|
};
|
|
|
|
struct __half2_raw {
|
|
union {
|
|
static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
|
|
|
|
struct {
|
|
__half_raw x;
|
|
__half_raw y;
|
|
};
|
|
_Float16_2 data;
|
|
};
|
|
};
|
|
|
|
#if defined(__cplusplus)
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "hip_fp16_math_fwd.h"
|
|
#include "amd_hip_vector_types.h"
|
|
#include "host_defines.h"
|
|
#include "amd_device_functions.h"
|
|
#include "amd_warp_functions.h"
|
|
#endif
|
|
namespace std
|
|
{
|
|
template<> struct is_floating_point<_Float16> : std::true_type {};
|
|
}
|
|
|
|
template<bool cond, typename T = void>
|
|
using Enable_if_t = typename std::enable_if<cond, T>::type;
|
|
|
|
// BEGIN STRUCT __HALF
|
|
struct __half {
|
|
protected:
|
|
union {
|
|
static_assert(sizeof(_Float16) == sizeof(unsigned short), "");
|
|
|
|
_Float16 data;
|
|
unsigned short __x;
|
|
};
|
|
public:
|
|
// CREATORS
|
|
__HOST_DEVICE__
|
|
__half() = default;
|
|
__HOST_DEVICE__
|
|
__half(const __half_raw& x) : data{x.data} {}
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
__HOST_DEVICE__
|
|
__half(decltype(data) x) : data{x} {}
|
|
template<
|
|
typename T,
|
|
Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
|
|
__HOST_DEVICE__
|
|
__half(T x) : data{static_cast<_Float16>(x)} {}
|
|
#endif
|
|
__HOST_DEVICE__
|
|
__half(const __half&) = default;
|
|
__HOST_DEVICE__
|
|
__half(__half&&) = default;
|
|
__HOST_DEVICE__
|
|
~__half() = default;
|
|
|
|
// CREATORS - DEVICE ONLY
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
template<
|
|
typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
|
|
__HOST_DEVICE__
|
|
__half(T x) : data{static_cast<_Float16>(x)} {}
|
|
#endif
|
|
|
|
// MANIPULATORS
|
|
__HOST_DEVICE__
|
|
__half& operator=(const __half&) = default;
|
|
__HOST_DEVICE__
|
|
__half& operator=(__half&&) = default;
|
|
__HOST_DEVICE__
|
|
__half& operator=(const __half_raw& x)
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
__HOST_DEVICE__
|
|
volatile __half& operator=(const __half_raw& x) volatile
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
volatile __half& operator=(const volatile __half_raw& x) volatile
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
__half& operator=(__half_raw&& x)
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
volatile __half& operator=(__half_raw&& x) volatile
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
volatile __half& operator=(volatile __half_raw&& x) volatile
|
|
{
|
|
data = x.data;
|
|
return *this;
|
|
}
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
template<
|
|
typename T,
|
|
Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
|
|
__HOST_DEVICE__
|
|
__half& operator=(T x)
|
|
{
|
|
data = static_cast<_Float16>(x);
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
// MANIPULATORS - DEVICE ONLY
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
template<
|
|
typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
|
|
__device__
|
|
__half& operator=(T x)
|
|
{
|
|
data = static_cast<_Float16>(x);
|
|
return *this;
|
|
}
|
|
#endif
|
|
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
__device__
|
|
__half& operator+=(const __half& x)
|
|
{
|
|
data += x.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half& operator-=(const __half& x)
|
|
{
|
|
data -= x.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half& operator*=(const __half& x)
|
|
{
|
|
data *= x.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half& operator/=(const __half& x)
|
|
{
|
|
data /= x.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half& operator++() { ++data; return *this; }
|
|
__device__
|
|
__half operator++(int)
|
|
{
|
|
__half tmp{*this};
|
|
++*this;
|
|
return tmp;
|
|
}
|
|
__device__
|
|
__half& operator--() { --data; return *this; }
|
|
__device__
|
|
__half operator--(int)
|
|
{
|
|
__half tmp{*this};
|
|
--*this;
|
|
return tmp;
|
|
}
|
|
#endif
|
|
|
|
// ACCESSORS
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
template<
|
|
typename T,
|
|
Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
|
|
__HOST_DEVICE__
|
|
operator T() const { return data; }
|
|
#endif
|
|
__HOST_DEVICE__
|
|
operator __half_raw() const { return __half_raw{data}; }
|
|
__HOST_DEVICE__
|
|
operator __half_raw() const volatile
|
|
{
|
|
return __half_raw{data};
|
|
}
|
|
|
|
#if !defined(__HIP_NO_HALF_CONVERSIONS__)
|
|
template<
|
|
typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
|
|
__HOST_DEVICE__
|
|
operator T() const { return data; }
|
|
#endif
|
|
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
__device__
|
|
__half operator+() const { return *this; }
|
|
__device__
|
|
__half operator-() const
|
|
{
|
|
__half tmp{*this};
|
|
tmp.data = -tmp.data;
|
|
return tmp;
|
|
}
|
|
#endif
|
|
|
|
// FRIENDS
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
friend
|
|
inline
|
|
__device__
|
|
__half operator+(const __half& x, const __half& y)
|
|
{
|
|
return __half{x} += y;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half operator-(const __half& x, const __half& y)
|
|
{
|
|
return __half{x} -= y;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half operator*(const __half& x, const __half& y)
|
|
{
|
|
return __half{x} *= y;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half operator/(const __half& x, const __half& y)
|
|
{
|
|
return __half{x} /= y;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator==(const __half& x, const __half& y)
|
|
{
|
|
return x.data == y.data;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator!=(const __half& x, const __half& y)
|
|
{
|
|
return !(x == y);
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator<(const __half& x, const __half& y)
|
|
{
|
|
return x.data < y.data;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator>(const __half& x, const __half& y)
|
|
{
|
|
return y.data < x.data;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator<=(const __half& x, const __half& y)
|
|
{
|
|
return !(y < x);
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator>=(const __half& x, const __half& y)
|
|
{
|
|
return !(x < y);
|
|
}
|
|
#endif // !defined(__HIP_NO_HALF_OPERATORS__)
|
|
};
|
|
// END STRUCT __HALF
|
|
|
|
// BEGIN STRUCT __HALF2
|
|
struct __half2 {
|
|
public:
|
|
union {
|
|
static_assert(
|
|
sizeof(_Float16_2) == sizeof(unsigned short[2]), "");
|
|
|
|
struct {
|
|
__half x;
|
|
__half y;
|
|
};
|
|
_Float16_2 data;
|
|
};
|
|
|
|
// CREATORS
|
|
__HOST_DEVICE__
|
|
__half2() = default;
|
|
__HOST_DEVICE__
|
|
__half2(const __half2_raw& xx) : data{xx.data} {}
|
|
__HOST_DEVICE__
|
|
__half2(decltype(data) xx) : data{xx} {}
|
|
__HOST_DEVICE__
|
|
__half2(const __half& xx, const __half& yy)
|
|
:
|
|
data{static_cast<__half_raw>(xx).data,
|
|
static_cast<__half_raw>(yy).data}
|
|
{}
|
|
__HOST_DEVICE__
|
|
__half2(const __half2&) = default;
|
|
__HOST_DEVICE__
|
|
__half2(__half2&&) = default;
|
|
__HOST_DEVICE__
|
|
~__half2() = default;
|
|
|
|
// MANIPULATORS
|
|
__HOST_DEVICE__
|
|
__half2& operator=(const __half2&) = default;
|
|
__HOST_DEVICE__
|
|
__half2& operator=(__half2&&) = default;
|
|
__HOST_DEVICE__
|
|
__half2& operator=(const __half2_raw& xx)
|
|
{
|
|
data = xx.data;
|
|
return *this;
|
|
}
|
|
|
|
// MANIPULATORS - DEVICE ONLY
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
__device__
|
|
__half2& operator+=(const __half2& xx)
|
|
{
|
|
data += xx.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half2& operator-=(const __half2& xx)
|
|
{
|
|
data -= xx.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half2& operator*=(const __half2& xx)
|
|
{
|
|
data *= xx.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half2& operator/=(const __half2& xx)
|
|
{
|
|
data /= xx.data;
|
|
return *this;
|
|
}
|
|
__device__
|
|
__half2& operator++() { return *this += _Float16_2{1, 1}; }
|
|
__device__
|
|
__half2 operator++(int)
|
|
{
|
|
__half2 tmp{*this};
|
|
++*this;
|
|
return tmp;
|
|
}
|
|
__device__
|
|
__half2& operator--() { return *this -= _Float16_2{1, 1}; }
|
|
__device__
|
|
__half2 operator--(int)
|
|
{
|
|
__half2 tmp{*this};
|
|
--*this;
|
|
return tmp;
|
|
}
|
|
#endif
|
|
|
|
// ACCESSORS
|
|
__HOST_DEVICE__
|
|
operator decltype(data)() const { return data; }
|
|
__HOST_DEVICE__
|
|
operator __half2_raw() const {
|
|
__half2_raw r;
|
|
r.data = data;
|
|
return r;
|
|
}
|
|
|
|
// ACCESSORS - DEVICE ONLY
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
__device__
|
|
__half2 operator+() const { return *this; }
|
|
__device__
|
|
__half2 operator-() const
|
|
{
|
|
__half2 tmp{*this};
|
|
tmp.data = -tmp.data;
|
|
return tmp;
|
|
}
|
|
#endif
|
|
|
|
// FRIENDS
|
|
#if !defined(__HIP_NO_HALF_OPERATORS__)
|
|
friend
|
|
inline
|
|
__device__
|
|
__half2 operator+(const __half2& xx, const __half2& yy)
|
|
{
|
|
return __half2{xx} += yy;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half2 operator-(const __half2& xx, const __half2& yy)
|
|
{
|
|
return __half2{xx} -= yy;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half2 operator*(const __half2& xx, const __half2& yy)
|
|
{
|
|
return __half2{xx} *= yy;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
__half2 operator/(const __half2& xx, const __half2& yy)
|
|
{
|
|
return __half2{xx} /= yy;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator==(const __half2& xx, const __half2& yy)
|
|
{
|
|
auto r = xx.data == yy.data;
|
|
return r.x != 0 && r.y != 0;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator!=(const __half2& xx, const __half2& yy)
|
|
{
|
|
return !(xx == yy);
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator<(const __half2& xx, const __half2& yy)
|
|
{
|
|
auto r = xx.data < yy.data;
|
|
return r.x != 0 && r.y != 0;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator>(const __half2& xx, const __half2& yy)
|
|
{
|
|
return yy < xx;
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator<=(const __half2& xx, const __half2& yy)
|
|
{
|
|
return !(yy < xx);
|
|
}
|
|
friend
|
|
inline
|
|
__device__
|
|
bool operator>=(const __half2& xx, const __half2& yy)
|
|
{
|
|
return !(xx < yy);
|
|
}
|
|
#endif // !defined(__HIP_NO_HALF_OPERATORS__)
|
|
};
|
|
// END STRUCT __HALF2
|
|
|
|
namespace
|
|
{
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 make_half2(__half x, __half y)
|
|
{
|
|
return __half2{x, y};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half __low2half(__half2 x)
|
|
{
|
|
return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half __high2half(__half2 x)
|
|
{
|
|
return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __half2half2(__half x)
|
|
{
|
|
return __half2{x, x};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __halves2half2(__half x, __half y)
|
|
{
|
|
return __half2{x, y};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __low2half2(__half2 x)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<__half2_raw>(x).data.x,
|
|
static_cast<__half2_raw>(x).data.x}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __high2half2(__half2 x)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<__half2_raw>(x).data.y,
|
|
static_cast<__half2_raw>(x).data.y}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __lows2half2(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<__half2_raw>(x).data.x,
|
|
static_cast<__half2_raw>(y).data.x}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __highs2half2(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<__half2_raw>(x).data.y,
|
|
static_cast<__half2_raw>(y).data.y}};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __lowhigh2highlow(__half2 x)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<__half2_raw>(x).data.y,
|
|
static_cast<__half2_raw>(x).data.x}};
|
|
}
|
|
|
|
// Bitcasts
|
|
inline
|
|
__device__
|
|
short __half_as_short(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).x;
|
|
}
|
|
|
|
inline
|
|
__device__
|
|
unsigned short __half_as_ushort(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).x;
|
|
}
|
|
|
|
inline
|
|
__device__
|
|
__half __short_as_half(short x)
|
|
{
|
|
__half_raw r; r.x = x;
|
|
return r;
|
|
}
|
|
|
|
inline
|
|
__device__
|
|
__half __ushort_as_half(unsigned short x)
|
|
{
|
|
__half_raw r; r.x = x;
|
|
return r;
|
|
}
|
|
|
|
// float -> half | half2
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half __float2half(float x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half __float2half_rn(float x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
#if !defined(__HIPCC_RTC__)
|
|
// TODO: rounding behaviour is not correct for host functions.
|
|
inline
|
|
__host__
|
|
__half __float2half_rz(float x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__host__
|
|
__half __float2half_rd(float x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__host__
|
|
__half __float2half_ru(float x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
#endif
|
|
inline
|
|
__device__
|
|
__half __float2half_rz(float x)
|
|
{
|
|
return __half_raw{__ocml_cvtrtz_f16_f32(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __float2half_rd(float x)
|
|
{
|
|
return __half_raw{__ocml_cvtrtn_f16_f32(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __float2half_ru(float x)
|
|
{
|
|
return __half_raw{__ocml_cvtrtp_f16_f32(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __float2half2_rn(float x)
|
|
{
|
|
return __half2{
|
|
_Float16_2{
|
|
static_cast<_Float16>(x), static_cast<_Float16>(x)}};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __floats2half2_rn(float x, float y)
|
|
{
|
|
return __half2{_Float16_2{
|
|
static_cast<_Float16>(x), static_cast<_Float16>(y)}};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __float22half2_rn(float2 x)
|
|
{
|
|
return __floats2half2_rn(x.x, x.y);
|
|
}
|
|
|
|
// half | half2 -> float
|
|
inline
|
|
__HOST_DEVICE__
|
|
float __half2float(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
float __low2float(__half2 x)
|
|
{
|
|
return static_cast<__half2_raw>(x).data.x;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
float __high2float(__half2 x)
|
|
{
|
|
return static_cast<__half2_raw>(x).data.y;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
float2 __half22float2(__half2 x)
|
|
{
|
|
return make_float2(
|
|
static_cast<__half2_raw>(x).data.x,
|
|
static_cast<__half2_raw>(x).data.y);
|
|
}
|
|
|
|
// half -> int
|
|
inline
|
|
__device__
|
|
int __half2int_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
int __half2int_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
int __half2int_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
int __half2int_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// int -> half
|
|
inline
|
|
__device__
|
|
__half __int2half_rn(int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __int2half_rz(int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __int2half_rd(int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __int2half_ru(int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// half -> short
|
|
inline
|
|
__device__
|
|
short __half2short_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
short __half2short_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
short __half2short_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
short __half2short_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// short -> half
|
|
inline
|
|
__device__
|
|
__half __short2half_rn(short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __short2half_rz(short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __short2half_rd(short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __short2half_ru(short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// half -> long long
|
|
inline
|
|
__device__
|
|
long long __half2ll_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
long long __half2ll_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
long long __half2ll_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
long long __half2ll_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// long long -> half
|
|
inline
|
|
__device__
|
|
__half __ll2half_rn(long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ll2half_rz(long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ll2half_rd(long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ll2half_ru(long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// half -> unsigned int
|
|
inline
|
|
__device__
|
|
unsigned int __half2uint_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned int __half2uint_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned int __half2uint_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned int __half2uint_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// unsigned int -> half
|
|
inline
|
|
__device__
|
|
__half __uint2half_rn(unsigned int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __uint2half_rz(unsigned int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __uint2half_rd(unsigned int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __uint2half_ru(unsigned int x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// half -> unsigned short
|
|
inline
|
|
__device__
|
|
unsigned short __half2ushort_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned short __half2ushort_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned short __half2ushort_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned short __half2ushort_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// unsigned short -> half
|
|
inline
|
|
__device__
|
|
__half __ushort2half_rn(unsigned short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ushort2half_rz(unsigned short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ushort2half_rd(unsigned short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ushort2half_ru(unsigned short x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// half -> unsigned long long
|
|
inline
|
|
__device__
|
|
unsigned long long __half2ull_rn(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned long long __half2ull_rz(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned long long __half2ull_rd(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
inline
|
|
__device__
|
|
unsigned long long __half2ull_ru(__half x)
|
|
{
|
|
return static_cast<__half_raw>(x).data;
|
|
}
|
|
|
|
// unsigned long long -> half
|
|
inline
|
|
__device__
|
|
__half __ull2half_rn(unsigned long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ull2half_rz(unsigned long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ull2half_rd(unsigned long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __ull2half_ru(unsigned long long x)
|
|
{
|
|
return __half_raw{static_cast<_Float16>(x)};
|
|
}
|
|
|
|
// Load primitives
|
|
inline
|
|
__device__
|
|
__half __ldg(const __half* ptr) { return *ptr; }
|
|
inline
|
|
__device__
|
|
__half __ldcg(const __half* ptr) { return *ptr; }
|
|
inline
|
|
__device__
|
|
__half __ldca(const __half* ptr) { return *ptr; }
|
|
inline
|
|
__device__
|
|
__half __ldcs(const __half* ptr) { return *ptr; }
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __ldg(const __half2* ptr) { return *ptr; }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __ldcg(const __half2* ptr) { return *ptr; }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __ldca(const __half2* ptr) { return *ptr; }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __ldcs(const __half2* ptr) { return *ptr; }
|
|
|
|
// Relations
|
|
inline
|
|
__device__
|
|
bool __heq(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data ==
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hne(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data !=
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hle(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data <=
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hge(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data >=
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hlt(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data <
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hgt(__half x, __half y)
|
|
{
|
|
return static_cast<__half_raw>(x).data >
|
|
static_cast<__half_raw>(y).data;
|
|
}
|
|
inline __device__
|
|
bool __hequ(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
|
|
!(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
|
|
}
|
|
inline __device__
|
|
bool __hneu(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
|
|
}
|
|
inline __device__
|
|
bool __hleu(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hgeu(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data);
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hltu(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data);
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hgtu(__half x, __half y) {
|
|
return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data);
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __heq2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data ==
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hne2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data !=
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hle2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data <=
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hge2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data >=
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hlt2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data <
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hgt2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(x).data >
|
|
static_cast<__half2_raw>(y).data;
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline __HOST_DEVICE__
|
|
__half2 __hequ2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
|
|
!(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hneu2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hleu2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hgeu2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hltu2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hgtu2(__half2 x, __half2 y) {
|
|
auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data);
|
|
return __builtin_convertvector(-r, _Float16_2);
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbeq2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__heq2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbne2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hne2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hble2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hle2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbge2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hge2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hblt2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hlt2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbgt2(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hgt2(x, y));
|
|
return r.data.x != 0 && r.data.y != 0;
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
|
|
inline
|
|
__device__
|
|
__half __hmax(const __half x, const __half y) {
|
|
return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data,
|
|
static_cast<__half_raw>(y).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hmax_nan(const __half x, const __half y) {
|
|
if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
|
|
return x;
|
|
} else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
|
|
return y;
|
|
}
|
|
return __hmax(x, y);
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hmin(const __half x, const __half y) {
|
|
return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data,
|
|
static_cast<__half_raw>(y).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hmin_nan(const __half x, const __half y) {
|
|
if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
|
|
return x;
|
|
} else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
|
|
return y;
|
|
}
|
|
return __hmin(x, y);
|
|
}
|
|
|
|
// Arithmetic
|
|
inline
|
|
__device__
|
|
__half __clamp_01(__half x)
|
|
{
|
|
auto r = static_cast<__half_raw>(x);
|
|
|
|
if (__hlt(x, __half_raw{0})) return __half_raw{0};
|
|
if (__hlt(__half_raw{1}, x)) return __half_raw{1};
|
|
return r;
|
|
}
|
|
|
|
inline
|
|
__device__
|
|
__half __hadd(__half x, __half y)
|
|
{
|
|
return __half_raw{
|
|
static_cast<__half_raw>(x).data +
|
|
static_cast<__half_raw>(y).data};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __habs(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_fabs_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hsub(__half x, __half y)
|
|
{
|
|
return __half_raw{
|
|
static_cast<__half_raw>(x).data -
|
|
static_cast<__half_raw>(y).data};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hmul(__half x, __half y)
|
|
{
|
|
return __half_raw{
|
|
static_cast<__half_raw>(x).data *
|
|
static_cast<__half_raw>(y).data};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hadd_sat(__half x, __half y)
|
|
{
|
|
return __clamp_01(__hadd(x, y));
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hsub_sat(__half x, __half y)
|
|
{
|
|
return __clamp_01(__hsub(x, y));
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hmul_sat(__half x, __half y)
|
|
{
|
|
return __clamp_01(__hmul(x, y));
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hfma(__half x, __half y, __half z)
|
|
{
|
|
return __half_raw{__ocml_fma_f16(
|
|
static_cast<__half_raw>(x).data,
|
|
static_cast<__half_raw>(y).data,
|
|
static_cast<__half_raw>(z).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hfma_sat(__half x, __half y, __half z)
|
|
{
|
|
return __clamp_01(__hfma(x, y, z));
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hdiv(__half x, __half y)
|
|
{
|
|
return __half_raw{
|
|
static_cast<__half_raw>(x).data /
|
|
static_cast<__half_raw>(y).data};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hadd2(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
static_cast<__half2_raw>(x).data +
|
|
static_cast<__half2_raw>(y).data};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __habs2(__half2 x)
|
|
{
|
|
return __half2{
|
|
__ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hsub2(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
static_cast<__half2_raw>(x).data -
|
|
static_cast<__half2_raw>(y).data};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hmul2(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
static_cast<__half2_raw>(x).data *
|
|
static_cast<__half2_raw>(y).data};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hadd2_sat(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hadd2(x, y));
|
|
return __half2{
|
|
__clamp_01(__half_raw{r.data.x}),
|
|
__clamp_01(__half_raw{r.data.y})};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hsub2_sat(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hsub2(x, y));
|
|
return __half2{
|
|
__clamp_01(__half_raw{r.data.x}),
|
|
__clamp_01(__half_raw{r.data.y})};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hmul2_sat(__half2 x, __half2 y)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hmul2(x, y));
|
|
return __half2{
|
|
__clamp_01(__half_raw{r.data.x}),
|
|
__clamp_01(__half_raw{r.data.y})};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hfma2(__half2 x, __half2 y, __half2 z)
|
|
{
|
|
return __half2{__ocml_fma_2f16(x, y, z)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
|
|
{
|
|
auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
|
|
return __half2{
|
|
__clamp_01(__half_raw{r.data.x}),
|
|
__clamp_01(__half_raw{r.data.y})};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __h2div(__half2 x, __half2 y)
|
|
{
|
|
return __half2{
|
|
static_cast<__half2_raw>(x).data /
|
|
static_cast<__half2_raw>(y).data};
|
|
}
|
|
|
|
// Math functions
|
|
#if defined(__clang__) && defined(__HIP__)
|
|
inline
|
|
__device__
|
|
float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
|
|
return __ockl_fdot2(static_cast<__half2_raw>(a).data,
|
|
static_cast<__half2_raw>(b).data,
|
|
c, saturate);
|
|
}
|
|
#endif
|
|
inline
|
|
__device__
|
|
__half htrunc(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_trunc_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hceil(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_ceil_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hfloor(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_floor_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hrint(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_rint_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hsin(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_sin_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hcos(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_cos_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hexp(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_exp_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hexp2(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_exp2_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hexp10(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_exp10_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hlog2(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_log2_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hlog(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_log_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hlog10(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_log10_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hrcp(__half x)
|
|
{
|
|
return __half_raw{
|
|
static_cast<_Float16>(1.0f) /static_cast<__half_raw>(x).data};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hrsqrt(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
__half hsqrt(__half x)
|
|
{
|
|
return __half_raw{
|
|
__ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hisinf(__half x)
|
|
{
|
|
return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
|
|
}
|
|
inline
|
|
__device__
|
|
bool __hisnan(__half x)
|
|
{
|
|
return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
|
|
}
|
|
inline
|
|
__device__
|
|
__half __hneg(__half x)
|
|
{
|
|
return __half_raw{-static_cast<__half_raw>(x).data};
|
|
}
|
|
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2trunc(__half2 x)
|
|
{
|
|
return __half2{__ocml_trunc_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2ceil(__half2 x)
|
|
{
|
|
return __half2{__ocml_ceil_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2floor(__half2 x)
|
|
{
|
|
return __half2{__ocml_floor_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2rint(__half2 x)
|
|
{
|
|
return __half2{__ocml_rint_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2sin(__half2 x)
|
|
{
|
|
return __half2{__ocml_sin_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2cos(__half2 x)
|
|
{
|
|
return __half2{__ocml_cos_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2exp(__half2 x)
|
|
{
|
|
return __half2{__ocml_exp_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2exp2(__half2 x)
|
|
{
|
|
return __half2{__ocml_exp2_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2exp10(__half2 x)
|
|
{
|
|
return __half2{__ocml_exp10_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2log2(__half2 x)
|
|
{
|
|
return __half2{__ocml_log2_2f16(x)};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2rcp(__half2 x) {
|
|
return _Float16_2{
|
|
_Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hisinf2(__half2 x)
|
|
{
|
|
auto r = __ocml_isinf_2f16(x);
|
|
return __half2{_Float16_2{
|
|
static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hisnan2(__half2 x)
|
|
{
|
|
auto r = __ocml_isnan_2f16(x);
|
|
return __half2{_Float16_2{
|
|
static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
|
|
}
|
|
inline
|
|
__HOST_DEVICE__
|
|
__half2 __hneg2(__half2 x)
|
|
{
|
|
return __half2{-static_cast<__half2_raw>(x).data};
|
|
}
|
|
} // Anonymous namespace.
|
|
|
|
#if !defined(HIP_NO_HALF)
|
|
using half = __half;
|
|
using half2 = __half2;
|
|
#endif
|
|
__device__
|
|
inline
|
|
__half __shfl(__half var, int src_lane, int width = warpSize) {
|
|
union { int i; __half h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl(tmp.i, src_lane, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
|
|
union { int i; __half2 h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl(tmp.i, src_lane, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; __half h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; __half2 h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_up(tmp.i, lane_delta, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; __half h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
|
|
union { int i; __half2 h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_down(tmp.i, lane_delta, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half __shfl_xor(__half var, int lane_mask, int width = warpSize) {
|
|
union { int i; __half h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
|
return tmp.h;
|
|
}
|
|
__device__
|
|
inline
|
|
__half2 __shfl_xor(__half2 var, int lane_mask, int width = warpSize) {
|
|
union { int i; __half2 h; } tmp; tmp.h = var;
|
|
tmp.i = __shfl_xor(tmp.i, lane_mask, width);
|
|
return tmp.h;
|
|
}
|
|
#endif // defined(__cplusplus)
|
|
#elif defined(__GNUC__)
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "hip_fp16_gcc.h"
|
|
#endif
|
|
#endif // !defined(__clang__) && defined(__GNUC__)
|
|
|
|
#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
|
|
/*
|
|
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in
|
|
all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
THE SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include "hip_fp16_math_fwd.h"
|
|
#include "amd_hip_vector_types.h"
|
|
#include "math_fwd.h"
|
|
|
|
#include <hip/amd_detail/host_defines.h>
|
|
|
|
#include <algorithm>
|
|
// assert.h is only for the host version of assert.
|
|
// The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
|
|
// Users should include hip_runtime.h for the device version of assert.
|
|
#if !__HIP_DEVICE_COMPILE__
|
|
#include <assert.h>
|
|
#endif
|
|
#include <limits.h>
|
|
#include <limits>
|
|
#include <stdint.h>
|
|
#endif // !defined(__HIPCC_RTC__)
|
|
|
|
#if _LIBCPP_VERSION && __HIP__
|
|
namespace std {
|
|
template <>
|
|
struct __numeric_type<_Float16>
|
|
{
|
|
static _Float16 __test(_Float16);
|
|
|
|
typedef _Float16 type;
|
|
static const bool value = true;
|
|
};
|
|
}
|
|
#endif // _LIBCPP_VERSION
|
|
|
|
#pragma push_macro("__DEVICE__")
|
|
#pragma push_macro("__RETURN_TYPE")
|
|
|
|
#define __DEVICE__ static __device__
|
|
#define __RETURN_TYPE bool
|
|
|
|
// DOT FUNCTIONS
|
|
#if __HIP_CLANG_ONLY__
|
|
__DEVICE__
|
|
inline
|
|
int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
|
|
return __ockl_sdot2(a.data, b.data, c, saturate);
|
|
}
|
|
__DEVICE__
|
|
inline
|
|
uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
|
|
return __ockl_udot2(a.data, b.data, c, saturate);
|
|
}
|
|
__DEVICE__
|
|
inline
|
|
int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
|
|
return __ockl_sdot4(a.data, b.data, c, saturate);
|
|
}
|
|
__DEVICE__
|
|
inline
|
|
uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
|
|
return __ockl_udot4(a.data, b.data, c, saturate);
|
|
}
|
|
__DEVICE__
|
|
inline
|
|
int amd_mixed_dot(int a, int b, int c, bool saturate) {
|
|
return __ockl_sdot8(a, b, c, saturate);
|
|
}
|
|
__DEVICE__
|
|
inline
|
|
uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
|
|
return __ockl_udot8(a, b, c, saturate);
|
|
}
|
|
#endif
|
|
|
|
#pragma pop_macro("__DEVICE__")
|
|
#pragma pop_macro("__RETURN_TYPE")
|
|
// For backward compatibility.
|
|
// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
|
|
// defined after including math_functions.h.
|
|
#if !defined(__HIPCC_RTC__)
|
|
#include <hip/amd_detail/amd_hip_runtime.h>
|
|
#endif
|
|
|