openpilot_comma/tinygrad_repo/extra/hiprtc/hiprtc_runtime.h

#pragma clang diagnostic ignored "-Weverything"

# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h"
# 1 "<built-in>" 1
# 1 "<built-in>" 3
# 845 "<built-in>" 3
# 1 "<command line>" 1
# 1 "<built-in>" 2
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 1 3
# 33 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
extern "C" {
  __attribute__((__visibility__("default")))
  __attribute__((weak))
  __attribute__((noreturn))
  __attribute__((device)) void __cxa_pure_virtual(void) {
    __builtin_trap();
  }
  __attribute__((__visibility__("default")))
  __attribute__((weak))
  __attribute__((noreturn))
  __attribute__((device)) void __cxa_deleted_virtual(void) {
    __builtin_trap();
  }
}
# 57 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
typedef long unsigned int size_t;
# 74 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
typedef long unsigned int __hip_size_t;


extern "C" {


extern "C" __attribute__((device)) unsigned long long __ockl_dm_alloc(unsigned long long __size);
extern "C" __attribute__((device)) void __ockl_dm_dealloc(unsigned long long __addr);
# 95 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
__attribute__((weak)) inline __attribute__((device)) void *malloc(__hip_size_t __size) {
  return (void *) __ockl_dm_alloc(__size);
}
__attribute__((weak)) inline __attribute__((device)) void free(void *__ptr) {
  __ockl_dm_dealloc((unsigned long long)__ptr);
}
# 124 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 3
}


# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 1 3
# 14 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_libdevice_declares.h" 3
extern "C" {


__attribute__((device)) __attribute__((const)) float __ocml_acos_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_acosh_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_asin_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_asinh_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_atan2_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_atan_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_atanh_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_cbrt_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_ceil_f32(float);
__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_copysign_f32(float,
                                                                       float);
__attribute__((device)) float __ocml_cos_f32(float);
__attribute__((device)) float __ocml_native_cos_f32(float);
__attribute__((device)) __attribute__((pure)) __attribute__((device)) float __ocml_cosh_f32(float);
__attribute__((device)) float __ocml_cospi_f32(float);
__attribute__((device)) float __ocml_i0_f32(float);
__attribute__((device)) float __ocml_i1_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_erfc_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_erfcinv_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_erfcx_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_erf_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_erfinv_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_exp10_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_native_exp10_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_exp_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_native_exp_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_expm1_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_fabs_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_fdim_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_floor_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_fma_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_fmin_f32(float, float);
__attribute__((device)) __attribute__((const)) __attribute__((device)) float __ocml_fmod_f32(float,
                                                                   float);
__attribute__((device)) float __ocml_frexp_f32(float,
                                  __attribute__((address_space(5))) int *);
__attribute__((device)) __attribute__((const)) float __ocml_hypot_f32(float, float);
__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f32(float);
__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f32(float);
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f32(float);
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f32(float);
__attribute__((device)) float __ocml_j0_f32(float);
__attribute__((device)) float __ocml_j1_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_ldexp_f32(float, int);
__attribute__((device)) float __ocml_lgamma_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log10_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_native_log10_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log1p_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_native_log2_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_logb_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_native_log_f32(float);
__attribute__((device)) float __ocml_modf_f32(float,
                                 __attribute__((address_space(5))) float *);
__attribute__((device)) __attribute__((const)) float __ocml_nearbyint_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_nextafter_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_len3_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_len4_f32(float, float, float,
                                                        float);
__attribute__((device)) __attribute__((pure)) float __ocml_ncdf_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_ncdfinv_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_pow_f32(float, float);
__attribute__((device)) __attribute__((pure)) float __ocml_pown_f32(float, int);
__attribute__((device)) __attribute__((pure)) float __ocml_rcbrt_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_remainder_f32(float, float);
__attribute__((device)) float __ocml_remquo_f32(float, float,
                                   __attribute__((address_space(5))) int *);
__attribute__((device)) __attribute__((const)) float __ocml_rhypot_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_rint_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
                                                         float);
__attribute__((device)) __attribute__((const)) float __ocml_round_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_rsqrt_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_scalb_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_scalbn_f32(float, int);
__attribute__((device)) __attribute__((const)) int __ocml_signbit_f32(float);
__attribute__((device)) float __ocml_sincos_f32(float,
                                   __attribute__((address_space(5))) float *);
__attribute__((device)) float __ocml_sincospi_f32(float,
                                     __attribute__((address_space(5))) float *);
__attribute__((device)) float __ocml_sin_f32(float);
__attribute__((device)) float __ocml_native_sin_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_sinh_f32(float);
__attribute__((device)) float __ocml_sinpi_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_native_sqrt_f32(float);
__attribute__((device)) float __ocml_tan_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_tanh_f32(float);
__attribute__((device)) float __ocml_tgamma_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_trunc_f32(float);
__attribute__((device)) float __ocml_y0_f32(float);
__attribute__((device)) float __ocml_y1_f32(float);


__attribute__((device)) __attribute__((const)) float __ocml_add_rte_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_add_rtn_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_add_rtp_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_add_rtz_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_sub_rte_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_mul_rte_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_div_rte_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_div_rtn_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_div_rtp_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_div_rtz_f32(float, float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rte_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
__attribute__((device)) __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);

__attribute__((device)) inline __attribute__((const)) float
__llvm_amdgcn_cos_f32(float __x) {
  return __builtin_amdgcn_cosf(__x);
}
__attribute__((device)) inline __attribute__((const)) float
__llvm_amdgcn_rcp_f32(float __x) {
  return __builtin_amdgcn_rcpf(__x);
}
__attribute__((device)) inline __attribute__((const)) float
__llvm_amdgcn_rsq_f32(float __x) {
  return __builtin_amdgcn_rsqf(__x);
}
__attribute__((device)) inline __attribute__((const)) float
__llvm_amdgcn_sin_f32(float __x) {
  return __builtin_amdgcn_sinf(__x);
}


__attribute__((device)) __attribute__((const)) double __ocml_acos_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_acosh_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_asin_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_asinh_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_atan2_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_atan_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_atanh_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_cbrt_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_ceil_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_copysign_f64(double, double);
__attribute__((device)) double __ocml_cos_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_cosh_f64(double);
__attribute__((device)) double __ocml_cospi_f64(double);
__attribute__((device)) double __ocml_i0_f64(double);
__attribute__((device)) double __ocml_i1_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_erfc_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_erfcinv_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_erfcx_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_erf_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_erfinv_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_exp10_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_exp_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_expm1_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_fabs_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_fdim_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_floor_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_fma_f64(double, double, double);
__attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_fmin_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_fmod_f64(double, double);
__attribute__((device)) double __ocml_frexp_f64(double,
                                   __attribute__((address_space(5))) int *);
__attribute__((device)) __attribute__((const)) double __ocml_hypot_f64(double, double);
__attribute__((device)) __attribute__((const)) int __ocml_ilogb_f64(double);
__attribute__((device)) __attribute__((const)) int __ocml_isfinite_f64(double);
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f64(double);
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f64(double);
__attribute__((device)) double __ocml_j0_f64(double);
__attribute__((device)) double __ocml_j1_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_ldexp_f64(double, int);
__attribute__((device)) double __ocml_lgamma_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log10_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log1p_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_logb_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log_f64(double);
__attribute__((device)) double __ocml_modf_f64(double,
                                  __attribute__((address_space(5))) double *);
__attribute__((device)) __attribute__((const)) double __ocml_nearbyint_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_nextafter_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_len3_f64(double, double,
                                                         double);
__attribute__((device)) __attribute__((const)) double __ocml_len4_f64(double, double, double,
                                                         double);
__attribute__((device)) __attribute__((pure)) double __ocml_ncdf_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_ncdfinv_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_pow_f64(double, double);
__attribute__((device)) __attribute__((pure)) double __ocml_pown_f64(double, int);
__attribute__((device)) __attribute__((pure)) double __ocml_rcbrt_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_remainder_f64(double, double);
__attribute__((device)) double __ocml_remquo_f64(double, double,
                                    __attribute__((address_space(5))) int *);
__attribute__((device)) __attribute__((const)) double __ocml_rhypot_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_rint_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_rlen3_f64(double, double,
                                                          double);
__attribute__((device)) __attribute__((const)) double __ocml_rlen4_f64(double, double,
                                                          double, double);
__attribute__((device)) __attribute__((const)) double __ocml_round_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_rsqrt_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_scalb_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_scalbn_f64(double, int);
__attribute__((device)) __attribute__((const)) int __ocml_signbit_f64(double);
__attribute__((device)) double __ocml_sincos_f64(double,
                                    __attribute__((address_space(5))) double *);
__attribute__((device)) double
__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
__attribute__((device)) double __ocml_sin_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_sinh_f64(double);
__attribute__((device)) double __ocml_sinpi_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
__attribute__((device)) double __ocml_tan_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_tanh_f64(double);
__attribute__((device)) double __ocml_tgamma_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_trunc_f64(double);
__attribute__((device)) double __ocml_y0_f64(double);
__attribute__((device)) double __ocml_y1_f64(double);


__attribute__((device)) __attribute__((const)) double __ocml_add_rte_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_add_rtn_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_add_rtp_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_add_rtz_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_sub_rte_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_mul_rte_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_div_rte_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_div_rtn_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_div_rtp_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_div_rtz_f64(double, double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rte_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_fma_rte_f64(double, double,
                                                            double);
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
                                                            double);
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
                                                            double);
__attribute__((device)) __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
                                                            double);

__attribute__((device)) inline __attribute__((const)) double
__llvm_amdgcn_rcp_f64(double __x) {
  return __builtin_amdgcn_rcp(__x);
}
__attribute__((device)) inline __attribute__((const)) double
__llvm_amdgcn_rsq_f64(double __x) {
  return __builtin_amdgcn_rsq(__x);
}

__attribute__((device)) __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
__attribute__((device)) _Float16 __ocml_cos_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
                                                          _Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
__attribute__((device)) __attribute__((const)) int __ocml_isinf_f16(_Float16);
__attribute__((device)) __attribute__((const)) int __ocml_isnan_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
__attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);

typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
typedef short __2i16 __attribute__((ext_vector_type(2)));


__attribute__((device)) __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
                                                     float c, bool s);


__attribute__((device)) __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
__attribute__((device)) __2f16 __ocml_cos_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
__attribute__((device)) __attribute__((const))
__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
__attribute__((device)) __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
__attribute__((device)) __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
__attribute__((device)) inline __2f16
__llvm_amdgcn_rcp_2f16(__2f16 __x)
{
  return (__2f16)(__llvm_amdgcn_rcp_f16(__x.x), __llvm_amdgcn_rcp_f16(__x.y));
}
__attribute__((device)) __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
__attribute__((device)) __2f16 __ocml_sin_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
__attribute__((device)) __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);


}
# 128 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 1 3
# 94 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
long unsigned int __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
  long unsigned int __r = 0;
  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '7')
      __r = (__r * 8u) + __tmp - '0';
    else
      return 0;

    ++__tagp;
  }

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
long unsigned int __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
  long unsigned int __r = 0;
  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '9')
      __r = (__r * 10u) + __tmp - '0';
    else
      return 0;

    ++__tagp;
  }

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
long unsigned int __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
  long unsigned int __r = 0;
  while (*__tagp != '\0') {
    char __tmp = *__tagp;

    if (__tmp >= '0' && __tmp <= '9')
      __r = (__r * 16u) + __tmp - '0';
    else if (__tmp >= 'a' && __tmp <= 'f')
      __r = (__r * 16u) + __tmp - 'a' + 10;
    else if (__tmp >= 'A' && __tmp <= 'F')
      __r = (__r * 16u) + __tmp - 'A' + 10;
    else
      return 0;

    ++__tagp;
  }

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
long unsigned int __make_mantissa(const char *__tagp __attribute__((nonnull))) {
  if (*__tagp == '0') {
    ++__tagp;

    if (*__tagp == 'x' || *__tagp == 'X')
      return __make_mantissa_base16(__tagp);
    else
      return __make_mantissa_base8(__tagp);
  }

  return __make_mantissa_base10(__tagp);
}


static __attribute__((device)) inline __attribute__((always_inline))
int abs(int __x) {
  int __sgn = __x >> (sizeof(int) * 8 - 1);
  return (__x ^ __sgn) - __sgn;
}
static __attribute__((device)) inline __attribute__((always_inline))
long labs(long __x) {
  long __sgn = __x >> (sizeof(long) * 8 - 1);
  return (__x ^ __sgn) - __sgn;
}
static __attribute__((device)) inline __attribute__((always_inline))
long long llabs(long long __x) {
  long long __sgn = __x >> (sizeof(long long) * 8 - 1);
  return (__x ^ __sgn) - __sgn;
}


static __attribute__((device)) inline __attribute__((always_inline))
float acosf(float __x) { return __ocml_acos_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float acoshf(float __x) { return __ocml_acosh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float asinf(float __x) { return __ocml_asin_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float asinhf(float __x) { return __ocml_asinh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float atanf(float __x) { return __ocml_atan_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float atanhf(float __x) { return __ocml_atanh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float cbrtf(float __x) { return __ocml_cbrt_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float ceilf(float __x) { return __ocml_ceil_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float copysignf(float __x, float __y) { return __ocml_copysign_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float cosf(float __x) { return __ocml_cos_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float coshf(float __x) { return __ocml_cosh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float cospif(float __x) { return __ocml_cospi_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float erfcf(float __x) { return __ocml_erfc_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float erfcxf(float __x) { return __ocml_erfcx_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float erff(float __x) { return __ocml_erf_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float erfinvf(float __x) { return __ocml_erfinv_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float exp10f(float __x) { return __ocml_exp10_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float exp2f(float __x) { return __ocml_exp2_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float expf(float __x) { return __ocml_exp_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float expm1f(float __x) { return __ocml_expm1_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float fabsf(float __x) { return __builtin_fabsf(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float fdividef(float __x, float __y) { return __x / __y; }

static __attribute__((device)) inline __attribute__((always_inline))
float floorf(float __x) { return __ocml_floor_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float fmaf(float __x, float __y, float __z) {
  return __ocml_fma_f32(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
float fmaxf(float __x, float __y) { return __ocml_fmax_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float fminf(float __x, float __y) { return __ocml_fmin_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float frexpf(float __x, int *__nptr) {
  int __tmp;


  float __r =
      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
int ilogbf(float __x) { return __ocml_ilogb_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __finitef(float __x) { return __ocml_isfinite_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __isinff(float __x) { return __ocml_isinf_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __isnanf(float __x) { return __ocml_isnan_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float j0f(float __x) { return __ocml_j0_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float j1f(float __x) { return __ocml_j1_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float jnf(int __n, float __x) {


  if (__n == 0)
    return j0f(__x);
  if (__n == 1)
    return j1f(__x);

  float __x0 = j0f(__x);
  float __x1 = j1f(__x);
  for (int __i = 1; __i < __n; ++__i) {
    float __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}

static __attribute__((device)) inline __attribute__((always_inline))
float ldexpf(float __x, int __e) { return __ocml_ldexp_f32(__x, __e); }

static __attribute__((device)) inline __attribute__((always_inline))
float lgammaf(float __x) { return __ocml_lgamma_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long long int llrintf(float __x) { return __ocml_rint_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long long int llroundf(float __x) { return __ocml_round_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float log10f(float __x) { return __ocml_log10_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float log1pf(float __x) { return __ocml_log1p_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float log2f(float __x) { return __ocml_log2_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float log2fi(int __x) { return __ocml_log2_f32((float) __x); }

static __attribute__((device)) inline __attribute__((always_inline))
float logbf(float __x) { return __ocml_logb_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float logf(float __x) { return __ocml_log_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long int lrintf(float __x) { return __ocml_rint_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long int lroundf(float __x) { return __ocml_round_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float modff(float __x, float *__iptr) {
  float __tmp;


  float __r =
      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__iptr = __tmp;
  return __r;
}


static __attribute__((device)) inline __attribute__((always_inline))
float nanf(const char *__tagp __attribute__((nonnull))) {
  union {
    float val;
    struct ieee_float {
      unsigned int mantissa : 22;
      unsigned int quiet : 1;
      unsigned int exponent : 8;
      unsigned int sign : 1;
    } bits;
  } __tmp;
  static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");

  __tmp.bits.sign = 0u;
  __tmp.bits.exponent = ~0u;
  __tmp.bits.quiet = 1u;
  __tmp.bits.mantissa = __make_mantissa(__tagp);

  return __tmp.val;
}


static __attribute__((device)) inline __attribute__((always_inline))
float nearbyintf(float __x) { return __ocml_nearbyint_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float nextafterf(float __x, float __y) {
  return __ocml_nextafter_f32(__x, __y);
}

static __attribute__((device)) inline __attribute__((always_inline))
float norm3df(float __x, float __y, float __z) {
  return __ocml_len3_f32(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
float norm4df(float __x, float __y, float __z, float __w) {
  return __ocml_len4_f32(__x, __y, __z, __w);
}

static __attribute__((device)) inline __attribute__((always_inline))
float normcdff(float __x) { return __ocml_ncdf_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float normf(int __dim,
            const float *__a) {
  float __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_sqrt_f32(__r);
}

static __attribute__((device)) inline __attribute__((always_inline))
float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
int powii(int __base, int __exp) {
  if (__exp < 0 )
    return -1;
  int __result = 1;
  for (;;) {
    if (__exp & 1)
      __result *= __base;
    __exp >>= 1;
    if (!__exp)
      break;
    __base *= __base;
  }
  return __result;
}

static __attribute__((device)) inline __attribute__((always_inline))
float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float remainderf(float __x, float __y) {
  return __ocml_remainder_f32(__x, __y);
}

static __attribute__((device)) inline __attribute__((always_inline))
float remquof(float __x, float __y, int *__quo) {
  int __tmp;


  float __r = __ocml_remquo_f32(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float rintf(float __x) { return __ocml_rint_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float rnorm3df(float __x, float __y, float __z) {
  return __ocml_rlen3_f32(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
float rnorm4df(float __x, float __y, float __z, float __w) {
  return __ocml_rlen4_f32(__x, __y, __z, __w);
}

static __attribute__((device)) inline __attribute__((always_inline))
float rnormf(int __dim,
             const float *__a) {
  float __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_rsqrt_f32(__r);
}

static __attribute__((device)) inline __attribute__((always_inline))
float roundf(float __x) { return __ocml_round_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float scalblnf(float __x, long int __n) {
  return (__n < 9223372036854775807L) ? __ocml_scalbn_f32(__x, __n)
                         : __ocml_scalb_f32(__x, __n);
}

static __attribute__((device)) inline __attribute__((always_inline))
float scalbnf(float __x, int __n) { return __ocml_scalbn_f32(__x, __n); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __signbitf(float __x) { return __ocml_signbit_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
void sincosf(float __x, float *__sinptr, float *__cosptr) {
  float __tmp;


  *__sinptr =
      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
}

static __attribute__((device)) inline __attribute__((always_inline))
void sincospif(float __x, float *__sinptr, float *__cosptr) {
  float __tmp;


  *__sinptr = __ocml_sincospi_f32(
      __x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
}

static __attribute__((device)) inline __attribute__((always_inline))
float sinf(float __x) { return __ocml_sin_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float sinhf(float __x) { return __ocml_sinh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float sinpif(float __x) { return __ocml_sinpi_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float tanf(float __x) { return __ocml_tan_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float tanhf(float __x) { return __ocml_tanh_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float tgammaf(float __x) { return __ocml_tgamma_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float truncf(float __x) { return __ocml_trunc_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float y0f(float __x) { return __ocml_y0_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float y1f(float __x) { return __ocml_y1_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float ynf(int __n, float __x) {


  if (__n == 0)
    return y0f(__x);
  if (__n == 1)
    return y1f(__x);

  float __x0 = y0f(__x);
  float __x1 = y1f(__x);
  for (int __i = 1; __i < __n; ++__i) {
    float __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}


static __attribute__((device)) inline __attribute__((always_inline))
float __cosf(float __x) { return __ocml_native_cos_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __expf(float __x) { return __ocml_native_exp_f32(__x); }
# 627 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fadd_rn(float __x, float __y) { return __x + __y; }
# 641 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fdiv_rn(float __x, float __y) { return __x / __y; }


static __attribute__((device)) inline __attribute__((always_inline))
float __fdividef(float __x, float __y) { return __x / __y; }
# 666 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fmaf_rn(float __x, float __y, float __z) {
  return __ocml_fma_f32(__x, __y, __z);
}
# 682 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fmul_rn(float __x, float __y) { return __x * __y; }
# 696 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __frcp_rn(float __x) { return 1.0f / __x; }


static __attribute__((device)) inline __attribute__((always_inline))
float __frsqrt_rn(float __x) { return __llvm_amdgcn_rsq_f32(__x); }
# 713 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
# 727 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
float __fsub_rn(float __x, float __y) { return __x - __y; }


static __attribute__((device)) inline __attribute__((always_inline))
float __log10f(float __x) { return __ocml_native_log10_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __log2f(float __x) { return __ocml_native_log2_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __logf(float __x) { return __ocml_native_log_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }

static __attribute__((device)) inline __attribute__((always_inline))
void __sincosf(float __x, float *__sinptr, float *__cosptr) {
  *__sinptr = __ocml_native_sin_f32(__x);
  *__cosptr = __ocml_native_cos_f32(__x);
}

static __attribute__((device)) inline __attribute__((always_inline))
float __sinf(float __x) { return __ocml_native_sin_f32(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
float __tanf(float __x) { return __ocml_tan_f32(__x); }


static __attribute__((device)) inline __attribute__((always_inline))
double acos(double __x) { return __ocml_acos_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double acosh(double __x) { return __ocml_acosh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double asin(double __x) { return __ocml_asin_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double asinh(double __x) { return __ocml_asinh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double atan(double __x) { return __ocml_atan_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double atanh(double __x) { return __ocml_atanh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double cbrt(double __x) { return __ocml_cbrt_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double ceil(double __x) { return __ocml_ceil_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double copysign(double __x, double __y) {
  return __ocml_copysign_f64(__x, __y);
}

static __attribute__((device)) inline __attribute__((always_inline))
double cos(double __x) { return __ocml_cos_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double cosh(double __x) { return __ocml_cosh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double cospi(double __x) { return __ocml_cospi_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double erf(double __x) { return __ocml_erf_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double erfc(double __x) { return __ocml_erfc_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double erfcx(double __x) { return __ocml_erfcx_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double erfinv(double __x) { return __ocml_erfinv_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double exp(double __x) { return __ocml_exp_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double exp10(double __x) { return __ocml_exp10_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double exp2(double __x) { return __ocml_exp2_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double expm1(double __x) { return __ocml_expm1_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double fabs(double __x) { return __builtin_fabs(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double floor(double __x) { return __ocml_floor_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double fma(double __x, double __y, double __z) {
  return __ocml_fma_f64(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
double fmax(double __x, double __y) { return __ocml_fmax_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double fmin(double __x, double __y) { return __ocml_fmin_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double frexp(double __x, int *__nptr) {
  int __tmp;


  double __r =
      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;
  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
int ilogb(double __x) { return __ocml_ilogb_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __finite(double __x) { return __ocml_isfinite_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __isinf(double __x) { return __ocml_isinf_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __isnan(double __x) { return __ocml_isnan_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double j0(double __x) { return __ocml_j0_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double j1(double __x) { return __ocml_j1_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double jn(int __n, double __x) {


  if (__n == 0)
    return j0(__x);
  if (__n == 1)
    return j1(__x);

  double __x0 = j0(__x);
  double __x1 = j1(__x);
  for (int __i = 1; __i < __n; ++__i) {
    double __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }
  return __x1;
}

static __attribute__((device)) inline __attribute__((always_inline))
double ldexp(double __x, int __e) { return __ocml_ldexp_f64(__x, __e); }

static __attribute__((device)) inline __attribute__((always_inline))
double lgamma(double __x) { return __ocml_lgamma_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long long int llrint(double __x) { return __ocml_rint_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long long int llround(double __x) { return __ocml_round_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double log(double __x) { return __ocml_log_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double log10(double __x) { return __ocml_log10_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double log1p(double __x) { return __ocml_log1p_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double log2(double __x) { return __ocml_log2_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double logb(double __x) { return __ocml_logb_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long int lrint(double __x) { return __ocml_rint_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
long int lround(double __x) { return __ocml_round_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double modf(double __x, double *__iptr) {
  double __tmp;


  double __r =
      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
  *__iptr = __tmp;

  return __r;
}


static __attribute__((device)) inline __attribute__((always_inline))
double nan(const char *__tagp) {

  union {
    double val;
    struct ieee_double {
      long unsigned int mantissa : 51;
      unsigned int quiet : 1;
      unsigned int exponent : 11;
      unsigned int sign : 1;
    } bits;
  } __tmp;
  static_assert((sizeof(__tmp.val)) == (sizeof(__tmp.bits)), "");

  __tmp.bits.sign = 0u;
  __tmp.bits.exponent = ~0u;
  __tmp.bits.quiet = 1u;
  __tmp.bits.mantissa = __make_mantissa(__tagp);

  return __tmp.val;


}


static __attribute__((device)) inline __attribute__((always_inline))
double nearbyint(double __x) { return __ocml_nearbyint_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double nextafter(double __x, double __y) {
  return __ocml_nextafter_f64(__x, __y);
}

static __attribute__((device)) inline __attribute__((always_inline))
double norm(int __dim,
            const double *__a) {
  double __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_sqrt_f64(__r);
}

static __attribute__((device)) inline __attribute__((always_inline))
double norm3d(double __x, double __y, double __z) {
  return __ocml_len3_f64(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
double norm4d(double __x, double __y, double __z, double __w) {
  return __ocml_len4_f64(__x, __y, __z, __w);
}

static __attribute__((device)) inline __attribute__((always_inline))
double normcdf(double __x) { return __ocml_ncdf_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double remainder(double __x, double __y) {
  return __ocml_remainder_f64(__x, __y);
}

static __attribute__((device)) inline __attribute__((always_inline))
double remquo(double __x, double __y, int *__quo) {
  int __tmp;


  double __r = __ocml_remquo_f64(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;

  return __r;
}

static __attribute__((device)) inline __attribute__((always_inline))
double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double rint(double __x) { return __ocml_rint_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double rnorm(int __dim,
             const double *__a) {
  double __r = 0;
  while (__dim--) {
    __r += __a[0] * __a[0];
    ++__a;
  }

  return __ocml_rsqrt_f64(__r);
}

static __attribute__((device)) inline __attribute__((always_inline))
double rnorm3d(double __x, double __y, double __z) {
  return __ocml_rlen3_f64(__x, __y, __z);
}

static __attribute__((device)) inline __attribute__((always_inline))
double rnorm4d(double __x, double __y, double __z, double __w) {
  return __ocml_rlen4_f64(__x, __y, __z, __w);
}

static __attribute__((device)) inline __attribute__((always_inline))
double round(double __x) { return __ocml_round_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double scalbln(double __x, long int __n) {
  return (__n < 9223372036854775807L) ? __ocml_scalbn_f64(__x, __n)
                         : __ocml_scalb_f64(__x, __n);
}
static __attribute__((device)) inline __attribute__((always_inline))
double scalbn(double __x, int __n) { return __ocml_scalbn_f64(__x, __n); }

static __attribute__((device)) inline __attribute__((always_inline))
bool __signbit(double __x) { return __ocml_signbit_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double sin(double __x) { return __ocml_sin_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
void sincos(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;


  *__sinptr = __ocml_sincos_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
}

static __attribute__((device)) inline __attribute__((always_inline))
void sincospi(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;


  *__sinptr = __ocml_sincospi_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
}

static __attribute__((device)) inline __attribute__((always_inline))
double sinh(double __x) { return __ocml_sinh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double sinpi(double __x) { return __ocml_sinpi_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double sqrt(double __x) { return __ocml_sqrt_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double tan(double __x) { return __ocml_tan_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double tanh(double __x) { return __ocml_tanh_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double tgamma(double __x) { return __ocml_tgamma_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double trunc(double __x) { return __ocml_trunc_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double y0(double __x) { return __ocml_y0_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double y1(double __x) { return __ocml_y1_f64(__x); }

static __attribute__((device)) inline __attribute__((always_inline))
double yn(int __n, double __x) {


  if (__n == 0)
    return y0(__x);
  if (__n == 1)
    return y1(__x);

  double __x0 = y0(__x);
  double __x1 = y1(__x);
  for (int __i = 1; __i < __n; ++__i) {
    double __x2 = (2 * __i) / __x * __x1 - __x0;
    __x0 = __x1;
    __x1 = __x2;
  }

  return __x1;
}
# 1190 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __dadd_rn(double __x, double __y) { return __x + __y; }
# 1212 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __ddiv_rn(double __x, double __y) { return __x / __y; }
# 1234 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __dmul_rn(double __x, double __y) { return __x * __y; }
# 1248 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __drcp_rn(double __x) { return 1.0 / __x; }
# 1262 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
# 1284 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __dsub_rn(double __x, double __y) { return __x - __y; }
# 1306 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
static __attribute__((device)) inline __attribute__((always_inline))
double __fma_rn(double __x, double __y, double __z) {
  return __ocml_fma_f64(__x, __y, __z);
}
# 1325 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_math.h" 3
template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T min(T __arg1, T __arg2) {
  return (__arg1 < __arg2) ? __arg1 : __arg2;
}

template <class T> static __attribute__((device)) inline __attribute__((always_inline)) T max(T __arg1, T __arg2) {
  return (__arg1 > __arg2) ? __arg1 : __arg2;
}


static __attribute__((device)) inline __attribute__((always_inline)) int min(int __arg1, int __arg2) {
  return (__arg1 < __arg2) ? __arg1 : __arg2;
}
static __attribute__((device)) inline __attribute__((always_inline)) int max(int __arg1, int __arg2) {
  return (__arg1 > __arg2) ? __arg1 : __arg2;
}

static __attribute__((device)) inline __attribute__((always_inline))
float max(float __x, float __y) { return fmaxf(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double max(double __x, double __y) { return fmax(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
float min(float __x, float __y) { return fminf(__x, __y); }

static __attribute__((device)) inline __attribute__((always_inline))
double min(double __x, double __y) { return fmin(__x, __y); }
# 129 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_stdlib.h" 1 3
# 130 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3


# 1 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 1 3
# 41 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
static __attribute__((device)) inline __attribute__((always_inline)) double abs(double __x) { return ::fabs(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float abs(float __x) { return ::fabsf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) long long abs(long long __n) { return ::llabs(__n); }
static __attribute__((device)) inline __attribute__((always_inline)) long abs(long __n) { return ::labs(__n); }
static __attribute__((device)) inline __attribute__((always_inline)) float fma(float __x, float __y, float __z) {
  return ::fmaf(__x, __y, __z);
}
# 61 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
static __attribute__((device)) inline __attribute__((always_inline)) float frexp(float __arg, int *__exp) {
  return ::frexpf(__arg, __exp);
}
# 93 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(float __x) { return ::__isinff(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool isinf(double __x) { return ::__isinf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(float __x) { return ::__finitef(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool isfinite(double __x) { return ::__finite(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(float __x) { return ::__isnanf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool isnan(double __x) { return ::__isnan(__x); }


static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(float __x, float __y) {
  return __builtin_isgreater(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreater(double __x, double __y) {
  return __builtin_isgreater(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(float __x, float __y) {
  return __builtin_isgreaterequal(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isgreaterequal(double __x, double __y) {
  return __builtin_isgreaterequal(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isless(float __x, float __y) {
  return __builtin_isless(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isless(double __x, double __y) {
  return __builtin_isless(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(float __x, float __y) {
  return __builtin_islessequal(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool islessequal(double __x, double __y) {
  return __builtin_islessequal(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(float __x, float __y) {
  return __builtin_islessgreater(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool islessgreater(double __x, double __y) {
  return __builtin_islessgreater(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(float __x) {
  return __builtin_isnormal(__x);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isnormal(double __x) {
  return __builtin_isnormal(__x);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(float __x, float __y) {
  return __builtin_isunordered(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool isunordered(double __x, double __y) {
  return __builtin_isunordered(__x, __y);
}
static __attribute__((device)) inline __attribute__((always_inline)) float modf(float __x, float *__iptr) {
  return ::modff(__x, __iptr);
}
static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __base, int __iexp) {
  return ::powif(__base, __iexp);
}
static __attribute__((device)) inline __attribute__((always_inline)) double pow(double __base, int __iexp) {
  return ::powi(__base, __iexp);
}
static __attribute__((device)) inline __attribute__((always_inline)) float remquo(float __x, float __y, int *__quo) {
  return ::remquof(__x, __y, __quo);
}
static __attribute__((device)) inline __attribute__((always_inline)) float scalbln(float __x, long int __n) {
  return ::scalblnf(__x, __n);
}
static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(float __x) { return ::__signbitf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) bool signbit(double __x) { return ::__signbit(__x); }


static __attribute__((device)) inline __attribute__((always_inline)) _Float16 fma(_Float16 __x, _Float16 __y,
                                      _Float16 __z) {
  return __ocml_fma_f16(__x, __y, __z);
}
static __attribute__((device)) inline __attribute__((always_inline)) _Float16 pow(_Float16 __base, int __iexp) {
  return __ocml_pown_f16(__base, __iexp);
}
# 202 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
static __attribute__((device)) inline __attribute__((always_inline)) float acos(float __x) { return acosf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float acosh(float __x) { return acoshf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float asin(float __x) { return asinf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float asinh(float __x) { return asinhf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float atan(float __x) { return atanf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float atan2(float __x, float __y) { return atan2f(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float atanh(float __x) { return atanhf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float cbrt(float __x) { return cbrtf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float ceil(float __x) { return ceilf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float copysign(float __x, float __y) { return copysignf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float cos(float __x) { return cosf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float cosh(float __x) { return coshf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float erf(float __x) { return erff(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float erfc(float __x) { return erfcf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float exp(float __x) { return expf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float exp2(float __x) { return exp2f(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float expm1(float __x) { return expm1f(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float fabs(float __x) { return fabsf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float fdim(float __x, float __y) { return fdimf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float floor(float __x) { return floorf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float fmax(float __x, float __y) { return fmaxf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float fmin(float __x, float __y) { return fminf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float fmod(float __x, float __y) { return fmodf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float hypot(float __x, float __y) { return hypotf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) int ilogb(float __x) { return ilogbf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float ldexp(float __x, int __y) { return ldexpf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float lgamma(float __x) { return lgammaf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float log(float __x) { return logf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float log10(float __x) { return log10f(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float log1p(float __x) { return log1pf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float log2(float __x) { return log2f(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float logb(float __x) { return logbf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) long long llrint(float __x) { return llrintf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) long long llround(float __x) { return llroundf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) long lrint(float __x) { return lrintf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) long lround(float __x) { return lroundf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float nearbyint(float __x) { return nearbyintf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float nextafter(float __x, float __y) { return nextafterf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float pow(float __x, float __y) { return powf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float remainder(float __x, float __y) { return remainderf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float rint(float __x) { return rintf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float round(float __x) { return roundf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float scalbn(float __x, int __y) { return scalbnf(__x, __y); }
static __attribute__((device)) inline __attribute__((always_inline)) float sin(float __x) { return sinf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float sinh(float __x) { return sinhf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float sqrt(float __x) { return sqrtf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float tan(float __x) { return tanf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float tanh(float __x) { return tanhf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float tgamma(float __x) { return tgammaf(__x); }
static __attribute__((device)) inline __attribute__((always_inline)) float trunc(float __x) { return truncf(__x); }
# 265 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
template <bool __B, class __T = void> struct __hip_enable_if {};

template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };

namespace __hip {
template <class _Tp> struct is_integral {
  enum { value = 0 };
};
template <> struct is_integral<bool> {
  enum { value = 1 };
};
template <> struct is_integral<char> {
  enum { value = 1 };
};
template <> struct is_integral<signed char> {
  enum { value = 1 };
};
template <> struct is_integral<unsigned char> {
  enum { value = 1 };
};
template <> struct is_integral<wchar_t> {
  enum { value = 1 };
};
template <> struct is_integral<short> {
  enum { value = 1 };
};
template <> struct is_integral<unsigned short> {
  enum { value = 1 };
};
template <> struct is_integral<int> {
  enum { value = 1 };
};
template <> struct is_integral<unsigned int> {
  enum { value = 1 };
};
template <> struct is_integral<long> {
  enum { value = 1 };
};
template <> struct is_integral<unsigned long> {
  enum { value = 1 };
};
template <> struct is_integral<long long> {
  enum { value = 1 };
};
template <> struct is_integral<unsigned long long> {
  enum { value = 1 };
};


template <class _Tp> struct is_arithmetic {
  enum { value = 0 };
};
template <> struct is_arithmetic<bool> {
  enum { value = 1 };
};
template <> struct is_arithmetic<char> {
  enum { value = 1 };
};
template <> struct is_arithmetic<signed char> {
  enum { value = 1 };
};
template <> struct is_arithmetic<unsigned char> {
  enum { value = 1 };
};
template <> struct is_arithmetic<wchar_t> {
  enum { value = 1 };
};
template <> struct is_arithmetic<short> {
  enum { value = 1 };
};
template <> struct is_arithmetic<unsigned short> {
  enum { value = 1 };
};
template <> struct is_arithmetic<int> {
  enum { value = 1 };
};
template <> struct is_arithmetic<unsigned int> {
  enum { value = 1 };
};
template <> struct is_arithmetic<long> {
  enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long> {
  enum { value = 1 };
};
template <> struct is_arithmetic<long long> {
  enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long long> {
  enum { value = 1 };
};
template <> struct is_arithmetic<float> {
  enum { value = 1 };
};
template <> struct is_arithmetic<double> {
  enum { value = 1 };
};

struct true_type {
  static const __attribute__((constant)) bool value = true;
};
struct false_type {
  static const __attribute__((constant)) bool value = false;
};

template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};

template <typename __T> struct add_rvalue_reference { typedef __T &&type; };

template <typename __T> typename add_rvalue_reference<__T>::type declval();


template <class _Tp> struct __numeric_type {
  static void __test(...);
  static _Float16 __test(_Float16);
  static float __test(float);
  static double __test(char);
  static double __test(int);
  static double __test(unsigned);
  static double __test(long);
  static double __test(unsigned long);
  static double __test(long long);
  static double __test(unsigned long long);
  static double __test(double);

  static double __test(long double);

  typedef decltype(__test(declval<_Tp>())) type;
  static const bool value = !is_same<type, void>::value;
};

template <> struct __numeric_type<void> { static const bool value = true; };

template <class _A1, class _A2 = void, class _A3 = void,
          bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
              &&__numeric_type<_A3>::value>
class __promote_imp {
public:
  static const bool value = false;
};

template <class _A1, class _A2, class _A3>
class __promote_imp<_A1, _A2, _A3, true> {
private:
  typedef typename __promote_imp<_A1>::type __type1;
  typedef typename __promote_imp<_A2>::type __type2;
  typedef typename __promote_imp<_A3>::type __type3;

public:
  typedef decltype(__type1() + __type2() + __type3()) type;
  static const bool value = true;
};

template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
private:
  typedef typename __promote_imp<_A1>::type __type1;
  typedef typename __promote_imp<_A2>::type __type2;

public:
  typedef decltype(__type1() + __type2()) type;
  static const bool value = true;
};

template <class _A1> class __promote_imp<_A1, void, void, true> {
public:
  typedef typename __numeric_type<_A1>::type type;
  static const bool value = true;
};

template <class _A1, class _A2 = void, class _A3 = void>
class __promote : public __promote_imp<_A1, _A2, _A3> {};

}
# 478 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acos(__T __x) { return ::acos((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type acosh(__T __x) { return ::acosh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asin(__T __x) { return ::asin((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type asinh(__T __x) { return ::asinh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atan(__T __x) { return ::atan((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type atan2(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return atan2((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type atanh(__T __x) { return ::atanh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cbrt(__T __x) { return ::cbrt((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type ceil(__T __x) { return ::ceil((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type copysign(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return copysign((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cos(__T __x) { return ::cos((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type cosh(__T __x) { return ::cosh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erf(__T __x) { return ::erf((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type erfc(__T __x) { return ::erfc((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp(__T __x) { return ::exp((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type exp2(__T __x) { return ::exp2((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type expm1(__T __x) { return ::expm1((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type fabs(__T __x) { return ::fabs((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fdim(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fdim((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type floor(__T __x) { return ::floor((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmax(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmax((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmin(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmin((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type fmod(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return fmod((__result_type)__x, (__result_type)__y); }


template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type hypot(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return hypot((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, int>::type ilogb(__T __x) { return ::ilogb((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isfinite(__T __x) { return ::isfinite((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreater((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isgreaterequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isgreaterequal((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isinf(__T __x) { return ::isinf((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isless(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isless((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessequal(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessequal((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type islessgreater(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return islessgreater((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnan(__T __x) { return ::isnan((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type isnormal(__T __x) { return ::isnormal((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type isunordered(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return isunordered((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type lgamma(__T __x) { return ::lgamma((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log(__T __x) { return ::log((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log10(__T __x) { return ::log10((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log1p(__T __x) { return ::log1p((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type log2(__T __x) { return ::log2((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type logb(__T __x) { return ::logb((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llrint(__T __x) { return ::llrint((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long long>::type llround(__T __x) { return ::llround((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lrint(__T __x) { return ::lrint((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, long>::type lround(__T __x) { return ::lround((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type nearbyint(__T __x) { return ::nearbyint((double)__x); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type nextafter(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return nextafter((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type pow(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return pow((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type remainder(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return remainder((__result_type)__x, (__result_type)__y); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type rint(__T __x) { return ::rint((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type round(__T __x) { return ::round((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, bool>::type signbit(__T __x) { return ::signbit((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sin(__T __x) { return ::sin((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sinh(__T __x) { return ::sinh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type sqrt(__T __x) { return ::sqrt((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tan(__T __x) { return ::tan((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tanh(__T __x) { return ::tanh((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type tgamma(__T __x) { return ::tgamma((double)__x); }
template <typename __T> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type trunc(__T __x) { return ::trunc((double)__x); }


template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type max(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return max((__result_type)__x, (__result_type)__y); }
template <typename __T1, typename __T2> static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if< __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, typename __hip::__promote<__T1, __T2>::type>::type min(__T1 __x, __T2 __y) { typedef typename __hip::__promote<__T1, __T2>::type __result_type; return min((__result_type)__x, (__result_type)__y); }


template <typename __T1, typename __T2, typename __T3>
static __attribute__((device)) inline __attribute__((always_inline)) typename __hip_enable_if<
    __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
        __hip::is_arithmetic<__T3>::value,
    typename __hip::__promote<__T1, __T2, __T3>::type>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
  return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
}
# 568 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
template <typename __T>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    frexp(__T __x, int *__exp) {
  return ::frexp((double)__x, __exp);
}

template <typename __T>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    ldexp(__T __x, int __exp) {
  return ::ldexp((double)__x, __exp);
}

template <typename __T>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    modf(__T __x, double *__exp) {
  return ::modf((double)__x, __exp);
}


template <typename __T1, typename __T2>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
                                 __hip::is_arithmetic<__T2>::value,
                             typename __hip::__promote<__T1, __T2>::type>::type
    remquo(__T1 __x, __T2 __y, int *__quo) {
  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
  return ::remquo((__result_type)__x, (__result_type)__y, __quo);
}
# 610 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_cmath.h" 3
template <typename __T>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbln(__T __x, long int __exp) {
  return ::scalbln((double)__x, __exp);
}

template <typename __T>
static __attribute__((device)) inline __attribute__((always_inline))
    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbn(__T __x, int __exp) {
  return ::scalbn((double)__x, __exp);
}
# 133 "/opt/rocm-6.0.0/lib/llvm/lib/clang/17.0.0/include/__clang_hip_runtime_wrapper.h" 2 3
# 2 "<built-in>" 2
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2


# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 1 3
# 58 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/include/hip/hip_version.h" 1 3
# 59 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 1 3
# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
# 97 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_common.h" 3
#pragma clang diagnostic pop
# 60 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3


# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 1 3
# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_common.h" 1 3
# 33 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
# 43 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
extern "C" {
# 54 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
const char* amd_dbgapi_get_build_name();
# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
const char* amd_dbgapi_get_git_hash();
# 72 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
size_t amd_dbgapi_get_build_id();


}
# 92 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef signed int int32_t;
typedef signed long long int64_t;
namespace std {
using ::uint32_t;
using ::uint64_t;
using ::int32_t;
using ::int64_t;
}
# 124 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 1 3
# 27 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 1 3
# 31 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 1 3
# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/host_defines.h" 3
namespace __hip_internal {
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef signed long long int64_t;

template <class _Tp, _Tp __v> struct integral_constant {
  static constexpr const _Tp value = __v;
  typedef _Tp value_type;
  typedef integral_constant type;
  constexpr operator value_type() const { return value; }
  constexpr value_type operator()() const { return value; }
};
template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;

typedef integral_constant<bool, true> true_type;
typedef integral_constant<bool, false> false_type;

template <bool B> using bool_constant = integral_constant<bool, B>;
typedef bool_constant<true> true_type;
typedef bool_constant<false> false_type;

template <bool __B, class __T = void> struct enable_if {};
template <class __T> struct enable_if<true, __T> { typedef __T type; };

template<bool _B> struct true_or_false_type : public false_type {};
template<> struct true_or_false_type<true> : public true_type {};

template <class _Tp> struct is_integral : public false_type {};
template <> struct is_integral<bool> : public true_type {};
template <> struct is_integral<char> : public true_type {};
template <> struct is_integral<signed char> : public true_type {};
template <> struct is_integral<unsigned char> : public true_type {};
template <> struct is_integral<wchar_t> : public true_type {};
template <> struct is_integral<short> : public true_type {};
template <> struct is_integral<unsigned short> : public true_type {};
template <> struct is_integral<int> : public true_type {};
template <> struct is_integral<unsigned int> : public true_type {};
template <> struct is_integral<long> : public true_type {};
template <> struct is_integral<unsigned long> : public true_type {};
template <> struct is_integral<long long> : public true_type {};
template <> struct is_integral<unsigned long long> : public true_type {};

template <class _Tp> struct is_arithmetic : public false_type {};
template <> struct is_arithmetic<bool> : public true_type {};
template <> struct is_arithmetic<char> : public true_type {};
template <> struct is_arithmetic<signed char> : public true_type {};
template <> struct is_arithmetic<unsigned char> : public true_type {};
template <> struct is_arithmetic<wchar_t> : public true_type {};
template <> struct is_arithmetic<short> : public true_type {};
template <> struct is_arithmetic<unsigned short> : public true_type {};
template <> struct is_arithmetic<int> : public true_type {};
template <> struct is_arithmetic<unsigned int> : public true_type {};
template <> struct is_arithmetic<long> : public true_type {};
template <> struct is_arithmetic<unsigned long> : public true_type {};
template <> struct is_arithmetic<long long> : public true_type {};
template <> struct is_arithmetic<unsigned long long> : public true_type {};
template <> struct is_arithmetic<float> : public true_type {};
template <> struct is_arithmetic<double> : public true_type {};

template<typename _Tp> struct is_floating_point : public false_type {};
template<> struct is_floating_point<float> : public true_type {};
template<> struct is_floating_point<double> : public true_type {};
template<> struct is_floating_point<long double> : public true_type {};

template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};

template<typename _Tp, bool = is_arithmetic<_Tp>::value>
  struct is_signed : public false_type {};
template<typename _Tp>
  struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};

template<typename _CharT> struct char_traits;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
typedef basic_istream<char> istream;
typedef basic_ostream<char> ostream;

template<typename _Tp>
    struct is_standard_layout
    : public integral_constant<bool, __is_standard_layout(_Tp)>
    { };

template<typename _Tp>
    struct is_trivial
    : public integral_constant<bool, __is_trivial(_Tp)>
    { };
}
typedef __hip_internal::uint8_t __hip_uint8_t;
typedef __hip_internal::uint16_t __hip_uint16_t;
typedef __hip_internal::uint32_t __hip_uint32_t;
typedef __hip_internal::uint64_t __hip_uint64_t;
typedef __hip_internal::int8_t __hip_int8_t;
typedef __hip_internal::int16_t __hip_int16_t;
typedef __hip_internal::int32_t __hip_int32_t;
typedef __hip_internal::int64_t __hip_int64_t;
# 32 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 2 3
# 52 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
namespace std {
using ::size_t;

template <class _Tp, _Tp __v> struct integral_constant {
  static constexpr const _Tp value = __v;
  typedef _Tp value_type;
  typedef integral_constant type;
  constexpr operator value_type() const { return value; }
  constexpr value_type operator()() const { return value; }
};
template <class _Tp, _Tp __v> constexpr const _Tp integral_constant<_Tp, __v>::value;

typedef integral_constant<bool, true> true_type;
typedef integral_constant<bool, false> false_type;

template <bool B> using bool_constant = integral_constant<bool, B>;
typedef bool_constant<true> true_type;
typedef bool_constant<false> false_type;

template <bool __B, class __T = void> struct enable_if {};
template <class __T> struct enable_if<true, __T> { typedef __T type; };

template<bool _B> struct true_or_false_type : public false_type {};
template<> struct true_or_false_type<true> : public true_type {};

template <class _Tp> struct is_integral : public false_type {};
template <> struct is_integral<bool> : public true_type {};
template <> struct is_integral<char> : public true_type {};
template <> struct is_integral<signed char> : public true_type {};
template <> struct is_integral<unsigned char> : public true_type {};
template <> struct is_integral<wchar_t> : public true_type {};
template <> struct is_integral<short> : public true_type {};
template <> struct is_integral<unsigned short> : public true_type {};
template <> struct is_integral<int> : public true_type {};
template <> struct is_integral<unsigned int> : public true_type {};
template <> struct is_integral<long> : public true_type {};
template <> struct is_integral<unsigned long> : public true_type {};
template <> struct is_integral<long long> : public true_type {};
template <> struct is_integral<unsigned long long> : public true_type {};

template <class _Tp> struct is_arithmetic : public false_type {};
template <> struct is_arithmetic<bool> : public true_type {};
template <> struct is_arithmetic<char> : public true_type {};
template <> struct is_arithmetic<signed char> : public true_type {};
template <> struct is_arithmetic<unsigned char> : public true_type {};
template <> struct is_arithmetic<wchar_t> : public true_type {};
template <> struct is_arithmetic<short> : public true_type {};
template <> struct is_arithmetic<unsigned short> : public true_type {};
template <> struct is_arithmetic<int> : public true_type {};
template <> struct is_arithmetic<unsigned int> : public true_type {};
template <> struct is_arithmetic<long> : public true_type {};
template <> struct is_arithmetic<unsigned long> : public true_type {};
template <> struct is_arithmetic<long long> : public true_type {};
template <> struct is_arithmetic<unsigned long long> : public true_type {};
template <> struct is_arithmetic<float> : public true_type {};
template <> struct is_arithmetic<double> : public true_type {};

template<typename _Tp> struct is_floating_point : public false_type {};
template<> struct is_floating_point<float> : public true_type {};
template<> struct is_floating_point<double> : public true_type {};
template<> struct is_floating_point<long double> : public true_type {};

template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};

template<typename _Tp, bool = is_arithmetic<_Tp>::value>
  struct is_signed : public false_type {};
template<typename _Tp>
  struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};

template <class _T1, class _T2> struct is_convertible
  : public true_or_false_type<__is_convertible_to(_T1, _T2)> {};

template<typename _CharT> struct char_traits;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
template<typename _CharT, typename _Traits = char_traits<_CharT>> class basic_ostream;
typedef basic_istream<char> istream;
typedef basic_ostream<char> ostream;

template <typename __T> struct is_scalar : public integral_constant<bool, __is_scalar(__T)> {};
}


    namespace hip_impl {
        inline
        constexpr
        unsigned int next_pot(unsigned int x) {

         return 1u << (32u - __builtin_clz(x - 1u));
        }
    }

    template<typename T, unsigned int n> struct HIP_vector_base;

    template<typename T>
    struct HIP_vector_base<T, 1> {
        using Native_vec_ = T __attribute__((ext_vector_type(1)));

        union {
            Native_vec_ data;
            struct {
                T x;
            };
        };

        using value_type = T;

        __attribute__((device))
        HIP_vector_base() = default;
        __attribute__((device))
        explicit
        constexpr
        HIP_vector_base(T x_) noexcept : data{x_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(const HIP_vector_base&) = default;
        __attribute__((device))
        constexpr
        HIP_vector_base(HIP_vector_base&&) = default;
        __attribute__((device))
        ~HIP_vector_base() = default;
        __attribute__((device))
        HIP_vector_base& operator=(const HIP_vector_base&) = default;
    };

    template<typename T>
    struct HIP_vector_base<T, 2> {
        using Native_vec_ = T __attribute__((ext_vector_type(2)));

        union


        {
            Native_vec_ data;
            struct {
                T x;
                T y;
            };
        };

        using value_type = T;

        __attribute__((device))
        HIP_vector_base() = default;
        __attribute__((device))
        explicit
        constexpr
        HIP_vector_base(T x_) noexcept : data{x_, x_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(T x_, T y_) noexcept : data{x_, y_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(const HIP_vector_base&) = default;
        __attribute__((device))
        constexpr
        HIP_vector_base(HIP_vector_base&&) = default;
        __attribute__((device))
        ~HIP_vector_base() = default;
        __attribute__((device))
        HIP_vector_base& operator=(const HIP_vector_base&) = default;
    };

    template<typename T>
    struct HIP_vector_base<T, 3> {
        struct Native_vec_ {
            T d[3];

            __attribute__((device))
            Native_vec_() = default;

            __attribute__((device))
            explicit
            constexpr
            Native_vec_(T x_) noexcept : d{x_, x_, x_} {}
            __attribute__((device))
            constexpr
            Native_vec_(T x_, T y_, T z_) noexcept : d{x_, y_, z_} {}
            __attribute__((device))
            constexpr
            Native_vec_(const Native_vec_&) = default;
            __attribute__((device))
            constexpr
            Native_vec_(Native_vec_&&) = default;
            __attribute__((device))
            ~Native_vec_() = default;

            __attribute__((device))
            Native_vec_& operator=(const Native_vec_&) = default;
            __attribute__((device))
            Native_vec_& operator=(Native_vec_&&) = default;

            __attribute__((device))
            T& operator[](unsigned int idx) noexcept { return d[idx]; }
            __attribute__((device))
            T operator[](unsigned int idx) const noexcept { return d[idx]; }

            __attribute__((device))
            Native_vec_& operator+=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] += x_.d[i];
                return *this;
            }
            __attribute__((device))
            Native_vec_& operator-=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] -= x_.d[i];
                return *this;
            }

            __attribute__((device))
            Native_vec_& operator*=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] *= x_.d[i];
                return *this;
            }
            __attribute__((device))
            Native_vec_& operator/=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] /= x_.d[i];
                return *this;
            }

            template<
                typename U = T,
                typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_ operator-() const noexcept
            {
                auto r{*this};
                for (auto&& x : r.d) x = -x;
                return r;
            }

            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_ operator~() const noexcept
            {
                auto r{*this};
                for (auto&& x : r.d) x = ~x;
                return r;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator%=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] %= x_.d[i];
                return *this;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator^=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] ^= x_.d[i];
                return *this;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator|=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] |= x_.d[i];
                return *this;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator&=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] &= x_.d[i];
                return *this;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator>>=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] >>= x_.d[i];
                return *this;
            }
            template<
                typename U = T,
                typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
            __attribute__((device))
            Native_vec_& operator<<=(const Native_vec_& x_) noexcept
            {
                for (auto i = 0u; i != 3u; ++i) d[i] <<= x_.d[i];
                return *this;
            }


            using Vec3_cmp = int __attribute__((vector_size(4 * sizeof(int))));

            __attribute__((device))
            Vec3_cmp operator==(const Native_vec_& x_) const noexcept
            {
                return Vec3_cmp{d[0] == x_.d[0], d[1] == x_.d[1], d[2] == x_.d[2]};
            }
        };

        union {
            Native_vec_ data;
            struct {
                T x;
                T y;
                T z;
            };
        };

        using value_type = T;

        __attribute__((device))
        HIP_vector_base() = default;
        __attribute__((device))
        explicit
        constexpr
        HIP_vector_base(T x_) noexcept : data{x_, x_, x_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(T x_, T y_, T z_) noexcept : data{x_, y_, z_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(const HIP_vector_base&) = default;
        __attribute__((device))
        constexpr
        HIP_vector_base(HIP_vector_base&&) = default;
        __attribute__((device))
        ~HIP_vector_base() = default;

        __attribute__((device))
        HIP_vector_base& operator=(const HIP_vector_base&) = default;
        __attribute__((device))
        HIP_vector_base& operator=(HIP_vector_base&&) = default;
    };

    template<typename T>
    struct HIP_vector_base<T, 4> {
        using Native_vec_ = T __attribute__((ext_vector_type(4)));

        union


        {
            Native_vec_ data;
            struct {
                T x;
                T y;
                T z;
                T w;
            };
        };

        using value_type = T;

        __attribute__((device))
        HIP_vector_base() = default;
        __attribute__((device))
        explicit
        constexpr
        HIP_vector_base(T x_) noexcept : data{x_, x_, x_, x_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(T x_, T y_, T z_, T w_) noexcept : data{x_, y_, z_, w_} {}
        __attribute__((device))
        constexpr
        HIP_vector_base(const HIP_vector_base&) = default;
        __attribute__((device))
        constexpr
        HIP_vector_base(HIP_vector_base&&) = default;
        __attribute__((device))
        ~HIP_vector_base() = default;
        __attribute__((device))
        HIP_vector_base& operator=(const HIP_vector_base&) = default;
    };

    template<typename T, unsigned int rank>
    struct HIP_vector_type : public HIP_vector_base<T, rank> {
        using HIP_vector_base<T, rank>::data;
        using typename HIP_vector_base<T, rank>::Native_vec_;

        __attribute__((device))
        HIP_vector_type() = default;
        template<
            typename U,
            typename std::enable_if<
                std::is_convertible<U, T>::value>::type* = nullptr>
        __attribute__((device))
        explicit
        constexpr
        HIP_vector_type(U x_) noexcept
            : HIP_vector_base<T, rank>{static_cast<T>(x_)}
        {}
        template<
            typename... Us,
            typename std::enable_if<
                (rank > 1) && sizeof...(Us) == rank>::type* = nullptr>
        __attribute__((device))
        constexpr
        HIP_vector_type(Us... xs) noexcept
            : HIP_vector_base<T, rank>{static_cast<T>(xs)...}
        {}
        __attribute__((device))
        constexpr
        HIP_vector_type(const HIP_vector_type&) = default;
        __attribute__((device))
        constexpr
        HIP_vector_type(HIP_vector_type&&) = default;
        __attribute__((device))
        ~HIP_vector_type() = default;

        __attribute__((device))
        HIP_vector_type& operator=(const HIP_vector_type&) = default;
        __attribute__((device))
        HIP_vector_type& operator=(HIP_vector_type&&) = default;


        __attribute__((device))
        HIP_vector_type& operator++() noexcept
        {
            return *this += HIP_vector_type{1};
        }
        __attribute__((device))
        HIP_vector_type operator++(int) noexcept
        {
            auto tmp(*this);
            ++*this;
            return tmp;
        }

        __attribute__((device))
        HIP_vector_type& operator--() noexcept
        {
            return *this -= HIP_vector_type{1};
        }
        __attribute__((device))
        HIP_vector_type operator--(int) noexcept
        {
            auto tmp(*this);
            --*this;
            return tmp;
        }

        __attribute__((device))
        HIP_vector_type& operator+=(const HIP_vector_type& x) noexcept
        {
            data += x.data;
            return *this;
        }
        template<
            typename U,
            typename std::enable_if<
                std::is_convertible<U, T>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator+=(U x) noexcept
        {
            return *this += HIP_vector_type{x};
        }

        __attribute__((device))
        HIP_vector_type& operator-=(const HIP_vector_type& x) noexcept
        {
            data -= x.data;
            return *this;
        }
        template<
            typename U,
            typename std::enable_if<
                std::is_convertible<U, T>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator-=(U x) noexcept
        {
            return *this -= HIP_vector_type{x};
        }

        __attribute__((device))
        HIP_vector_type& operator*=(const HIP_vector_type& x) noexcept
        {
            data *= x.data;
            return *this;
        }

        friend __attribute__((device)) inline constexpr HIP_vector_type operator*(
        HIP_vector_type x, const HIP_vector_type& y) noexcept
        {
          return HIP_vector_type{ x } *= y;
        }

        template<
            typename U,
            typename std::enable_if<
                std::is_convertible<U, T>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator*=(U x) noexcept
        {
            return *this *= HIP_vector_type{x};
        }

        friend __attribute__((device)) inline constexpr HIP_vector_type operator/(
        HIP_vector_type x, const HIP_vector_type& y) noexcept
        {
          return HIP_vector_type{ x } /= y;
        }

        __attribute__((device))
        HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
        {
            data /= x.data;
            return *this;
        }
        template<
            typename U,
            typename std::enable_if<
                std::is_convertible<U, T>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator/=(U x) noexcept
        {
            return *this /= HIP_vector_type{x};
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_signed<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type operator-() const noexcept
        {
            auto tmp(*this);
            tmp.data = -tmp.data;
            return tmp;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type operator~() const noexcept
        {
            HIP_vector_type r{*this};
            r.data = ~r.data;
            return r;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator%=(const HIP_vector_type& x) noexcept
        {
            data %= x.data;
            return *this;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator^=(const HIP_vector_type& x) noexcept
        {
            data ^= x.data;
            return *this;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator|=(const HIP_vector_type& x) noexcept
        {
            data |= x.data;
            return *this;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator&=(const HIP_vector_type& x) noexcept
        {
            data &= x.data;
            return *this;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator>>=(const HIP_vector_type& x) noexcept
        {
            data >>= x.data;
            return *this;
        }

        template<
            typename U = T,
            typename std::enable_if<std::is_integral<U>{}>::type* = nullptr>
        __attribute__((device))
        HIP_vector_type& operator<<=(const HIP_vector_type& x) noexcept
        {
            data <<= x.data;
            return *this;
        }
    };

    template<typename T, unsigned int n>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator+(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} += y;
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator+(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} += HIP_vector_type<T, n>{y};
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator+(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} += y;
    }

    template<typename T, unsigned int n>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator-(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} -= y;
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator-(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} -= HIP_vector_type<T, n>{y};
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator-(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} -= y;
    }

    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator*(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} *= HIP_vector_type<T, n>{y};
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator*(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} *= y;
    }

    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator/(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} /= HIP_vector_type<T, n>{y};
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator/(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} /= y;
    }

    template<typename V>
    __attribute__((device))
    inline
    constexpr
    bool _hip_any_zero(const V& x, int n) noexcept
    {
        return
            (n == -1) ? true : ((x[n] == 0) ? false : _hip_any_zero(x, n - 1));
    }

    template<typename T, unsigned int n>
    __attribute__((device))
    inline
    constexpr
    bool operator==(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return _hip_any_zero(x.data == y.data, n - 1);
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    bool operator==(const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return x == HIP_vector_type<T, n>{y};
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    bool operator==(U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} == y;
    }

    template<typename T, unsigned int n>
    __attribute__((device))
    inline
    constexpr
    bool operator!=(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return !(x == y);
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    bool operator!=(const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return !(x == y);
    }
    template<typename T, unsigned int n, typename U>
    __attribute__((device))
    inline
    constexpr
    bool operator!=(U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return !(x == y);
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator%(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} %= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator%(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} %= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator%(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} %= y;
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator^(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} ^= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator^(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} ^= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator^(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} ^= y;
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator|(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} |= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator|(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} |= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator|(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} |= y;
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator&(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} &= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator&(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} &= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator&(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} &= y;
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator>>(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} >>= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator>>(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} >>= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator>>(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} >>= y;
    }

    template<
        typename T,
        unsigned int n,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator<<(
        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} <<= y;
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator<<(
        const HIP_vector_type<T, n>& x, U y) noexcept
    {
        return HIP_vector_type<T, n>{x} <<= HIP_vector_type<T, n>{y};
    }
    template<
        typename T,
        unsigned int n,
        typename U,
        typename std::enable_if<std::is_arithmetic<U>::value>::type,
        typename std::enable_if<std::is_integral<T>{}>* = nullptr>
    __attribute__((device))
    inline
    constexpr
    HIP_vector_type<T, n> operator<<(
        U x, const HIP_vector_type<T, n>& y) noexcept
    {
        return HIP_vector_type<T, n>{x} <<= y;
    }


    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 1 && rankU >= 1),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT>(static_cast<T>(u.x));
    };

    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU == 1),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0));
    };

    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 2 && rankU >= 2),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y));
    };

    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 1),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(0),
                                       static_cast<T>(0), static_cast<T>(0));
    };

    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 2),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT>(static_cast<T>(u.x), static_cast<T>(u.y),
                                       static_cast<T>(0), static_cast<T>(0));
    };

    template <typename T, unsigned int rankT, typename U, unsigned int rankU>
    inline __attribute__((always_inline)) __attribute__((device)) typename std::enable_if<(rankT == 4 && rankU == 4),
                                                            const HIP_vector_type<T, rankT>>::type
    __hipMapVector(const HIP_vector_type<U, rankU>& u) {
      return HIP_vector_type<T, rankT> (static_cast<T>(u.x), static_cast<T>(u.y),
                                       static_cast<T>(u.z), static_cast<T>(u.w));
    };
# 1135 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
using uchar1 = HIP_vector_type<unsigned char, 1>; using uchar2 = HIP_vector_type<unsigned char, 2>; using uchar3 = HIP_vector_type<unsigned char, 3>; using uchar4 = HIP_vector_type<unsigned char, 4>;;
using char1 = HIP_vector_type<char, 1>; using char2 = HIP_vector_type<char, 2>; using char3 = HIP_vector_type<char, 3>; using char4 = HIP_vector_type<char, 4>;;
using ushort1 = HIP_vector_type<unsigned short, 1>; using ushort2 = HIP_vector_type<unsigned short, 2>; using ushort3 = HIP_vector_type<unsigned short, 3>; using ushort4 = HIP_vector_type<unsigned short, 4>;;
using short1 = HIP_vector_type<short, 1>; using short2 = HIP_vector_type<short, 2>; using short3 = HIP_vector_type<short, 3>; using short4 = HIP_vector_type<short, 4>;;
using uint1 = HIP_vector_type<unsigned int, 1>; using uint2 = HIP_vector_type<unsigned int, 2>; using uint3 = HIP_vector_type<unsigned int, 3>; using uint4 = HIP_vector_type<unsigned int, 4>;;
using int1 = HIP_vector_type<int, 1>; using int2 = HIP_vector_type<int, 2>; using int3 = HIP_vector_type<int, 3>; using int4 = HIP_vector_type<int, 4>;;
using ulong1 = HIP_vector_type<unsigned long, 1>; using ulong2 = HIP_vector_type<unsigned long, 2>; using ulong3 = HIP_vector_type<unsigned long, 3>; using ulong4 = HIP_vector_type<unsigned long, 4>;;
using long1 = HIP_vector_type<long, 1>; using long2 = HIP_vector_type<long, 2>; using long3 = HIP_vector_type<long, 3>; using long4 = HIP_vector_type<long, 4>;;
using ulonglong1 = HIP_vector_type<unsigned long long, 1>; using ulonglong2 = HIP_vector_type<unsigned long long, 2>; using ulonglong3 = HIP_vector_type<unsigned long long, 3>; using ulonglong4 = HIP_vector_type<unsigned long long, 4>;;
using longlong1 = HIP_vector_type<long long, 1>; using longlong2 = HIP_vector_type<long long, 2>; using longlong3 = HIP_vector_type<long long, 3>; using longlong4 = HIP_vector_type<long long, 4>;;
using float1 = HIP_vector_type<float, 1>; using float2 = HIP_vector_type<float, 2>; using float3 = HIP_vector_type<float, 3>; using float4 = HIP_vector_type<float, 4>;;
using double1 = HIP_vector_type<double, 1>; using double2 = HIP_vector_type<double, 2>; using double3 = HIP_vector_type<double, 3>; using double4 = HIP_vector_type<double, 4>;;
# 2117 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_vector_types.h" 3
static inline __attribute__((device)) uchar1 make_uchar1(unsigned char x) { uchar1 r{x}; return r; };
static inline __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y) { uchar2 r{x, y}; return r; };
static inline __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) { uchar3 r{x, y, z}; return r; };
static inline __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) { uchar4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) char1 make_char1(signed char x) { char1 r{x}; return r; };
static inline __attribute__((device)) char2 make_char2(signed char x, signed char y) { char2 r{x, y}; return r; };
static inline __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z) { char3 r{x, y, z}; return r; };
static inline __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w) { char4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) ushort1 make_ushort1(unsigned short x) { ushort1 r{x}; return r; };
static inline __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y) { ushort2 r{x, y}; return r; };
static inline __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) { ushort3 r{x, y, z}; return r; };
static inline __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) { ushort4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) short1 make_short1(signed short x) { short1 r{x}; return r; };
static inline __attribute__((device)) short2 make_short2(signed short x, signed short y) { short2 r{x, y}; return r; };
static inline __attribute__((device)) short3 make_short3(signed short x, signed short y, signed short z) { short3 r{x, y, z}; return r; };
static inline __attribute__((device)) short4 make_short4(signed short x, signed short y, signed short z, signed short w) { short4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) uint1 make_uint1(unsigned int x) { uint1 r{x}; return r; };
static inline __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y) { uint2 r{x, y}; return r; };
static inline __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) { uint3 r{x, y, z}; return r; };
static inline __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) { uint4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) int1 make_int1(signed int x) { int1 r{x}; return r; };
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { int2 r{x, y}; return r; };
static inline __attribute__((device)) int3 make_int3(signed int x, signed int y, signed int z) { int3 r{x, y, z}; return r; };
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { int4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) float1 make_float1(float x) { float1 r{x}; return r; };
static inline __attribute__((device)) float2 make_float2(float x, float y) { float2 r{x, y}; return r; };
static inline __attribute__((device)) float3 make_float3(float x, float y, float z) { float3 r{x, y, z}; return r; };
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { float4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) double1 make_double1(double x) { double1 r{x}; return r; };
static inline __attribute__((device)) double2 make_double2(double x, double y) { double2 r{x, y}; return r; };
static inline __attribute__((device)) double3 make_double3(double x, double y, double z) { double3 r{x, y, z}; return r; };
static inline __attribute__((device)) double4 make_double4(double x, double y, double z, double w) { double4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) ulong1 make_ulong1(unsigned long x) { ulong1 r{x}; return r; };
static inline __attribute__((device)) ulong2 make_ulong2(unsigned long x, unsigned long y) { ulong2 r{x, y}; return r; };
static inline __attribute__((device)) ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z) { ulong3 r{x, y, z}; return r; };
static inline __attribute__((device)) ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w) { ulong4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) long1 make_long1(signed long x) { long1 r{x}; return r; };
static inline __attribute__((device)) long2 make_long2(signed long x, signed long y) { long2 r{x, y}; return r; };
static inline __attribute__((device)) long3 make_long3(signed long x, signed long y, signed long z) { long3 r{x, y, z}; return r; };
static inline __attribute__((device)) long4 make_long4(signed long x, signed long y, signed long z, signed long w) { long4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long x) { ulonglong1 r{x}; return r; };
static inline __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y) { ulonglong2 r{x, y}; return r; };
static inline __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z) { ulonglong3 r{x, y, z}; return r; };
static inline __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w) { ulonglong4 r{x, y, z, w}; return r; };

static inline __attribute__((device)) longlong1 make_longlong1(signed long long x) { longlong1 r{x}; return r; };
static inline __attribute__((device)) longlong2 make_longlong2(signed long long x, signed long long y) { longlong2 r{x, y}; return r; };
static inline __attribute__((device)) longlong3 make_longlong3(signed long long x, signed long long y, signed long long z) { longlong3 r{x, y, z}; return r; };
static inline __attribute__((device)) longlong4 make_longlong4(signed long long x, signed long long y, signed long long z, signed long long w) { longlong4 r{x, y, z, w}; return r; };
# 28 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/hip_ldg.h" 2 3


__attribute__((device)) inline static char __ldg(const char* ptr) { return *ptr; }

__attribute__((device)) inline static char2 __ldg(const char2* ptr) { return *ptr; }

__attribute__((device)) inline static char4 __ldg(const char4* ptr) { return *ptr; }

__attribute__((device)) inline static signed char __ldg(const signed char* ptr) { return ptr[0]; }

__attribute__((device)) inline static unsigned char __ldg(const unsigned char* ptr) { return ptr[0]; }


__attribute__((device)) inline static short __ldg(const short* ptr) { return ptr[0]; }

__attribute__((device)) inline static short2 __ldg(const short2* ptr) { return ptr[0]; }

__attribute__((device)) inline static short4 __ldg(const short4* ptr) { return ptr[0]; }

__attribute__((device)) inline static unsigned short __ldg(const unsigned short* ptr) { return ptr[0]; }


__attribute__((device)) inline static int __ldg(const int* ptr) { return ptr[0]; }

__attribute__((device)) inline static int2 __ldg(const int2* ptr) { return ptr[0]; }

__attribute__((device)) inline static int4 __ldg(const int4* ptr) { return ptr[0]; }

__attribute__((device)) inline static unsigned int __ldg(const unsigned int* ptr) { return ptr[0]; }


__attribute__((device)) inline static long __ldg(const long* ptr) { return ptr[0]; }

__attribute__((device)) inline static unsigned long __ldg(const unsigned long* ptr) { return ptr[0]; }


__attribute__((device)) inline static long long __ldg(const long long* ptr) { return ptr[0]; }

__attribute__((device)) inline static longlong2 __ldg(const longlong2* ptr) { return ptr[0]; }

__attribute__((device)) inline static unsigned long long __ldg(const unsigned long long* ptr) { return ptr[0]; }


__attribute__((device)) inline static uchar2 __ldg(const uchar2* ptr) { return ptr[0]; }

__attribute__((device)) inline static uchar4 __ldg(const uchar4* ptr) { return ptr[0]; }


__attribute__((device)) inline static ushort2 __ldg(const ushort2* ptr) { return ptr[0]; }


__attribute__((device)) inline static uint2 __ldg(const uint2* ptr) { return ptr[0]; }

__attribute__((device)) inline static uint4 __ldg(const uint4* ptr) { return ptr[0]; }


__attribute__((device)) inline static ulonglong2 __ldg(const ulonglong2* ptr) { return ptr[0]; }


__attribute__((device)) inline static float __ldg(const float* ptr) { return ptr[0]; }

__attribute__((device)) inline static float2 __ldg(const float2* ptr) { return ptr[0]; }

__attribute__((device)) inline static float4 __ldg(const float4* ptr) { return ptr[0]; }


__attribute__((device)) inline static double __ldg(const double* ptr) { return ptr[0]; }

__attribute__((device)) inline static double2 __ldg(const double2* ptr) { return ptr[0]; }
# 125 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 2 3
# 250 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_runtime.h" 3
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
struct __HIP_BlockIdx {
  __attribute__((device))
  std::uint32_t operator()(std::uint32_t x) const noexcept { return __ockl_get_group_id(x); }
};
struct __HIP_BlockDim {
  __attribute__((device))
  std::uint32_t operator()(std::uint32_t x) const noexcept {
    return __ockl_get_local_size(x);
  }
};
struct __HIP_GridDim {
  __attribute__((device))
  std::uint32_t operator()(std::uint32_t x) const noexcept {
    return __ockl_get_num_groups(x);
  }
};
struct __HIP_ThreadIdx {
  __attribute__((device))
  std::uint32_t operator()(std::uint32_t x) const noexcept {
    return __ockl_get_local_id(x);
  }
};


typedef struct dim3 {
    uint32_t x;
    uint32_t y;
    uint32_t z;

    constexpr __attribute__((device)) dim3(uint32_t _x = 1, uint32_t _y = 1, uint32_t _z = 1) : x(_x), y(_y), z(_z){};

} dim3;


extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_global_size(unsigned int);


template <typename F> struct __HIP_Coordinates {
    using R = decltype(F{}(0));

    struct __X {
    __attribute__((device)) operator R() const noexcept { return F{}(0); }
    __attribute__((device)) R operator+=(const R& rhs) { return F{}(0) + rhs; }
    };
    struct __Y {
    __attribute__((device)) operator R() const noexcept { return F{}(1); }
    __attribute__((device)) R operator+=(const R& rhs) { return F{}(1) + rhs; }
    };
    struct __Z {
    __attribute__((device)) operator R() const noexcept { return F{}(2); }
    __attribute__((device)) R operator+=(const R& rhs) { return F{}(2) + rhs; }
    };


    __attribute__((weak))

    __attribute__((device)) static constexpr __X x{};

    __attribute__((weak))

    __attribute__((device)) static constexpr __Y y{};

    __attribute__((weak))

    __attribute__((device)) static constexpr __Z z{};

    __attribute__((device)) operator dim3() const { return dim3(x, y, z); }
};

template <typename F>
constexpr typename __HIP_Coordinates<F>::__X __HIP_Coordinates<F>::x;
template <typename F>
constexpr typename __HIP_Coordinates<F>::__Y __HIP_Coordinates<F>::y;
template <typename F>
constexpr typename __HIP_Coordinates<F>::__Z __HIP_Coordinates<F>::z;

inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__X,
                        __HIP_Coordinates<__HIP_BlockDim>::__X) noexcept {
  return __ockl_get_global_size(0);
}
inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__X,
                        __HIP_Coordinates<__HIP_GridDim>::__X) noexcept {
  return __ockl_get_global_size(0);
}
inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Y,
                        __HIP_Coordinates<__HIP_BlockDim>::__Y) noexcept {
  return __ockl_get_global_size(1);
}
inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Y,
                        __HIP_Coordinates<__HIP_GridDim>::__Y) noexcept {
  return __ockl_get_global_size(1);
}
inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_GridDim>::__Z,
                        __HIP_Coordinates<__HIP_BlockDim>::__Z) noexcept {
  return __ockl_get_global_size(2);
}
inline
__attribute__((device))
std::uint32_t operator*(__HIP_Coordinates<__HIP_BlockDim>::__Z,
                        __HIP_Coordinates<__HIP_GridDim>::__Z) noexcept {
  return __ockl_get_global_size(2);
}

static constexpr __HIP_Coordinates<__HIP_BlockDim> blockDim{};
static constexpr __HIP_Coordinates<__HIP_BlockIdx> blockIdx{};
static constexpr __HIP_Coordinates<__HIP_GridDim> gridDim{};
static constexpr __HIP_Coordinates<__HIP_ThreadIdx> threadIdx{};


extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);


extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);


extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);


extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_num_groups(unsigned int);
# 63 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
# 73 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_vector_types.h" 1 3
# 74 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_runtime.h" 2 3
# 6 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 1 3
# 37 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 3
# 1 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 1 3
# 55 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshadow"
struct hip_bfloat16
{
    __hip_uint16_t data;

    enum truncate_t
    {
        truncate
    };

    __attribute__((device)) hip_bfloat16() = default;


    explicit __attribute__((device)) hip_bfloat16(float f)
        : data(float_to_bfloat16(f))
    {
    }

    explicit __attribute__((device)) hip_bfloat16(float f, truncate_t)
        : data(truncate_float_to_bfloat16(f))
    {
    }


    __attribute__((device)) operator float() const
    {
        union
        {
            uint32_t int32;
            float fp32;
        } u = {uint32_t(data) << 16};
        return u.fp32;
    }

    __attribute__((device)) hip_bfloat16 &operator=(const float& f)
    {
       data = float_to_bfloat16(f);
       return *this;
    }

    static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f)
    {
        hip_bfloat16 output;
        output.data = float_to_bfloat16(f);
        return output;
    }

    static __attribute__((device)) hip_bfloat16 round_to_bfloat16(float f, truncate_t)
    {
        hip_bfloat16 output;
        output.data = truncate_float_to_bfloat16(f);
        return output;
    }

private:
    static __attribute__((device)) __hip_uint16_t float_to_bfloat16(float f)
    {
        union
        {
            float fp32;
            uint32_t int32;
        } u = {f};
        if(~u.int32 & 0x7f800000)
        {
# 136 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
            u.int32 += 0x7fff + ((u.int32 >> 16) & 1);
        }
        else if(u.int32 & 0xffff)
        {
# 148 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
            u.int32 |= 0x10000;
        }
        return __hip_uint16_t(u.int32 >> 16);
    }


    static __attribute__((device)) __hip_uint16_t truncate_float_to_bfloat16(float f)
    {
        union
        {
            float fp32;
            uint32_t int32;
        } u = {f};
        return __hip_uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
    }
};
#pragma clang diagnostic pop

typedef struct
{
    __hip_uint16_t data;
} hip_bfloat16_public;

static_assert(__hip_internal::is_standard_layout<hip_bfloat16>{},
              "hip_bfloat16 is not a standard layout type, and thus is "
              "incompatible with C.");

static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
              "hip_bfloat16 is not a trivial type, and thus is "
              "incompatible with C.");
# 189 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/clr/hipamd/include/hip/amd_detail/amd_hip_bfloat16.h" 3
inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a)
{
    return a;
}
inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a)
{
    a.data ^= 0x8000;
    return a;
}
inline __attribute__((device)) hip_bfloat16 operator+(hip_bfloat16 a, hip_bfloat16 b)
{
    return hip_bfloat16(float(a) + float(b));
}
inline __attribute__((device)) hip_bfloat16 operator-(hip_bfloat16 a, hip_bfloat16 b)
{
    return hip_bfloat16(float(a) - float(b));
}
inline __attribute__((device)) hip_bfloat16 operator*(hip_bfloat16 a, hip_bfloat16 b)
{
    return hip_bfloat16(float(a) * float(b));
}
inline __attribute__((device)) hip_bfloat16 operator/(hip_bfloat16 a, hip_bfloat16 b)
{
    return hip_bfloat16(float(a) / float(b));
}
inline __attribute__((device)) bool operator<(hip_bfloat16 a, hip_bfloat16 b)
{
    return float(a) < float(b);
}
inline __attribute__((device)) bool operator==(hip_bfloat16 a, hip_bfloat16 b)
{
    return float(a) == float(b);
}
inline __attribute__((device)) bool operator>(hip_bfloat16 a, hip_bfloat16 b)
{
    return b < a;
}
inline __attribute__((device)) bool operator<=(hip_bfloat16 a, hip_bfloat16 b)
{
    return !(a > b);
}
inline __attribute__((device)) bool operator!=(hip_bfloat16 a, hip_bfloat16 b)
{
    return !(a == b);
}
inline __attribute__((device)) bool operator>=(hip_bfloat16 a, hip_bfloat16 b)
{
    return !(a < b);
}
inline __attribute__((device)) hip_bfloat16& operator+=(hip_bfloat16& a, hip_bfloat16 b)
{
    return a = a + b;
}
inline __attribute__((device)) hip_bfloat16& operator-=(hip_bfloat16& a, hip_bfloat16 b)
{
    return a = a - b;
}
inline __attribute__((device)) hip_bfloat16& operator*=(hip_bfloat16& a, hip_bfloat16 b)
{
    return a = a * b;
}
inline __attribute__((device)) hip_bfloat16& operator/=(hip_bfloat16& a, hip_bfloat16 b)
{
    return a = a / b;
}
inline __attribute__((device)) hip_bfloat16& operator++(hip_bfloat16& a)
{
    return a += hip_bfloat16(1.0f);
}
inline __attribute__((device)) hip_bfloat16& operator--(hip_bfloat16& a)
{
    return a -= hip_bfloat16(1.0f);
}
inline __attribute__((device)) hip_bfloat16 operator++(hip_bfloat16& a, int)
{
    hip_bfloat16 orig = a;
    ++a;
    return orig;
}
inline __attribute__((device)) hip_bfloat16 operator--(hip_bfloat16& a, int)
{
    hip_bfloat16 orig = a;
    --a;
    return orig;
}

namespace std
{
    constexpr __attribute__((device)) bool isinf(hip_bfloat16 a)
    {
        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
    }
    constexpr __attribute__((device)) bool isnan(hip_bfloat16 a)
    {
        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
    }
    constexpr __attribute__((device)) bool iszero(hip_bfloat16 a)
    {
        return !(a.data & 0x7fff);
    }
}
# 38 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/external/hip-on-vdi/include/hip/hip_bfloat16.h" 2 3
# 7 "/long_pathname_so_that_rpms_can_package_the_debug_info/src/out/ubuntu-22.04/22.04/build/hip-on-rocclr/hipamd/src/hiprtc/hip_rtc_gen/hipRTC_header.h" 2

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-id-macro"
#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wundef"
#define __device__ __attribute__((device))
#define __host__ __attribute__((host))
#define __global__ __attribute__((global))
#define __constant__ __attribute__((constant))
#define __shared__ __attribute__((shared))
#define __align__(x) __attribute__((aligned(x)))
#if !defined(__has_feature) || !__has_feature(cuda_noinline_keyword)
#define __noinline__ __attribute__((noinline))
#endif
#define __forceinline__ inline __attribute__((always_inline))
#if __HIP_NO_IMAGE_SUPPORT
#define __hip_img_chk__ __attribute__((unavailable("The image/texture API not supported on the device")))
#else
#define __hip_img_chk__
#endif
#define launch_bounds_impl0(requiredMaxThreadsPerBlock)                                       \
    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock)))
#define launch_bounds_impl1(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)           \
    __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),                \
                   amdgpu_waves_per_eu(minBlocksPerMultiprocessor)))
#define select_impl_(_1, _2, impl_, ...) impl_
#define __launch_bounds__(...)                                                                \
    select_impl_(__VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__)
#define HIP_INCLUDE_HIP_HIP_RUNTIME_H
#define _HIP_BFLOAT16_H_
#define HIP_INCLUDE_HIP_MATH_FUNCTIONS_H
#define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H
#if !__HIP_NO_STD_DEFS__
#if defined(__HIPRTC_PTRDIFF_T_IS_LONG_LONG__) && __HIPRTC_PTRDIFF_T_IS_LONG_LONG__==1
typedef long long ptrdiff_t;
#else
typedef __PTRDIFF_TYPE__ ptrdiff_t;
#endif
typedef long clock_t;
namespace std {
using ::ptrdiff_t;
using ::clock_t;
}
#endif // __HIP_NO_STD_DEFS__
#pragma clang diagnostic pop/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_HIP_COMMON_H
#define HIP_INCLUDE_HIP_HIP_COMMON_H

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#endif
// Common code included at start of every hip file.
// Auto enable __HIP_PLATFORM_AMD__ if compiling on AMD platform
// Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
#if defined(__clang__) && defined(__HIP__)
#ifndef __HIP_PLATFORM_AMD__
#define __HIP_PLATFORM_AMD__
#endif
#endif  // defined(__clang__) && defined(__HIP__)

// Auto enable __HIP_PLATFORM_NVIDIA__ if compiling with NVIDIA platform
#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__) && !defined(__HIP__))
#ifndef __HIP_PLATFORM_NVIDIA__
#define __HIP_PLATFORM_NVIDIA__
#endif

#ifdef __CUDACC__
#define __HIPCC__
#endif

#endif  //__NVCC__

// Auto enable __HIP_DEVICE_COMPILE__ if compiled in HCC or NVCC device path
#if (defined(__HCC_ACCELERATOR__) && __HCC_ACCELERATOR__ != 0) ||                                  \
    (defined(__CUDA_ARCH__) && __CUDA_ARCH__ != 0)
#define __HIP_DEVICE_COMPILE__ 1
#endif

#ifdef __GNUC__
#define HIP_PUBLIC_API              __attribute__ ((visibility ("default")))
#define HIP_INTERNAL_EXPORTED_API   __attribute__ ((visibility ("default")))
#else
#define HIP_PUBLIC_API
#define HIP_INTERNAL_EXPORTED_API
#endif

#if __HIP_DEVICE_COMPILE__ == 0
// 32-bit Atomics
#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (0)
#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (0)
#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (0)
#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (0)
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)

// 64-bit Atomics
#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (0)
#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (0)

// Doubles
#define __HIP_ARCH_HAS_DOUBLES__ (0)

// Warp cross-lane operations
#define __HIP_ARCH_HAS_WARP_VOTE__ (0)
#define __HIP_ARCH_HAS_WARP_BALLOT__ (0)
#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (0)
#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (0)

// Sync
#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (0)
#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (0)

// Misc
#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (0)
#define __HIP_ARCH_HAS_3DGRID__ (0)
#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (0)
#endif

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_LIBRARY_TYPES_H
#define HIP_INCLUDE_HIP_LIBRARY_TYPES_H

#if !defined(__HIPCC_RTC__)
#include <hip/hip_common.h>
#endif

#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)

typedef enum hipDataType {
  HIP_R_32F   =  0,
  HIP_R_64F   =  1,
  HIP_R_16F   =  2,
  HIP_R_8I    =  3,
  HIP_C_32F   =  4,
  HIP_C_64F   =  5,
  HIP_C_16F   =  6,
  HIP_C_8I    =  7,
  HIP_R_8U    =  8,
  HIP_C_8U    =  9,
  HIP_R_32I   = 10,
  HIP_C_32I   = 11,
  HIP_R_32U   = 12,
  HIP_C_32U   = 13,
  HIP_R_16BF  = 14,
  HIP_C_16BF  = 15,
  HIP_R_4I    = 16,
  HIP_C_4I    = 17,
  HIP_R_4U    = 18,
  HIP_C_4U    = 19,
  HIP_R_16I   = 20,
  HIP_C_16I   = 21,
  HIP_R_16U   = 22,
  HIP_C_16U   = 23,
  HIP_R_64I   = 24,
  HIP_C_64I   = 25,
  HIP_R_64U   = 26,
  HIP_C_64U   = 27,
  // HIP specific Data Types
  HIP_R_8F_E4M3_FNUZ = 1000,
  HIP_R_8F_E5M2_FNUZ = 1001
} hipDataType;

typedef enum hipLibraryPropertyType {
  HIP_LIBRARY_MAJOR_VERSION,
  HIP_LIBRARY_MINOR_VERSION,
  HIP_LIBRARY_PATCH_LEVEL
} hipLibraryPropertyType;

#elif !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
#include "library_types.h"
#else
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
#endif

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_DRIVER_TYPES_H
#define HIP_INCLUDE_HIP_DRIVER_TYPES_H

#if !defined(__HIPCC_RTC__)
#include <hip/hip_common.h>
#endif

#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
#include "driver_types.h"
#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)

#if !defined(__HIPCC_RTC__)
#ifndef __cplusplus
#include <stdbool.h>
#endif
#endif // !defined(__HIPCC_RTC__)
typedef void* hipDeviceptr_t;
typedef enum hipChannelFormatKind {
    hipChannelFormatKindSigned = 0,
    hipChannelFormatKindUnsigned = 1,
    hipChannelFormatKindFloat = 2,
    hipChannelFormatKindNone = 3
}hipChannelFormatKind;
typedef struct hipChannelFormatDesc {
    int x;
    int y;
    int z;
    int w;
    enum hipChannelFormatKind f;
}hipChannelFormatDesc;
#define HIP_TRSA_OVERRIDE_FORMAT 0x01
#define HIP_TRSF_READ_AS_INTEGER 0x01
#define HIP_TRSF_NORMALIZED_COORDINATES 0x02
#define HIP_TRSF_SRGB 0x10

typedef struct hipArray* hipArray_t;
typedef const struct hipArray* hipArray_const_t;
typedef enum hipArray_Format {
    HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01,
    HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02,
    HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03,
    HIP_AD_FORMAT_SIGNED_INT8 = 0x08,
    HIP_AD_FORMAT_SIGNED_INT16 = 0x09,
    HIP_AD_FORMAT_SIGNED_INT32 = 0x0a,
    HIP_AD_FORMAT_HALF = 0x10,
    HIP_AD_FORMAT_FLOAT = 0x20
}hipArray_Format;
typedef struct HIP_ARRAY_DESCRIPTOR {
  size_t Width;
  size_t Height;
  enum hipArray_Format Format;
  unsigned int NumChannels;
}HIP_ARRAY_DESCRIPTOR;
typedef struct HIP_ARRAY3D_DESCRIPTOR {
  size_t Width;
  size_t Height;
  size_t Depth;
  enum hipArray_Format Format;
  unsigned int NumChannels;
  unsigned int Flags;
}HIP_ARRAY3D_DESCRIPTOR;
#if !defined(__HIPCC_RTC__)
typedef struct hip_Memcpy2D {
    size_t srcXInBytes;
    size_t srcY;
    hipMemoryType srcMemoryType;
    const void* srcHost;
    hipDeviceptr_t srcDevice;
    hipArray_t srcArray;
    size_t srcPitch;
    size_t dstXInBytes;
    size_t dstY;
    hipMemoryType dstMemoryType;
    void* dstHost;
    hipDeviceptr_t dstDevice;
    hipArray_t dstArray;
    size_t dstPitch;
    size_t WidthInBytes;
    size_t Height;
} hip_Memcpy2D;
#endif // !defined(__HIPCC_RTC__)
typedef struct hipMipmappedArray {
  void* data;
  struct hipChannelFormatDesc desc;
  unsigned int type;
  unsigned int width;
  unsigned int height;
  unsigned int depth;
  unsigned int min_mipmap_level;
  unsigned int max_mipmap_level;
  unsigned int flags;
  enum hipArray_Format format;
  unsigned int num_channels;
} hipMipmappedArray;
typedef struct hipMipmappedArray* hipMipmappedArray_t;
typedef hipMipmappedArray_t hipmipmappedArray;
typedef const struct hipMipmappedArray* hipMipmappedArray_const_t;
/**
 * hip resource types
 */
typedef enum hipResourceType {
    hipResourceTypeArray = 0x00,
    hipResourceTypeMipmappedArray = 0x01,
    hipResourceTypeLinear = 0x02,
    hipResourceTypePitch2D = 0x03
}hipResourceType;
typedef enum HIPresourcetype_enum {
    HIP_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
    HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
    HIP_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
    HIP_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
} HIPresourcetype, hipResourcetype;
/**
 * hip address modes
 */
typedef enum HIPaddress_mode_enum {
    HIP_TR_ADDRESS_MODE_WRAP   = 0,
    HIP_TR_ADDRESS_MODE_CLAMP  = 1,
    HIP_TR_ADDRESS_MODE_MIRROR = 2,
    HIP_TR_ADDRESS_MODE_BORDER = 3
} HIPaddress_mode;
/**
 * hip filter modes
 */
typedef enum HIPfilter_mode_enum {
    HIP_TR_FILTER_MODE_POINT  = 0,
    HIP_TR_FILTER_MODE_LINEAR = 1
} HIPfilter_mode;
/**
 * Texture descriptor
 */
typedef struct HIP_TEXTURE_DESC_st {
    HIPaddress_mode addressMode[3];  /**< Address modes */
    HIPfilter_mode filterMode;       /**< Filter mode */
    unsigned int flags;              /**< Flags */
    unsigned int maxAnisotropy;      /**< Maximum anisotropy ratio */
    HIPfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
    float mipmapLevelBias;           /**< Mipmap level bias */
    float minMipmapLevelClamp;       /**< Mipmap minimum level clamp */
    float maxMipmapLevelClamp;       /**< Mipmap maximum level clamp */
    float borderColor[4];            /**< Border Color */
    int reserved[12];
} HIP_TEXTURE_DESC;
/**
 * hip texture resource view formats
 */
typedef enum hipResourceViewFormat {
    hipResViewFormatNone = 0x00,
    hipResViewFormatUnsignedChar1 = 0x01,
    hipResViewFormatUnsignedChar2 = 0x02,
    hipResViewFormatUnsignedChar4 = 0x03,
    hipResViewFormatSignedChar1 = 0x04,
    hipResViewFormatSignedChar2 = 0x05,
    hipResViewFormatSignedChar4 = 0x06,
    hipResViewFormatUnsignedShort1 = 0x07,
    hipResViewFormatUnsignedShort2 = 0x08,
    hipResViewFormatUnsignedShort4 = 0x09,
    hipResViewFormatSignedShort1 = 0x0a,
    hipResViewFormatSignedShort2 = 0x0b,
    hipResViewFormatSignedShort4 = 0x0c,
    hipResViewFormatUnsignedInt1 = 0x0d,
    hipResViewFormatUnsignedInt2 = 0x0e,
    hipResViewFormatUnsignedInt4 = 0x0f,
    hipResViewFormatSignedInt1 = 0x10,
    hipResViewFormatSignedInt2 = 0x11,
    hipResViewFormatSignedInt4 = 0x12,
    hipResViewFormatHalf1 = 0x13,
    hipResViewFormatHalf2 = 0x14,
    hipResViewFormatHalf4 = 0x15,
    hipResViewFormatFloat1 = 0x16,
    hipResViewFormatFloat2 = 0x17,
    hipResViewFormatFloat4 = 0x18,
    hipResViewFormatUnsignedBlockCompressed1 = 0x19,
    hipResViewFormatUnsignedBlockCompressed2 = 0x1a,
    hipResViewFormatUnsignedBlockCompressed3 = 0x1b,
    hipResViewFormatUnsignedBlockCompressed4 = 0x1c,
    hipResViewFormatSignedBlockCompressed4 = 0x1d,
    hipResViewFormatUnsignedBlockCompressed5 = 0x1e,
    hipResViewFormatSignedBlockCompressed5 = 0x1f,
    hipResViewFormatUnsignedBlockCompressed6H = 0x20,
    hipResViewFormatSignedBlockCompressed6H = 0x21,
    hipResViewFormatUnsignedBlockCompressed7 = 0x22
}hipResourceViewFormat;
typedef enum HIPresourceViewFormat_enum
{
    HIP_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
    HIP_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
    HIP_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
    HIP_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
    HIP_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
    HIP_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
    HIP_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
    HIP_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
    HIP_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
    HIP_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
    HIP_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
    HIP_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
    HIP_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
    HIP_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
} HIPresourceViewFormat;
/**
 * HIP resource descriptor
 */
typedef struct hipResourceDesc {
    enum hipResourceType resType;
    union {
        struct {
            hipArray_t array;
        } array;
        struct {
            hipMipmappedArray_t mipmap;
        } mipmap;
        struct {
            void* devPtr;
            struct hipChannelFormatDesc desc;
            size_t sizeInBytes;
        } linear;
        struct {
            void* devPtr;
            struct hipChannelFormatDesc desc;
            size_t width;
            size_t height;
            size_t pitchInBytes;
        } pitch2D;
    } res;
}hipResourceDesc;
typedef struct HIP_RESOURCE_DESC_st
{
    HIPresourcetype resType;                     /**< Resource type */
    union {
        struct {
            hipArray_t hArray;                   /**< HIP array */
        } array;
        struct {
            hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
        } mipmap;
        struct {
            hipDeviceptr_t devPtr;               /**< Device pointer */
            hipArray_Format format;              /**< Array format */
            unsigned int numChannels;            /**< Channels per array element */
            size_t sizeInBytes;                  /**< Size in bytes */
        } linear;
        struct {
            hipDeviceptr_t devPtr;               /**< Device pointer */
            hipArray_Format format;              /**< Array format */
            unsigned int numChannels;            /**< Channels per array element */
            size_t width;                        /**< Width of the array in elements */
            size_t height;                       /**< Height of the array in elements */
            size_t pitchInBytes;                 /**< Pitch between two rows in bytes */
        } pitch2D;
        struct {
            int reserved[32];
        } reserved;
    } res;
    unsigned int flags;                          /**< Flags (must be zero) */
} HIP_RESOURCE_DESC;
/**
 * hip resource view descriptor
 */
struct hipResourceViewDesc {
    enum hipResourceViewFormat format;
    size_t width;
    size_t height;
    size_t depth;
    unsigned int firstMipmapLevel;
    unsigned int lastMipmapLevel;
    unsigned int firstLayer;
    unsigned int lastLayer;
};
/**
 * Resource view descriptor
 */
typedef struct HIP_RESOURCE_VIEW_DESC_st
{
    HIPresourceViewFormat format;   /**< Resource view format */
    size_t width;                   /**< Width of the resource view */
    size_t height;                  /**< Height of the resource view */
    size_t depth;                   /**< Depth of the resource view */
    unsigned int firstMipmapLevel;  /**< First defined mipmap level */
    unsigned int lastMipmapLevel;   /**< Last defined mipmap level */
    unsigned int firstLayer;        /**< First layer index */
    unsigned int lastLayer;         /**< Last layer index */
    unsigned int reserved[16];
} HIP_RESOURCE_VIEW_DESC;
/**
 * Memory copy types
 *
 */
#if !defined(__HIPCC_RTC__)
typedef enum hipMemcpyKind {
    hipMemcpyHostToHost = 0,      ///< Host-to-Host Copy
    hipMemcpyHostToDevice = 1,    ///< Host-to-Device Copy
    hipMemcpyDeviceToHost = 2,    ///< Device-to-Host Copy
    hipMemcpyDeviceToDevice = 3,  ///< Device-to-Device Copy
    hipMemcpyDefault =
        4  ///< Runtime will automatically determine copy-kind based on virtual addresses.
} hipMemcpyKind;
typedef struct hipPitchedPtr {
    void* ptr;
    size_t pitch;
    size_t xsize;
    size_t ysize;
}hipPitchedPtr;
typedef struct hipExtent {
    size_t width;  // Width in elements when referring to array memory, in bytes when referring to
                   // linear memory
    size_t height;
    size_t depth;
}hipExtent;
typedef struct hipPos {
    size_t x;
    size_t y;
    size_t z;
}hipPos;
typedef struct hipMemcpy3DParms {
    hipArray_t srcArray;
    struct hipPos srcPos;
    struct hipPitchedPtr srcPtr;
    hipArray_t dstArray;
    struct hipPos dstPos;
    struct hipPitchedPtr dstPtr;
    struct hipExtent extent;
    enum hipMemcpyKind kind;
} hipMemcpy3DParms;
typedef struct HIP_MEMCPY3D {
  size_t srcXInBytes;
  size_t srcY;
  size_t srcZ;
  size_t srcLOD;
  hipMemoryType srcMemoryType;
  const void* srcHost;
  hipDeviceptr_t srcDevice;
  hipArray_t srcArray;
  size_t srcPitch;
  size_t srcHeight;
  size_t dstXInBytes;
  size_t dstY;
  size_t dstZ;
  size_t dstLOD;
  hipMemoryType dstMemoryType;
  void* dstHost;
  hipDeviceptr_t dstDevice;
  hipArray_t dstArray;
  size_t dstPitch;
  size_t dstHeight;
  size_t WidthInBytes;
  size_t Height;
  size_t Depth;
} HIP_MEMCPY3D;
static inline struct hipPitchedPtr make_hipPitchedPtr(void* d, size_t p, size_t xsz,
                                                          size_t ysz) {
    struct hipPitchedPtr s;
    s.ptr = d;
    s.pitch = p;
    s.xsize = xsz;
    s.ysize = ysz;
    return s;
}
static inline struct hipPos make_hipPos(size_t x, size_t y, size_t z) {
    struct hipPos p;
    p.x = x;
    p.y = y;
    p.z = z;
    return p;
}
static inline struct hipExtent make_hipExtent(size_t w, size_t h, size_t d) {
    struct hipExtent e;
    e.width = w;
    e.height = h;
    e.depth = d;
    return e;
}
typedef enum hipFunction_attribute {
    HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
    HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
    HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES,
    HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
    HIP_FUNC_ATTRIBUTE_NUM_REGS,
    HIP_FUNC_ATTRIBUTE_PTX_VERSION,
    HIP_FUNC_ATTRIBUTE_BINARY_VERSION,
    HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA,
    HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
    HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT,
    HIP_FUNC_ATTRIBUTE_MAX
} hipFunction_attribute;

typedef enum hipPointer_attribute {
    HIP_POINTER_ATTRIBUTE_CONTEXT = 1,   ///< The context on which a pointer was allocated
                                         ///< @warning - not supported in HIP
    HIP_POINTER_ATTRIBUTE_MEMORY_TYPE,   ///< memory type describing location of a pointer
    HIP_POINTER_ATTRIBUTE_DEVICE_POINTER,///< address at which the pointer is allocated on device
    HIP_POINTER_ATTRIBUTE_HOST_POINTER,  ///< address at which the pointer is allocated on host
    HIP_POINTER_ATTRIBUTE_P2P_TOKENS,    ///< A pair of tokens for use with linux kernel interface
                                         ///< @warning - not supported in HIP
    HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS,   ///< Synchronize every synchronous memory operation
                                         ///< initiated on this region
    HIP_POINTER_ATTRIBUTE_BUFFER_ID,     ///< Unique ID for an allocated memory region
    HIP_POINTER_ATTRIBUTE_IS_MANAGED,    ///< Indicates if the pointer points to managed memory
    HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL,///< device ordinal of a device on which a pointer
                                         ///< was allocated or registered
    HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE, ///< if this pointer maps to an allocation
                                                     ///< that is suitable for hipIpcGetMemHandle
                                                     ///< @warning - not supported in HIP
    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,///< Starting address for this requested pointer
    HIP_POINTER_ATTRIBUTE_RANGE_SIZE,      ///< Size of the address range for this requested pointer
    HIP_POINTER_ATTRIBUTE_MAPPED,          ///< tells if this pointer is in a valid address range
                                           ///< that is mapped to a backing allocation
    HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES,///< Bitmask of allowed hipmemAllocationHandleType
                                           ///< for this allocation @warning - not supported in HIP
    HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE, ///< returns if the memory referenced by
                                           ///< this pointer can be used with the GPUDirect RDMA API
                                           ///< @warning - not supported in HIP
    HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS,    ///< Returns the access flags the device associated with
                                           ///< for the corresponding memory referenced by the ptr
    HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE   ///< Returns the mempool handle for the allocation if
                                           ///< it was allocated from a mempool
                                           ///< @warning - not supported in HIP
} hipPointer_attribute;

#endif // !defined(__HIPCC_RTC__)
#else
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
#endif
#endif
/*
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 *  @file  surface_types.h
 *  @brief Defines surface types for HIP runtime.
 */

#ifndef HIP_INCLUDE_HIP_SURFACE_TYPES_H
#define HIP_INCLUDE_HIP_SURFACE_TYPES_H

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-identifier"
#endif

#if !defined(__HIPCC_RTC__)
#include <hip/driver_types.h>
#endif

/**
 * An opaque value that represents a hip surface object
 */
struct __hip_surface;
typedef struct __hip_surface* hipSurfaceObject_t;

/**
 * hip surface reference
 */
struct surfaceReference {
    hipSurfaceObject_t surfaceObject;
};

/**
 * hip surface boundary modes
 */
enum hipSurfaceBoundaryMode {
    hipBoundaryModeZero = 0,
    hipBoundaryModeTrap = 1,
    hipBoundaryModeClamp = 2
};

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

#endif /* !HIP_INCLUDE_HIP_SURFACE_TYPES_H */
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H

#if !defined(__HIPCC_RTC__)
#include <hip/hip_common.h>
#include <hip/driver_types.h>
#include <hip/amd_detail/amd_hip_vector_types.h>
#endif

#ifdef __cplusplus

extern "C" HIP_PUBLIC_API
hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f);

static inline hipChannelFormatDesc hipCreateChannelDescHalf() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}

static inline hipChannelFormatDesc hipCreateChannelDescHalf1() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}

static inline hipChannelFormatDesc hipCreateChannelDescHalf2() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
}

static inline hipChannelFormatDesc hipCreateChannelDescHalf4() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
}

template <typename T>
static inline hipChannelFormatDesc hipCreateChannelDesc() {
    return hipCreateChannelDesc(0, 0, 0, 0, hipChannelFormatKindNone);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char>() {
    int e = (int)sizeof(char) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed char>() {
    int e = (int)sizeof(signed char) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned char>() {
    int e = (int)sizeof(unsigned char) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar1>() {
    int e = (int)sizeof(unsigned char) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char1>() {
    int e = (int)sizeof(signed char) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar2>() {
    int e = (int)sizeof(unsigned char) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char2>() {
    int e = (int)sizeof(signed char) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}

#ifndef __GNUC__  // vector3 is the same as vector4
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar3>() {
    int e = (int)sizeof(unsigned char) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char3>() {
    int e = (int)sizeof(signed char) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uchar4>() {
    int e = (int)sizeof(unsigned char) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<char4>() {
    int e = (int)sizeof(signed char) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned short>() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed short>() {
    int e = (int)sizeof(signed short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort1>() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short1>() {
    int e = (int)sizeof(signed short) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort2>() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short2>() {
    int e = (int)sizeof(signed short) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}

#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort3>() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short3>() {
    int e = (int)sizeof(signed short) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ushort4>() {
    int e = (int)sizeof(unsigned short) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<short4>() {
    int e = (int)sizeof(signed short) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned int>() {
    int e = (int)sizeof(unsigned int) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed int>() {
    int e = (int)sizeof(signed int) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint1>() {
    int e = (int)sizeof(unsigned int) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int1>() {
    int e = (int)sizeof(signed int) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint2>() {
    int e = (int)sizeof(unsigned int) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int2>() {
    int e = (int)sizeof(signed int) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}

#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint3>() {
    int e = (int)sizeof(unsigned int) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int3>() {
    int e = (int)sizeof(signed int) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<uint4>() {
    int e = (int)sizeof(unsigned int) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<int4>() {
    int e = (int)sizeof(signed int) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float>() {
    int e = (int)sizeof(float) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float1>() {
    int e = (int)sizeof(float) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindFloat);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float2>() {
    int e = (int)sizeof(float) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindFloat);
}

#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float3>() {
    int e = (int)sizeof(float) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindFloat);
}
#endif

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<float4>() {
    int e = (int)sizeof(float) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindFloat);
}

#if !defined(__LP64__)

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<unsigned long>() {
    int e = (int)sizeof(unsigned long) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<signed long>() {
    int e = (int)sizeof(signed long) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong1>() {
    int e = (int)sizeof(unsigned long) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long1>() {
    int e = (int)sizeof(signed long) * 8;
    return hipCreateChannelDesc(e, 0, 0, 0, hipChannelFormatKindSigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong2>() {
    int e = (int)sizeof(unsigned long) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long2>() {
    int e = (int)sizeof(signed long) * 8;
    return hipCreateChannelDesc(e, e, 0, 0, hipChannelFormatKindSigned);
}

#ifndef __GNUC__
template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong3>() {
    int e = (int)sizeof(unsigned long) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long3>() {
    int e = (int)sizeof(signed long) * 8;
    return hipCreateChannelDesc(e, e, e, 0, hipChannelFormatKindSigned);
}
#endif

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<ulong4>() {
    int e = (int)sizeof(unsigned long) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindUnsigned);
}

template <>
inline hipChannelFormatDesc hipCreateChannelDesc<long4>() {
    int e = (int)sizeof(signed long) * 8;
    return hipCreateChannelDesc(e, e, e, e, hipChannelFormatKindSigned);
}
#endif /* !__LP64__ */

#else

struct hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
                                                 enum hipChannelFormatKind f);

#endif /* __cplusplus */

#endif /* !HIP_INCLUDE_HIP_AMD_DETAIL_CHANNEL_DESCRIPTOR_H */
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_TEXTURE_TYPES_H
#define HIP_INCLUDE_HIP_TEXTURE_TYPES_H

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreserved-identifier"
#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
#pragma clang diagnostic ignored "-Wc++98-compat"
#endif

#if !defined(__HIPCC_RTC__)
#include <hip/hip_common.h>
#endif

#if !defined(__HIP_PLATFORM_AMD__) && defined(__HIP_PLATFORM_NVIDIA__)
#include "texture_types.h"
#elif defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__)
/*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
#if !defined(__HIPCC_RTC__)
#include <limits.h>
#include <hip/channel_descriptor.h>
#include <hip/driver_types.h>
#endif // !defined(__HIPCC_RTC__)

#define hipTextureType1D 0x01
#define hipTextureType2D 0x02
#define hipTextureType3D 0x03
#define hipTextureTypeCubemap 0x0C
#define hipTextureType1DLayered 0xF1
#define hipTextureType2DLayered 0xF2
#define hipTextureTypeCubemapLayered 0xFC

/**
 * Should be same as HSA_IMAGE_OBJECT_SIZE_DWORD/HSA_SAMPLER_OBJECT_SIZE_DWORD
 */
#define HIP_IMAGE_OBJECT_SIZE_DWORD 12
#define HIP_SAMPLER_OBJECT_SIZE_DWORD 8
#define HIP_SAMPLER_OBJECT_OFFSET_DWORD HIP_IMAGE_OBJECT_SIZE_DWORD
#define HIP_TEXTURE_OBJECT_SIZE_DWORD (HIP_IMAGE_OBJECT_SIZE_DWORD + HIP_SAMPLER_OBJECT_SIZE_DWORD)

/**
 * An opaque value that represents a hip texture object
 */
struct __hip_texture;
typedef struct __hip_texture* hipTextureObject_t;

/**
 * hip texture address modes
 */
enum hipTextureAddressMode {
    hipAddressModeWrap = 0,
    hipAddressModeClamp = 1,
    hipAddressModeMirror = 2,
    hipAddressModeBorder = 3
};

/**
 * hip texture filter modes
 */
enum hipTextureFilterMode { hipFilterModePoint = 0, hipFilterModeLinear = 1 };

/**
 * hip texture read modes
 */
enum hipTextureReadMode { hipReadModeElementType = 0, hipReadModeNormalizedFloat = 1 };

/**
 * hip texture reference
 */
typedef struct textureReference {
    int normalized;
    enum hipTextureReadMode readMode;// used only for driver API's
    enum hipTextureFilterMode filterMode;
    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
    struct hipChannelFormatDesc channelDesc;
    int sRGB;                    // Perform sRGB->linear conversion during texture read
    unsigned int maxAnisotropy;  // Limit to the anisotropy ratio
    enum hipTextureFilterMode mipmapFilterMode;
    float mipmapLevelBias;
    float minMipmapLevelClamp;
    float maxMipmapLevelClamp;

    hipTextureObject_t textureObject;
    int numChannels;
    enum hipArray_Format format;
}textureReference;

/**
 * hip texture descriptor
 */
typedef struct hipTextureDesc {
    enum hipTextureAddressMode addressMode[3];  // Texture address mode for up to 3 dimensions
    enum hipTextureFilterMode filterMode;
    enum hipTextureReadMode readMode;
    int sRGB;  // Perform sRGB->linear conversion during texture read
    float borderColor[4];
    int normalizedCoords;
    unsigned int maxAnisotropy;
    enum hipTextureFilterMode mipmapFilterMode;
    float mipmapLevelBias;
    float minMipmapLevelClamp;
    float maxMipmapLevelClamp;
}hipTextureDesc;

#if __cplusplus

/*******************************************************************************
 *                                                                              *
 *                                                                              *
 *                                                                              *
 *******************************************************************************/
#if __HIP__
#define __HIP_TEXTURE_ATTRIB __attribute__((device_builtin_texture_type))
#else
#define __HIP_TEXTURE_ATTRIB
#endif

typedef textureReference* hipTexRef;

template <class T, int texType = hipTextureType1D,
          enum hipTextureReadMode mode = hipReadModeElementType>
struct __HIP_TEXTURE_ATTRIB texture : public textureReference {
    texture(int norm = 0, enum hipTextureFilterMode fMode = hipFilterModePoint,
            enum hipTextureAddressMode aMode = hipAddressModeClamp) {
        normalized = norm;
        readMode = mode;
        filterMode = fMode;
        addressMode[0] = aMode;
        addressMode[1] = aMode;
        addressMode[2] = aMode;
        channelDesc = hipCreateChannelDesc<T>();
        sRGB = 0;
        textureObject = nullptr;
        maxAnisotropy = 0;
        mipmapLevelBias = 0;
        minMipmapLevelClamp = 0;
        maxMipmapLevelClamp = 0;
    }

    texture(int norm, enum hipTextureFilterMode fMode, enum hipTextureAddressMode aMode,
            struct hipChannelFormatDesc desc) {
        normalized = norm;
        readMode = mode;
        filterMode = fMode;
        addressMode[0] = aMode;
        addressMode[1] = aMode;
        addressMode[2] = aMode;
        channelDesc = desc;
        sRGB = 0;
        textureObject = nullptr;
        maxAnisotropy = 0;
        mipmapLevelBias = 0;
        minMipmapLevelClamp = 0;
        maxMipmapLevelClamp = 0;
    }
};

#endif /* __cplusplus */

#else
#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__");
#endif

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if !defined(__HIPCC_RTC__)
#include <hip/hip_vector_types.h>
#endif

extern "C" {

#define ADDRESS_SPACE_CONSTANT __attribute__((address_space(4)))

__device__ float4::Native_vec_ __ockl_image_load_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);

__device__ float4::Native_vec_ __ockl_image_load_1Db(unsigned int ADDRESS_SPACE_CONSTANT*i, int c);

__device__ float4::Native_vec_ __ockl_image_load_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_load_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_load_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_load_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_load_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f);

__device__ float4::Native_vec_ __ockl_image_load_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f);

__device__ float4::Native_vec_ __ockl_image_load_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l);

__device__ float4::Native_vec_ __ockl_image_load_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l);

__device__ void __ockl_image_store_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, float4::Native_vec_ p);

__device__ void __ockl_image_store_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);

__device__ void __ockl_image_store_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, float4::Native_vec_ p);

__device__ void __ockl_image_store_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);

__device__ void __ockl_image_store_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, float4::Native_vec_ p);

__device__ void __ockl_image_store_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, float4::Native_vec_ p);

__device__ void __ockl_image_store_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, int c, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, int2::Native_vec_ c, int f, int l, float4::Native_vec_ p);

__device__ void __ockl_image_store_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, int4::Native_vec_ c, int f, int l, float4::Native_vec_ p);

__device__ float4::Native_vec_ __ockl_image_sample_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c);

__device__ float4::Native_vec_ __ockl_image_sample_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_sample_grad_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float dx, float dy);

__device__ float4::Native_vec_ __ockl_image_sample_grad_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float dx, float dy);

__device__ float4::Native_vec_ __ockl_image_sample_grad_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);

__device__ float4::Native_vec_ __ockl_image_sample_grad_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float2::Native_vec_ dx, float2::Native_vec_ dy);

__device__ float4::Native_vec_ __ockl_image_sample_grad_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float4::Native_vec_ dx, float4::Native_vec_ dy);

__device__ float4::Native_vec_ __ockl_image_sample_lod_1D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_1Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_2Da(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_3D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_CM(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_sample_lod_CMa(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float4::Native_vec_ c, float l);

__device__ float4::Native_vec_ __ockl_image_gather4r_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_gather4g_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_gather4b_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ float4::Native_vec_ __ockl_image_gather4a_2D(unsigned int ADDRESS_SPACE_CONSTANT*i, unsigned int ADDRESS_SPACE_CONSTANT*s, float2::Native_vec_ c);

__device__ int __ockl_image_channel_data_type_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_data_type_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_1D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_1Da(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_1Db(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_2D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_2Da(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_2Dad(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_2Dd(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_3D(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_CM(unsigned int ADDRESS_SPACE_CONSTANT* i);

__device__ int __ockl_image_channel_order_CMa(unsigned int ADDRESS_SPACE_CONSTANT* i);

}
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if defined(__cplusplus)

#if !defined(__HIPCC_RTC__)
#include <hip/hip_vector_types.h>
#include <hip/hip_texture_types.h>
#include <hip/amd_detail/ockl_image.h>
#include <type_traits>
#endif // !defined(__HIPCC_RTC__)

#define TEXTURE_PARAMETERS_INIT                                                                     \
    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)t.textureObject; \
    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;

template<typename T>
struct __hip_is_tex_surf_scalar_channel_type
{
    static constexpr bool value =
        std::is_same<T, char>::value ||
        std::is_same<T, unsigned char>::value ||
        std::is_same<T, short>::value ||
        std::is_same<T, unsigned short>::value ||
        std::is_same<T, int>::value ||
        std::is_same<T, unsigned int>::value ||
        std::is_same<T, float>::value;
};

template<typename T>
struct __hip_is_tex_surf_channel_type
{
    static constexpr bool value =
        __hip_is_tex_surf_scalar_channel_type<T>::value;
};

template<
    typename T,
    unsigned int rank>
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>
{
    static constexpr bool value =
        __hip_is_tex_surf_scalar_channel_type<T>::value &&
        ((rank == 1) ||
         (rank == 2) ||
         (rank == 4));
};

template<typename T>
struct __hip_is_tex_normalized_channel_type
{
    static constexpr bool value =
        std::is_same<T, char>::value ||
        std::is_same<T, unsigned char>::value ||
        std::is_same<T, short>::value ||
        std::is_same<T, unsigned short>::value;
};

template<
    typename T,
    unsigned int rank>
struct __hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>
{
    static constexpr bool value =
        __hip_is_tex_normalized_channel_type<T>::value &&
        ((rank == 1) ||
         (rank == 2) ||
         (rank == 4));
};

template <
    typename T,
    hipTextureReadMode readMode,
    typename Enable = void>
struct __hip_tex_ret
{
    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
};

/*
 * Map from device function return U to scalar texture type T
 */
template<typename T, typename U>
__forceinline__ __device__
typename std::enable_if<
  __hip_is_tex_surf_scalar_channel_type<T>::value, const T>::type
__hipMapFrom(const U &u) {
  if constexpr (sizeof(T) < sizeof(float)) {
    union {
      U u;
      int i;
    } d = { u };
    return static_cast<T>(d.i);
  } else { // sizeof(T) == sizeof(float)
    union {
      U u;
      T t;
    } d = { u };
    return d.t;
  }
}

/*
 * Map from device function return U to vector texture type T
 */
template<typename T, typename U>
__forceinline__ __device__
typename std::enable_if<
  __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
__hipMapFrom(const U &u) {
  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
    union {
      U u;
      int4 i4;
    } d = { u };
    return __hipMapVector<typename T::value_type, sizeof(T)/sizeof(typename T::value_type)>(d.i4);
  } else { // sizeof(typename T::value_type) == sizeof(float)
    union {
      U u;
      T t;
    } d = { u };
    return d.t;
  }
}

/*
 * Map from scalar texture type T to device function input U
 */
template<typename U, typename T>
__forceinline__ __device__
typename std::enable_if<
__hip_is_tex_surf_scalar_channel_type<T>::value, const U>::type
__hipMapTo(const T &t) {
  if constexpr (sizeof(T) < sizeof(float)) {
    union {
      U u;
      int i;
    } d = { 0 };
    d.i = static_cast<int>(t);
    return d.u;
  } else { // sizeof(T) == sizeof(float)
    union {
      U u;
      T t;
    } d = { 0 };
    d.t = t;
    return d.u;
  }
}

/*
 * Map from vector texture type T to device function input U
 */
template<typename U, typename T>
__forceinline__ __device__
typename std::enable_if<
  __hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
__hipMapTo(const T &t) {
  if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
    union {
      U u;
      int4 i4;
    } d = { 0 };
    d.i4 = __hipMapVector<int, 4>(t);
    return d.u;
  } else { // sizeof(typename T::value_type) == sizeof(float)
    union {
      U u;
      T t;
    } d = { 0 };
    d.t = t;
    return d.u;
  }
}

template <
    typename T,
    hipTextureReadMode readMode>
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;

template <typename T>
struct __hip_tex_ret<
    T,
    hipReadModeElementType,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
{
    using type = T;
};

template<
    typename T,
    unsigned int rank>
struct __hip_tex_ret<
    HIP_vector_type<T, rank>,
    hipReadModeElementType,
    typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeElementType>, rank>;
};

template<typename T>
struct __hip_tex_ret<
    T,
    hipReadModeNormalizedFloat,
    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
{
    using type = float;
};

template<
    typename T,
    unsigned int rank>
struct __hip_tex_ret<
    HIP_vector_type<T, rank>,
    hipReadModeNormalizedFloat,
    typename std::enable_if<__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
    using type = HIP_vector_type<__hip_tex_ret_t<T, hipReadModeNormalizedFloat>, rank>;
};


template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1Dfetch(texture<T, hipTextureType1D, readMode> t, int x)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_load_1Db(i, x);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1D(texture<T, hipTextureType1D, readMode> t, float x)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_1D(i, s, x);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2D(texture<T, hipTextureType2D, readMode> t, float x, float y)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayered(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayered(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3D(texture<T, hipTextureType3D, readMode> t, float x, float y, float z)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemap(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLod(texture<T, hipTextureType1D, readMode> t, float x, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLod(texture<T, hipTextureType2D, readMode> t, float x, float y, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredLod(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_1Da(i, s, float2(x, layer).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredLod(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_2Da(i, s, float4(x, y, layer, 0.0f).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DLod(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLod(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayered(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredLod(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float level)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapGrad(texture<T, hipTextureTypeCubemap, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    // TODO missing in device libs.
    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    // return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
    return {};
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> texCubemapLayeredGrad(texture<T, hipTextureTypeCubemapLayered, readMode> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    // TODO missing in device libs.
    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    // return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
    return {};
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DGrad(texture<T, hipTextureType1D, readMode> t, float x, float dPdx, float dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DGrad(texture<T, hipTextureType2D, readMode> t, float x, float y, float2 dPdx, float2 dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex1DLayeredGrad(texture<T, hipTextureType1DLayered, readMode> t, float x, int layer, float dPdx, float dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex2DLayeredGrad(texture<T, hipTextureType2DLayered, readMode> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex_ret_t<T, readMode> tex3DGrad(texture<T, hipTextureType3D, readMode> t, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    TEXTURE_PARAMETERS_INIT;
    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    return __hipMapFrom<__hip_tex_ret_t<T, readMode>>(tmp);
}

template <
    typename T,
    hipTextureReadMode readMode,
    typename Enable = void>
struct __hip_tex2dgather_ret
{
    static_assert(std::is_same<Enable, void>::value, "Invalid channel type!");
};

template <
    typename T,
    hipTextureReadMode readMode>
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;

template <typename T>
struct __hip_tex2dgather_ret<
    T,
    hipReadModeElementType,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type>
{
    using type = HIP_vector_type<T, 4>;
};

template<
    typename T,
    unsigned int rank>
struct __hip_tex2dgather_ret<
    HIP_vector_type<T, rank>,
    hipReadModeElementType,
    typename std::enable_if<__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type>
{
    using type = HIP_vector_type<T, 4>;
};

template <typename T>
struct __hip_tex2dgather_ret<
    T,
    hipReadModeNormalizedFloat,
    typename std::enable_if<__hip_is_tex_normalized_channel_type<T>::value, bool>::type>
{
    using type = float4;
};

template <typename T, hipTextureReadMode readMode>
static __forceinline__ __device__ __hip_img_chk__ __hip_tex2dgather_ret_t<T, readMode> tex2Dgather(texture<T, hipTextureType2D, readMode> t, float x, float y, int comp=0)
{
    TEXTURE_PARAMETERS_INIT;
    switch (comp) {
    case 1: {
        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
        return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
    }
    case 2: {
        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
        return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
    }
    case 3: {
        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
        return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
    }
    default: {
        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
        return __hipMapFrom<__hip_tex2dgather_ret_t<T, readMode>>(tmp);
    }
    }
    return {};
}

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if defined(__cplusplus)

#if !defined(__HIPCC_RTC__)
#include <hip/hip_vector_types.h>
#include <hip/hip_texture_types.h>
#include <hip/amd_detail/texture_fetch_functions.h>
#include <hip/amd_detail/ockl_image.h>
#include <type_traits>
#endif // !defined(__HIPCC_RTC__)

#define TEXTURE_OBJECT_PARAMETERS_INIT                                                            \
    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)textureObject; \
    unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD;

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_load_1Db(i, x);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1Dfetch(T *ptr, hipTextureObject_t textureObject, int x)
{
    *ptr = tex1Dfetch<T>(textureObject, x);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_1D(i, s, x);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1D(T *ptr, hipTextureObject_t textureObject, float x)
{
    *ptr = tex1D<T>(textureObject, x);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_2D(i, s, float2(x, y).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2D(T *ptr, hipTextureObject_t textureObject, float x, float y)
{
    *ptr = tex2D<T>(textureObject, x, y);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y, float z)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_3D(i, s, float4(x, y, z, 0.0f).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3D(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
{
    *ptr = tex3D<T>(textureObject, x, y, z);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x, int layer)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayered(T *ptr, hipTextureObject_t textureObject, float x, int layer)
{
    *ptr = tex1DLayered<T>(textureObject, x, layer);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y, int layer)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer)
{
    *ptr = tex1DLayered<T>(textureObject, x, y, layer);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__  T texCubemap(hipTextureObject_t textureObject, float x, float y, float z)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_CM(i, s, float4(x, y, z, 0.0f).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemap(T *ptr, hipTextureObject_t textureObject, float x, float y, float z)
{
    *ptr = texCubemap<T>(textureObject, x, y, z);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x, float y, float z, int layer)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_CMa(i, s, float4(x, y, z, layer).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayered(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer)
{
    *ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y, int comp = 0)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    switch (comp) {
    case 1: {
        auto tmp = __ockl_image_gather4r_2D(i, s, float2(x, y).data);
        return __hipMapFrom<T>(tmp);
        break;
    }
    case 2: {
        auto tmp = __ockl_image_gather4g_2D(i, s, float2(x, y).data);
        return __hipMapFrom<T>(tmp);
        break;
    }
    case 3: {
        auto tmp = __ockl_image_gather4b_2D(i, s, float2(x, y).data);
        return __hipMapFrom<T>(tmp);
        break;
    }
    default: {
        auto tmp = __ockl_image_gather4a_2D(i, s, float2(x, y).data);
        return __hipMapFrom<T>(tmp);
        break;
    }
    }
    return {};
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2Dgather(T *ptr, hipTextureObject_t textureObject, float x, float y, int comp = 0)
{
    *ptr = texCubemapLayered<T>(textureObject, x, y, comp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_lod_1D(i, s, x, level);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLod(T *ptr, hipTextureObject_t textureObject, float x, float level)
{
    *ptr = tex1DLod<T>(textureObject, x, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_lod_2D(i, s, float2(x, y).data, level);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float level)
{
    *ptr = tex2DLod<T>(textureObject, x, y, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_lod_3D(i, s, float4(x, y, z, 0.0f).data, level);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
{
    *ptr = tex3DLod<T>(textureObject, x, y, z, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x, int layer, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_1Da(i, s, float2(x, layer).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, int layer, float level)
{
    *ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__  T tex2DLayeredLod(hipTextureObject_t textureObject, float x, float y, int layer, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_2Da(i, s, float4(x, y, layer, 0.0f).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float level)
{
    *ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x, float y, float z, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_lod_CM(i, s, float4(x, y, z, 0.0f).data, level);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float level)
{
    *ptr = texCubemapLod<T>(textureObject, x, y, z, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    // TODO missing in device libs.
    // auto tmp = __ockl_image_sample_grad_CM(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    // return __hipMapFrom<T>(tmp);
    return {};
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    *ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_lod_CMa(i, s, float4(x, y, z, layer).data, level);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float level)
{
    *ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_grad_1D(i, s, x, dPdx, dPdy);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DGrad(T *ptr, hipTextureObject_t textureObject, float x, float dPdx, float dPdy)
{
    *ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_grad_2D(i, s, float2(x, y).data, float2(dPdx.x, dPdx.y).data,  float2(dPdy.x, dPdy.y).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float2 dPdx, float2 dPdy)
{
    *ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_grad_3D(i, s, float4(x, y, z, 0.0f).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex3DGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, float4 dPdx, float4 dPdy)
{
    *ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_grad_1Da(i, s, float2(x, layer).data, dPdx, dPdy);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, int layer, float dPdx, float dPdy)
{
    *ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    auto tmp = __ockl_image_sample_grad_2Da(i, s, float4(x, y, layer, 0.0f).data, float2(dPdx.x, dPdx.y).data, float2(dPdy.x, dPdy.y).data);
    return __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
{
    *ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__  T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
    TEXTURE_OBJECT_PARAMETERS_INIT
    // TODO missing in device libs.
    // auto tmp = __ockl_image_sample_grad_CMa(i, s, float4(x, y, z, layer).data, float4(dPdx.x, dPdx.y, dPdx.z, 0.0f).data, float4(dPdy.x, dPdy.y, dPdy.z, 0.0f).data);
    // return __hipMapFrom<T>(tmp);
    return {};
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T *ptr, hipTextureObject_t textureObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
{
    *ptr = texCubemapLayeredGrad<T>(textureObject, x, y, z, layer, dPdx, dPdy);
}

#endif
/*
Copyright (c) 2018 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_SURFACE_FUNCTIONS_H

#if defined(__cplusplus)

#if !defined(__HIPCC_RTC__)
#include <hip/surface_types.h>
#include <hip/hip_vector_types.h>
#include <hip/amd_detail/texture_fetch_functions.h>
#include <hip/amd_detail/ockl_image.h>
#endif

#if defined(__HIPCC_RTC__)
#define __HOST_DEVICE__ __device__
#else
#define __HOST_DEVICE__ __host__ __device__
#endif

#define __HIP_SURFACE_OBJECT_PARAMETERS_INIT                                                            \
    unsigned int ADDRESS_SPACE_CONSTANT* i = (unsigned int ADDRESS_SPACE_CONSTANT*)surfObj;

// CUDA is using byte address, need map to pixel address for HIP
static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format, int order) {
    /*
    * use below format index to generate format LUT
      typedef enum {
        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT8 = 0,
        HSA_EXT_IMAGE_CHANNEL_TYPE_SNORM_INT16 = 1,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT8 = 2,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT16 = 3,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_INT24 = 4,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 = 5,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 = 6,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010 = 7,
        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT8 = 8,
        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT16 = 9,
        HSA_EXT_IMAGE_CHANNEL_TYPE_SIGNED_INT32 = 10,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 = 11,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 = 12,
        HSA_EXT_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 = 13,
        HSA_EXT_IMAGE_CHANNEL_TYPE_HALF_FLOAT = 14,
        HSA_EXT_IMAGE_CHANNEL_TYPE_FLOAT = 15
      } hsa_ext_image_channel_type_t;
    */
    static const int FormatLUT[] = { 0, 1, 0, 1, 3, 1, 1, 1, 0, 1, 2, 0, 1, 2, 1, 2 };
    x = FormatLUT[format] == 3 ? x / FormatLUT[format] : x >> FormatLUT[format];

    /*
    * use below order index to generate order LUT
      typedef enum {
        HSA_EXT_IMAGE_CHANNEL_ORDER_A = 0,
        HSA_EXT_IMAGE_CHANNEL_ORDER_R = 1,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RX = 2,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RG = 3,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RGX = 4,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RA = 5,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RGB = 6,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBX = 7,
        HSA_EXT_IMAGE_CHANNEL_ORDER_RGBA = 8,
        HSA_EXT_IMAGE_CHANNEL_ORDER_BGRA = 9,
        HSA_EXT_IMAGE_CHANNEL_ORDER_ARGB = 10,
        HSA_EXT_IMAGE_CHANNEL_ORDER_ABGR = 11,
        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB = 12,
        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX = 13,
        HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA = 14,
        HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA = 15,
        HSA_EXT_IMAGE_CHANNEL_ORDER_INTENSITY = 16,
        HSA_EXT_IMAGE_CHANNEL_ORDER_LUMINANCE = 17,
        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH = 18,
        HSA_EXT_IMAGE_CHANNEL_ORDER_DEPTH_STENCIL = 19
      } hsa_ext_image_channel_order_t;
    */
    static const int OrderLUT[] = { 0, 0, 1, 1, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 2, 2, 0, 0, 0, 0 };
    return x = OrderLUT[order] == 3 ? x / OrderLUT[order] : x >> OrderLUT[order];
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
        int boundaryMode = hipBoundaryModeZero) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
    auto tmp = __ockl_image_load_1D(i, x);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_1D(i, x, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x, int y) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __ockl_image_load_2D(i, int2(x, y).data);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_2D(i, int2(x, y).data, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y, int z) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
    auto tmp = __ockl_image_load_3D(i, int4(x, y, z, 0).data);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int z) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_3D(i, int4(x, y, z, 0).data, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
    auto tmp = __ockl_image_load_lod_1D(i, x, layer);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_lod_1D(i, x, layer, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __ockl_image_load_lod_2D(i, int2(x, y).data, layer);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_lod_2D(i, int2(x, y).data, layer, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __ockl_image_load_CM(i, int2(x, y).data, face);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x, int y, int face) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_CM(i, int2(x, y).data, face, tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
        int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __ockl_image_load_lod_CM(i, int2(x, y).data, face, layer);
    *data = __hipMapFrom<T>(tmp);
}

template <
    typename T,
    typename std::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj, int x, int y, int face,
        int layer) {
    __HIP_SURFACE_OBJECT_PARAMETERS_INIT
    x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i));
    auto tmp = __hipMapTo<float4::Native_vec_>(data);
    __ockl_image_store_lod_CM(i, int2(x, y).data, face, layer, tmp);
}

#endif

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H

#if !defined(__HIPCC_RTC__)
#include "hip/amd_detail/amd_hip_vector_types.h"
#endif

#if defined(__HIPCC_RTC__)
#define __HOST_DEVICE__ __device__
#else
#define __HOST_DEVICE__ __host__ __device__
// TODO: Clang has a bug which allows device functions to call std functions
// when std functions are introduced into default namespace by using statement.
// math.h may be included after this bug is fixed.
#if __cplusplus
#include <cmath>
#else
#include "math.h"
#endif
#endif // !defined(__HIPCC_RTC__)

#if __cplusplus
#define COMPLEX_NEG_OP_OVERLOAD(type)                                                              \
    __HOST_DEVICE__ static inline type operator-(const type& op) {                             \
        type ret;                                                                                  \
        ret.x = -op.x;                                                                             \
        ret.y = -op.y;                                                                             \
        return ret;                                                                                \
    }

#define COMPLEX_EQ_OP_OVERLOAD(type)                                                               \
    __HOST_DEVICE__ static inline bool operator==(const type& lhs, const type& rhs) {          \
        return lhs.x == rhs.x && lhs.y == rhs.y;                                                   \
    }

#define COMPLEX_NE_OP_OVERLOAD(type)                                                               \
    __HOST_DEVICE__ static inline bool operator!=(const type& lhs, const type& rhs) {          \
        return !(lhs == rhs);                                                                      \
    }

#define COMPLEX_ADD_OP_OVERLOAD(type)                                                              \
    __HOST_DEVICE__ static inline type operator+(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x + rhs.x;                                                                     \
        ret.y = lhs.y + rhs.y;                                                                     \
        return ret;                                                                                \
    }

#define COMPLEX_SUB_OP_OVERLOAD(type)                                                              \
    __HOST_DEVICE__ static inline type operator-(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x - rhs.x;                                                                     \
        ret.y = lhs.y - rhs.y;                                                                     \
        return ret;                                                                                \
    }

#define COMPLEX_MUL_OP_OVERLOAD(type)                                                              \
    __HOST_DEVICE__ static inline type operator*(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs.x - lhs.y * rhs.y;                                                     \
        ret.y = lhs.x * rhs.y + lhs.y * rhs.x;                                                     \
        return ret;                                                                                \
    }

#define COMPLEX_DIV_OP_OVERLOAD(type)                                                              \
    __HOST_DEVICE__ static inline type operator/(const type& lhs, const type& rhs) {           \
        type ret;                                                                                  \
        ret.x = (lhs.x * rhs.x + lhs.y * rhs.y);                                                   \
        ret.y = (rhs.x * lhs.y - lhs.x * rhs.y);                                                   \
        ret.x = ret.x / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
        ret.y = ret.y / (rhs.x * rhs.x + rhs.y * rhs.y);                                           \
        return ret;                                                                                \
    }

#define COMPLEX_ADD_PREOP_OVERLOAD(type)                                                           \
    __HOST_DEVICE__ static inline type& operator+=(type& lhs, const type& rhs) {               \
        lhs.x += rhs.x;                                                                            \
        lhs.y += rhs.y;                                                                            \
        return lhs;                                                                                \
    }

#define COMPLEX_SUB_PREOP_OVERLOAD(type)                                                           \
    __HOST_DEVICE__ static inline type& operator-=(type& lhs, const type& rhs) {               \
        lhs.x -= rhs.x;                                                                            \
        lhs.y -= rhs.y;                                                                            \
        return lhs;                                                                                \
    }

#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                            \
    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {                    \
        type temp{lhs};                                                                             \
        lhs.x = rhs.x * temp.x - rhs.y * temp.y;                                                    \
        lhs.y = rhs.y * temp.x + rhs.x * temp.y;                                                    \
        return lhs;                                                                                 \
    }

#define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {                   \
        type temp;                                                                                 \
        temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                      \
        temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                    \
        lhs = temp;                                                                                \
        return lhs;                                                                                \
    }

#define COMPLEX_SCALAR_PRODUCT(type, type1)                                                        \
    __HOST_DEVICE__ static inline type operator*(const type& lhs, type1 rhs) {                 \
        type ret;                                                                                  \
        ret.x = lhs.x * rhs;                                                                       \
        ret.y = lhs.y * rhs;                                                                       \
        return ret;                                                                                \
    }

#endif

typedef float2 hipFloatComplex;

__HOST_DEVICE__ static inline float hipCrealf(hipFloatComplex z) { return z.x; }

__HOST_DEVICE__ static inline float hipCimagf(hipFloatComplex z) { return z.y; }

__HOST_DEVICE__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
    hipFloatComplex z;
    z.x = a;
    z.y = b;
    return z;
}

__HOST_DEVICE__ static inline hipFloatComplex hipConjf(hipFloatComplex z) {
    hipFloatComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
}

__HOST_DEVICE__ static inline float hipCsqabsf(hipFloatComplex z) {
    return z.x * z.x + z.y * z.y;
}

__HOST_DEVICE__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x + q.x, p.y + q.y);
}

__HOST_DEVICE__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x - q.x, p.y - q.y);
}

__HOST_DEVICE__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
    return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}

__HOST_DEVICE__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
    float sqabs = hipCsqabsf(q);
    hipFloatComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
    return ret;
}

__HOST_DEVICE__ static inline float hipCabsf(hipFloatComplex z) { return sqrtf(hipCsqabsf(z)); }


typedef double2 hipDoubleComplex;

__HOST_DEVICE__ static inline double hipCreal(hipDoubleComplex z) { return z.x; }

__HOST_DEVICE__ static inline double hipCimag(hipDoubleComplex z) { return z.y; }

__HOST_DEVICE__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
    hipDoubleComplex z;
    z.x = a;
    z.y = b;
    return z;
}

__HOST_DEVICE__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) {
    hipDoubleComplex ret;
    ret.x = z.x;
    ret.y = -z.y;
    return ret;
}

__HOST_DEVICE__ static inline double hipCsqabs(hipDoubleComplex z) {
    return z.x * z.x + z.y * z.y;
}

__HOST_DEVICE__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x + q.x, p.y + q.y);
}

__HOST_DEVICE__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x - q.x, p.y - q.y);
}

__HOST_DEVICE__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
    return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y);
}

__HOST_DEVICE__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
    double sqabs = hipCsqabs(q);
    hipDoubleComplex ret;
    ret.x = (p.x * q.x + p.y * q.y) / sqabs;
    ret.y = (p.y * q.x - p.x * q.y) / sqabs;
    return ret;
}

__HOST_DEVICE__ static inline double hipCabs(hipDoubleComplex z) { return sqrt(hipCsqabs(z)); }


#if __cplusplus

COMPLEX_NEG_OP_OVERLOAD(hipFloatComplex)
COMPLEX_EQ_OP_OVERLOAD(hipFloatComplex)
COMPLEX_NE_OP_OVERLOAD(hipFloatComplex)
COMPLEX_ADD_OP_OVERLOAD(hipFloatComplex)
COMPLEX_SUB_OP_OVERLOAD(hipFloatComplex)
COMPLEX_MUL_OP_OVERLOAD(hipFloatComplex)
COMPLEX_DIV_OP_OVERLOAD(hipFloatComplex)
COMPLEX_ADD_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_SUB_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_MUL_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_DIV_PREOP_OVERLOAD(hipFloatComplex)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned short)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed short)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned int)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed int)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, float)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, double)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, signed long long)
COMPLEX_SCALAR_PRODUCT(hipFloatComplex, unsigned long long)

COMPLEX_NEG_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_EQ_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_NE_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_ADD_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_SUB_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_MUL_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_DIV_OP_OVERLOAD(hipDoubleComplex)
COMPLEX_ADD_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_SUB_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_MUL_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_DIV_PREOP_OVERLOAD(hipDoubleComplex)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned short)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed short)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned int)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed int)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, float)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, double)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, signed long long)
COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long)

#endif


typedef hipFloatComplex hipComplex;

__HOST_DEVICE__ static inline hipComplex make_hipComplex(float x, float y) {
    return make_hipFloatComplex(x, y);
}

__HOST_DEVICE__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
    return make_hipFloatComplex((float)z.x, (float)z.y);
}

__HOST_DEVICE__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
    return make_hipDoubleComplex((double)z.x, (double)z.y);
}

__HOST_DEVICE__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
    float real = (p.x * q.x) + r.x;
    float imag = (q.x * p.y) + r.y;

    real = -(p.y * q.y) + real;
    imag = (p.x * q.y) + imag;

    return make_hipComplex(real, imag);
}

__HOST_DEVICE__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
                                                           hipDoubleComplex r) {
    double real = (p.x * q.x) + r.x;
    double imag = (q.x * p.y) + r.y;

    real = -(p.y * q.y) + real;
    imag = (p.x * q.y) + imag;

    return make_hipDoubleComplex(real, imag);
}

#endif //HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COMPLEX_H
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef AMD_HIP_MATH_CONSTANTS_H
#define AMD_HIP_MATH_CONSTANTS_H

// single precision constants
#define HIP_INF_F            __int_as_float(0x7f800000U)
#define HIP_NAN_F            __int_as_float(0x7fffffffU)
#define HIP_MIN_DENORM_F     __int_as_float(0x00000001U)
#define HIP_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
#define HIP_NEG_ZERO_F       __int_as_float(0x80000000U)
#define HIP_ZERO_F           0.0F
#define HIP_ONE_F            1.0F
#define HIP_SQRT_HALF_F      0.707106781F
#define HIP_SQRT_HALF_HI_F   0.707106781F
#define HIP_SQRT_HALF_LO_F   1.210161749e-08F
#define HIP_SQRT_TWO_F       1.414213562F
#define HIP_THIRD_F          0.333333333F
#define HIP_PIO4_F           0.785398163F
#define HIP_PIO2_F           1.570796327F
#define HIP_3PIO4_F          2.356194490F
#define HIP_2_OVER_PI_F      0.636619772F
#define HIP_SQRT_2_OVER_PI_F 0.797884561F
#define HIP_PI_F             3.141592654F
#define HIP_L2E_F            1.442695041F
#define HIP_L2T_F            3.321928094F
#define HIP_LG2_F            0.301029996F
#define HIP_LGE_F            0.434294482F
#define HIP_LN2_F            0.693147181F
#define HIP_LNT_F            2.302585093F
#define HIP_LNPI_F           1.144729886F
#define HIP_TWO_TO_M126_F    1.175494351e-38F
#define HIP_TWO_TO_126_F     8.507059173e37F
#define HIP_NORM_HUGE_F      3.402823466e38F
#define HIP_TWO_TO_23_F      8388608.0F
#define HIP_TWO_TO_24_F      16777216.0F
#define HIP_TWO_TO_31_F      2147483648.0F
#define HIP_TWO_TO_32_F      4294967296.0F
#define HIP_REMQUO_BITS_F    3U
#define HIP_REMQUO_MASK_F    (~((~0U)<<HIP_REMQUO_BITS_F))
#define HIP_TRIG_PLOSS_F     105615.0F

// double precision constants
#define HIP_INF              __longlong_as_double(0x7ff0000000000000ULL)
#define HIP_NAN              __longlong_as_double(0xfff8000000000000ULL)
#define HIP_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
#define HIP_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
#define HIP_ZERO             0.0
#define HIP_ONE              1.0
#define HIP_SQRT_TWO         1.4142135623730951e+0
#define HIP_SQRT_HALF        7.0710678118654757e-1
#define HIP_SQRT_HALF_HI     7.0710678118654757e-1
#define HIP_SQRT_HALF_LO   (-4.8336466567264567e-17)
#define HIP_THIRD            3.3333333333333333e-1
#define HIP_TWOTHIRD         6.6666666666666667e-1
#define HIP_PIO4             7.8539816339744828e-1
#define HIP_PIO4_HI          7.8539816339744828e-1
#define HIP_PIO4_LO          3.0616169978683830e-17
#define HIP_PIO2             1.5707963267948966e+0
#define HIP_PIO2_HI          1.5707963267948966e+0
#define HIP_PIO2_LO          6.1232339957367660e-17
#define HIP_3PIO4            2.3561944901923448e+0
#define HIP_2_OVER_PI        6.3661977236758138e-1
#define HIP_PI               3.1415926535897931e+0
#define HIP_PI_HI            3.1415926535897931e+0
#define HIP_PI_LO            1.2246467991473532e-16
#define HIP_SQRT_2PI         2.5066282746310007e+0
#define HIP_SQRT_2PI_HI      2.5066282746310007e+0
#define HIP_SQRT_2PI_LO    (-1.8328579980459167e-16)
#define HIP_SQRT_PIO2        1.2533141373155003e+0
#define HIP_SQRT_PIO2_HI     1.2533141373155003e+0
#define HIP_SQRT_PIO2_LO   (-9.1642899902295834e-17)
#define HIP_SQRT_2OPI        7.9788456080286536e-1
#define HIP_L2E              1.4426950408889634e+0
#define HIP_L2E_HI           1.4426950408889634e+0
#define HIP_L2E_LO           2.0355273740931033e-17
#define HIP_L2T              3.3219280948873622e+0
#define HIP_LG2              3.0102999566398120e-1
#define HIP_LG2_HI           3.0102999566398120e-1
#define HIP_LG2_LO         (-2.8037281277851704e-18)
#define HIP_LGE              4.3429448190325182e-1
#define HIP_LGE_HI           4.3429448190325182e-1
#define HIP_LGE_LO           1.09831965021676510e-17
#define HIP_LN2              6.9314718055994529e-1
#define HIP_LN2_HI           6.9314718055994529e-1
#define HIP_LN2_LO           2.3190468138462996e-17
#define HIP_LNT              2.3025850929940459e+0
#define HIP_LNT_HI           2.3025850929940459e+0
#define HIP_LNT_LO         (-2.1707562233822494e-16)
#define HIP_LNPI             1.1447298858494002e+0
#define HIP_LN2_X_1024       7.0978271289338397e+2
#define HIP_LN2_X_1025       7.1047586007394398e+2
#define HIP_LN2_X_1075       7.4513321910194122e+2
#define HIP_LG2_X_1024       3.0825471555991675e+2
#define HIP_LG2_X_1075       3.2360724533877976e+2
#define HIP_TWO_TO_23        8388608.0
#define HIP_TWO_TO_52        4503599627370496.0
#define HIP_TWO_TO_53        9007199254740992.0
#define HIP_TWO_TO_54        18014398509481984.0
#define HIP_TWO_TO_M54       5.5511151231257827e-17
#define HIP_TWO_TO_M1022     2.22507385850720140e-308
#define HIP_TRIG_PLOSS       2147483648.0
#define HIP_DBL2INT_CVT      6755399441055744.0

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if !defined(__HIPCC_RTC__)
#include "host_defines.h"
#include "amd_hip_vector_types.h"  // For Native_vec_
#endif

#if defined(__cplusplus)
    extern "C" {
#endif

// DOT FUNCTIONS
#if defined(__clang__) && defined(__HIP__)
__device__
__attribute__((const))
int __ockl_sdot2(
    HIP_vector_base<short, 2>::Native_vec_,
    HIP_vector_base<short, 2>::Native_vec_,
    int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot2(
    HIP_vector_base<unsigned short, 2>::Native_vec_,
    HIP_vector_base<unsigned short, 2>::Native_vec_,
    unsigned int, bool);

__device__
__attribute__((const))
int __ockl_sdot4(
    HIP_vector_base<char, 4>::Native_vec_,
    HIP_vector_base<char, 4>::Native_vec_,
    int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot4(
    HIP_vector_base<unsigned char, 4>::Native_vec_,
    HIP_vector_base<unsigned char, 4>::Native_vec_,
    unsigned int, bool);

__device__
__attribute__((const))
int __ockl_sdot8(int, int, int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
#endif

#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
// BEGIN FLOAT
__device__
__attribute__((const))
float __ocml_acos_f32(float);
__device__
__attribute__((pure))
float __ocml_acosh_f32(float);
__device__
__attribute__((const))
float __ocml_asin_f32(float);
__device__
__attribute__((pure))
float __ocml_asinh_f32(float);
__device__
__attribute__((const))
float __ocml_atan2_f32(float, float);
__device__
__attribute__((const))
float __ocml_atan_f32(float);
__device__
__attribute__((pure))
float __ocml_atanh_f32(float);
__device__
__attribute__((pure))
float __ocml_cbrt_f32(float);
__device__
__attribute__((const))
float __ocml_ceil_f32(float);
__device__
__attribute__((const))
__device__
float __ocml_copysign_f32(float, float);
__device__
float __ocml_cos_f32(float);
__device__
float __ocml_native_cos_f32(float);
__device__
__attribute__((pure))
__device__
float __ocml_cosh_f32(float);
__device__
float __ocml_cospi_f32(float);
__device__
float __ocml_i0_f32(float);
__device__
float __ocml_i1_f32(float);
__device__
__attribute__((pure))
float __ocml_erfc_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcinv_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcx_f32(float);
__device__
__attribute__((pure))
float __ocml_erf_f32(float);
__device__
__attribute__((pure))
float __ocml_erfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_exp2_f32(float);
__device__
__attribute__((pure))
float __ocml_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_expm1_f32(float);
__device__
__attribute__((const))
float __ocml_fabs_f32(float);
__device__
__attribute__((const))
float __ocml_fdim_f32(float, float);
__device__
__attribute__((const))
float __ocml_floor_f32(float);
__device__
__attribute__((const))
float __ocml_fma_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fmax_f32(float, float);
__device__
__attribute__((const))
float __ocml_fmin_f32(float, float);
__device__
__attribute__((const))
__device__
float __ocml_fmod_f32(float, float);
__device__
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_hypot_f32(float, float);
__device__
__attribute__((const))
int __ocml_ilogb_f32(float);
__device__
__attribute__((const))
int __ocml_isfinite_f32(float);
__device__
__attribute__((const))
int __ocml_isinf_f32(float);
__device__
__attribute__((const))
int __ocml_isnan_f32(float);
__device__
float __ocml_j0_f32(float);
__device__
float __ocml_j1_f32(float);
__device__
__attribute__((const))
float __ocml_ldexp_f32(float, int);
__device__
float __ocml_lgamma_f32(float);
__device__
__attribute__((pure))
float __ocml_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_log1p_f32(float);
__device__
__attribute__((pure))
float __ocml_log2_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log2_f32(float);
__device__
__attribute__((const))
float __ocml_logb_f32(float);
__device__
__attribute__((pure))
float __ocml_log_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log_f32(float);
__device__
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
__device__
__attribute__((const))
float __ocml_nearbyint_f32(float);
__device__
__attribute__((const))
float __ocml_nextafter_f32(float, float);
__device__
__attribute__((const))
float __ocml_len3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_len4_f32(float, float, float, float);
__device__
__attribute__((pure))
float __ocml_ncdf_f32(float);
__device__
__attribute__((pure))
float __ocml_ncdfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_pow_f32(float, float);
__device__
__attribute__((pure))
float __ocml_pown_f32(float, int);
__device__
__attribute__((pure))
float __ocml_rcbrt_f32(float);
__device__
__attribute__((const))
float __ocml_remainder_f32(float, float);
__device__
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_rhypot_f32(float, float);
__device__
__attribute__((const))
float __ocml_rint_f32(float);
__device__
__attribute__((const))
float __ocml_rlen3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_rlen4_f32(float, float, float, float);
__device__
__attribute__((const))
float __ocml_round_f32(float);
__device__
__attribute__((pure))
float __ocml_rsqrt_f32(float);
__device__
__attribute__((const))
float __ocml_scalb_f32(float, float);
__device__
__attribute__((const))
float __ocml_scalbn_f32(float, int);
__device__
__attribute__((const))
int __ocml_signbit_f32(float);
__device__
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sin_f32(float);
__device__
float __ocml_native_sin_f32(float);
__device__
__attribute__((pure))
float __ocml_sinh_f32(float);
__device__
float __ocml_sinpi_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_f32(float);
__device__
__attribute__((const))
float __ocml_native_sqrt_f32(float);
__device__
float __ocml_tan_f32(float);
__device__
__attribute__((pure))
float __ocml_tanh_f32(float);
__device__
float __ocml_tgamma_f32(float);
__device__
__attribute__((const))
float __ocml_trunc_f32(float);
__device__
float __ocml_y0_f32(float);
__device__
float __ocml_y1_f32(float);

// BEGIN INTRINSICS
__device__
__attribute__((const))
float __ocml_add_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sqrt_rte_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtn_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtp_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtz_f32(float);
__device__
__attribute__((const))
float __ocml_fma_rte_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtn_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtp_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtz_f32(float, float, float);
// END INTRINSICS
// END FLOAT

// BEGIN DOUBLE
__device__
__attribute__((const))
double __ocml_acos_f64(double);
__device__
__attribute__((pure))
double __ocml_acosh_f64(double);
__device__
__attribute__((const))
double __ocml_asin_f64(double);
__device__
__attribute__((pure))
double __ocml_asinh_f64(double);
__device__
__attribute__((const))
double __ocml_atan2_f64(double, double);
__device__
__attribute__((const))
double __ocml_atan_f64(double);
__device__
__attribute__((pure))
double __ocml_atanh_f64(double);
__device__
__attribute__((pure))
double __ocml_cbrt_f64(double);
__device__
__attribute__((const))
double __ocml_ceil_f64(double);
__device__
__attribute__((const))
double __ocml_copysign_f64(double, double);
__device__
double __ocml_cos_f64(double);
__device__
__attribute__((pure))
double __ocml_cosh_f64(double);
__device__
double __ocml_cospi_f64(double);
__device__
double __ocml_i0_f64(double);
__device__
double __ocml_i1_f64(double);
__device__
__attribute__((pure))
double __ocml_erfc_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcinv_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcx_f64(double);
__device__
__attribute__((pure))
double __ocml_erf_f64(double);
__device__
__attribute__((pure))
double __ocml_erfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_exp10_f64(double);
__device__
__attribute__((pure))
double __ocml_exp2_f64(double);
__device__
__attribute__((pure))
double __ocml_exp_f64(double);
__device__
__attribute__((pure))
double __ocml_expm1_f64(double);
__device__
__attribute__((const))
double __ocml_fabs_f64(double);
__device__
__attribute__((const))
double __ocml_fdim_f64(double, double);
__device__
__attribute__((const))
double __ocml_floor_f64(double);
__device__
__attribute__((const))
double __ocml_fma_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fmax_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmin_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmod_f64(double, double);
__device__
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_hypot_f64(double, double);
__device__
__attribute__((const))
int __ocml_ilogb_f64(double);
__device__
__attribute__((const))
int __ocml_isfinite_f64(double);
__device__
__attribute__((const))
int __ocml_isinf_f64(double);
__device__
__attribute__((const))
int __ocml_isnan_f64(double);
__device__
double __ocml_j0_f64(double);
__device__
double __ocml_j1_f64(double);
__device__
__attribute__((const))
double __ocml_ldexp_f64(double, int);
__device__
double __ocml_lgamma_f64(double);
__device__
__attribute__((pure))
double __ocml_log10_f64(double);
__device__
__attribute__((pure))
double __ocml_log1p_f64(double);
__device__
__attribute__((pure))
double __ocml_log2_f64(double);
__device__
__attribute__((const))
double __ocml_logb_f64(double);
__device__
__attribute__((pure))
double __ocml_log_f64(double);
__device__
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
__device__
__attribute__((const))
double __ocml_nearbyint_f64(double);
__device__
__attribute__((const))
double __ocml_nextafter_f64(double, double);
__device__
__attribute__((const))
double __ocml_len3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_len4_f64(double, double, double, double);
__device__
__attribute__((pure))
double __ocml_ncdf_f64(double);
__device__
__attribute__((pure))
double __ocml_ncdfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_pow_f64(double, double);
__device__
__attribute__((pure))
double __ocml_pown_f64(double, int);
__device__
__attribute__((pure))
double __ocml_rcbrt_f64(double);
__device__
__attribute__((const))
double __ocml_remainder_f64(double, double);
__device__
double __ocml_remquo_f64(
    double, double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_rhypot_f64(double, double);
__device__
__attribute__((const))
double __ocml_rint_f64(double);
__device__
__attribute__((const))
double __ocml_rlen3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_rlen4_f64(double, double, double, double);
__device__
__attribute__((const))
double __ocml_round_f64(double);
__device__
__attribute__((pure))
double __ocml_rsqrt_f64(double);
__device__
__attribute__((const))
double __ocml_scalb_f64(double, double);
__device__
__attribute__((const))
double __ocml_scalbn_f64(double, int);
__device__
__attribute__((const))
int __ocml_signbit_f64(double);
__device__
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sin_f64(double);
__device__
__attribute__((pure))
double __ocml_sinh_f64(double);
__device__
double __ocml_sinpi_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_f64(double);
__device__
double __ocml_tan_f64(double);
__device__
__attribute__((pure))
double __ocml_tanh_f64(double);
__device__
double __ocml_tgamma_f64(double);
__device__
__attribute__((const))
double __ocml_trunc_f64(double);
__device__
double __ocml_y0_f64(double);
__device__
double __ocml_y1_f64(double);

// BEGIN INTRINSICS
__device__
__attribute__((const))
double __ocml_add_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sqrt_rte_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtn_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtp_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtz_f64(double);
__device__
__attribute__((const))
double __ocml_fma_rte_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtn_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtp_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtz_f64(double, double, double);
// END INTRINSICS
// END DOUBLE

#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__

#if defined(__cplusplus)
    } // extern "C"
#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 *  @file  amd_detail/device_library_decls.h
 *  @brief Contains declarations for types and functions in device library.
 *         Uses int64_t and uint64_t instead of long, long long, unsigned
 *         long and unsigned long long types for device library API
 *         declarations.
 */

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_LIBRARY_DECLS_H

#if !defined(__HIPCC_RTC__)
#include "hip/amd_detail/host_defines.h"
#endif

typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned long long ullong;

extern "C" __device__ __attribute__((const)) bool __ockl_wfany_i32(int);
extern "C" __device__ __attribute__((const)) bool __ockl_wfall_i32(int);
extern "C" __device__ uint __ockl_activelane_u32(void);

extern "C" __device__ __attribute__((const)) uint __ockl_mul24_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul24_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_mul_hi_u32(uint, uint);
extern "C" __device__ __attribute__((const)) int __ockl_mul_hi_i32(int, int);
extern "C" __device__ __attribute__((const)) uint __ockl_sadd_u32(uint, uint, uint);

extern "C" __device__ __attribute__((const)) uchar __ockl_clz_u8(uchar);
extern "C" __device__ __attribute__((const)) ushort __ockl_clz_u16(ushort);
extern "C" __device__ __attribute__((const)) uint __ockl_clz_u32(uint);
extern "C" __device__ __attribute__((const)) uint64_t __ockl_clz_u64(uint64_t);

extern "C" __device__ __attribute__((const)) float __ocml_floor_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_rint_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_ceil_f32(float);
extern "C" __device__ __attribute__((const)) float __ocml_trunc_f32(float);

extern "C" __device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
extern "C" __device__ __attribute__((const)) float __ocml_fmax_f32(float, float);

extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_f64(double);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_f64(double);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_f64(double);

extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
extern "C" __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);

extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s32(int);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u32(uint32_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_s64(int64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtn_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtp_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) float __ocml_cvtrtz_f32_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_s64(int64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtn_f64_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtp_f64_u64(uint64_t);
extern "C" __device__ __attribute__((const)) double __ocml_cvtrtz_f64_u64(uint64_t);

extern "C" __device__ __attribute__((convergent)) void __ockl_gws_init(uint nwm1, uint rid);
extern "C" __device__ __attribute__((convergent)) void __ockl_gws_barrier(uint nwm1, uint rid);

extern "C" __device__ __attribute__((const)) uint32_t __ockl_lane_u32();
extern "C" __device__ __attribute__((const)) int __ockl_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_sync(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_num_grids(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_grid_rank(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);

extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);

extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_add_i32(int a);
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_and_i32(int a);
extern "C" __device__ __attribute__((convergent)) int __ockl_wgred_or_i32(int a);

extern "C" __device__ uint64_t __ockl_fprintf_stderr_begin();
extern "C" __device__ uint64_t __ockl_fprintf_append_args(uint64_t msg_desc, uint32_t num_args,
                                                          uint64_t value0, uint64_t value1,
                                                          uint64_t value2, uint64_t value3,
                                                          uint64_t value4, uint64_t value5,
                                                          uint64_t value6, uint32_t is_last);
extern "C" __device__ uint64_t __ockl_fprintf_append_string_n(uint64_t msg_desc, const char* data,
                                                              uint64_t length, uint32_t is_last);

// Introduce local address space
#define __local __attribute__((address_space(3)))

#ifdef __HIP_DEVICE_COMPILE__
__device__ inline static __local void* __to_local(unsigned x) { return (__local void*)x; }
#endif //__HIP_DEVICE_COMPILE__

// Using hip.amdgcn.bc - sync threads
#define __CLK_LOCAL_MEM_FENCE    0x01
typedef unsigned __cl_mem_fence_flags;

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H

#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_common.h>
#include "host_defines.h"
#include "math_fwd.h"
#include <hip/hip_runtime_api.h>
#include <stddef.h>
#include <hip/hip_vector_types.h>
#include <hip/amd_detail/device_library_decls.h>
#endif // !defined(__HIPCC_RTC__)

#if defined(__clang__) && defined(__HIP__)
extern "C" __device__ int printf(const char *fmt, ...);
#else
template <typename... All>
static inline __device__ void printf(const char* format, All... all) {}
#endif // __HIP_CLANG_ONLY__

extern "C" __device__ unsigned long long __ockl_steadyctr_u64();

/*
Integer Intrinsics
*/

// integer intrinsic function __poc __clz __ffs __brev
__device__ static inline unsigned int __popc(unsigned int input) {
    return __builtin_popcount(input);
}
__device__ static inline unsigned int __popcll(unsigned long long int input) {
    return __builtin_popcountll(input);
}

__device__ static inline int __clz(int input) {
    return __ockl_clz_u32((uint)input);
}

__device__ static inline int __clzll(long long int input) {
    return __ockl_clz_u64((uint64_t)input);
}

__device__ static inline unsigned int __ffs(unsigned int input) {
    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
}

__device__ static inline unsigned int __ffsll(unsigned long long int input) {
    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
}

__device__ static inline unsigned int __ffs(int input) {
    return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
}

__device__ static inline unsigned int __ffsll(long long int input) {
    return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
}

// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
// If not found, return -1.
__device__  static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
  uint64_t temp_mask = mask;
  int32_t temp_offset = offset;

  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
  }
  else if (offset < 0) {
    temp_mask = __builtin_bitreverse64(mask);
    base = 63 - base;
    temp_offset = -offset;
  }

  temp_mask = temp_mask & ((~0ULL) << base);
  if (__builtin_popcountll(temp_mask) < temp_offset)
    return -1;
  int32_t total = 0;
  for (int i = 0x20; i > 0; i >>= 1) {
    uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
    int32_t pcnt = __builtin_popcountll(temp_mask_lo);
    if (pcnt < temp_offset) {
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
    }
    else {
      temp_mask = temp_mask_lo;
    }
  }
  if (offset < 0)
    return 63 - total;
  else
    return total;
}

__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
  uint64_t temp_mask = mask;
  int32_t temp_offset = offset;
  if (offset == 0) {
    temp_mask &= (1 << base);
    temp_offset = 1;
  }
  else if (offset < 0) {
    temp_mask = __builtin_bitreverse64(mask);
    base = 63 - base;
    temp_offset = -offset;
  }
  temp_mask = temp_mask & ((~0ULL) << base);
  if (__builtin_popcountll(temp_mask) < temp_offset)
    return -1;
  int32_t total = 0;
  for (int i = 0x20; i > 0; i >>= 1) {
    uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
    int32_t pcnt = __builtin_popcountll(temp_mask_lo);
    if (pcnt < temp_offset) {
      temp_mask = temp_mask >> i;
      temp_offset -= pcnt;
      total += i;
    }
    else {
      temp_mask = temp_mask_lo;
    }
  }
  if (offset < 0)
    return 63 - total;
  else
    return total;
}
__device__ static inline unsigned int __brev(unsigned int input) {
    return __builtin_bitreverse32(input);
}

__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
    return __builtin_bitreverse64(input);
}

__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
    return input == 0 ? -1 : __builtin_ctzl(input);
}

__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
    uint32_t offset = src1 & 31;
    uint32_t width = src2 & 31;
    return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
}

__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
    uint64_t offset = src1 & 63;
    uint64_t width = src2 & 63;
    return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
}

__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
    uint32_t offset = src2 & 31;
    uint32_t width = src3 & 31;
    uint32_t mask = (1 << width) - 1;
    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}

__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
    uint64_t offset = src2 & 63;
    uint64_t width = src3 & 63;
    uint64_t mask = (1ULL << width) - 1;
    return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
}

__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
{
    uint32_t mask_shift = shift & 31;
    return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
}

__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
{
    uint32_t min_shift = shift >= 32 ? 32 : shift;
    return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
}

__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
{
    return __builtin_amdgcn_alignbit(hi, lo, shift);
}

__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
{
    return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
}

__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
__device__ static unsigned int __hadd(int x, int y);
__device__ static int __mul24(int x, int y);
__device__ static long long int __mul64hi(long long int x, long long int y);
__device__ static int __mulhi(int x, int y);
__device__ static int __rhadd(int x, int y);
__device__ static unsigned int __sad(int x, int y,unsigned int z);
__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
__device__ static int __umul24(unsigned int x, unsigned int y);
__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);

struct ucharHolder {
    union {
        unsigned char c[4];
        unsigned int ui;
    };
} __attribute__((aligned(4)));

struct uchar2Holder {
    union {
        unsigned int ui[2];
        unsigned char c[8];
    };
} __attribute__((aligned(8)));

__device__
static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
    struct uchar2Holder cHoldVal;
    struct ucharHolder cHoldKey;
    cHoldKey.ui = s;
    cHoldVal.ui[0] = x;
    cHoldVal.ui[1] = y;
    unsigned int result;
    result = cHoldVal.c[cHoldKey.c[0] & 0x07];
    result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
    result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
    result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
    return result;
}

__device__ static inline unsigned int __hadd(int x, int y) {
    int z = x + y;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
}

__device__ static inline int __mul24(int x, int y) {
    return __ockl_mul24_i32(x, y);
}

__device__ static inline long long __mul64hi(long long int x, long long int y) {
    ulong x0 = (ulong)x & 0xffffffffUL;
    long x1 = x >> 32;
    ulong y0 = (ulong)y & 0xffffffffUL;
    long y1 = y >> 32;
    ulong z0 = x0*y0;
    long t = x1*y0 + (z0 >> 32);
    long z1 = t & 0xffffffffL;
    long z2 = t >> 32;
    z1 = x0*y1 + z1;
    return x1*y1 + z2 + (z1 >> 32);
}

__device__ static inline int __mulhi(int x, int y) {
    return __ockl_mul_hi_i32(x, y);
}

__device__ static inline int __rhadd(int x, int y) {
    int z = x + y + 1;
    int sign = z & 0x8000000;
    int value = z & 0x7FFFFFFF;
    return ((value) >> 1 || sign);
}
__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
    return x > y ? x - y + z : y - x + z;
}
__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
    return (x + y) >> 1;
}
__device__ static inline int __umul24(unsigned int x, unsigned int y) {
    return __ockl_mul24_u32(x, y);
}

__device__
static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
    ulong x0 = x & 0xffffffffUL;
    ulong x1 = x >> 32;
    ulong y0 = y & 0xffffffffUL;
    ulong y1 = y >> 32;
    ulong z0 = x0*y0;
    ulong t = x1*y0 + (z0 >> 32);
    ulong z1 = t & 0xffffffffUL;
    ulong z2 = t >> 32;
    z1 = x0*y1 + z1;
    return x1*y1 + z2 + (z1 >> 32);
}

__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
    return __ockl_mul_hi_u32(x, y);
}
__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
    return (x + y + 1) >> 1;
}
__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
    return __ockl_sadd_u32(x, y, z);
}

__device__ static inline unsigned int __lane_id() {
    return  __builtin_amdgcn_mbcnt_hi(
        -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}

__device__
static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};

__device__
static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};

/*
HIP specific device functions
*/

#if !defined(__HIPCC_RTC__)
#include "amd_warp_functions.h"
#endif

#define MASK1 0x00ff00ff
#define MASK2 0xff00ff00

__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 + one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 + one2) & MASK2);
    return out;
}

__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 - one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 - one2) & MASK2);
    return out;
}

__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
    char4 out;
    unsigned one1 = in1.w & MASK1;
    unsigned one2 = in2.w & MASK1;
    out.w = (one1 * one2) & MASK1;
    one1 = in1.w & MASK2;
    one2 = in2.w & MASK2;
    out.w = out.w | ((one1 * one2) & MASK2);
    return out;
}

__device__ static inline float __double2float_rd(double x) {
    return __ocml_cvtrtn_f32_f64(x);
}
__device__ static inline float __double2float_rn(double x) { return x; }
__device__ static inline float __double2float_ru(double x) {
    return __ocml_cvtrtp_f32_f64(x);
}
__device__ static inline float __double2float_rz(double x) {
    return __ocml_cvtrtz_f32_f64(x);
}

__device__ static inline int __double2hiint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
    __builtin_memcpy(tmp, &x, sizeof(tmp));

    return tmp[1];
}
__device__ static inline int __double2loint(double x) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");

    int tmp[2];
    __builtin_memcpy(tmp, &x, sizeof(tmp));

    return tmp[0];
}

__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
__device__ static inline int __double2int_rz(double x) { return (int)x; }

__device__ static inline long long int __double2ll_rd(double x) {
  return (long long)__ocml_floor_f64(x);
}
__device__ static inline long long int __double2ll_rn(double x) {
  return (long long)__ocml_rint_f64(x);
}
__device__ static inline long long int __double2ll_ru(double x) {
  return (long long)__ocml_ceil_f64(x);
}
__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }

__device__ static inline unsigned int __double2uint_rd(double x) {
  return (unsigned int)__ocml_floor_f64(x);
}
__device__ static inline unsigned int __double2uint_rn(double x) {
  return (unsigned int)__ocml_rint_f64(x);
}
__device__ static inline unsigned int __double2uint_ru(double x) {
  return (unsigned int)__ocml_ceil_f64(x);
}
__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }

__device__ static inline unsigned long long int __double2ull_rd(double x) {
  return (unsigned long long int)__ocml_floor_f64(x);
}
__device__ static inline unsigned long long int __double2ull_rn(double x) {
  return (unsigned long long int)__ocml_rint_f64(x);
}
__device__ static inline unsigned long long int __double2ull_ru(double x) {
  return (unsigned long long int)__ocml_ceil_f64(x);
}
__device__ static inline unsigned long long int __double2ull_rz(double x) {
  return (unsigned long long int)x;
}
__device__ static inline long long int __double_as_longlong(double x) {
    static_assert(sizeof(long long) == sizeof(double), "");

    long long tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

/*
__device__ unsigned short __float2half_rn(float x);
__device__ float __half2float(unsigned short);

The above device function are not a valid .
Use
__device__ __half __float2half_rn(float x);
__device__ float __half2float(__half);
from hip_fp16.h

CUDA implements half as unsigned short whereas, HIP doesn't.

*/

__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }

__device__ static inline long long int __float2ll_rd(float x) {
  return (long long int)__ocml_floor_f32(x);
}
__device__ static inline long long int __float2ll_rn(float x) {
  return (long long int)__ocml_rint_f32(x);
}
__device__ static inline long long int __float2ll_ru(float x) {
  return (long long int)__ocml_ceil_f32(x);
}
__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }

__device__ static inline unsigned int __float2uint_rd(float x) {
  return (unsigned int)__ocml_floor_f32(x);
}
__device__ static inline unsigned int __float2uint_rn(float x) {
  return (unsigned int)__ocml_rint_f32(x);
}
__device__ static inline unsigned int __float2uint_ru(float x) {
  return (unsigned int)__ocml_ceil_f32(x);
}
__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }

__device__ static inline unsigned long long int __float2ull_rd(float x) {
  return (unsigned long long int)__ocml_floor_f32(x);
}
__device__ static inline unsigned long long int __float2ull_rn(float x) {
  return (unsigned long long int)__ocml_rint_f32(x);
}
__device__ static inline unsigned long long int __float2ull_ru(float x) {
  return (unsigned long long int)__ocml_ceil_f32(x);
}
__device__ static inline unsigned long long int __float2ull_rz(float x) {
  return (unsigned long long int)x;
}

__device__ static inline int __float_as_int(float x) {
    static_assert(sizeof(int) == sizeof(float), "");

    int tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__device__ static inline unsigned int __float_as_uint(float x) {
    static_assert(sizeof(unsigned int) == sizeof(float), "");

    unsigned int tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__device__ static inline double __hiloint2double(int hi, int lo) {
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
    double tmp1;
    __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));

    return tmp1;
}

__device__ static inline double __int2double_rn(int x) { return (double)x; }

__device__ static inline float __int2float_rd(int x) {
    return __ocml_cvtrtn_f32_s32(x);
}
__device__ static inline float __int2float_rn(int x) { return (float)x; }
__device__ static inline float __int2float_ru(int x) {
    return __ocml_cvtrtp_f32_s32(x);
}
__device__ static inline float __int2float_rz(int x) {
    return __ocml_cvtrtz_f32_s32(x);
}

__device__ static inline float __int_as_float(int x) {
    static_assert(sizeof(float) == sizeof(int), "");

    float tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__device__ static inline double __ll2double_rd(long long int x) {
    return __ocml_cvtrtn_f64_s64(x);
}
__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
__device__ static inline double __ll2double_ru(long long int x) {
    return __ocml_cvtrtp_f64_s64(x);
}
__device__ static inline double __ll2double_rz(long long int x) {
    return __ocml_cvtrtz_f64_s64(x);
}

__device__ static inline float __ll2float_rd(long long int x) {
    return __ocml_cvtrtn_f32_s64(x);
}
__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
__device__ static inline float __ll2float_ru(long long int x) {
    return __ocml_cvtrtp_f32_s64(x);
}
__device__ static inline float __ll2float_rz(long long int x) {
    return __ocml_cvtrtz_f32_s64(x);
}

__device__ static inline double __longlong_as_double(long long int x) {
    static_assert(sizeof(double) == sizeof(long long), "");

    double tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }

__device__ static inline float __uint2float_rd(unsigned int x) {
    return __ocml_cvtrtn_f32_u32(x);
}
__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
__device__ static inline float __uint2float_ru(unsigned int x) {
    return __ocml_cvtrtp_f32_u32(x);
}
__device__ static inline float __uint2float_rz(unsigned int x) {
    return __ocml_cvtrtz_f32_u32(x);
}

__device__ static inline float __uint_as_float(unsigned int x) {
   static_assert(sizeof(float) == sizeof(unsigned int), "");

    float tmp;
    __builtin_memcpy(&tmp, &x, sizeof(tmp));

    return tmp;
}

__device__ static inline double __ull2double_rd(unsigned long long int x) {
    return __ocml_cvtrtn_f64_u64(x);
}
__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
__device__ static inline double __ull2double_ru(unsigned long long int x) {
    return __ocml_cvtrtp_f64_u64(x);
}
__device__ static inline double __ull2double_rz(unsigned long long int x) {
    return __ocml_cvtrtz_f64_u64(x);
}

__device__ static inline float __ull2float_rd(unsigned long long int x) {
    return __ocml_cvtrtn_f32_u64(x);
}
__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
__device__ static inline float __ull2float_ru(unsigned long long int x) {
    return __ocml_cvtrtp_f32_u64(x);
}
__device__ static inline float __ull2float_rz(unsigned long long int x) {
    return __ocml_cvtrtz_f32_u64(x);
}

#if defined(__clang__) && defined(__HIP__)

// Clock functions
__device__ long long int __clock64();
__device__ long long int __clock();
__device__ long long int clock64();
__device__ long long int clock();
__device__ long long int wall_clock64();
// hip.amdgcn.bc - named sync
__device__ void __named_sync();

#ifdef __HIP_DEVICE_COMPILE__

// Clock function to return GPU core cycle count.
// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
// through hipDeviceAttributeClockRate attribute.
__device__
inline  __attribute((always_inline))
long long int __clock64() {
#if __has_builtin(__builtin_amdgcn_s_memtime)
  // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
  return (long long int) __builtin_amdgcn_s_memtime();
#else
  // Subject to change when better solution available
  return (long long int) __builtin_readcyclecounter();
#endif
}

__device__
inline __attribute((always_inline))
long long int  __clock() { return __clock64(); }

// Clock function to return wall clock count at a constant frequency that can be queried
// through hipDeviceAttributeWallClockRate attribute.
__device__
inline  __attribute__((always_inline))
long long int wall_clock64() {
  return (long long int) __ockl_steadyctr_u64();
}

__device__
inline  __attribute__((always_inline))
long long int clock64() { return __clock64(); }

__device__
inline __attribute__((always_inline))
long long int  clock() { return __clock(); }

// hip.amdgcn.bc - named sync
__device__
inline
void __named_sync() { __builtin_amdgcn_s_barrier(); }

#endif // __HIP_DEVICE_COMPILE__

// warp vote function __all __any __ballot
__device__
inline
int __all(int predicate) {
    return __ockl_wfall_i32(predicate);
}

__device__
inline
int __any(int predicate) {
    return __ockl_wfany_i32(predicate);
}

// XXX from llvm/include/llvm/IR/InstrTypes.h
#define ICMP_NE 33

__device__
inline
unsigned long long int __ballot(int predicate) {
    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
}

__device__
inline
unsigned long long int __ballot64(int predicate) {
    return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
}

// hip.amdgcn.bc - lanemask
__device__
inline
uint64_t  __lanemask_gt()
{
    uint32_t lane = __ockl_lane_u32();
    if (lane == 63)
      return 0;
    uint64_t ballot = __ballot64(1);
    uint64_t mask = (~((uint64_t)0)) << (lane + 1);
    return mask & ballot;
}

__device__
inline
uint64_t __lanemask_lt()
{
    uint32_t lane = __ockl_lane_u32();
    int64_t ballot = __ballot64(1);
    uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
    return mask & ballot;
}

__device__
inline
uint64_t  __lanemask_eq()
{
    uint32_t lane = __ockl_lane_u32();
    int64_t mask = ((uint64_t)1 << lane);
    return mask;
}


__device__ inline void* __local_to_generic(void* p) { return p; }

#ifdef __HIP_DEVICE_COMPILE__
__device__
inline
void* __get_dynamicgroupbaseptr()
{
    // Get group segment base pointer.
    return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
}
#else
__device__
void* __get_dynamicgroupbaseptr();
#endif // __HIP_DEVICE_COMPILE__

__device__
inline
void *__amdgcn_get_dynamicgroupbaseptr() {
    return __get_dynamicgroupbaseptr();
}

// Memory Fence Functions
__device__
inline
static void __threadfence()
{
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
}

__device__
inline
static void __threadfence_block()
{
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
}

__device__
inline
static void __threadfence_system()
{
    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
}

// abort
__device__
inline
__attribute__((weak))
void abort() {
    return __builtin_trap();
}

// The noinline attribute helps encapsulate the printf expansion,
// which otherwise has a performance impact just by increasing the
// size of the calling function. Additionally, the weak attribute
// allows the function to exist as a global although its definition is
// included in every compilation unit.
#if defined(_WIN32) || defined(_WIN64)
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
void _wassert(const wchar_t *_msg, const wchar_t *_file, unsigned _line) {
    // FIXME: Need `wchar_t` support to generate assertion message.
    __builtin_trap();
}
#else /* defined(_WIN32) || defined(_WIN64) */
extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
void __assert_fail(const char *assertion,
                   const char *file,
                   unsigned int line,
                   const char *function)
{
  const char fmt[] = "%s:%u: %s: Device-side assertion `%s' failed.\n";

  // strlen is not available as a built-in yet, so we create our own
  // loop in a macro. With a string literal argument, the compiler
  // usually manages to replace the loop with a constant.
  //
  // The macro does not check for null pointer, since all the string
  // arguments are defined to be constant literals when called from
  // the assert() macro.
  //
  // NOTE: The loop below includes the null terminator in the length
  // as required by append_string_n().
#define __hip_get_string_length(LEN, STR)       \
  do {                                          \
    const char *tmp = STR;                      \
    while (*tmp++);                             \
    LEN = tmp - STR;                            \
  } while (0)

  auto msg = __ockl_fprintf_stderr_begin();
  int len = 0;
  __hip_get_string_length(len, fmt);
  msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
  __hip_get_string_length(len, file);
  msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
  msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
  __hip_get_string_length(len, function);
  msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
  __hip_get_string_length(len, assertion);
  __ockl_fprintf_append_string_n(msg, assertion, len, /* is_last = */ 1);

#undef __hip_get_string_length

  __builtin_trap();
}

extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
void __assertfail()
{
    // ignore all the args for now.
    __builtin_trap();
}
#endif /* defined(_WIN32) || defined(_WIN64) */

__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
    if (flags) {
        __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
        __builtin_amdgcn_s_barrier();
        __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
    } else {
        __builtin_amdgcn_s_barrier();
    }
}

__device__
inline
static void __barrier(int n)
{
  __work_group_barrier((__cl_mem_fence_flags)n);
}

__device__
inline
__attribute__((convergent))
void __syncthreads()
{
  __barrier(__CLK_LOCAL_MEM_FENCE);
}

__device__
inline
__attribute__((convergent))
int __syncthreads_count(int predicate)
{
  return __ockl_wgred_add_i32(!!predicate);
}

__device__
inline
__attribute__((convergent))
int __syncthreads_and(int predicate)
{
  return __ockl_wgred_and_i32(!!predicate);
}

__device__
inline
__attribute__((convergent))
int __syncthreads_or(int predicate)
{
  return __ockl_wgred_or_i32(!!predicate);
}

// hip.amdgcn.bc - device routine
/*
  HW_ID Register bit structure for RDNA2 & RDNA3
  WAVE_ID     4:0     Wave id within the SIMD.
  SIMD_ID     9:8     SIMD_ID within the WGP: [0] = row, [1] = column.
  WGP_ID      13:10   Physical WGP ID.
  SA_ID       16      Shader Array ID
  SE_ID       20:18   Shader Engine the wave is assigned to for gfx11
  SE_ID       19:18   Shader Engine the wave is assigned to for gfx10
  DP_RATE     31:29   Number of double-precision float units per SIMD

  HW_ID Register bit structure for GCN and CDNA
  WAVE_ID     3:0     Wave buffer slot number. 0-9.
  SIMD_ID     5:4     SIMD which the wave is assigned to within the CU.
  PIPE_ID     7:6     Pipeline from which the wave was dispatched.
  CU_ID       11:8    Compute Unit the wave is assigned to.
  SH_ID       12      Shader Array (within an SE) the wave is assigned to.
  SE_ID       15:13   Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
              14:13   Shader Engine the wave is assigned to for Vega.
  TG_ID       19:16   Thread-group ID
  VM_ID       23:20   Virtual Memory ID
  QUEUE_ID    26:24   Queue from which this wave was dispatched.
  STATE_ID    29:27   State ID (graphics only, not compute).
  ME_ID       31:30   Micro-engine ID.

  XCC_ID Register bit structure for gfx940
  XCC_ID      3:0     XCC the wave is assigned to.
 */

#if (defined (__GFX10__) || defined (__GFX11__))
  #define HW_ID               23
#else
  #define HW_ID               4
#endif

#if (defined(__GFX10__) || defined(__GFX11__))
  #define HW_ID_WGP_ID_SIZE   4
  #define HW_ID_WGP_ID_OFFSET 10
#else
  #define HW_ID_CU_ID_SIZE    4
  #define HW_ID_CU_ID_OFFSET  8
#endif

#if (defined(__gfx908__) || defined(__gfx90a__) || \
     defined(__GFX11__))
  #define HW_ID_SE_ID_SIZE    3
#else //4 SEs/XCC for gfx940-942
  #define HW_ID_SE_ID_SIZE    2
#endif
#if (defined(__GFX10__) || defined(__GFX11__))
  #define HW_ID_SE_ID_OFFSET  18
  #define HW_ID_SA_ID_OFFSET  16
  #define HW_ID_SA_ID_SIZE    1
#else
  #define HW_ID_SE_ID_OFFSET  13
#endif

#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
  #define XCC_ID                   20
  #define XCC_ID_XCC_ID_SIZE       4
  #define XCC_ID_XCC_ID_OFFSET     0
#endif

#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
  #define __HIP_NO_IMAGE_SUPPORT   1
#endif

/*
   Encoding of parameter bitmask
   HW_ID        5:0     HW_ID
   OFFSET       10:6    Range: 0..31
   SIZE         15:11   Range: 1..32
 */

#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))

/*
  __smid returns the wave's assigned Compute Unit and Shader Engine.
  The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
  Note: the results vary over time.
  SZ minus 1 since SIZE is 1-based.
*/
__device__
inline
unsigned __smid(void)
{
    unsigned se_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
    #if (defined(__GFX10__) || defined(__GFX11__))
      unsigned wgp_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
      unsigned sa_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
    #else
      #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
      unsigned xcc_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
      #endif
      unsigned cu_id = __builtin_amdgcn_s_getreg(
            GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
    #endif
    #if (defined(__GFX10__) || defined(__GFX11__))
      unsigned temp = se_id;
      temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
      temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
      return temp;
      //TODO : CU Mode impl
    #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
      unsigned temp = xcc_id;
      temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
      temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
      return temp;
    #else
      return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
    #endif
}

/**
 * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
 * To be removed in a future release.
 */
#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
#define HIP_DYNAMIC_SHARED_ATTRIBUTE

#endif //defined(__clang__) && defined(__HIP__)


// loop unrolling
static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);
    auto srcPtr = static_cast<const unsigned char*>(src);

    while (size >= 4u) {
        dstPtr[0] = srcPtr[0];
        dstPtr[1] = srcPtr[1];
        dstPtr[2] = srcPtr[2];
        dstPtr[3] = srcPtr[3];

        size -= 4u;
        srcPtr += 4u;
        dstPtr += 4u;
    }
    switch (size) {
        case 3:
            dstPtr[2] = srcPtr[2];
        case 2:
            dstPtr[1] = srcPtr[1];
        case 1:
            dstPtr[0] = srcPtr[0];
    }

    return dst;
}

static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
    auto dstPtr = static_cast<unsigned char*>(dst);

    while (size >= 4u) {
        dstPtr[0] = val;
        dstPtr[1] = val;
        dstPtr[2] = val;
        dstPtr[3] = val;

        size -= 4u;
        dstPtr += 4u;
    }
    switch (size) {
        case 3:
            dstPtr[2] = val;
        case 2:
            dstPtr[1] = val;
        case 1:
            dstPtr[0] = val;
    }

    return dst;
}
#ifndef __OPENMP_AMDGCN__
static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
    return __hip_hc_memcpy(dst, src, size);
}

static inline __device__ void* memset(void* ptr, int val, size_t size) {
    unsigned char val8 = static_cast<unsigned char>(val);
    return __hip_hc_memset(ptr, val8, size);
}
#endif // !__OPENMP_AMDGCN__

#endif
/*
Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H

__device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.u;
}

__device__ static inline float __hip_ds_bpermutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
    return tmp.f;
}

__device__ static inline unsigned __hip_ds_permute(int index, unsigned src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
    return tmp.u;
}

__device__ static inline float __hip_ds_permutef(int index, float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_permute(index, tmp.i);
    return tmp.f;
}

#define __hip_ds_swizzle(src, pattern)  __hip_ds_swizzle_N<(pattern)>((src))
#define __hip_ds_swizzlef(src, pattern) __hip_ds_swizzlef_N<(pattern)>((src))

template <int pattern>
__device__ static inline unsigned __hip_ds_swizzle_N(unsigned int src) {
    union { int i; unsigned u; float f; } tmp; tmp.u = src;
    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
    return tmp.u;
}

template <int pattern>
__device__ static inline float __hip_ds_swizzlef_N(float src) {
    union { int i; unsigned u; float f; } tmp; tmp.f = src;
    tmp.i = __builtin_amdgcn_ds_swizzle(tmp.i, pattern);
    return tmp.f;
}

#define __hip_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl) \
  __hip_move_dpp_N<(dpp_ctrl), (row_mask), (bank_mask), (bound_ctrl)>((src))

template <int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl>
__device__ static inline int __hip_move_dpp_N(int src) {
    return __builtin_amdgcn_mov_dpp(src, dpp_ctrl, row_mask, bank_mask,
                                    bound_ctrl);
}

static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE;

__device__
inline
int __shfl(int var, int src_lane, int width = warpSize) {
    int self = __lane_id();
    int index = (src_lane & (width - 1)) + (self & ~(width-1));
    return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl(unsigned int var, int src_lane, int width = warpSize) {
     union { int i; unsigned u; float f; } tmp; tmp.u = var;
    tmp.i = __shfl(tmp.i, src_lane, width);
    return tmp.u;
}
__device__
inline
float __shfl(float var, int src_lane, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.f = var;
    tmp.i = __shfl(tmp.i, src_lane, width);
    return tmp.f;
}
__device__
inline
double __shfl(double var, int src_lane, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
long __shfl(long var, int src_lane, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl(static_cast<int>(var), src_lane, width));
    #endif
}
__device__
inline
unsigned long __shfl(unsigned long var, int src_lane, int width = warpSize) {
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl(static_cast<unsigned int>(var), src_lane, width));
    #endif
}
__device__
inline
long long __shfl(long long var, int src_lane, int width = warpSize)
{
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl(unsigned long long var, int src_lane, int width = warpSize) {
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl(tmp[0], src_lane, width);
    tmp[1] = __shfl(tmp[1], src_lane, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_up(int var, unsigned int lane_delta, int width = warpSize) {
    int self = __lane_id();
    int index = self - lane_delta;
    index = (index < (self & ~(width-1)))?self:index;
    return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_up(unsigned int var, unsigned int lane_delta, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.u = var;
    tmp.i = __shfl_up(tmp.i, lane_delta, width);
    return tmp.u;
}
__device__
inline
float __shfl_up(float var, unsigned int lane_delta, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.f = var;
    tmp.i = __shfl_up(tmp.i, lane_delta, width);
    return tmp.f;
}
__device__
inline
double __shfl_up(double var, unsigned int lane_delta, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
long __shfl_up(long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_up(static_cast<int>(var), lane_delta, width));
    #endif
}

__device__
inline
unsigned long __shfl_up(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_up(static_cast<unsigned int>(var), lane_delta, width));
    #endif
}

__device__
inline
long long __shfl_up(long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
unsigned long long __shfl_up(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_up(tmp[0], lane_delta, width);
    tmp[1] = __shfl_up(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_down(int var, unsigned int lane_delta, int width = warpSize) {
    int self = __lane_id();
    int index = self + lane_delta;
    index = (int)((self&(width-1))+lane_delta) >= width?self:index;
    return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_down(unsigned int var, unsigned int lane_delta, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.u = var;
    tmp.i = __shfl_down(tmp.i, lane_delta, width);
    return tmp.u;
}
__device__
inline
float __shfl_down(float var, unsigned int lane_delta, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.f = var;
    tmp.i = __shfl_down(tmp.i, lane_delta, width);
    return tmp.f;
}
__device__
inline
double __shfl_down(double var, unsigned int lane_delta, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
long __shfl_down(long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_down(static_cast<int>(var), lane_delta, width));
    #endif
}
__device__
inline
unsigned long __shfl_down(unsigned long var, unsigned int lane_delta, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_down(static_cast<unsigned int>(var), lane_delta, width));
    #endif
}
__device__
inline
long long __shfl_down(long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl_down(unsigned long long var, unsigned int lane_delta, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_down(tmp[0], lane_delta, width);
    tmp[1] = __shfl_down(tmp[1], lane_delta, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

__device__
inline
int __shfl_xor(int var, int lane_mask, int width = warpSize) {
    int self = __lane_id();
    int index = self^lane_mask;
    index = index >= ((self+width)&~(width-1))?self:index;
    return __builtin_amdgcn_ds_bpermute(index<<2, var);
}
__device__
inline
unsigned int __shfl_xor(unsigned int var, int lane_mask, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.u = var;
    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
    return tmp.u;
}
__device__
inline
float __shfl_xor(float var, int lane_mask, int width = warpSize) {
    union { int i; unsigned u; float f; } tmp; tmp.f = var;
    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
    return tmp.f;
}
__device__
inline
double __shfl_xor(double var, int lane_mask, int width = warpSize) {
    static_assert(sizeof(double) == 2 * sizeof(int), "");
    static_assert(sizeof(double) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    double tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
long __shfl_xor(long var, int lane_mask, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(long) == 2 * sizeof(int), "");
    static_assert(sizeof(long) == sizeof(uint64_t), "");

    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(long) == sizeof(int), "");
    return static_cast<long>(__shfl_xor(static_cast<int>(var), lane_mask, width));
    #endif
}
__device__
inline
unsigned long __shfl_xor(unsigned long var, int lane_mask, int width = warpSize)
{
    #ifndef _MSC_VER
    static_assert(sizeof(unsigned long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long) == sizeof(uint64_t), "");

    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);

    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
    #else
    static_assert(sizeof(unsigned long) == sizeof(unsigned int), "");
    return static_cast<unsigned long>(__shfl_xor(static_cast<unsigned int>(var), lane_mask, width));
    #endif
}
__device__
inline
long long __shfl_xor(long long var, int lane_mask, int width = warpSize)
{
    static_assert(sizeof(long long) == 2 * sizeof(int), "");
    static_assert(sizeof(long long) == sizeof(uint64_t), "");
    int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}
__device__
inline
unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width = warpSize)
{
    static_assert(sizeof(unsigned long long) == 2 * sizeof(unsigned int), "");
    static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
    unsigned int tmp[2]; __builtin_memcpy(tmp, &var, sizeof(tmp));
    tmp[0] = __shfl_xor(tmp[0], lane_mask, width);
    tmp[1] = __shfl_xor(tmp[1], lane_mask, width);
    uint64_t tmp0 = (static_cast<uint64_t>(tmp[1]) << 32ull) | static_cast<uint32_t>(tmp[0]);
    unsigned long long tmp1;  __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
    return tmp1;
}

#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 *  @file  amd_detail/hip_cooperative_groups_helper.h
 *
 *  @brief Device side implementation of cooperative group feature.
 *
 *  Defines helper constructs and APIs which aid the types and device API
 *  wrappers defined within `amd_detail/hip_cooperative_groups.h`.
 */
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H

#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_runtime.h> // threadId, blockId
#include <hip/amd_detail/amd_device_functions.h>
#endif
#if !defined(__align__)
#define __align__(x) __attribute__((aligned(x)))
#endif

#if !defined(__CG_QUALIFIER__)
#define __CG_QUALIFIER__ __device__ __forceinline__
#endif

#if !defined(__CG_STATIC_QUALIFIER__)
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
#endif

#if !defined(_CG_STATIC_CONST_DECL_)
#define _CG_STATIC_CONST_DECL_ static constexpr
#endif

#if __AMDGCN_WAVEFRONT_SIZE == 32
using lane_mask = unsigned int;
#else
using lane_mask = unsigned long long int;
#endif

namespace cooperative_groups {

/* Global scope */
template <unsigned int size>
using is_power_of_2 = std::integral_constant<bool, (size & (size - 1)) == 0>;

template <unsigned int size>
using is_valid_wavefront = std::integral_constant<bool, (size <= __AMDGCN_WAVEFRONT_SIZE)>;

template <unsigned int size>
using is_valid_tile_size =
    std::integral_constant<bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;

template <typename T>
using is_valid_type =
    std::integral_constant<bool, std::is_integral<T>::value || std::is_floating_point<T>::value>;

namespace internal {

/**
* @brief Enums representing different cooperative group types
* @note  This enum is only applicable on Linux.
*
 */
typedef enum {
  cg_invalid,
  cg_multi_grid,
  cg_grid,
  cg_workgroup,
  cg_tiled_group,
  cg_coalesced_group
} group_type;
/**
 *  @ingroup CooperativeG
 *  @{
 *  This section describes the cooperative groups functions of HIP runtime API.
 *
 *  The cooperative groups provides flexible thread parallel programming algorithms, threads
 *  cooperate and share data to perform collective computations.
 *
 *  @note  Cooperative groups feature is implemented on Linux, under developement
 *  on Windows.
 *
 */
/**
 *
 * @brief  Functionalities related to multi-grid cooperative group type
 * @note  The following cooperative groups functions are only applicable on Linux.
 *
 */
namespace multi_grid {

__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
  return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }

__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
  return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }

__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }

__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }

__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }

__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }

}  // namespace multi_grid

/**
 *  @brief Functionalities related to grid cooperative group type
 *  @note  The following cooperative groups functions are only applicable on Linux.
 */
namespace grid {

__CG_STATIC_QUALIFIER__ uint32_t size() {
  return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
                    (blockDim.x * gridDim.x));
}

__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  // Compute global id of the workgroup to which the current thread belongs to
  uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
                               (blockIdx.y * gridDim.x) + (blockIdx.x));

  // Compute total number of threads being passed to reach current workgroup
  // within grid
  uint32_t num_threads_till_current_workgroup =
      static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));

  // Compute thread local rank within current workgroup
  uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                                          (threadIdx.y * blockDim.x) + (threadIdx.x));

  return (num_threads_till_current_workgroup + local_thread_rank);
}

__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }

__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }

}  // namespace grid

/**
 *  @brief Functionalities related to `workgroup` (thread_block in CUDA terminology)
 *  cooperative group type
 *  @note  The following cooperative groups functions are only applicable on Linux.
 */
namespace workgroup {

__CG_STATIC_QUALIFIER__ dim3 group_index() {
  return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
               static_cast<uint32_t>(blockIdx.z)));
}

__CG_STATIC_QUALIFIER__ dim3 thread_index() {
  return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
               static_cast<uint32_t>(threadIdx.z)));
}

__CG_STATIC_QUALIFIER__ uint32_t size() {
  return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
}

__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
  return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                     (threadIdx.y * blockDim.x) + (threadIdx.x)));
}

__CG_STATIC_QUALIFIER__ bool is_valid() {
  return true;
}

__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }

__CG_STATIC_QUALIFIER__ dim3 block_dim() {
  return (dim3(static_cast<uint32_t>(blockDim.x), static_cast<uint32_t>(blockDim.y),
          static_cast<uint32_t>(blockDim.z)));
}

}  // namespace workgroup

namespace tiled_group {

// enforce ordering for memory intructions
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }

}  // namespace tiled_group

namespace coalesced_group {

// enforce ordering for memory intructions
__CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent"); }

// Masked bit count
//
// For each thread, this function returns the number of active threads which
// have i-th bit of x set and come before the current thread.
__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
  unsigned int counter=0;
    #if __AMDGCN_WAVEFRONT_SIZE == 32
      counter = __builtin_amdgcn_mbcnt_lo(x, add);
    #else
      counter = __builtin_amdgcn_mbcnt_lo(static_cast<lane_mask>(x), add);
      counter = __builtin_amdgcn_mbcnt_hi(static_cast<lane_mask>(x >> 32), counter);
    #endif

    return counter;
}

}  // namespace coalesced_group


}  // namespace internal

}  // namespace cooperative_groups
/**
*  @}
*/

#endif  // __cplusplus
#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
 *  @file  amd_detail/hip_cooperative_groups.h
 *
 *  @brief Device side implementation of `Cooperative Group` feature.
 *
 *  Defines new types and device API wrappers related to `Cooperative Group`
 *  feature, which the programmer can directly use in his kernel(s) in order to
 *  make use of this feature.
 */
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H

#if __cplusplus
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/hip_cooperative_groups_helper.h>
#endif

#define __hip_abort()                                                                              \
  { abort(); }
#if defined(NDEBUG)
#define __hip_assert(COND)
#else
#define __hip_assert(COND)                                                                         \
  {                                                                                                \
    if (!COND) {                                                                                   \
      __hip_abort();                                                                               \
    }                                                                                              \
  }
#endif

namespace cooperative_groups {

/** @brief The base type of all cooperative group types
 *
 *  \details Holds the key properties of a constructed cooperative group types
 *           object, like the group type, its size, etc
 *
 *  @note  Cooperative groups feature is implemented on Linux, under developement
 *  on Windows.
 */
class thread_group {
 protected:
  uint32_t _type;  // thread_group type
  uint32_t _size;  // total number of threads in the tread_group
  uint64_t _mask;  // Lanemask for coalesced and tiled partitioned group types,
                   // LSB represents lane 0, and MSB represents lane 63

  // Construct a thread group, and set thread group type and other essential
  // thread group properties. This generic thread group is directly constructed
  // only when the group is supposed to contain only the calling the thread
  // (throurh the API - `this_thread()`), and in all other cases, this thread
  // group object is a sub-object of some other derived thread group object
  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
                                uint64_t mask = static_cast<uint64_t>(0)) {
    _type = type;
    _size = size;
    _mask = mask;
  }

  struct _tiled_info {
    bool is_tiled;
    unsigned int size;
    unsigned int meta_group_rank;
    unsigned int meta_group_size;
  };

  struct _coalesced_info {
    lane_mask member_mask;
    unsigned int size;
    struct _tiled_info tiled_info;
  } coalesced_info;

  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
                                                       unsigned int tile_size);
  friend class thread_block;

 public:
  // Total number of threads in the thread group, and this serves the purpose
  // for all derived cooperative group types since their `size` is directly
  // saved during the construction
  __CG_QUALIFIER__ uint32_t size() const { return _size; }
  __CG_QUALIFIER__ unsigned int cg_type() const { return _type; }
  // Rank of the calling thread within [0, size())
  __CG_QUALIFIER__ uint32_t thread_rank() const;
  // Is this cooperative group type valid?
  __CG_QUALIFIER__ bool is_valid() const;
  // synchronize the threads in the thread group
  __CG_QUALIFIER__ void sync() const;
};
/**
 *-------------------------------------------------------------------------------------------------
 *-------------------------------------------------------------------------------------------------
 *  @defgroup CooperativeG Cooperative Groups
 *  @ingroup API
 *  @{
 *  This section describes the cooperative groups functions of HIP runtime API.
 *
 *  The cooperative groups provides flexible thread parallel programming algorithms, threads
 *  cooperate and share data to perform collective computations.
 *
 *  @note  Cooperative groups feature is implemented on Linux, under developement
 *  on Windows.
 *
 */
/** \brief The multi-grid cooperative group type
 *
 *  \details Represents an inter-device cooperative group type where the
 *           participating threads within the group spans across multple
 *           devices, running the (same) kernel on these devices
 * @note  The multi-grid cooperative group type is implemented on Linux, under developement
 *  on Windows.
 */
class multi_grid_group : public thread_group {
  // Only these friend functions are allowed to construct an object of this class
  // and access its resources
  friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();

 protected:
  // Construct mutli-grid thread group (through the API this_multi_grid())
  explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
      : thread_group(internal::cg_multi_grid, size) {}

 public:
  // Number of invocations participating in this multi-grid group. In other
  // words, the number of GPUs
  __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
  // Rank of this invocation. In other words, an ID number within the range
  // [0, num_grids()) of the GPU, this kernel is running on
  __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
  __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
  __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
};

/** @brief User exposed API interface to construct multi-grid cooperative
 *         group type object - `multi_grid_group`
 *
 *  \details User is not allowed to directly construct an object of type
 *           `multi_grid_group`. Instead, he should construct it through this
 *           API function
 *  @note  This multi-grid cooperative API type is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ multi_grid_group this_multi_grid() {
  return multi_grid_group(internal::multi_grid::size());
}

/** @brief The grid cooperative group type
 *
 *  \details Represents an inter-workgroup cooperative group type where the
 *           participating threads within the group spans across multiple
 *           workgroups running the (same) kernel on the same device
 *  @note  This is implemented on Linux, under developement
 *  on Windows.
 */
class grid_group : public thread_group {
  // Only these friend functions are allowed to construct an object of this class
  // and access its resources
  friend __CG_QUALIFIER__ grid_group this_grid();

 protected:
  // Construct grid thread group (through the API this_grid())
  explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}

 public:
  __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
  __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
  __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
};

/** @brief User exposed API interface to construct grid cooperative group type
 *         object - `grid_group`
 *
 *  \details User is not allowed to directly construct an object of type
 *           `multi_grid_group`. Instead, he should construct it through this
 *           API function
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }

/** @brief   The workgroup (thread-block in CUDA terminology) cooperative group
 *           type
 *
 *  \details Represents an intra-workgroup cooperative group type where the
 *           participating threads within the group are exactly the same threads
 *           which are participated in the currently executing `workgroup`
 *  @note  This is implemented on Linux, under developement
 *  on Windows.
 */
class thread_block : public thread_group {
  // Only these friend functions are allowed to construct an object of thi
  // class and access its resources
  friend __CG_QUALIFIER__ thread_block this_thread_block();
  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
                                                       unsigned int tile_size);
  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent,
                                                       unsigned int tile_size);
 protected:
  // Construct a workgroup thread group (through the API this_thread_block())
  explicit __CG_QUALIFIER__ thread_block(uint32_t size)
      : thread_group(internal::cg_workgroup, size) {}

  __CG_QUALIFIER__ thread_group new_tiled_group(unsigned int tile_size) const {
    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
    // Invalid tile size, assert
    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
      __hip_assert(false && "invalid tile size")
    }

    thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
    tiledGroup.coalesced_info.tiled_info.size = tile_size;
    tiledGroup.coalesced_info.tiled_info.is_tiled = true;
    tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
    tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size;
    return tiledGroup;
  }

 public:
  // 3-dimensional block index within the grid
  __CG_STATIC_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
  // 3-dimensional thread index within the block
  __CG_STATIC_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
  __CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return internal::workgroup::thread_rank(); }
  __CG_STATIC_QUALIFIER__ uint32_t size() { return internal::workgroup::size(); }
  __CG_STATIC_QUALIFIER__ bool is_valid() { return internal::workgroup::is_valid(); }
  __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
  __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
};

/** \brief   User exposed API interface to construct workgroup cooperative
 *           group type object - `thread_block`.
 *
 *  \details User is not allowed to directly construct an object of type
 *           `thread_block`. Instead, he should construct it through this API
 *           function.
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ thread_block this_thread_block() {
  return thread_block(internal::workgroup::size());
}

/** \brief   The tiled_group cooperative group type
 *
 *  \details Represents one tiled thread group in a wavefront.
 *           This group type also supports sub-wave level intrinsics.
 *  @note  This is implemented on Linux, under developement
 *  on Windows.
 */

class tiled_group : public thread_group {
 private:
  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent,
                                                       unsigned int tile_size);
  friend __CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent,
                                                      unsigned int tile_size);

  __CG_QUALIFIER__ tiled_group new_tiled_group(unsigned int tile_size) const {
    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);

    if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
      __hip_assert(false && "invalid tile size")
    }

    if (size() <= tile_size) {
      return *this;
    }

    tiled_group tiledGroup = tiled_group(tile_size);
    tiledGroup.coalesced_info.tiled_info.is_tiled = true;
    return tiledGroup;
  }

 protected:
  explicit __CG_QUALIFIER__ tiled_group(unsigned int tileSize)
      : thread_group(internal::cg_tiled_group, tileSize) {
    coalesced_info.tiled_info.size = tileSize;
    coalesced_info.tiled_info.is_tiled = true;
  }

 public:
  __CG_QUALIFIER__ unsigned int size() const { return (coalesced_info.tiled_info.size); }

  __CG_QUALIFIER__ unsigned int thread_rank() const {
    return (internal::workgroup::thread_rank() & (coalesced_info.tiled_info.size - 1));
  }

  __CG_QUALIFIER__ void sync() const {
    internal::tiled_group::sync();
  }
};

/** \brief   The coalesced_group cooperative group type
 *
 *  \details Represents a active thread group in a wavefront.
 *           This group type also supports sub-wave level intrinsics.
 *  @note  This is implemented on Linux, under developement
 *  on Windows.
 */
class coalesced_group : public thread_group {
 private:
  friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
  friend __CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size);
  friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size);

  __CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
    const bool pow2 = ((tile_size & (tile_size - 1)) == 0);

    if (!tile_size || (tile_size > size()) || !pow2) {
      return coalesced_group(0);
    }

    // If a tiled group is passed to be partitioned further into a coalesced_group.
    // prepare a mask for further partitioning it so that it stays coalesced.
    if (coalesced_info.tiled_info.is_tiled) {
      unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
      unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
      lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);

      member_mask <<= (__lane_id() & ~(tile_size - 1));
      coalesced_group coalesced_tile = coalesced_group(member_mask);
      coalesced_tile.coalesced_info.tiled_info.is_tiled = true;
      coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
      coalesced_tile.coalesced_info.tiled_info.meta_group_size = size() / tile_size;
      return coalesced_tile;
    }
    // Here the parent coalesced_group is not partitioned.
    else {
      lane_mask member_mask = 0;
      unsigned int tile_rank = 0;
      int lanes_to_skip = ((thread_rank()) / tile_size) * tile_size;

      for (unsigned int i = 0; i < __AMDGCN_WAVEFRONT_SIZE; i++) {
        lane_mask active = coalesced_info.member_mask & (1 << i);
        // Make sure the lane is active
        if (active) {
          if (lanes_to_skip <= 0 && tile_rank < tile_size) {
             // Prepare a member_mask that is appropriate for a tile
            member_mask |= active;
            tile_rank++;
          }
          lanes_to_skip--;
        }
      }
      coalesced_group coalesced_tile = coalesced_group(member_mask);
      coalesced_tile.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size;
      coalesced_tile.coalesced_info.tiled_info.meta_group_size =
                                                      (size() + tile_size - 1) / tile_size;
      return coalesced_tile;
    }
     return coalesced_group(0);
  }

 protected:
 // Constructor
  explicit __CG_QUALIFIER__ coalesced_group(lane_mask member_mask)
      : thread_group(internal::cg_coalesced_group) {
    coalesced_info.member_mask = member_mask; // Which threads are active
    coalesced_info.size = __popcll(coalesced_info.member_mask); // How many threads are active
    coalesced_info.tiled_info.is_tiled = false; // Not a partitioned group
    coalesced_info.tiled_info.meta_group_rank = 0;
    coalesced_info.tiled_info.meta_group_size = 1;
  }

 public:
   __CG_QUALIFIER__ unsigned int size() const {
     return coalesced_info.size;
   }

   __CG_QUALIFIER__ unsigned int thread_rank() const {
     return internal::coalesced_group::masked_bit_count(coalesced_info.member_mask);
    }

   __CG_QUALIFIER__ void sync() const {
       internal::coalesced_group::sync();
    }

   __CG_QUALIFIER__ unsigned int meta_group_rank() const {
       return coalesced_info.tiled_info.meta_group_rank;
    }

   __CG_QUALIFIER__ unsigned int meta_group_size() const {
       return coalesced_info.tiled_info.meta_group_size;
   }

  template <class T>
  __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");

    srcRank = srcRank % static_cast<int>(size());

    int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
             : (__AMDGCN_WAVEFRONT_SIZE == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
                                          : __fns32(coalesced_info.member_mask, 0, (srcRank + 1));

    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  }

  template <class T>
  __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");

    // Note: The cuda implementation appears to use the remainder of lane_delta
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.

    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
      return __shfl_down(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
    }

    int lane;
    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
      lane = __fns64(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
    }
    else {
      lane = __fns32(coalesced_info.member_mask, __lane_id(), lane_delta + 1);
    }

    if (lane == -1) {
      lane = __lane_id();
    }

    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  }

  template <class T>
  __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");

    // Note: The cuda implementation appears to use the remainder of lane_delta
    // and WARP_SIZE as the shift value rather than lane_delta itself.
    // This is not described in the documentation and is not done here.

    if (size() == __AMDGCN_WAVEFRONT_SIZE) {
      return __shfl_up(var, lane_delta, __AMDGCN_WAVEFRONT_SIZE);
    }

    int lane;
    if (__AMDGCN_WAVEFRONT_SIZE == 64) {
      lane = __fns64(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
    }
    else if (__AMDGCN_WAVEFRONT_SIZE == 32) {
      lane = __fns32(coalesced_info.member_mask, __lane_id(), -(lane_delta + 1));
    }

    if (lane == -1) {
      lane = __lane_id();
    }

    return __shfl(var, lane, __AMDGCN_WAVEFRONT_SIZE);
  }
};

/** \brief   User exposed API to create coalesced groups.
 *
 *  \details A collective operation that groups  all active lanes into a new thread group.
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */

__CG_QUALIFIER__ coalesced_group coalesced_threads() {
    return cooperative_groups::coalesced_group(__builtin_amdgcn_read_exec());
}

/**
 *  Implemenation of all publicly exposed base class APIs
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
  switch (this->_type) {
    case internal::cg_multi_grid: {
      return (static_cast<const multi_grid_group*>(this)->thread_rank());
    }
    case internal::cg_grid: {
      return (static_cast<const grid_group*>(this)->thread_rank());
    }
    case internal::cg_workgroup: {
      return (static_cast<const thread_block*>(this)->thread_rank());
    }
    case internal::cg_tiled_group: {
      return (static_cast<const tiled_group*>(this)->thread_rank());
    }
    case internal::cg_coalesced_group: {
      return (static_cast<const coalesced_group*>(this)->thread_rank());
    }
    default: {
      __hip_assert(false && "invalid cooperative group type")
      return -1;
    }
  }
}
/**
 *  Implemenation of all publicly exposed thread group API
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ bool thread_group::is_valid() const {
  switch (this->_type) {
    case internal::cg_multi_grid: {
      return (static_cast<const multi_grid_group*>(this)->is_valid());
    }
    case internal::cg_grid: {
      return (static_cast<const grid_group*>(this)->is_valid());
    }
    case internal::cg_workgroup: {
      return (static_cast<const thread_block*>(this)->is_valid());
    }
    case internal::cg_tiled_group: {
      return (static_cast<const tiled_group*>(this)->is_valid());
    }
    case internal::cg_coalesced_group: {
      return (static_cast<const coalesced_group*>(this)->is_valid());
    }
    default: {
      __hip_assert(false && "invalid cooperative group type")
      return false;
    }
  }
}
/**
 *  Implemenation of all publicly exposed thread group sync API
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
__CG_QUALIFIER__ void thread_group::sync() const {
  switch (this->_type) {
    case internal::cg_multi_grid: {
      static_cast<const multi_grid_group*>(this)->sync();
      break;
    }
    case internal::cg_grid: {
      static_cast<const grid_group*>(this)->sync();
      break;
    }
    case internal::cg_workgroup: {
      static_cast<const thread_block*>(this)->sync();
      break;
    }
    case internal::cg_tiled_group: {
      static_cast<const tiled_group*>(this)->sync();
      break;
    }
    case internal::cg_coalesced_group: {
      static_cast<const coalesced_group*>(this)->sync();
      break;
    }
    default: {
      __hip_assert(false && "invalid cooperative group type")
    }
  }
}

/**
 *  Implemenation of publicly exposed `wrapper` API on top of basic cooperative
 *  group type APIs
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
template <class CGTy> __CG_QUALIFIER__ uint32_t group_size(CGTy const& g) { return g.size(); }
/**
 *  Implemenation of publicly exposed `wrapper` API on top of basic cooperative
 *  group type APIs
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
template <class CGTy> __CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g) {
  return g.thread_rank();
}
/**
 *  Implemenation of publicly exposed `wrapper` API on top of basic cooperative
 *  group type APIs
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
template <class CGTy> __CG_QUALIFIER__ bool is_valid(CGTy const& g) { return g.is_valid(); }
/**
 *  Implemenation of publicly exposed `wrapper` API on top of basic cooperative
 *  group type APIs
 *  @note  This function is implemented on Linux, under developement
 *  on Windows.
 */
template <class CGTy> __CG_QUALIFIER__ void sync(CGTy const& g) { g.sync(); }
/**
 * template class tile_base
 *  @note  This class is implemented on Linux, under developement
 *  on Windows.
 */
template <unsigned int tileSize> class tile_base {
 protected:
  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;

 public:
  // Rank of the thread within this tile
  _CG_STATIC_CONST_DECL_ unsigned int thread_rank() {
    return (internal::workgroup::thread_rank() & (numThreads - 1));
  }

  // Number of threads within this tile
  __CG_STATIC_QUALIFIER__ unsigned int size() { return numThreads; }
};
/**
 * template class thread_block_tile_base
 *  @note  This class is implemented on Linux, under developement
 *  on Windows.
 */
template <unsigned int size> class thread_block_tile_base : public tile_base<size> {
  static_assert(is_valid_tile_size<size>::value,
                "Tile size is either not a power of 2 or greater than the wavefront size");
  using tile_base<size>::numThreads;

 public:
  __CG_STATIC_QUALIFIER__ void sync() {
    internal::tiled_group::sync();
  }

  template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
    return (__shfl(var, srcRank, numThreads));
  }

  template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
    return (__shfl_down(var, lane_delta, numThreads));
  }

  template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
    return (__shfl_up(var, lane_delta, numThreads));
  }

  template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
    return (__shfl_xor(var, laneMask, numThreads));
  }
};
/** \brief   User exposed API that captures the state of the parent group pre-partition
 */
template <unsigned int tileSize, typename ParentCGTy>
class parent_group_info {
public:
  // Returns the linear rank of the group within the set of tiles partitioned
  // from a parent group (bounded by meta_group_size)
  __CG_STATIC_QUALIFIER__ unsigned int meta_group_rank() {
    return ParentCGTy::thread_rank() / tileSize;
  }

  // Returns the number of groups created when the parent group was partitioned.
  __CG_STATIC_QUALIFIER__ unsigned int meta_group_size() {
    return (ParentCGTy::size() + tileSize - 1) / tileSize;
  }
};

/** \brief   Group type - thread_block_tile
 *
 *  \details  Represents one tile of thread group.
 *  @note  This type is implemented on Linux, under developement
 *  on Windows.
 */
template <unsigned int tileSize, class ParentCGTy>
class thread_block_tile_type : public thread_block_tile_base<tileSize>,
                               public tiled_group,
                               public parent_group_info<tileSize, ParentCGTy> {
  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
  protected:
    __CG_QUALIFIER__ thread_block_tile_type() : tiled_group(numThreads) {
      coalesced_info.tiled_info.size = numThreads;
      coalesced_info.tiled_info.is_tiled = true;
    }
};

// Partial template specialization
template <unsigned int tileSize>
class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
                               public tiled_group
                             {
  _CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;

  typedef thread_block_tile_base<numThreads> tbtBase;

 protected:

    __CG_QUALIFIER__ thread_block_tile_type(unsigned int meta_group_rank, unsigned int meta_group_size)
        : tiled_group(numThreads) {
    coalesced_info.tiled_info.size = numThreads;
    coalesced_info.tiled_info.is_tiled = true;
    coalesced_info.tiled_info.meta_group_rank = meta_group_rank;
    coalesced_info.tiled_info.meta_group_size = meta_group_size;
  }

 public:
  using tbtBase::size;
  using tbtBase::sync;
  using tbtBase::thread_rank;

  __CG_QUALIFIER__ unsigned int meta_group_rank() const {
    return coalesced_info.tiled_info.meta_group_rank;
  }

  __CG_QUALIFIER__ unsigned int meta_group_size() const {
    return coalesced_info.tiled_info.meta_group_size;
  }
// end of operative group
/**
* @}
*/
};


/** \brief   User exposed API to partition groups.
 *
 *  \details A collective operation that partitions the parent group into a one-dimensional,
 *           row-major, tiling of subgroups.
 */

__CG_QUALIFIER__ thread_group tiled_partition(const thread_group& parent, unsigned int tile_size) {
  if (parent.cg_type() == internal::cg_tiled_group) {
    const tiled_group* cg = static_cast<const tiled_group*>(&parent);
    return cg->new_tiled_group(tile_size);
  }
  else if(parent.cg_type() == internal::cg_coalesced_group) {
    const coalesced_group* cg = static_cast<const coalesced_group*>(&parent);
    return cg->new_tiled_group(tile_size);
  }
  else {
    const thread_block* tb = static_cast<const thread_block*>(&parent);
    return tb->new_tiled_group(tile_size);
  }
}

// Thread block type overload
__CG_QUALIFIER__ thread_group tiled_partition(const thread_block& parent, unsigned int tile_size) {
  return (parent.new_tiled_group(tile_size));
}

__CG_QUALIFIER__ tiled_group tiled_partition(const tiled_group& parent, unsigned int tile_size) {
  return (parent.new_tiled_group(tile_size));
}

// If a coalesced group is passed to be partitioned, it should remain coalesced
__CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tile_size) {
    return (parent.new_tiled_group(tile_size));
}

template <unsigned int size, class ParentCGTy> class thread_block_tile;

namespace impl {
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;

template <unsigned int size, class ParentCGTy>
class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
 protected:
  template <unsigned int tbtSize, class tbtParentT>
  __CG_QUALIFIER__ thread_block_tile_internal(
      const thread_block_tile_internal<tbtSize, tbtParentT>& g)
      : thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}

  __CG_QUALIFIER__ thread_block_tile_internal(const thread_block& g)
      : thread_block_tile_type<size, ParentCGTy>() {}
};
}  // namespace impl

template <unsigned int size, class ParentCGTy>
class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
 protected:
  __CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
      : impl::thread_block_tile_internal<size, ParentCGTy>(g) {}

 public:
  __CG_QUALIFIER__ operator thread_block_tile<size, void>() const {
    return thread_block_tile<size, void>(*this);
  }
};


template <unsigned int size>
class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
  template <unsigned int, class ParentCGTy> friend class thread_block_tile;

 protected:
 public:
  template <class ParentCGTy>
  __CG_QUALIFIER__ thread_block_tile(const thread_block_tile<size, ParentCGTy>& g)
      : impl::thread_block_tile_internal<size, void>(g) {}
};

template <unsigned int size, class ParentCGTy = void> class thread_block_tile;

namespace impl {
template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;

template <unsigned int size>
struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
  __CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
      : thread_block_tile<size, thread_block>(g) {}
};

}  // namespace impl

/** \brief   User exposed API to partition groups.
 *
 *  \details  This constructs a templated class derieved from thread_group.
 *            The template defines tile size of the new thread group at compile time.
 */
template <unsigned int size, class ParentCGTy>
__CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const ParentCGTy& g) {
  static_assert(is_valid_tile_size<size>::value,
                "Tiled partition with size > wavefront size. Currently not supported ");
  return impl::tiled_partition_internal<size, ParentCGTy>(g);
}
}  // namespace cooperative_groups

#endif  // __cplusplus
#endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
/*
Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#ifdef __cplusplus

/**
 * @brief Unsafe floating point rmw atomic add.
 *
 * Performs a relaxed read-modify-write floating point atomic add with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated to have the original value plus \p value
 *
 * @note This operation currently only performs different operations for
 * the gfx90a target. Other devices continue to use safe atomics.
 *
 * It can be used to generate code that uses fast hardware floating point atomic
 * operations which may handle rounding and subnormal values differently than
 * non-atomic floating point operations.
 *
 * The operation is not always safe and can have undefined behavior unless
 * following condition are met:
 *
 * - \p addr is at least 4 bytes aligned
 * - If \p addr is a global segment address, it is in a coarse grain allocation.
 * Passing in global segment addresses in fine grain allocations will result in
 * undefined behavior and is not supported.
 *
 * @param [in,out] addr Pointer to value to be increment by \p value.
 * @param [in] value Value by \p addr is to be incremented.
 * @return Original value contained in \p addr.
 */
__device__ inline float unsafeAtomicAdd(float* addr, float value) {
#if defined(__gfx90a__) &&                                                   \
    __has_builtin(__builtin_amdgcn_is_shared) &&                               \
    __has_builtin(__builtin_amdgcn_is_private) &&                              \
    __has_builtin(__builtin_amdgcn_ds_atomic_fadd_f32) &&                      \
    __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32)
  if (__builtin_amdgcn_is_shared(
        (const __attribute__((address_space(0))) void*)addr))
    return __builtin_amdgcn_ds_atomic_fadd_f32(addr, value);
  else if (__builtin_amdgcn_is_private(
              (const __attribute__((address_space(0))) void*)addr)) {
    float temp = *addr;
    *addr = temp + value;
    return temp;
  }
  else
    return __builtin_amdgcn_global_atomic_fadd_f32(addr, value);
#elif __has_builtin(__hip_atomic_fetch_add)
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}

/**
 * @brief Unsafe floating point rmw atomic max.
 *
 * Performs a relaxed read-modify-write floating point atomic max with
 * device memory scope. The original value at \p addr is returned and
 * the value at \p addr is replaced by \p val if greater.
 *
 * @note This operation is currently identical to that performed by
 * atomicMax and is included for completeness.
 *
 * @param [in,out] addr Pointer to value to be updated
 * @param [in] val Value used to update the value at \p addr.
 * @return Original value contained in \p addr.
 */
__device__ inline float unsafeAtomicMax(float* addr, float val) {
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
}

/**
 * @brief Unsafe floating point rmw atomic min.
 *
 * Performs a relaxed read-modify-write floating point atomic min with
 * device memory scope. The original value at \p addr is returned and
 * the value at \p addr is replaced by \p val if lesser.
 *
 * @note This operation is currently identical to that performed by
 * atomicMin and is included for completeness.
 *
 * @param [in,out] addr Pointer to value to be updated
 * @param [in] val Value used to update the value at \p addr.
 * @return Original value contained in \p addr.
 */
__device__ inline float unsafeAtomicMin(float* addr, float val) {
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
}

/**
 * @brief Unsafe double precision rmw atomic add.
 *
 * Performs a relaxed read-modify-write double precision atomic add with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated to have the original value plus \p value
 *
 * @note This operation currently only performs different operations for
 * the gfx90a target. Other devices continue to use safe atomics.
 *
 * It can be used to generate code that uses fast hardware floating point atomic
 * operations which may handle rounding and subnormal values differently than
 * non-atomic floating point operations.
 *
 * The operation is not always safe and can have undefined behavior unless
 * following condition are met:
 *
 * - \p addr is at least 8 byte aligned
 * - If \p addr is a global segment address, it is in a coarse grain allocation.
 * Passing in global segment addresses in fine grain allocations will result in
 * undefined behavior and are not supported.
 *
 * @param [in,out] addr Pointer to value to be updated.
 * @param [in] value Value by \p addr is to be incremented.
 * @return Original value contained in \p addr.
 */
__device__ inline double unsafeAtomicAdd(double* addr, double value) {
#if defined(__gfx90a__) && __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
  return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
#elif defined (__hip_atomic_fetch_add)
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}

/**
 * @brief Unsafe double precision rmw atomic max.
 *
 * Performs a relaxed read-modify-write double precision atomic max with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated with \p val if greater.
 *
 * @note This operation currently only performs different operations for
 * the gfx90a target. Other devices continue to use safe atomics.
 *
 * It can be used to generate code that uses fast hardware floating point atomic
 * operations which may handle rounding and subnormal values differently than
 * non-atomic floating point operations.
 *
 * The operation is not always safe and can have undefined behavior unless
 * following condition are met:
 *
 * - \p addr is at least 8 byte aligned
 * - If \p addr is a global segment address, it is in a coarse grain allocation.
 * Passing in global segment addresses in fine grain allocations will result in
 * undefined behavior and are not supported.
 *
 * @param [in,out] addr Pointer to value to be updated.
 * @param [in] val Value used to updated the contents at \p addr
 * @return Original value contained at \p addr.
 */
__device__ inline double unsafeAtomicMax(double* addr, double val) {
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) &&  \
    __has_builtin(__builtin_amdgcn_flat_atomic_fmax_f64)
  return __builtin_amdgcn_flat_atomic_fmax_f64(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
#endif
}

/**
 * @brief Unsafe double precision rmw atomic min.
 *
 * Performs a relaxed read-modify-write double precision atomic min with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated with \p val if lesser.
 *
 * @note This operation currently only performs different operations for
 * the gfx90a target. Other devices continue to use safe atomics.
 *
 * It can be used to generate code that uses fast hardware floating point atomic
 * operations which may handle rounding and subnormal values differently than
 * non-atomic floating point operations.
 *
 * The operation is not always safe and can have undefined behavior unless
 * following condition are met:
 *
 * - \p addr is at least 8 byte aligned
 * - If \p addr is a global segment address, it is in a coarse grain allocation.
 * Passing in global segment addresses in fine grain allocations will result in
 * undefined behavior and are not supported.
 *
 * @param [in,out] addr Pointer to value to be updated.
 * @param [in] val Value used to updated the contents at \p addr
 * @return Original value contained at \p addr.
 */
__device__ inline double unsafeAtomicMin(double* addr, double val) {
#if (defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) &&  \
    __has_builtin(__builtin_amdgcn_flat_atomic_fmin_f64)
  return __builtin_amdgcn_flat_atomic_fmin_f64(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
#endif
}

/**
 * @brief Safe floating point rmw atomic add.
 *
 * Performs a relaxed read-modify-write floating point atomic add with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated to have the original value plus \p value
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be increment by \p value.
 * @param [in] value Value by \p addr is to be incremented.
 * @return Original value contained in \p addr.
 */
__device__ inline float safeAtomicAdd(float* addr, float value) {
#if defined(__gfx908__) || defined(__gfx941__)                                \
    || ((defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx942__))   \
         && !__has_builtin(__hip_atomic_fetch_add))
  // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
  // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
  // On gfx941, we can generate unsafe FP32 atomic add that may not always happen atomically,
  // so we need to force a CAS loop emulation to ensure safety.
  // On gfx90a, gfx940 and gfx942 if we do not have the __hip_atomic_fetch_add builtin, we
  // need to force a CAS loop here.
  float old_val;
#if __has_builtin(__hip_atomic_load)
  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_load)
  old_val = __uint_as_float(__atomic_load_n(reinterpret_cast<unsigned int*>(addr), __ATOMIC_RELAXED));
#endif // __has_builtin(__hip_atomic_load)
  float expected, temp;
  do {
    temp = expected = old_val;
#if __has_builtin(__hip_atomic_compare_exchange_strong)
    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
    old_val = expected;
  } while (__float_as_uint(temp) != __float_as_uint(old_val));
  return old_val;
#elif defined(__gfx90a__)
  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
  // atomics will produce safe CAS loops, but are otherwise not different than
  // agent-scope atomics. This logic is only applicable for gfx90a, and should
  // not be assumed on other architectures.
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#elif __has_builtin(__hip_atomic_fetch_add)
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else
  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif
}

/**
 * @brief Safe floating point rmw atomic max.
 *
 * Performs a relaxed read-modify-write floating point atomic max with
 * device memory scope. The original value at \p addr is returned and
 * the value at \p addr is replaced by \p val if greater.
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be updated
 * @param [in] val Value used to update the value at \p addr.
 * @return Original value contained in \p addr.
 */
__device__ inline float safeAtomicMax(float* addr, float val) {
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
}

/**
 * @brief Safe floating point rmw atomic min.
 *
 * Performs a relaxed read-modify-write floating point atomic min with
 * device memory scope. The original value at \p addr is returned and
 * the value at \p addr is replaced by \p val if lesser.
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be updated
 * @param [in] val Value used to update the value at \p addr.
 * @return Original value contained in \p addr.
 */
__device__ inline float safeAtomicMin(float* addr, float val) {
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
}

/**
 * @brief Safe double precision rmw atomic add.
 *
 * Performs a relaxed read-modify-write double precision atomic add with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated to have the original value plus \p value
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be increment by \p value.
 * @param [in] value Value by \p addr is to be incremented.
 * @return Original value contained in \p addr.
 */
__device__ inline double safeAtomicAdd(double* addr, double value) {
#if defined(__gfx90a__) &&  __has_builtin(__hip_atomic_fetch_add)
  // On gfx90a, with the __hip_atomic_fetch_add builtin, relaxed system-scope
  // atomics will produce safe CAS loops, but are otherwise not different than
  // agent-scope atomics. This logic is only applicable for gfx90a, and should
  // not be assumed on other architectures.
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#elif defined(__gfx90a__)
  // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
  // force a CAS loop here.
  double old_val;
#if __has_builtin(__hip_atomic_load)
  old_val = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_load)
  old_val = __longlong_as_double(__atomic_load_n(reinterpret_cast<unsigned long long*>(addr), __ATOMIC_RELAXED));
#endif // __has_builtin(__hip_atomic_load)
  double expected, temp;
  do {
    temp = expected = old_val;
#if __has_builtin(__hip_atomic_compare_exchange_strong)
    __hip_atomic_compare_exchange_strong(addr, &expected, old_val + value, __ATOMIC_RELAXED,
                                         __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else // !__has_builtin(__hip_atomic_compare_exchange_strong)
    __atomic_compare_exchange_n(addr, &expected, old_val + value, false,
                                __ATOMIC_RELAXED, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_compare_exchange_strong)
    old_val = expected;
  } while (__double_as_longlong(temp) != __double_as_longlong(old_val));
  return old_val;
#else // !defined(__gfx90a__)
#if __has_builtin(__hip_atomic_fetch_add)
  return __hip_atomic_fetch_add(addr, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#else  // !__has_builtin(__hip_atomic_fetch_add)
  return __atomic_fetch_add(addr, value, __ATOMIC_RELAXED);
#endif // __has_builtin(__hip_atomic_fetch_add)
#endif
}

/**
 * @brief Safe double precision rmw atomic max.
 *
 * Performs a relaxed read-modify-write double precision atomic max with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated with \p val if greater.
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be updated.
 * @param [in] val Value used to updated the contents at \p addr
 * @return Original value contained at \p addr.
 */
__device__ inline double safeAtomicMax(double* addr, double val) {
  #if __has_builtin(__builtin_amdgcn_is_private)
  if (__builtin_amdgcn_is_private(
          (const __attribute__((address_space(0))) void*)addr)) {
    double old = *addr;
    *addr = __builtin_fmax(old, val);
    return old;
  } else {
  #endif
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
  #if __has_builtin(__builtin_amdgcn_is_private)
  }
  #endif
}

/**
 * @brief Safe double precision rmw atomic min.
 *
 * Performs a relaxed read-modify-write double precision atomic min with
 * device memory scope. Original value at \p addr is returned and
 * the value of \p addr is updated with \p val if lesser.
 *
 * @note This operation ensures that, on all targets, we produce safe atomics.
 * This will be the case even when -munsafe-fp-atomics is passed into the compiler.
 *
 * @param [in,out] addr Pointer to value to be updated.
 * @param [in] val Value used to updated the contents at \p addr
 * @return Original value contained at \p addr.
 */
__device__ inline double safeAtomicMin(double* addr, double val) {
  #if __has_builtin(__builtin_amdgcn_is_private)
  if (__builtin_amdgcn_is_private(
           (const __attribute__((address_space(0))) void*)addr)) {
    double old = *addr;
    *addr = __builtin_fmin(old, val);
    return old;
  } else {
  #endif
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
  #if __has_builtin(__builtin_amdgcn_is_private)
  }
  #endif
}

#endif
/*
Copyright (c) 2015 - Present Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if !defined(__HIPCC_RTC__)
#include "amd_device_functions.h"
#endif

#if __has_builtin(__hip_atomic_compare_exchange_strong)

template<bool B, typename T, typename F> struct Cond_t;

template<typename T, typename F> struct Cond_t<true, T, F> { using type = T; };
template<typename T, typename F> struct Cond_t<false, T, F> { using type = F; };

#if !__HIP_DEVICE_COMPILE__
//TODO: Remove this after compiler pre-defines the following Macros.
#define __HIP_MEMORY_SCOPE_SINGLETHREAD 1
#define __HIP_MEMORY_SCOPE_WAVEFRONT 2
#define __HIP_MEMORY_SCOPE_WORKGROUP 3
#define __HIP_MEMORY_SCOPE_AGENT 4
#define __HIP_MEMORY_SCOPE_SYSTEM 5
#endif

#if !defined(__HIPCC_RTC__)
#include "amd_hip_unsafe_atomics.h"
#endif

// Atomic expanders
template<
  int mem_order = __ATOMIC_SEQ_CST,
  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  typename T,
  typename Op,
  typename F>
inline
__attribute__((always_inline, device))
T hip_cas_expander(T* p, T x, Op op, F f) noexcept
{
  using FP = __attribute__((address_space(0))) const void*;

  __device__
  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

  if (is_shared_workaround((FP)p))
    return f();

  using U = typename Cond_t<
    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

  U tmp0{__hip_atomic_load(q, mem_order, mem_scope)};
  U tmp1;
  do {
    tmp1 = tmp0;

    op(reinterpret_cast<T&>(tmp1), x);
  } while (!__hip_atomic_compare_exchange_strong(q, &tmp0, tmp1, mem_order,
                                                 mem_order, mem_scope));

  return reinterpret_cast<const T&>(tmp0);
}

template<
  int mem_order = __ATOMIC_SEQ_CST,
  int mem_scope= __HIP_MEMORY_SCOPE_SYSTEM,
  typename T,
  typename Cmp,
  typename F>
inline
__attribute__((always_inline, device))
T hip_cas_extrema_expander(T* p, T x, Cmp cmp, F f) noexcept
{
  using FP = __attribute__((address_space(0))) const void*;

  __device__
  extern bool is_shared_workaround(FP) asm("llvm.amdgcn.is.shared");

  if (is_shared_workaround((FP)p))
    return f();

  using U = typename Cond_t<
    sizeof(T) == sizeof(unsigned int), unsigned int, unsigned long long>::type;

  auto q = reinterpret_cast<U*>(p);

  U tmp{__hip_atomic_load(q, mem_order, mem_scope)};
  while (cmp(x, reinterpret_cast<const T&>(tmp)) &&
         !__hip_atomic_compare_exchange_strong(q, &tmp, x, mem_order, mem_order,
                                               mem_scope));

  return reinterpret_cast<const T&>(tmp);
}

__device__
inline
int atomicCAS(int* address, int compare, int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
    return compare;
}

__device__
inline
int atomicCAS_system(int* address, int compare, int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
    return compare;
}

__device__
inline
unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
}

__device__
inline
unsigned int atomicCAS_system(unsigned int* address, unsigned int compare, unsigned int val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
}

__device__
inline
unsigned long atomicCAS(unsigned long* address, unsigned long compare, unsigned long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
}

__device__
inline
unsigned long atomicCAS_system(unsigned long* address, unsigned long compare, unsigned long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
}

__device__
inline
unsigned long long atomicCAS(unsigned long long* address, unsigned long long compare,
                             unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
  return compare;
}

__device__
inline
unsigned long long atomicCAS_system(unsigned long long* address, unsigned long long compare,
                                    unsigned long long val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
  return compare;
}

__device__
inline
float atomicCAS(float* address, float compare, float val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
    return compare;
}

__device__
inline
float atomicCAS_system(float* address, float compare, float val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
    return compare;
}

__device__
inline
double atomicCAS(double* address, double compare, double val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_AGENT);
    return compare;
}

__device__
inline
double atomicCAS_system(double* address, double compare, double val) {
  __hip_atomic_compare_exchange_strong(address, &compare, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED,
                                       __HIP_MEMORY_SCOPE_SYSTEM);
    return compare;
}

__device__
inline
int atomicAdd(int* address, int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
int atomicAdd_system(int* address, int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned int atomicAdd(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned int atomicAdd_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long atomicAdd(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long atomicAdd_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long long atomicAdd(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long long atomicAdd_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
float atomicAdd(float* address, float val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, val);
#else
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif
}

__device__
inline
float atomicAdd_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

#if !defined(__HIPCC_RTC__)
DEPRECATED("use atomicAdd instead")
#endif // !defined(__HIPCC_RTC__)
__device__
inline
void atomicAddNoRet(float* address, float val)
{
    __ockl_atomic_add_noret_f32(address, val);
}

__device__
inline
double atomicAdd(double* address, double val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, val);
#else
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif
}

__device__
inline
double atomicAdd_system(double* address, double val) {
  return __hip_atomic_fetch_add(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
int atomicSub(int* address, int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
int atomicSub_system(int* address, int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned int atomicSub(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned int atomicSub_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long atomicSub(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long atomicSub_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long long atomicSub(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long long atomicSub_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
float atomicSub(float* address, float val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, -val);
#else
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif
}

__device__
inline
float atomicSub_system(float* address, float val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
double atomicSub(double* address, double val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicAdd(address, -val);
#else
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif
}

__device__
inline
double atomicSub_system(double* address, double val) {
  return __hip_atomic_fetch_add(address, -val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
int atomicExch(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
int atomicExch_system(int* address, int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned int atomicExch(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned int atomicExch_system(unsigned int* address, unsigned int val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long atomicExch(unsigned long* address, unsigned long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long atomicExch_system(unsigned long* address, unsigned long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
unsigned long long atomicExch(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
unsigned long long atomicExch_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
float atomicExch(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
float atomicExch_system(float* address, float val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
double atomicExch(double* address, double val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}

__device__
inline
double atomicExch_system(double* address, double val) {
  return __hip_atomic_exchange(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

__device__
inline
int atomicMin(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int x, int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
int atomicMin_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int x, int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicMin(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__

}

__device__
inline
unsigned int atomicMin_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int x, unsigned int y) { return x < y; }, [=]() {
      return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicMin(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long x, unsigned long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicMin_system(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long x, unsigned long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicMin(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicMin_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return x < y; },
    [=]() {
    return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
long long atomicMin(long long* address, long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
      address, val, [](long long x, long long y) { return x < y; },
      [=]() {
        return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif  // __gfx941__
}

__device__
inline
long long atomicMin_system(long long* address, long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
      address, val, [](long long x, long long y) { return x < y; },
      [=]() {
        return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
      });
#else
  return __hip_atomic_fetch_min(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif  // __gfx941__
}

__device__
inline
float atomicMin(float* addr, float val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
#endif
}

__device__
inline
float atomicMin_system(float* address, float val) {
  unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
  #if __has_builtin(__hip_atomic_load)
    unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  #else
    unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  #endif
  float value = __uint_as_float(tmp);

  while (val < value) {
    value = atomicCAS_system(address, value, val);
  }

  return value;
}

__device__
inline
double atomicMin(double* addr, double val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMin(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value > val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) > val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
#endif
}

__device__
inline
double atomicMin_system(double* address, double val) {
  unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
  #if __has_builtin(__hip_atomic_load)
    unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  #else
    unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  #endif
  double value = __longlong_as_double(tmp);

  while (val < value) {
    value = atomicCAS_system(address, value, val);
  }

  return value;
}

__device__
inline
int atomicMax(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int x, int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
int atomicMax_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int x, int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicMax(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicMax_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int x, unsigned int y) { return y < x; }, [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicMax(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long x, unsigned long y) { return y < x; },
    [=]() {
    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicMax_system(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long x, unsigned long y) { return y < x; },
    [=]() {
    return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicMax(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return y < x; },
    [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicMax_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long x, unsigned long long y) { return y < x; },
    [=]() {
      return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED,
                                    __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
long long atomicMax(long long* address, long long val) {
  #if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
      address, val, [](long long x, long long y) { return y < x; },
      [=]() {
        return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
      });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
long long atomicMax_system(long long* address, long long val) {
#if defined(__gfx941__)
  return hip_cas_extrema_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
      address, val, [](long long x, long long y) { return y < x; },
      [=]() {
        return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
      });
#else
  return __hip_atomic_fetch_max(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif  // __gfx941__
}

__device__
inline
float atomicMax(float* addr, float val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned int *uaddr = (unsigned int *)addr;
  unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __uint_as_float(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __uint_as_float(value);
  #endif
#endif
}

__device__
inline
float atomicMax_system(float* address, float val) {
  unsigned int* uaddr { reinterpret_cast<unsigned int*>(address) };
  #if __has_builtin(__hip_atomic_load)
    unsigned int tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  #else
    unsigned int tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  #endif
  float value = __uint_as_float(tmp);

  while (value < val) {
    value = atomicCAS_system(address, value, val);
  }

  return value;
}

__device__
inline
double atomicMax(double* addr, double val) {
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
  return unsafeAtomicMax(addr, val);
#else
  #if __has_builtin(__hip_atomic_load) && \
      __has_builtin(__hip_atomic_compare_exchange_strong)
  double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  bool done = false;
  while (!done && value < val) {
    done = __hip_atomic_compare_exchange_strong(addr, &value, val,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
  }
  return value;
  #else
  unsigned long long *uaddr = (unsigned long long *)addr;
  unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED);
  bool done = false;
  while (!done && __longlong_as_double(value) < val) {
    done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false,
               __ATOMIC_RELAXED, __ATOMIC_RELAXED);
  }
  return __longlong_as_double(value);
  #endif
#endif
}

__device__
inline
double atomicMax_system(double* address, double val) {
  unsigned long long* uaddr { reinterpret_cast<unsigned long long*>(address) };
  #if __has_builtin(__hip_atomic_load)
    unsigned long long tmp {__hip_atomic_load(uaddr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM)};
  #else
    unsigned long long tmp {__atomic_load_n(uaddr, __ATOMIC_RELAXED)};
  #endif
  double value = __longlong_as_double(tmp);

  while (value < val) {
      value = atomicCAS_system(address, value, val);
  }

  return value;
}

__device__
inline
unsigned int atomicInc(unsigned int* address, unsigned int val)
{
#if defined(__gfx941__)
  __device__
  extern
  unsigned int __builtin_amdgcn_atomic_inc(
    unsigned int*,
    unsigned int,
    unsigned int,
    unsigned int,
    bool) __asm("llvm.amdgcn.atomic.inc.i32.p0i32");

  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned int& x, unsigned int y) { x = (x >= y) ? 0 : (x + 1); },
    [=]() {
    return
      __builtin_amdgcn_atomic_inc(address, val, __ATOMIC_RELAXED, 1, false);
  });
#else
    return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
#endif // __gfx941__

}

__device__
inline
unsigned int atomicDec(unsigned int* address, unsigned int val)
{
#if defined(__gfx941__)
  __device__
  extern
  unsigned int __builtin_amdgcn_atomic_dec(
    unsigned int*,
    unsigned int,
    unsigned int,
    unsigned int,
    bool) __asm("llvm.amdgcn.atomic.dec.i32.p0i32");

  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned int& x, unsigned int y) { x = (!x || x > y) ? y : (x - 1); },
    [=]() {
    return
      __builtin_amdgcn_atomic_dec(address, val, __ATOMIC_RELAXED, 1, false);
  });
#else
  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
#endif // __gfx941__

}

__device__
inline
int atomicAnd(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
int atomicAnd_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicAnd(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicAnd_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicAnd(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicAnd_system(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned long& x, unsigned long y) { x &= y; }, [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicAnd(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x &= y; },
    [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicAnd_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x &= y; },
    [=]() {
    return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_and(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
int atomicOr(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
int atomicOr_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicOr(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicOr_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicOr(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicOr_system(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned long& x, unsigned long y) { x |= y; }, [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicOr(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x |= y; },
    [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicOr_system(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x |= y; },
    [=]() {
    return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED,
                                 __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_or(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
int atomicXor(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](int& x, int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
int atomicXor_system(int* address, int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](int& x, int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicXor(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned int atomicXor_system(unsigned int* address, unsigned int val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned int& x, unsigned int y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicXor(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long atomicXor_system(unsigned long* address, unsigned long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM>(
    address, val, [](unsigned long& x, unsigned long y) { x ^= y; }, [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_SYSTEM);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicXor(unsigned long long* address, unsigned long long val) {
#if defined(__gfx941__)
  return hip_cas_expander<__ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT>(
    address,
    val,
    [](unsigned long long& x, unsigned long long y) { x ^= y; },
    [=]() {
    return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED,
                                  __HIP_MEMORY_SCOPE_AGENT);
  });
#else
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
#endif // __gfx941__
}

__device__
inline
unsigned long long atomicXor_system(unsigned long long* address, unsigned long long val) {
  return __hip_atomic_fetch_xor(address, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
}

#else // __hip_atomic_compare_exchange_strong

__device__
inline
int atomicCAS(int* address, int compare, int val)
{
    __atomic_compare_exchange_n(
        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);

    return compare;
}
__device__
inline
unsigned int atomicCAS(
    unsigned int* address, unsigned int compare, unsigned int val)
{
    __atomic_compare_exchange_n(
        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);

    return compare;
}
__device__
inline
unsigned long long atomicCAS(
    unsigned long long* address,
    unsigned long long compare,
    unsigned long long val)
{
    __atomic_compare_exchange_n(
        address, &compare, val, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);

    return compare;
}

__device__
inline
int atomicAdd(int* address, int val)
{
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicAdd(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicAdd(
    unsigned long long* address, unsigned long long val)
{
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
}
__device__
inline
float atomicAdd(float* address, float val)
{
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
    return unsafeAtomicAdd(address, val);
#else
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
#endif
}

#if !defined(__HIPCC_RTC__)
DEPRECATED("use atomicAdd instead")
#endif // !defined(__HIPCC_RTC__)
__device__
inline
void atomicAddNoRet(float* address, float val)
{
    __ockl_atomic_add_noret_f32(address, val);
}

__device__
inline
double atomicAdd(double* address, double val)
{
#if defined(__AMDGCN_UNSAFE_FP_ATOMICS__)
    return unsafeAtomicAdd(address, val);
#else
    return __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
#endif
}

__device__
inline
int atomicSub(int* address, int val)
{
    return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicSub(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_sub(address, val, __ATOMIC_RELAXED);
}

__device__
inline
int atomicExch(int* address, int val)
{
    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicExch(unsigned int* address, unsigned int val)
{
    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicExch(unsigned long long* address, unsigned long long val)
{
    return __atomic_exchange_n(address, val, __ATOMIC_RELAXED);
}
__device__
inline
float atomicExch(float* address, float val)
{
    return __uint_as_float(__atomic_exchange_n(
        reinterpret_cast<unsigned int*>(address),
        __float_as_uint(val),
        __ATOMIC_RELAXED));
}

__device__
inline
int atomicMin(int* address, int val)
{
    return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicMin(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_min(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicMin(
    unsigned long long* address, unsigned long long val)
{
    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
    while (val < tmp) {
        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);

        if (tmp1 != tmp) { tmp = tmp1; continue; }

        tmp = atomicCAS(address, tmp, val);
    }

    return tmp;
}
__device__ inline long long atomicMin(long long* address, long long val) {
    long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
    while (val < tmp) {
        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);

        if (tmp1 != tmp) {
          tmp = tmp1;
          continue;
        }

        tmp = atomicCAS(address, tmp, val);
    }
    return tmp;
}

__device__
inline
int atomicMax(int* address, int val)
{
    return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicMax(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_max(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicMax(
    unsigned long long* address, unsigned long long val)
{
    unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
    while (tmp < val) {
        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);

        if (tmp1 != tmp) { tmp = tmp1; continue; }

        tmp = atomicCAS(address, tmp, val);
    }

    return tmp;
}
__device__ inline long long atomicMax(long long* address, long long val) {
    long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)};
    while (tmp < val) {
        const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED);

        if (tmp1 != tmp) {
          tmp = tmp1;
          continue;
        }

        tmp = atomicCAS(address, tmp, val);
    }
    return tmp;
}

__device__
inline
unsigned int atomicInc(unsigned int* address, unsigned int val)
{
  return __builtin_amdgcn_atomic_inc32(address, val, __ATOMIC_RELAXED, "agent");
}

__device__
inline
unsigned int atomicDec(unsigned int* address, unsigned int val)
{
  return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent");
}

__device__
inline
int atomicAnd(int* address, int val)
{
    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicAnd(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicAnd(
    unsigned long long* address, unsigned long long val)
{
    return __atomic_fetch_and(address, val, __ATOMIC_RELAXED);
}

__device__
inline
int atomicOr(int* address, int val)
{
    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicOr(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicOr(
    unsigned long long* address, unsigned long long val)
{
    return __atomic_fetch_or(address, val, __ATOMIC_RELAXED);
}

__device__
inline
int atomicXor(int* address, int val)
{
    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned int atomicXor(unsigned int* address, unsigned int val)
{
    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
}
__device__
inline
unsigned long long atomicXor(
    unsigned long long* address, unsigned long long val)
{
    return __atomic_fetch_xor(address, val, __ATOMIC_RELAXED);
}

#endif // __hip_atomic_compare_exchange_strong
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if !defined(__HIPCC_RTC__)
#include "host_defines.h"
#include "amd_hip_vector_types.h"  // For Native_vec_
#endif

#if defined(__cplusplus)
    extern "C" {
#endif

// DOT FUNCTIONS
#if defined(__clang__) && defined(__HIP__)
__device__
__attribute__((const))
int __ockl_sdot2(
    HIP_vector_base<short, 2>::Native_vec_,
    HIP_vector_base<short, 2>::Native_vec_,
    int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot2(
    HIP_vector_base<unsigned short, 2>::Native_vec_,
    HIP_vector_base<unsigned short, 2>::Native_vec_,
    unsigned int, bool);

__device__
__attribute__((const))
int __ockl_sdot4(
    HIP_vector_base<char, 4>::Native_vec_,
    HIP_vector_base<char, 4>::Native_vec_,
    int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot4(
    HIP_vector_base<unsigned char, 4>::Native_vec_,
    HIP_vector_base<unsigned char, 4>::Native_vec_,
    unsigned int, bool);

__device__
__attribute__((const))
int __ockl_sdot8(int, int, int, bool);

__device__
__attribute__((const))
unsigned int __ockl_udot8(unsigned int, unsigned int, unsigned int, bool);
#endif

#if !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
// BEGIN FLOAT
__device__
__attribute__((const))
float __ocml_acos_f32(float);
__device__
__attribute__((pure))
float __ocml_acosh_f32(float);
__device__
__attribute__((const))
float __ocml_asin_f32(float);
__device__
__attribute__((pure))
float __ocml_asinh_f32(float);
__device__
__attribute__((const))
float __ocml_atan2_f32(float, float);
__device__
__attribute__((const))
float __ocml_atan_f32(float);
__device__
__attribute__((pure))
float __ocml_atanh_f32(float);
__device__
__attribute__((pure))
float __ocml_cbrt_f32(float);
__device__
__attribute__((const))
float __ocml_ceil_f32(float);
__device__
__attribute__((const))
__device__
float __ocml_copysign_f32(float, float);
__device__
float __ocml_cos_f32(float);
__device__
float __ocml_native_cos_f32(float);
__device__
__attribute__((pure))
__device__
float __ocml_cosh_f32(float);
__device__
float __ocml_cospi_f32(float);
__device__
float __ocml_i0_f32(float);
__device__
float __ocml_i1_f32(float);
__device__
__attribute__((pure))
float __ocml_erfc_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcinv_f32(float);
__device__
__attribute__((pure))
float __ocml_erfcx_f32(float);
__device__
__attribute__((pure))
float __ocml_erf_f32(float);
__device__
__attribute__((pure))
float __ocml_erfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp10_f32(float);
__device__
__attribute__((pure))
float __ocml_exp2_f32(float);
__device__
__attribute__((pure))
float __ocml_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_native_exp_f32(float);
__device__
__attribute__((pure))
float __ocml_expm1_f32(float);
__device__
__attribute__((const))
float __ocml_fabs_f32(float);
__device__
__attribute__((const))
float __ocml_fdim_f32(float, float);
__device__
__attribute__((const))
float __ocml_floor_f32(float);
__device__
__attribute__((const))
float __ocml_fma_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fmax_f32(float, float);
__device__
__attribute__((const))
float __ocml_fmin_f32(float, float);
__device__
__attribute__((const))
__device__
float __ocml_fmod_f32(float, float);
__device__
float __ocml_frexp_f32(float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_hypot_f32(float, float);
__device__
__attribute__((const))
int __ocml_ilogb_f32(float);
__device__
__attribute__((const))
int __ocml_isfinite_f32(float);
__device__
__attribute__((const))
int __ocml_isinf_f32(float);
__device__
__attribute__((const))
int __ocml_isnan_f32(float);
__device__
float __ocml_j0_f32(float);
__device__
float __ocml_j1_f32(float);
__device__
__attribute__((const))
float __ocml_ldexp_f32(float, int);
__device__
float __ocml_lgamma_f32(float);
__device__
__attribute__((pure))
float __ocml_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log10_f32(float);
__device__
__attribute__((pure))
float __ocml_log1p_f32(float);
__device__
__attribute__((pure))
float __ocml_log2_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log2_f32(float);
__device__
__attribute__((const))
float __ocml_logb_f32(float);
__device__
__attribute__((pure))
float __ocml_log_f32(float);
__device__
__attribute__((pure))
float __ocml_native_log_f32(float);
__device__
float __ocml_modf_f32(float, __attribute__((address_space(5))) float*);
__device__
__attribute__((const))
float __ocml_nearbyint_f32(float);
__device__
__attribute__((const))
float __ocml_nextafter_f32(float, float);
__device__
__attribute__((const))
float __ocml_len3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_len4_f32(float, float, float, float);
__device__
__attribute__((pure))
float __ocml_ncdf_f32(float);
__device__
__attribute__((pure))
float __ocml_ncdfinv_f32(float);
__device__
__attribute__((pure))
float __ocml_pow_f32(float, float);
__device__
__attribute__((pure))
float __ocml_pown_f32(float, int);
__device__
__attribute__((pure))
float __ocml_rcbrt_f32(float);
__device__
__attribute__((const))
float __ocml_remainder_f32(float, float);
__device__
float __ocml_remquo_f32(float, float, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
float __ocml_rhypot_f32(float, float);
__device__
__attribute__((const))
float __ocml_rint_f32(float);
__device__
__attribute__((const))
float __ocml_rlen3_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_rlen4_f32(float, float, float, float);
__device__
__attribute__((const))
float __ocml_round_f32(float);
__device__
__attribute__((pure))
float __ocml_rsqrt_f32(float);
__device__
__attribute__((const))
float __ocml_scalb_f32(float, float);
__device__
__attribute__((const))
float __ocml_scalbn_f32(float, int);
__device__
__attribute__((const))
int __ocml_signbit_f32(float);
__device__
float __ocml_sincos_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sincospi_f32(float, __attribute__((address_space(5))) float*);
__device__
float __ocml_sin_f32(float);
__device__
float __ocml_native_sin_f32(float);
__device__
__attribute__((pure))
float __ocml_sinh_f32(float);
__device__
float __ocml_sinpi_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_f32(float);
__device__
__attribute__((const))
float __ocml_native_sqrt_f32(float);
__device__
float __ocml_tan_f32(float);
__device__
__attribute__((pure))
float __ocml_tanh_f32(float);
__device__
float __ocml_tgamma_f32(float);
__device__
__attribute__((const))
float __ocml_trunc_f32(float);
__device__
float __ocml_y0_f32(float);
__device__
float __ocml_y1_f32(float);

// BEGIN INTRINSICS
__device__
__attribute__((const))
float __ocml_add_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_add_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_sub_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_mul_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rte_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtn_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtp_f32(float, float);
__device__
__attribute__((const))
float __ocml_div_rtz_f32(float, float);
__device__
__attribute__((const))
float __ocml_sqrt_rte_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtn_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtp_f32(float);
__device__
__attribute__((const))
float __ocml_sqrt_rtz_f32(float);
__device__
__attribute__((const))
float __ocml_fma_rte_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtn_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtp_f32(float, float, float);
__device__
__attribute__((const))
float __ocml_fma_rtz_f32(float, float, float);
// END INTRINSICS
// END FLOAT

// BEGIN DOUBLE
__device__
__attribute__((const))
double __ocml_acos_f64(double);
__device__
__attribute__((pure))
double __ocml_acosh_f64(double);
__device__
__attribute__((const))
double __ocml_asin_f64(double);
__device__
__attribute__((pure))
double __ocml_asinh_f64(double);
__device__
__attribute__((const))
double __ocml_atan2_f64(double, double);
__device__
__attribute__((const))
double __ocml_atan_f64(double);
__device__
__attribute__((pure))
double __ocml_atanh_f64(double);
__device__
__attribute__((pure))
double __ocml_cbrt_f64(double);
__device__
__attribute__((const))
double __ocml_ceil_f64(double);
__device__
__attribute__((const))
double __ocml_copysign_f64(double, double);
__device__
double __ocml_cos_f64(double);
__device__
__attribute__((pure))
double __ocml_cosh_f64(double);
__device__
double __ocml_cospi_f64(double);
__device__
double __ocml_i0_f64(double);
__device__
double __ocml_i1_f64(double);
__device__
__attribute__((pure))
double __ocml_erfc_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcinv_f64(double);
__device__
__attribute__((pure))
double __ocml_erfcx_f64(double);
__device__
__attribute__((pure))
double __ocml_erf_f64(double);
__device__
__attribute__((pure))
double __ocml_erfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_exp10_f64(double);
__device__
__attribute__((pure))
double __ocml_exp2_f64(double);
__device__
__attribute__((pure))
double __ocml_exp_f64(double);
__device__
__attribute__((pure))
double __ocml_expm1_f64(double);
__device__
__attribute__((const))
double __ocml_fabs_f64(double);
__device__
__attribute__((const))
double __ocml_fdim_f64(double, double);
__device__
__attribute__((const))
double __ocml_floor_f64(double);
__device__
__attribute__((const))
double __ocml_fma_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fmax_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmin_f64(double, double);
__device__
__attribute__((const))
double __ocml_fmod_f64(double, double);
__device__
double __ocml_frexp_f64(double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_hypot_f64(double, double);
__device__
__attribute__((const))
int __ocml_ilogb_f64(double);
__device__
__attribute__((const))
int __ocml_isfinite_f64(double);
__device__
__attribute__((const))
int __ocml_isinf_f64(double);
__device__
__attribute__((const))
int __ocml_isnan_f64(double);
__device__
double __ocml_j0_f64(double);
__device__
double __ocml_j1_f64(double);
__device__
__attribute__((const))
double __ocml_ldexp_f64(double, int);
__device__
double __ocml_lgamma_f64(double);
__device__
__attribute__((pure))
double __ocml_log10_f64(double);
__device__
__attribute__((pure))
double __ocml_log1p_f64(double);
__device__
__attribute__((pure))
double __ocml_log2_f64(double);
__device__
__attribute__((const))
double __ocml_logb_f64(double);
__device__
__attribute__((pure))
double __ocml_log_f64(double);
__device__
double __ocml_modf_f64(double, __attribute__((address_space(5))) double*);
__device__
__attribute__((const))
double __ocml_nearbyint_f64(double);
__device__
__attribute__((const))
double __ocml_nextafter_f64(double, double);
__device__
__attribute__((const))
double __ocml_len3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_len4_f64(double, double, double, double);
__device__
__attribute__((pure))
double __ocml_ncdf_f64(double);
__device__
__attribute__((pure))
double __ocml_ncdfinv_f64(double);
__device__
__attribute__((pure))
double __ocml_pow_f64(double, double);
__device__
__attribute__((pure))
double __ocml_pown_f64(double, int);
__device__
__attribute__((pure))
double __ocml_rcbrt_f64(double);
__device__
__attribute__((const))
double __ocml_remainder_f64(double, double);
__device__
double __ocml_remquo_f64(
    double, double, __attribute__((address_space(5))) int*);
__device__
__attribute__((const))
double __ocml_rhypot_f64(double, double);
__device__
__attribute__((const))
double __ocml_rint_f64(double);
__device__
__attribute__((const))
double __ocml_rlen3_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_rlen4_f64(double, double, double, double);
__device__
__attribute__((const))
double __ocml_round_f64(double);
__device__
__attribute__((pure))
double __ocml_rsqrt_f64(double);
__device__
__attribute__((const))
double __ocml_scalb_f64(double, double);
__device__
__attribute__((const))
double __ocml_scalbn_f64(double, int);
__device__
__attribute__((const))
int __ocml_signbit_f64(double);
__device__
double __ocml_sincos_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sincospi_f64(double, __attribute__((address_space(5))) double*);
__device__
double __ocml_sin_f64(double);
__device__
__attribute__((pure))
double __ocml_sinh_f64(double);
__device__
double __ocml_sinpi_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_f64(double);
__device__
double __ocml_tan_f64(double);
__device__
__attribute__((pure))
double __ocml_tanh_f64(double);
__device__
double __ocml_tgamma_f64(double);
__device__
__attribute__((const))
double __ocml_trunc_f64(double);
__device__
double __ocml_y0_f64(double);
__device__
double __ocml_y1_f64(double);

// BEGIN INTRINSICS
__device__
__attribute__((const))
double __ocml_add_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_add_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_sub_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_mul_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rte_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtn_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtp_f64(double, double);
__device__
__attribute__((const))
double __ocml_div_rtz_f64(double, double);
__device__
__attribute__((const))
double __ocml_sqrt_rte_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtn_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtp_f64(double);
__device__
__attribute__((const))
double __ocml_sqrt_rtz_f64(double);
__device__
__attribute__((const))
double __ocml_fma_rte_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtn_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtp_f64(double, double, double);
__device__
__attribute__((const))
double __ocml_fma_rtz_f64(double, double, double);
// END INTRINSICS
// END DOUBLE

#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__

#if defined(__cplusplus)
    } // extern "C"
#endif
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

// /*
// Half Math Functions
// */
#if !defined(__HIPCC_RTC__)
#include "host_defines.h"
#endif
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
extern "C"
{
    __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
    __device__ _Float16 __ocml_cos_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
    __device__ __attribute__((const))
    _Float16 __ocml_fma_f16(_Float16, _Float16, _Float16);
    __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
    __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
    __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
    __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
    __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
    __device__ _Float16 __ocml_sin_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
    __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
    __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);

    typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
    typedef short __2i16 __attribute__((ext_vector_type(2)));

    #if defined(__clang__) && defined(__HIP__)
    __device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b, float c, bool s);
    #endif

    __device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
    __device__ __2f16 __ocml_cos_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
    __device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
    __device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
    __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
    __device__ __2f16 __ocml_sin_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
    __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);

    __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
    __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
    __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);

}
#endif // !__CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__
//TODO: remove these after they get into clang header __clang_hip_libdevice_declares.h'
extern "C" {
    __device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
    __device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
    __device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
    __device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
    __device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
}
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once
#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
#define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H

#if defined(__HIPCC_RTC__)
  #define __HOST_DEVICE__ __device__
#else
  #define __HOST_DEVICE__ __host__ __device__
  #include <hip/amd_detail/amd_hip_common.h>
  #include "hip/amd_detail/host_defines.h"
  #include <assert.h>
  #if defined(__cplusplus)
    #include <algorithm>
    #include <type_traits>
    #include <utility>
#endif
#endif // !defined(__HIPCC_RTC__)

#if defined(__clang__) && defined(__HIP__)
    typedef _Float16 _Float16_2 __attribute__((ext_vector_type(2)));

    struct __half_raw {
        union {
            static_assert(sizeof(_Float16) == sizeof(unsigned short), "");

            _Float16 data;
            unsigned short x;
        };
    };

    struct __half2_raw {
        union {
            static_assert(sizeof(_Float16_2) == sizeof(unsigned short[2]), "");

            struct {
                __half_raw x;
                __half_raw y;
            };
            _Float16_2 data;
        };
    };

    #if defined(__cplusplus)
      #if !defined(__HIPCC_RTC__)
        #include "hip_fp16_math_fwd.h"
        #include "amd_hip_vector_types.h"
        #include "host_defines.h"
        #include "amd_device_functions.h"
        #include "amd_warp_functions.h"
      #endif
        namespace std
        {
            template<> struct is_floating_point<_Float16> : std::true_type {};
        }

        template<bool cond, typename T = void>
        using Enable_if_t = typename std::enable_if<cond, T>::type;

        // BEGIN STRUCT __HALF
        struct __half {
        protected:
            union {
                static_assert(sizeof(_Float16) == sizeof(unsigned short), "");

                _Float16 data;
                unsigned short __x;
            };
        public:
            // CREATORS
            __HOST_DEVICE__
            __half() = default;
            __HOST_DEVICE__
            __half(const __half_raw& x) : data{x.data} {}
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                __HOST_DEVICE__
                __half(decltype(data) x) : data{x} {}
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
                __HOST_DEVICE__
                __half(T x) : data{static_cast<_Float16>(x)} {}
            #endif
            __HOST_DEVICE__
            __half(const __half&) = default;
            __HOST_DEVICE__
            __half(__half&&) = default;
            __HOST_DEVICE__
            ~__half() = default;

            // CREATORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
                __HOST_DEVICE__
                __half(T x) : data{static_cast<_Float16>(x)} {}
            #endif

            // MANIPULATORS
            __HOST_DEVICE__
            __half& operator=(const __half&) = default;
            __HOST_DEVICE__
            __half& operator=(__half&&) = default;
            __HOST_DEVICE__
            __half& operator=(const __half_raw& x)
            {
                data = x.data;
                return *this;
            }
            __HOST_DEVICE__
            volatile __half& operator=(const __half_raw& x) volatile
            {
                data = x.data;
                return *this;
            }
            volatile __half& operator=(const volatile __half_raw& x) volatile
            {
                data = x.data;
                return *this;
            }
            __half& operator=(__half_raw&& x)
            {
                data = x.data;
                return *this;
            }
            volatile __half& operator=(__half_raw&& x) volatile
            {
                data = x.data;
                return *this;
            }
            volatile __half& operator=(volatile __half_raw&& x) volatile
            {
                data = x.data;
                return *this;
            }
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
                __HOST_DEVICE__
                __half& operator=(T x)
                {
                    data = static_cast<_Float16>(x);
                    return *this;
                }
            #endif

            // MANIPULATORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
                __device__
                __half& operator=(T x)
                {
                    data = static_cast<_Float16>(x);
                    return *this;
                }
            #endif

            #if !defined(__HIP_NO_HALF_OPERATORS__)
                __device__
                __half& operator+=(const __half& x)
                {
                    data += x.data;
                    return *this;
                }
                __device__
                __half& operator-=(const __half& x)
                {
                    data -= x.data;
                    return *this;
                }
                __device__
                __half& operator*=(const __half& x)
                {
                    data *= x.data;
                    return *this;
                }
                __device__
                __half& operator/=(const __half& x)
                {
                    data /= x.data;
                    return *this;
                }
                __device__
                __half& operator++() { ++data; return *this; }
                __device__
                __half operator++(int)
                {
                    __half tmp{*this};
                    ++*this;
                    return tmp;
                }
                __device__
                __half& operator--() { --data; return *this; }
                __device__
                __half operator--(int)
                {
                    __half tmp{*this};
                    --*this;
                    return tmp;
                }
            #endif

            // ACCESSORS
            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T,
                    Enable_if_t<std::is_floating_point<T>{}>* = nullptr>
                __HOST_DEVICE__
                operator T() const { return data; }
            #endif
            __HOST_DEVICE__
            operator __half_raw() const { return __half_raw{data}; }
            __HOST_DEVICE__
            operator __half_raw() const volatile
            {
                return __half_raw{data};
            }

            #if !defined(__HIP_NO_HALF_CONVERSIONS__)
                template<
                    typename T, Enable_if_t<std::is_integral<T>{}>* = nullptr>
                __HOST_DEVICE__
                operator T() const { return data; }
            #endif

            #if !defined(__HIP_NO_HALF_OPERATORS__)
                __device__
                __half operator+() const { return *this; }
                __device__
                __half operator-() const
                {
                    __half tmp{*this};
                    tmp.data = -tmp.data;
                    return tmp;
                }
            #endif

            // FRIENDS
            #if !defined(__HIP_NO_HALF_OPERATORS__)
                friend
                inline
                __device__
                __half operator+(const __half& x, const __half& y)
                {
                    return __half{x} += y;
                }
                friend
                inline
                __device__
                __half operator-(const __half& x, const __half& y)
                {
                    return __half{x} -= y;
                }
                friend
                inline
                __device__
                __half operator*(const __half& x, const __half& y)
                {
                    return __half{x} *= y;
                }
                friend
                inline
                __device__
                __half operator/(const __half& x, const __half& y)
                {
                    return __half{x} /= y;
                }
                friend
                inline
                __device__
                bool operator==(const __half& x, const __half& y)
                {
                    return x.data == y.data;
                }
                friend
                inline
                __device__
                bool operator!=(const __half& x, const __half& y)
                {
                    return !(x == y);
                }
                friend
                inline
                __device__
                bool operator<(const __half& x, const __half& y)
                {
                    return x.data < y.data;
                }
                friend
                inline
                __device__
                bool operator>(const __half& x, const __half& y)
                {
                    return y.data < x.data;
                }
                friend
                inline
                __device__
                bool operator<=(const __half& x, const __half& y)
                {
                    return !(y < x);
                }
                friend
                inline
                __device__
                bool operator>=(const __half& x, const __half& y)
                {
                    return !(x < y);
                }
            #endif // !defined(__HIP_NO_HALF_OPERATORS__)
        };
        // END STRUCT __HALF

        // BEGIN STRUCT __HALF2
        struct __half2 {
        public:
            union {
                static_assert(
                    sizeof(_Float16_2) == sizeof(unsigned short[2]), "");

                struct {
                    __half x;
                    __half y;
                };
                _Float16_2 data;
            };

            // CREATORS
            __HOST_DEVICE__
            __half2() = default;
            __HOST_DEVICE__
            __half2(const __half2_raw& xx) : data{xx.data} {}
            __HOST_DEVICE__
            __half2(decltype(data) xx) : data{xx} {}
            __HOST_DEVICE__
            __half2(const __half& xx, const __half& yy)
                :
                data{static_cast<__half_raw>(xx).data,
                     static_cast<__half_raw>(yy).data}
            {}
            __HOST_DEVICE__
            __half2(const __half2&) = default;
            __HOST_DEVICE__
            __half2(__half2&&) = default;
            __HOST_DEVICE__
            ~__half2() = default;

            // MANIPULATORS
            __HOST_DEVICE__
            __half2& operator=(const __half2&) = default;
            __HOST_DEVICE__
            __half2& operator=(__half2&&) = default;
            __HOST_DEVICE__
            __half2& operator=(const __half2_raw& xx)
            {
                data = xx.data;
                return *this;
            }

            // MANIPULATORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_OPERATORS__)
                __device__
                __half2& operator+=(const __half2& xx)
                {
                    data += xx.data;
                    return *this;
                }
                __device__
                __half2& operator-=(const __half2& xx)
                {
                    data -= xx.data;
                    return *this;
                }
                __device__
                __half2& operator*=(const __half2& xx)
                {
                    data *= xx.data;
                    return *this;
                }
                __device__
                __half2& operator/=(const __half2& xx)
                {
                    data /= xx.data;
                    return *this;
                }
                __device__
                __half2& operator++() { return *this += _Float16_2{1, 1}; }
                __device__
                __half2 operator++(int)
                {
                    __half2 tmp{*this};
                    ++*this;
                    return tmp;
                }
                __device__
                __half2& operator--() { return *this -= _Float16_2{1, 1}; }
                __device__
                __half2 operator--(int)
                {
                    __half2 tmp{*this};
                    --*this;
                    return tmp;
                }
            #endif

            // ACCESSORS
            __HOST_DEVICE__
            operator decltype(data)() const { return data; }
            __HOST_DEVICE__
            operator __half2_raw() const {
              __half2_raw r;
              r.data = data;
              return r;
            }

            // ACCESSORS - DEVICE ONLY
            #if !defined(__HIP_NO_HALF_OPERATORS__)
                __device__
                __half2 operator+() const { return *this; }
                __device__
                __half2 operator-() const
                {
                    __half2 tmp{*this};
                    tmp.data = -tmp.data;
                    return tmp;
                }
            #endif

            // FRIENDS
            #if !defined(__HIP_NO_HALF_OPERATORS__)
                friend
                inline
                __device__
                __half2 operator+(const __half2& xx, const __half2& yy)
                {
                    return __half2{xx} += yy;
                }
                friend
                inline
                __device__
                __half2 operator-(const __half2& xx, const __half2& yy)
                {
                    return __half2{xx} -= yy;
                }
                friend
                inline
                __device__
                __half2 operator*(const __half2& xx, const __half2& yy)
                {
                    return __half2{xx} *= yy;
                }
                friend
                inline
                __device__
                __half2 operator/(const __half2& xx, const __half2& yy)
                {
                    return __half2{xx} /= yy;
                }
                friend
                inline
                __device__
                bool operator==(const __half2& xx, const __half2& yy)
                {
                    auto r = xx.data == yy.data;
                    return r.x != 0 && r.y != 0;
                }
                friend
                inline
                __device__
                bool operator!=(const __half2& xx, const __half2& yy)
                {
                    return !(xx == yy);
                }
                friend
                inline
                __device__
                bool operator<(const __half2& xx, const __half2& yy)
                {
                    auto r = xx.data < yy.data;
                    return r.x != 0 && r.y != 0;
                }
                friend
                inline
                __device__
                bool operator>(const __half2& xx, const __half2& yy)
                {
                    return yy < xx;
                }
                friend
                inline
                __device__
                bool operator<=(const __half2& xx, const __half2& yy)
                {
                    return !(yy < xx);
                }
                friend
                inline
                __device__
                bool operator>=(const __half2& xx, const __half2& yy)
                {
                    return !(xx < yy);
                }
            #endif // !defined(__HIP_NO_HALF_OPERATORS__)
        };
        // END STRUCT __HALF2

        namespace
        {
            inline
            __HOST_DEVICE__
            __half2 make_half2(__half x, __half y)
            {
                return __half2{x, y};
            }

            inline
            __HOST_DEVICE__
            __half __low2half(__half2 x)
            {
                return __half{__half_raw{static_cast<__half2_raw>(x).data.x}};
            }

            inline
            __HOST_DEVICE__
            __half __high2half(__half2 x)
            {
                return __half{__half_raw{static_cast<__half2_raw>(x).data.y}};
            }

            inline
            __HOST_DEVICE__
            __half2 __half2half2(__half x)
            {
                return __half2{x, x};
            }

            inline
            __HOST_DEVICE__
            __half2 __halves2half2(__half x, __half y)
            {
                return __half2{x, y};
            }

            inline
            __HOST_DEVICE__
            __half2 __low2half2(__half2 x)
            {
                return __half2{
                    _Float16_2{
                        static_cast<__half2_raw>(x).data.x,
                        static_cast<__half2_raw>(x).data.x}};
            }

            inline
            __HOST_DEVICE__
            __half2 __high2half2(__half2 x)
            {
                return __half2{
                    _Float16_2{
                        static_cast<__half2_raw>(x).data.y,
                        static_cast<__half2_raw>(x).data.y}};
            }

            inline
            __HOST_DEVICE__
            __half2 __lows2half2(__half2 x, __half2 y)
            {
                return __half2{
                    _Float16_2{
                        static_cast<__half2_raw>(x).data.x,
                        static_cast<__half2_raw>(y).data.x}};
            }

            inline
            __HOST_DEVICE__
            __half2 __highs2half2(__half2 x, __half2 y)
            {
                return __half2{
                    _Float16_2{
                        static_cast<__half2_raw>(x).data.y,
                        static_cast<__half2_raw>(y).data.y}};
            }

            inline
            __HOST_DEVICE__
            __half2 __lowhigh2highlow(__half2 x)
            {
                return __half2{
                    _Float16_2{
                        static_cast<__half2_raw>(x).data.y,
                        static_cast<__half2_raw>(x).data.x}};
            }

            // Bitcasts
            inline
            __device__
            short __half_as_short(__half x)
            {
                return static_cast<__half_raw>(x).x;
            }

            inline
            __device__
            unsigned short __half_as_ushort(__half x)
            {
                return static_cast<__half_raw>(x).x;
            }

            inline
            __device__
            __half __short_as_half(short x)
            {
                __half_raw r; r.x = x;
                return r;
            }

            inline
            __device__
            __half __ushort_as_half(unsigned short x)
            {
                __half_raw r; r.x = x;
                return r;
            }

            // float -> half | half2
            inline
            __HOST_DEVICE__
            __half __float2half(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __HOST_DEVICE__
            __half __float2half_rn(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            #if !defined(__HIPCC_RTC__)
            // TODO: rounding behaviour is not correct for host functions.
            inline
            __host__
            __half __float2half_rz(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __host__
            __half __float2half_rd(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __host__
            __half __float2half_ru(float x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            #endif
            inline
            __device__
            __half __float2half_rz(float x)
            {
                return __half_raw{__ocml_cvtrtz_f16_f32(x)};
            }
            inline
            __device__
            __half __float2half_rd(float x)
            {
                return __half_raw{__ocml_cvtrtn_f16_f32(x)};
            }
            inline
            __device__
            __half __float2half_ru(float x)
            {
                return __half_raw{__ocml_cvtrtp_f16_f32(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 __float2half2_rn(float x)
            {
                return __half2{
                    _Float16_2{
                        static_cast<_Float16>(x), static_cast<_Float16>(x)}};
            }
            inline
            __HOST_DEVICE__
            __half2 __floats2half2_rn(float x, float y)
            {
                return __half2{_Float16_2{
                    static_cast<_Float16>(x), static_cast<_Float16>(y)}};
            }
            inline
            __HOST_DEVICE__
            __half2 __float22half2_rn(float2 x)
            {
                return __floats2half2_rn(x.x, x.y);
            }

            // half | half2 -> float
            inline
            __HOST_DEVICE__
            float __half2float(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __HOST_DEVICE__
            float __low2float(__half2 x)
            {
                return static_cast<__half2_raw>(x).data.x;
            }
            inline
            __HOST_DEVICE__
            float __high2float(__half2 x)
            {
                return static_cast<__half2_raw>(x).data.y;
            }
            inline
            __HOST_DEVICE__
            float2 __half22float2(__half2 x)
            {
                return make_float2(
                    static_cast<__half2_raw>(x).data.x,
                    static_cast<__half2_raw>(x).data.y);
            }

            // half -> int
            inline
            __device__
            int __half2int_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            int __half2int_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            int __half2int_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            int __half2int_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // int -> half
            inline
            __device__
            __half __int2half_rn(int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __int2half_rz(int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __int2half_rd(int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __int2half_ru(int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // half -> short
            inline
            __device__
            short __half2short_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            short __half2short_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            short __half2short_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            short __half2short_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // short -> half
            inline
            __device__
            __half __short2half_rn(short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __short2half_rz(short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __short2half_rd(short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __short2half_ru(short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // half -> long long
            inline
            __device__
            long long __half2ll_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            long long __half2ll_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            long long __half2ll_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            long long __half2ll_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // long long -> half
            inline
            __device__
            __half __ll2half_rn(long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ll2half_rz(long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ll2half_rd(long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ll2half_ru(long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // half -> unsigned int
            inline
            __device__
            unsigned int __half2uint_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned int __half2uint_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned int __half2uint_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned int __half2uint_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // unsigned int -> half
            inline
            __device__
            __half __uint2half_rn(unsigned int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __uint2half_rz(unsigned int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __uint2half_rd(unsigned int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __uint2half_ru(unsigned int x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // half -> unsigned short
            inline
            __device__
            unsigned short __half2ushort_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned short __half2ushort_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned short __half2ushort_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned short __half2ushort_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // unsigned short -> half
            inline
            __device__
            __half __ushort2half_rn(unsigned short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ushort2half_rz(unsigned short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ushort2half_rd(unsigned short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ushort2half_ru(unsigned short x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // half -> unsigned long long
            inline
            __device__
            unsigned long long __half2ull_rn(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned long long __half2ull_rz(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned long long __half2ull_rd(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }
            inline
            __device__
            unsigned long long __half2ull_ru(__half x)
            {
                return static_cast<__half_raw>(x).data;
            }

            // unsigned long long -> half
            inline
            __device__
            __half __ull2half_rn(unsigned long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ull2half_rz(unsigned long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ull2half_rd(unsigned long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }
            inline
            __device__
            __half __ull2half_ru(unsigned long long x)
            {
                return __half_raw{static_cast<_Float16>(x)};
            }

            // Load primitives
            inline
            __device__
            __half __ldg(const __half* ptr) { return *ptr; }
            inline
            __device__
            __half __ldcg(const __half* ptr) { return *ptr; }
            inline
            __device__
            __half __ldca(const __half* ptr) { return *ptr; }
            inline
            __device__
            __half __ldcs(const __half* ptr) { return *ptr; }

            inline
            __HOST_DEVICE__
            __half2 __ldg(const __half2* ptr) { return *ptr; }
            inline
            __HOST_DEVICE__
            __half2 __ldcg(const __half2* ptr) { return *ptr; }
            inline
            __HOST_DEVICE__
            __half2 __ldca(const __half2* ptr) { return *ptr; }
            inline
            __HOST_DEVICE__
            __half2 __ldcs(const __half2* ptr) { return *ptr; }

            // Relations
            inline
            __device__
            bool __heq(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data ==
                    static_cast<__half_raw>(y).data;
            }
            inline
            __device__
            bool __hne(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data !=
                    static_cast<__half_raw>(y).data;
            }
            inline
            __device__
            bool __hle(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data <=
                    static_cast<__half_raw>(y).data;
            }
            inline
            __device__
            bool __hge(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data >=
                    static_cast<__half_raw>(y).data;
            }
            inline
            __device__
            bool __hlt(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data <
                    static_cast<__half_raw>(y).data;
            }
            inline
            __device__
            bool __hgt(__half x, __half y)
            {
                return static_cast<__half_raw>(x).data >
                    static_cast<__half_raw>(y).data;
            }
            inline __device__
            bool __hequ(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
                    !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
            }
            inline __device__
            bool __hneu(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
            }
            inline __device__
            bool __hleu(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
            }
            inline
            __device__
            bool __hgeu(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data);
            }
            inline
            __device__
            bool __hltu(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data >= static_cast<__half_raw>(y).data);
            }
            inline
            __device__
            bool __hgtu(__half x, __half y) {
                return !(static_cast<__half_raw>(x).data <= static_cast<__half_raw>(y).data);
            }

            inline
            __HOST_DEVICE__
            __half2 __heq2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data ==
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hne2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data !=
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hle2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data <=
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hge2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data >=
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hlt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data <
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hgt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(x).data >
                    static_cast<__half2_raw>(y).data;
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline __HOST_DEVICE__
            __half2 __hequ2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
                    !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hneu2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data == static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hleu2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hgeu2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hltu2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data >= static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }
            inline
            __HOST_DEVICE__
            __half2 __hgtu2(__half2 x, __half2 y) {
                auto r = !(static_cast<__half2_raw>(x).data <= static_cast<__half2_raw>(y).data);
                return __builtin_convertvector(-r, _Float16_2);
            }

            inline
            __HOST_DEVICE__
            bool __hbeq2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__heq2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hbne2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hne2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hble2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hle2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hbge2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hge2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hblt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hlt2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hbgt2(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hgt2(x, y));
                return r.data.x != 0 && r.data.y != 0;
            }
            inline
            __HOST_DEVICE__
            bool __hbequ2(__half2 x, __half2 y) { return __hbeq2(x, y); }
            inline
            __HOST_DEVICE__
            bool __hbneu2(__half2 x, __half2 y) { return __hbne2(x, y); }
            inline
            __HOST_DEVICE__
            bool __hbleu2(__half2 x, __half2 y) { return __hble2(x, y); }
            inline
            __HOST_DEVICE__
            bool __hbgeu2(__half2 x, __half2 y) { return __hbge2(x, y); }
            inline
            __HOST_DEVICE__
            bool __hbltu2(__half2 x, __half2 y) { return __hblt2(x, y); }
            inline
            __HOST_DEVICE__
            bool __hbgtu2(__half2 x, __half2 y) { return __hbgt2(x, y); }
            inline
            __device__
            __half __hmax(const __half x, const __half y) {
              return __half_raw{__ocml_fmax_f16(static_cast<__half_raw>(x).data,
                                   static_cast<__half_raw>(y).data)};
            }
            inline
            __device__
            __half __hmax_nan(const __half x, const __half y) {
                if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
                  return x;
                } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
                  return y;
                }
                return __hmax(x, y);
            }
            inline
            __device__
            __half __hmin(const __half x, const __half y) {
              return __half_raw{__ocml_fmin_f16(static_cast<__half_raw>(x).data,
                                   static_cast<__half_raw>(y).data)};
            }
            inline
            __device__
            __half __hmin_nan(const __half x, const __half y) {
                if(__ocml_isnan_f16(static_cast<__half_raw>(x).data)) {
                  return x;
                } else if (__ocml_isnan_f16(static_cast<__half_raw>(y).data)) {
                  return y;
                }
                return __hmin(x, y);
            }

            // Arithmetic
            inline
            __device__
            __half __clamp_01(__half x)
            {
                auto r = static_cast<__half_raw>(x);

                if (__hlt(x, __half_raw{0})) return __half_raw{0};
                if (__hlt(__half_raw{1}, x)) return __half_raw{1};
                return r;
            }

            inline
            __device__
            __half __hadd(__half x, __half y)
            {
                return __half_raw{
                    static_cast<__half_raw>(x).data +
                    static_cast<__half_raw>(y).data};
            }
	    inline
	    __device__
	    __half __habs(__half x)
	    {
	        return __half_raw{
		    __ocml_fabs_f16(static_cast<__half_raw>(x).data)};
	    }
            inline
            __device__
            __half __hsub(__half x, __half y)
            {
                return __half_raw{
                    static_cast<__half_raw>(x).data -
                    static_cast<__half_raw>(y).data};
            }
            inline
            __device__
            __half __hmul(__half x, __half y)
            {
                return __half_raw{
                    static_cast<__half_raw>(x).data *
                    static_cast<__half_raw>(y).data};
            }
            inline
            __device__
            __half __hadd_sat(__half x, __half y)
            {
                return __clamp_01(__hadd(x, y));
            }
            inline
            __device__
            __half __hsub_sat(__half x, __half y)
            {
                return __clamp_01(__hsub(x, y));
            }
            inline
            __device__
            __half __hmul_sat(__half x, __half y)
            {
                return __clamp_01(__hmul(x, y));
            }
            inline
            __device__
            __half __hfma(__half x, __half y, __half z)
            {
                return __half_raw{__ocml_fma_f16(
                    static_cast<__half_raw>(x).data,
                    static_cast<__half_raw>(y).data,
                    static_cast<__half_raw>(z).data)};
            }
            inline
            __device__
            __half __hfma_sat(__half x, __half y, __half z)
            {
                return __clamp_01(__hfma(x, y, z));
            }
            inline
            __device__
            __half __hdiv(__half x, __half y)
            {
                return __half_raw{
                    static_cast<__half_raw>(x).data /
                    static_cast<__half_raw>(y).data};
            }

            inline
            __HOST_DEVICE__
            __half2 __hadd2(__half2 x, __half2 y)
            {
                return __half2{
                    static_cast<__half2_raw>(x).data +
                    static_cast<__half2_raw>(y).data};
            }
	    inline
	    __HOST_DEVICE__
	    __half2 __habs2(__half2 x)
	    {
	        return __half2{
		    __ocml_fabs_2f16(static_cast<__half2_raw>(x).data)};
	    }
            inline
            __HOST_DEVICE__
            __half2 __hsub2(__half2 x, __half2 y)
            {
                return __half2{
                    static_cast<__half2_raw>(x).data -
                    static_cast<__half2_raw>(y).data};
            }
            inline
            __HOST_DEVICE__
            __half2 __hmul2(__half2 x, __half2 y)
            {
                return __half2{
                    static_cast<__half2_raw>(x).data *
                    static_cast<__half2_raw>(y).data};
            }
            inline
            __HOST_DEVICE__
            __half2 __hadd2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hadd2(x, y));
                return __half2{
                    __clamp_01(__half_raw{r.data.x}),
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
            __HOST_DEVICE__
            __half2 __hsub2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hsub2(x, y));
                return __half2{
                    __clamp_01(__half_raw{r.data.x}),
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
            __HOST_DEVICE__
            __half2 __hmul2_sat(__half2 x, __half2 y)
            {
                auto r = static_cast<__half2_raw>(__hmul2(x, y));
                return __half2{
                    __clamp_01(__half_raw{r.data.x}),
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
            __HOST_DEVICE__
            __half2 __hfma2(__half2 x, __half2 y, __half2 z)
            {
                return __half2{__ocml_fma_2f16(x, y, z)};
            }
            inline
            __HOST_DEVICE__
            __half2 __hfma2_sat(__half2 x, __half2 y, __half2 z)
            {
                auto r = static_cast<__half2_raw>(__hfma2(x, y, z));
                return __half2{
                    __clamp_01(__half_raw{r.data.x}),
                    __clamp_01(__half_raw{r.data.y})};
            }
            inline
            __HOST_DEVICE__
            __half2 __h2div(__half2 x, __half2 y)
            {
                return __half2{
                    static_cast<__half2_raw>(x).data /
                    static_cast<__half2_raw>(y).data};
            }

            // Math functions
            #if defined(__clang__) && defined(__HIP__)
            inline
            __device__
            float amd_mixed_dot(__half2 a, __half2 b, float c, bool saturate) {
                return __ockl_fdot2(static_cast<__half2_raw>(a).data,
                                    static_cast<__half2_raw>(b).data,
                                    c, saturate);
            }
            #endif
            inline
            __device__
            __half htrunc(__half x)
            {
                return __half_raw{
                    __ocml_trunc_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hceil(__half x)
            {
                return __half_raw{
                    __ocml_ceil_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hfloor(__half x)
            {
                return __half_raw{
                   __ocml_floor_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hrint(__half x)
            {
                return __half_raw{
                    __ocml_rint_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hsin(__half x)
            {
                return __half_raw{
                    __ocml_sin_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hcos(__half x)
            {
                return __half_raw{
                    __ocml_cos_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hexp(__half x)
            {
                return __half_raw{
                    __ocml_exp_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hexp2(__half x)
            {
                return __half_raw{
                    __ocml_exp2_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hexp10(__half x)
            {
                return __half_raw{
                    __ocml_exp10_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hlog2(__half x)
            {
                return __half_raw{
                    __ocml_log2_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hlog(__half x)
            {
                return __half_raw{
                    __ocml_log_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hlog10(__half x)
            {
                return __half_raw{
                    __ocml_log10_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hrcp(__half x)
            {
                return __half_raw{
                    static_cast<_Float16>(1.0f) /static_cast<__half_raw>(x).data};
            }
            inline
            __device__
            __half hrsqrt(__half x)
            {
                return __half_raw{
                    __ocml_rsqrt_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            __half hsqrt(__half x)
            {
                return __half_raw{
                    __ocml_sqrt_f16(static_cast<__half_raw>(x).data)};
            }
            inline
            __device__
            bool __hisinf(__half x)
            {
                return __ocml_isinf_f16(static_cast<__half_raw>(x).data);
            }
            inline
            __device__
            bool __hisnan(__half x)
            {
                return __ocml_isnan_f16(static_cast<__half_raw>(x).data);
            }
            inline
            __device__
            __half __hneg(__half x)
            {
                return __half_raw{-static_cast<__half_raw>(x).data};
            }

            inline
            __HOST_DEVICE__
            __half2 h2trunc(__half2 x)
            {
                return __half2{__ocml_trunc_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2ceil(__half2 x)
            {
                return __half2{__ocml_ceil_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2floor(__half2 x)
            {
                return __half2{__ocml_floor_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2rint(__half2 x)
            {
                return __half2{__ocml_rint_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2sin(__half2 x)
            {
                return __half2{__ocml_sin_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2cos(__half2 x)
            {
                return __half2{__ocml_cos_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2exp(__half2 x)
            {
                return __half2{__ocml_exp_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2exp2(__half2 x)
            {
                return __half2{__ocml_exp2_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2exp10(__half2 x)
            {
                return __half2{__ocml_exp10_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2log2(__half2 x)
            {
                return __half2{__ocml_log2_2f16(x)};
            }
            inline
            __HOST_DEVICE__
            __half2 h2log(__half2 x) { return __ocml_log_2f16(x); }
            inline
            __HOST_DEVICE__
            __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
            inline
            __HOST_DEVICE__
            __half2 h2rcp(__half2 x) {
                return _Float16_2{
                    _Float16_2{static_cast<_Float16>(1.0f), static_cast<_Float16>(1.0f)} / x.data};
            }
            inline
            __HOST_DEVICE__
            __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
            inline
            __HOST_DEVICE__
            __half2 h2sqrt(__half2 x) { return __ocml_sqrt_2f16(x); }
            inline
            __HOST_DEVICE__
            __half2 __hisinf2(__half2 x)
            {
                auto r = __ocml_isinf_2f16(x);
                return __half2{_Float16_2{
                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
            }
            inline
            __HOST_DEVICE__
            __half2 __hisnan2(__half2 x)
            {
                auto r = __ocml_isnan_2f16(x);
                return __half2{_Float16_2{
                    static_cast<_Float16>(r.x), static_cast<_Float16>(r.y)}};
            }
            inline
            __HOST_DEVICE__
            __half2 __hneg2(__half2 x)
            {
                return __half2{-static_cast<__half2_raw>(x).data};
            }
        } // Anonymous namespace.

        #if !defined(HIP_NO_HALF)
            using half = __half;
            using half2 = __half2;
        #endif
        __device__
        inline
        __half __shfl(__half var, int src_lane, int width = warpSize) {
           union { int i; __half h; } tmp; tmp.h = var;
           tmp.i = __shfl(tmp.i, src_lane, width);
           return tmp.h;
        }
        __device__
        inline
        __half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
           union { int i; __half2 h; } tmp; tmp.h = var;
           tmp.i = __shfl(tmp.i, src_lane, width);
           return tmp.h;
        }
        __device__
        inline
        __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
           union { int i; __half h; } tmp; tmp.h = var;
           tmp.i = __shfl_up(tmp.i, lane_delta, width);
           return tmp.h;
        }
        __device__
        inline
         __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
            union { int i; __half2 h; } tmp; tmp.h = var;
            tmp.i = __shfl_up(tmp.i, lane_delta, width);
            return tmp.h;
         }
         __device__
         inline
         __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
            union { int i; __half h; } tmp; tmp.h = var;
            tmp.i = __shfl_down(tmp.i, lane_delta, width);
            return tmp.h;
         }
         __device__
         inline
         __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
            union { int i; __half2 h; } tmp; tmp.h = var;
            tmp.i = __shfl_down(tmp.i, lane_delta, width);
            return tmp.h;
         }
         __device__
         inline
         __half __shfl_xor(__half var,  int lane_mask, int width = warpSize) {
            union { int i; __half h; } tmp; tmp.h = var;
            tmp.i = __shfl_xor(tmp.i, lane_mask, width);
            return tmp.h;
         }
         __device__
         inline
          __half2 __shfl_xor(__half2 var,  int lane_mask, int width = warpSize) {
             union { int i; __half2 h; } tmp; tmp.h = var;
             tmp.i = __shfl_xor(tmp.i, lane_mask, width);
             return tmp.h;
         }
    #endif // defined(__cplusplus)
#elif defined(__GNUC__)
    #if !defined(__HIPCC_RTC__)
      #include "hip_fp16_gcc.h"
    #endif
#endif // !defined(__clang__) && defined(__GNUC__)

#endif // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP16_H
/*
Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#if !defined(__HIPCC_RTC__)
#include "hip_fp16_math_fwd.h"
#include "amd_hip_vector_types.h"
#include "math_fwd.h"

#include <hip/amd_detail/host_defines.h>

#include <algorithm>
// assert.h is only for the host version of assert.
// The device version of assert is implemented in hip/amd_detail/hip_runtime.h.
// Users should include hip_runtime.h for the device version of assert.
#if !__HIP_DEVICE_COMPILE__
#include <assert.h>
#endif
#include <limits.h>
#include <limits>
#include <stdint.h>
#endif // !defined(__HIPCC_RTC__)

#if _LIBCPP_VERSION && __HIP__
namespace std {
template <>
struct __numeric_type<_Float16>
{
   static _Float16 __test(_Float16);

   typedef _Float16 type;
   static const bool value = true;
};
}
#endif // _LIBCPP_VERSION

#pragma push_macro("__DEVICE__")
#pragma push_macro("__RETURN_TYPE")

#define __DEVICE__ static __device__
#define __RETURN_TYPE bool

// DOT FUNCTIONS
#if __HIP_CLANG_ONLY__
__DEVICE__
inline
int amd_mixed_dot(short2 a, short2 b, int c, bool saturate) {
    return __ockl_sdot2(a.data, b.data, c, saturate);
}
__DEVICE__
inline
uint amd_mixed_dot(ushort2 a, ushort2 b, uint c, bool saturate) {
    return __ockl_udot2(a.data, b.data, c, saturate);
}
__DEVICE__
inline
int amd_mixed_dot(char4 a, char4 b, int c, bool saturate) {
    return __ockl_sdot4(a.data, b.data, c, saturate);
}
__DEVICE__
inline
uint amd_mixed_dot(uchar4 a, uchar4 b, uint c, bool saturate) {
    return __ockl_udot4(a.data, b.data, c, saturate);
}
__DEVICE__
inline
int amd_mixed_dot(int a, int b, int c, bool saturate) {
    return __ockl_sdot8(a, b, c, saturate);
}
__DEVICE__
inline
uint amd_mixed_dot(uint a, uint b, uint c, bool saturate) {
    return __ockl_udot8(a, b, c, saturate);
}
#endif

#pragma pop_macro("__DEVICE__")
#pragma pop_macro("__RETURN_TYPE")
// For backward compatibility.
// There are HIP applications e.g. TensorFlow, expecting __HIP_ARCH_* macros
// defined after including math_functions.h.
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_runtime.h>
#endif