From 337f53ea082669539c6347e5efd236b70932dc09 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 15 Jan 2025 16:51:39 +0100 Subject: [PATCH] [HIPIFY][#1769][fp16][feature] Support for `fp16` math - Part 4 - final + Updated synthetic tests, the regenerated `hipify-perl`, and `Device` `CUDA2HIP` docs accordingly --- bin/hipify-perl | 13 +++-- .../CUDA_Device_API_supported_by_HIP.md | 10 ++-- src/CUDA2HIP_Device_functions.cpp | 12 +++-- src/CUDA2HIP_Device_types.cpp | 3 +- .../synthetic/libraries/cudevice2hipdevice.cu | 48 ++++++++++++++----- 5 files changed, 60 insertions(+), 26 deletions(-) diff --git a/bin/hipify-perl b/bin/hipify-perl index 703e1530..0d60569a 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -6237,6 +6237,7 @@ sub simpleSubstitutions { subst("__bfloat162bfloat162", "__bfloat162bfloat162", "device_function"); subst("__bfloat162float", "__bfloat162float", "device_function"); subst("__bfloat16_as_short", "__bfloat16_as_short", "device_function"); + subst("__bfloat16_as_ushort", "__bfloat16_as_ushort", "device_function"); subst("__brev", "__brev", "device_function"); subst("__brevll", "__brevll", "device_function"); subst("__byte_perm", "__byte_perm", "device_function"); @@ -6280,6 +6281,7 @@ sub simpleSubstitutions { subst("__fdividef", "__fdividef", "device_function"); subst("__ffs", "__ffs", "device_function"); subst("__ffsll", "__ffsll", "device_function"); + subst("__float22bfloat162_rn", "__float22bfloat162_rn", "device_function"); subst("__float22half2_rn", "__float22half2_rn", "device_function"); subst("__float2bfloat16", "__float2bfloat16", "device_function"); subst("__float2half", "__float2half", "device_function"); @@ -6495,6 +6497,7 @@ sub simpleSubstitutions { subst("__short2half_rn", "__short2half_rn", "device_function"); subst("__short2half_ru", "__short2half_ru", "device_function"); subst("__short2half_rz", "__short2half_rz", "device_function"); + subst("__short_as_bfloat16", "__short_as_bfloat16", "device_function"); subst("__short_as_half", "__short_as_half", "device_function"); subst("__sincosf", "__sincosf", "device_function"); subst("__sinf", "__sinf", "device_function"); @@ -6538,6 +6541,7 @@ sub simpleSubstitutions { subst("__ushort2half_rn", "__ushort2half_rn", "device_function"); subst("__ushort2half_ru", "__ushort2half_ru", "device_function"); subst("__ushort2half_rz", "__ushort2half_rz", "device_function"); + subst("__ushort_as_bfloat16", "__ushort_as_bfloat16", "device_function"); subst("__ushort_as_half", "__ushort_as_half", "device_function"); subst("abs", "abs", "device_function"); subst("acos", "acos", "device_function"); @@ -6786,6 +6790,7 @@ sub simpleSubstitutions { subst("__nv_fp8x4_e4m3", "__hip_fp8x4_e4m3_fnuz", "device_type"); subst("__nv_fp8x4_storage_t", "__hip_fp8x4_storage_t", "device_type"); subst("__nv_saturation_t", "__hip_saturation_t", "device_type"); + subst("nv_bfloat16", "hip_bfloat16", "device_type"); subst("caffe2\/core\/common_cudnn.h", "caffe2\/core\/hip\/common_miopen.h", "include"); subst("caffe2\/operators\/spatial_batch_norm_op.h", "caffe2\/operators\/hip\/spatial_batch_norm_op_miopen.hip", "include"); subst("channel_descriptor.h", "hip\/channel_descriptor.h", "include"); @@ -9279,6 +9284,7 @@ sub transformHostFunctions { "acos", "abs", "__ushort_as_half", + "__ushort_as_bfloat16", "__ushort2half_rz", "__ushort2half_ru", "__ushort2half_rn", @@ -9322,6 +9328,7 @@ sub transformHostFunctions { "__sinf", "__sincosf", "__short_as_half", + "__short_as_bfloat16", "__short2half_rz", "__short2half_ru", "__short2half_rn", @@ -9537,6 +9544,7 @@ sub transformHostFunctions { "__float2half", "__float2bfloat16", "__float22half2_rn", + "__float22bfloat162_rn", "__ffsll", "__ffs", "__fdividef", @@ -9580,6 +9588,7 @@ sub transformHostFunctions { "__byte_perm", "__brevll", "__brev", + "__bfloat16_as_ushort", "__bfloat16_as_short", "__bfloat162float", "__bfloat162bfloat162", @@ -9713,7 +9722,6 @@ sub countSupportedDeviceFunctions { "__vabsdiffs2", "__vabs4", "__vabs2", - "__ushort_as_bfloat16", "__ushort2bfloat16_rz", "__ushort2bfloat16_ru", "__ushort2bfloat16_rn", @@ -9734,7 +9742,6 @@ sub countSupportedDeviceFunctions { "__signbitl", "__signbitf", "__signbit", - "__short_as_bfloat16", "__short2bfloat16_rz", "__short2bfloat16_ru", "__short2bfloat16_rn", @@ -9809,7 +9816,6 @@ sub countSupportedDeviceFunctions { "__float2bfloat16_rn", "__float2bfloat16_rd", "__float2bfloat162_rn", - "__float22bfloat162_rn", "__finitel", "__finitef", "__finite", @@ -9839,7 +9845,6 @@ sub countSupportedDeviceFunctions { "__dadd_ru", "__dadd_rd", "__brkpt", - "__bfloat16_as_ushort", "__bfloat162ushort_rz", "__bfloat162ushort_ru", "__bfloat162ushort_rn", diff --git a/docs/tables/CUDA_Device_API_supported_by_HIP.md b/docs/tables/CUDA_Device_API_supported_by_HIP.md index 5e70cb82..27aa483b 100644 --- a/docs/tables/CUDA_Device_API_supported_by_HIP.md +++ b/docs/tables/CUDA_Device_API_supported_by_HIP.md @@ -48,7 +48,7 @@ |`__bfloat162ushort_ru`|11.0| | | | | | | | | | |`__bfloat162ushort_rz`|11.0| | | | | | | | | | |`__bfloat16_as_short`|11.0| | | |`__bfloat16_as_short`|5.7.0| | | | | -|`__bfloat16_as_ushort`|11.0| | | | | | | | | | +|`__bfloat16_as_ushort`|11.0| | | |`__bfloat16_as_ushort`|5.7.0| | | | | |`__brev`| | | | |`__brev`|1.6.0| | | | | |`__brevll`| | | | |`__brevll`|1.6.0| | | | | |`__brkpt`| | | | | | | | | | | @@ -121,7 +121,7 @@ |`__finite`| | | | | | | | | | | |`__finitef`| | | | | | | | | | | |`__finitel`| | | | | | | | | | | -|`__float22bfloat162_rn`|11.0| | | | | | | | | | +|`__float22bfloat162_rn`|11.0| | | |`__float22bfloat162_rn`|5.7.0| | | | | |`__float22half2_rn`| | | | |`__float22half2_rn`|1.6.0| | | | | |`__float2bfloat16`|11.0| | | |`__float2bfloat16`|5.7.0| | | | | |`__float2bfloat162_rn`|11.0| | | | | | | | | | @@ -411,7 +411,7 @@ |`__short2half_rn`| | | | |`__short2half_rn`|1.6.0| | | | | |`__short2half_ru`| | | | |`__short2half_ru`|1.6.0| | | | | |`__short2half_rz`| | | | |`__short2half_rz`|1.6.0| | | | | -|`__short_as_bfloat16`|11.0| | | | | | | | | | +|`__short_as_bfloat16`|11.0| | | |`__short_as_bfloat16`|5.7.0| | | | | |`__short_as_half`| | | | |`__short_as_half`|1.9.0| | | | | |`__signbit`| | | | | | | | | | | |`__signbitf`| | | | | | | | | | | @@ -475,7 +475,7 @@ |`__ushort2half_rn`| | | | |`__ushort2half_rn`|1.6.0| | | | | |`__ushort2half_ru`| | | | |`__ushort2half_ru`|1.6.0| | | | | |`__ushort2half_rz`| | | | |`__ushort2half_rz`|1.6.0| | | | | -|`__ushort_as_bfloat16`|11.0| | | | | | | | | | +|`__ushort_as_bfloat16`|11.0| | | |`__ushort_as_bfloat16`|5.7.0| | | | | |`__ushort_as_half`| | | | |`__ushort_as_half`|1.6.0| | | | | |`__vabs2`| | | | | | | | | | | |`__vabs4`| | | | | | | | | | | @@ -836,6 +836,6 @@ |`__nv_fp8x4_e5m2`|11.8| | | |`__hip_fp8x4_e5m2_fnuz`|6.2.0| | | | | |`__nv_fp8x4_storage_t`|11.8| | | |`__hip_fp8x4_storage_t`|6.2.0| | | | | |`__nv_saturation_t`|11.8| | | |`__hip_saturation_t`|6.2.0| | | | | -|`nv_bfloat16`|11.0| | | | | | | | | | +|`nv_bfloat16`|11.0| | | |`hip_bfloat16`|3.5.0| | | | | |`nv_bfloat162`|11.0| | | | | | | | | | diff --git a/src/CUDA2HIP_Device_functions.cpp b/src/CUDA2HIP_Device_functions.cpp index 5ed4739c..2742f405 100644 --- a/src/CUDA2HIP_Device_functions.cpp +++ b/src/CUDA2HIP_Device_functions.cpp @@ -781,10 +781,10 @@ const std::map CUDA_DEVICE_FUNCTION_MAP { {"__low2bfloat162", {"__low2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__high2bfloat162", {"__high2bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__bfloat16_as_short", {"__bfloat16_as_short", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, - {"__bfloat16_as_ushort", {"__bfloat16_as_ushort", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__short_as_bfloat16", {"__short_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__ushort_as_bfloat16", {"__ushort_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, - {"__float22bfloat162_rn", {"__float22bfloat162_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, + {"__bfloat16_as_ushort", {"__bfloat16_as_ushort", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__short_as_bfloat16", {"__short_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__ushort_as_bfloat16", {"__ushort_as_bfloat16", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, + {"__float22bfloat162_rn", {"__float22bfloat162_rn", "", CONV_DEVICE_FUNC, API_RUNTIME, 1}}, {"__bfloat162char_rz", {"__bfloat162char_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"__bfloat162uchar_rz", {"__bfloat162uchar_rz", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, {"make_bfloat162", {"make_bfloat162", "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}}, @@ -1524,6 +1524,10 @@ const std::map HIP_DEVICE_FUNCTION_VER_MAP { {"__low2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, {"__high2bfloat162", {HIP_5070, HIP_0, HIP_0 }}, {"__bfloat16_as_short", {HIP_5070, HIP_0, HIP_0 }}, + {"__bfloat16_as_ushort", {HIP_5070, HIP_0, HIP_0 }}, + {"__short_as_bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__ushort_as_bfloat16", {HIP_5070, HIP_0, HIP_0 }}, + {"__float22bfloat162_rn", {HIP_5070, HIP_0, HIP_0 }}, }; const std::map CUDA_DEVICE_FUNCTION_API_SECTION_MAP { diff --git a/src/CUDA2HIP_Device_types.cpp b/src/CUDA2HIP_Device_types.cpp index 27ebd93c..ed05493d 100644 --- a/src/CUDA2HIP_Device_types.cpp +++ b/src/CUDA2HIP_Device_types.cpp @@ -31,7 +31,7 @@ const std::map CUDA_DEVICE_TYPE_NAME_MAP { {"__half2_raw", {"__half2_raw", "", CONV_DEVICE_TYPE, API_RUNTIME, 2}}, // Bfloat16 Precision Device types {"__nv_bfloat16", {"__hip_bfloat16", "rocblas_bfloat16", CONV_DEVICE_TYPE, API_RUNTIME, 2}}, - {"nv_bfloat16", {"hip_bfloat16", "", CONV_DEVICE_TYPE, API_RUNTIME, 2, UNSUPPORTED}}, + {"nv_bfloat16", {"hip_bfloat16", "", CONV_DEVICE_TYPE, API_RUNTIME, 2}}, {"__nv_bfloat16_raw", {"__hip_bfloat16_raw", "", CONV_DEVICE_TYPE, API_RUNTIME, 2}}, {"__nv_bfloat162", {"__hip_bfloat162", "", CONV_DEVICE_TYPE, API_RUNTIME, 2}}, {"nv_bfloat162", {"hip_bfloat162", "", CONV_DEVICE_TYPE, API_RUNTIME, 2, UNSUPPORTED}}, @@ -102,6 +102,7 @@ const std::map HIP_DEVICE_TYPE_NAME_VER_MAP { {"__hip_bfloat16_raw", {HIP_6020, HIP_0, HIP_0 }}, {"__hip_bfloat162_raw", {HIP_6020, HIP_0, HIP_0 }}, {"__hip_bfloat162", {HIP_5070, HIP_0, HIP_0 }}, + {"hip_bfloat16", {HIP_3050, HIP_0, HIP_0 }}, {"rocblas_half", {HIP_1050, HIP_0, HIP_0 }}, {"rocblas_bfloat16", {HIP_3050, HIP_0, HIP_0 }}, diff --git a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu index f6fcc756..12afdd7b 100644 --- a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu +++ b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu @@ -16,6 +16,7 @@ int main() { float fa = 0.0f; float fx = 0.0f; short int shi = 0; + unsigned short int ushi = 0; double2 d2 = { 0.0f, 0.0f }; float2 f2 = { 0.0f, 0.0f }; __half hx = { 0.0f }; @@ -25,13 +26,16 @@ int main() { __half2_raw h2rx = { 0, 0 }; #if CUDA_VERSION >= 11000 - // CHECK: __hip_bfloat16 bf16 = { 0.0f }; + // CHECK: __hip_bfloat16 _bf16 = { 0.0f }; // CHECK-NEXT: __hip_bfloat16 bf16a = { 0.0f }; // CHECK-NEXT: __hip_bfloat16 bf16b = { 0.0f }; - __nv_bfloat16 bf16 = { 0.0f }; + __nv_bfloat16 _bf16 = { 0.0f }; __nv_bfloat16 bf16a = { 0.0f }; __nv_bfloat16 bf16b = { 0.0f }; + // CHECK: hip_bfloat16 bf16 = { 0 }; + nv_bfloat16 bf16 = { 0 }; + // CHECK: __hip_bfloat16_raw bf16r = { 0 }; __nv_bfloat16_raw bf16r = { 0 }; @@ -47,18 +51,18 @@ int main() { // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) - // CHECK: bf16 = __double2bfloat16(da); - bf16 = __double2bfloat16(da); + // CHECK: _bf16 = __double2bfloat16(da); + _bf16 = __double2bfloat16(da); // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f); - // CHECK: bf16 = __float2bfloat16(fa); - bf16 = __float2bfloat16(fa); + // CHECK: _bf16 = __float2bfloat16(fa); + _bf16 = __float2bfloat16(fa); // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); // HIP: __BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a); - // CHECK: bf16 = __bfloat162float(fa); - bf16 = __bfloat162float(fa); + // CHECK: _bf16 = __bfloat162float(fa); + _bf16 = __bfloat162float(fa); // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); // HIP: __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a); @@ -77,8 +81,8 @@ int main() { // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a); - // CHECK: bf16 = __low2bfloat16(bf162a); - bf16 = __low2bfloat16(bf162a); + // CHECK: _bf16 = __low2bfloat16(bf162a); + _bf16 = __low2bfloat16(bf162a); // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b); @@ -97,8 +101,28 @@ int main() { // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); // HIP: __BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h); - // CHECK: shi = __bfloat16_as_short(bf16); - shi = __bfloat16_as_short(bf16); + // CHECK: shi = __bfloat16_as_short(_bf16); + shi = __bfloat16_as_short(_bf16); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); + // HIP: __BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h); + // CHECK: ushi = __bfloat16_as_ushort(_bf16); + ushi = __bfloat16_as_ushort(_bf16); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a); + // CHECK: _bf16 = __short_as_bfloat16(shi); + _bf16 = __short_as_bfloat16(shi); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a); + // CHECK: _bf16 = __ushort_as_bfloat16(ushi); + _bf16 = __ushort_as_bfloat16(ushi); + + // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); + // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a); + // CHECK: bf162 = __float22bfloat162_rn(f2); + bf162 = __float22bfloat162_rn(f2); #endif #if CUDA_VERSION >= 11080