From 337f53ea082669539c6347e5efd236b70932dc09 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <evgeny.mankov@gmail.com>
Date: Wed, 15 Jan 2025 16:51:39 +0100
Subject: [PATCH] [HIPIFY][#1769][fp16][feature] Support for `fp16` math - Part
 4 - final

+ Updated synthetic tests, the regenerated `hipify-perl`, and `Device` `CUDA2HIP` docs accordingly
---
 bin/hipify-perl                               | 13 +++--
 .../CUDA_Device_API_supported_by_HIP.md       | 10 ++--
 src/CUDA2HIP_Device_functions.cpp             | 12 +++--
 src/CUDA2HIP_Device_types.cpp                 |  3 +-
 .../synthetic/libraries/cudevice2hipdevice.cu | 48 ++++++++++++++-----
 5 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/bin/hipify-perl b/bin/hipify-perl
index 703e1530..0d60569a 100755
--- a/bin/hipify-perl
+++ b/bin/hipify-perl
@@ -6237,6 +6237,7 @@ sub simpleSubstitutions {
     subst("__bfloat162bfloat162", "__bfloat162bfloat162", "device_function");
     subst("__bfloat162float", "__bfloat162float", "device_function");
     subst("__bfloat16_as_short", "__bfloat16_as_short", "device_function");
+    subst("__bfloat16_as_ushort", "__bfloat16_as_ushort", "device_function");
     subst("__brev", "__brev", "device_function");
     subst("__brevll", "__brevll", "device_function");
     subst("__byte_perm", "__byte_perm", "device_function");
@@ -6280,6 +6281,7 @@ sub simpleSubstitutions {
     subst("__fdividef", "__fdividef", "device_function");
     subst("__ffs", "__ffs", "device_function");
     subst("__ffsll", "__ffsll", "device_function");
+    subst("__float22bfloat162_rn", "__float22bfloat162_rn", "device_function");
     subst("__float22half2_rn", "__float22half2_rn", "device_function");
     subst("__float2bfloat16", "__float2bfloat16", "device_function");
     subst("__float2half", "__float2half", "device_function");
@@ -6495,6 +6497,7 @@ sub simpleSubstitutions {
     subst("__short2half_rn", "__short2half_rn", "device_function");
     subst("__short2half_ru", "__short2half_ru", "device_function");
     subst("__short2half_rz", "__short2half_rz", "device_function");
+    subst("__short_as_bfloat16", "__short_as_bfloat16", "device_function");
     subst("__short_as_half", "__short_as_half", "device_function");
     subst("__sincosf", "__sincosf", "device_function");
     subst("__sinf", "__sinf", "device_function");
@@ -6538,6 +6541,7 @@ sub simpleSubstitutions {
     subst("__ushort2half_rn", "__ushort2half_rn", "device_function");
     subst("__ushort2half_ru", "__ushort2half_ru", "device_function");
     subst("__ushort2half_rz", "__ushort2half_rz", "device_function");
+    subst("__ushort_as_bfloat16", "__ushort_as_bfloat16", "device_function");
     subst("__ushort_as_half", "__ushort_as_half", "device_function");
     subst("abs", "abs", "device_function");
     subst("acos", "acos", "device_function");
@@ -6786,6 +6790,7 @@ sub simpleSubstitutions {
     subst("__nv_fp8x4_e4m3", "__hip_fp8x4_e4m3_fnuz", "device_type");
     subst("__nv_fp8x4_storage_t", "__hip_fp8x4_storage_t", "device_type");
     subst("__nv_saturation_t", "__hip_saturation_t", "device_type");
+    subst("nv_bfloat16", "hip_bfloat16", "device_type");
     subst("caffe2\/core\/common_cudnn.h", "caffe2\/core\/hip\/common_miopen.h", "include");
     subst("caffe2\/operators\/spatial_batch_norm_op.h", "caffe2\/operators\/hip\/spatial_batch_norm_op_miopen.hip", "include");
     subst("channel_descriptor.h", "hip\/channel_descriptor.h", "include");
@@ -9279,6 +9284,7 @@ sub transformHostFunctions {
     "acos",
     "abs",
     "__ushort_as_half",
+    "__ushort_as_bfloat16",
     "__ushort2half_rz",
     "__ushort2half_ru",
     "__ushort2half_rn",
@@ -9322,6 +9328,7 @@ sub transformHostFunctions {
     "__sinf",
     "__sincosf",
     "__short_as_half",
+    "__short_as_bfloat16",
     "__short2half_rz",
     "__short2half_ru",
     "__short2half_rn",
@@ -9537,6 +9544,7 @@ sub transformHostFunctions {
     "__float2half",
     "__float2bfloat16",
     "__float22half2_rn",
+    "__float22bfloat162_rn",
     "__ffsll",
     "__ffs",
     "__fdividef",
@@ -9580,6 +9588,7 @@ sub transformHostFunctions {
     "__byte_perm",
     "__brevll",
     "__brev",
+    "__bfloat16_as_ushort",
     "__bfloat16_as_short",
     "__bfloat162float",
     "__bfloat162bfloat162",
@@ -9713,7 +9722,6 @@ sub countSupportedDeviceFunctions {
     "__vabsdiffs2",
     "__vabs4",
     "__vabs2",
-    "__ushort_as_bfloat16",
     "__ushort2bfloat16_rz",
     "__ushort2bfloat16_ru",
     "__ushort2bfloat16_rn",
@@ -9734,7 +9742,6 @@ sub countSupportedDeviceFunctions {
     "__signbitl",
     "__signbitf",
     "__signbit",
-    "__short_as_bfloat16",
     "__short2bfloat16_rz",
     "__short2bfloat16_ru",
     "__short2bfloat16_rn",
@@ -9809,7 +9816,6 @@ sub countSupportedDeviceFunctions {
     "__float2bfloat16_rn",
     "__float2bfloat16_rd",
     "__float2bfloat162_rn",
-    "__float22bfloat162_rn",
     "__finitel",
     "__finitef",
     "__finite",
@@ -9839,7 +9845,6 @@ sub countSupportedDeviceFunctions {
     "__dadd_ru",
     "__dadd_rd",
     "__brkpt",
-    "__bfloat16_as_ushort",
     "__bfloat162ushort_rz",
     "__bfloat162ushort_ru",
     "__bfloat162ushort_rn",
diff --git a/docs/tables/CUDA_Device_API_supported_by_HIP.md b/docs/tables/CUDA_Device_API_supported_by_HIP.md
index 5e70cb82..27aa483b 100644
--- a/docs/tables/CUDA_Device_API_supported_by_HIP.md
+++ b/docs/tables/CUDA_Device_API_supported_by_HIP.md
@@ -48,7 +48,7 @@
 |`__bfloat162ushort_ru`|11.0| | | | | | | | | |
 |`__bfloat162ushort_rz`|11.0| | | | | | | | | |
 |`__bfloat16_as_short`|11.0| | | |`__bfloat16_as_short`|5.7.0| | | | |
-|`__bfloat16_as_ushort`|11.0| | | | | | | | | |
+|`__bfloat16_as_ushort`|11.0| | | |`__bfloat16_as_ushort`|5.7.0| | | | |
 |`__brev`| | | | |`__brev`|1.6.0| | | | |
 |`__brevll`| | | | |`__brevll`|1.6.0| | | | |
 |`__brkpt`| | | | | | | | | | |
@@ -121,7 +121,7 @@
 |`__finite`| | | | | | | | | | |
 |`__finitef`| | | | | | | | | | |
 |`__finitel`| | | | | | | | | | |
-|`__float22bfloat162_rn`|11.0| | | | | | | | | |
+|`__float22bfloat162_rn`|11.0| | | |`__float22bfloat162_rn`|5.7.0| | | | |
 |`__float22half2_rn`| | | | |`__float22half2_rn`|1.6.0| | | | |
 |`__float2bfloat16`|11.0| | | |`__float2bfloat16`|5.7.0| | | | |
 |`__float2bfloat162_rn`|11.0| | | | | | | | | |
@@ -411,7 +411,7 @@
 |`__short2half_rn`| | | | |`__short2half_rn`|1.6.0| | | | |
 |`__short2half_ru`| | | | |`__short2half_ru`|1.6.0| | | | |
 |`__short2half_rz`| | | | |`__short2half_rz`|1.6.0| | | | |
-|`__short_as_bfloat16`|11.0| | | | | | | | | |
+|`__short_as_bfloat16`|11.0| | | |`__short_as_bfloat16`|5.7.0| | | | |
 |`__short_as_half`| | | | |`__short_as_half`|1.9.0| | | | |
 |`__signbit`| | | | | | | | | | |
 |`__signbitf`| | | | | | | | | | |
@@ -475,7 +475,7 @@
 |`__ushort2half_rn`| | | | |`__ushort2half_rn`|1.6.0| | | | |
 |`__ushort2half_ru`| | | | |`__ushort2half_ru`|1.6.0| | | | |
 |`__ushort2half_rz`| | | | |`__ushort2half_rz`|1.6.0| | | | |
-|`__ushort_as_bfloat16`|11.0| | | | | | | | | |
+|`__ushort_as_bfloat16`|11.0| | | |`__ushort_as_bfloat16`|5.7.0| | | | |
 |`__ushort_as_half`| | | | |`__ushort_as_half`|1.6.0| | | | |
 |`__vabs2`| | | | | | | | | | |
 |`__vabs4`| | | | | | | | | | |
@@ -836,6 +836,6 @@
 |`__nv_fp8x4_e5m2`|11.8| | | |`__hip_fp8x4_e5m2_fnuz`|6.2.0| | | | |
 |`__nv_fp8x4_storage_t`|11.8| | | |`__hip_fp8x4_storage_t`|6.2.0| | | | |
 |`__nv_saturation_t`|11.8| | | |`__hip_saturation_t`|6.2.0| | | | |
-|`nv_bfloat16`|11.0| | | | | | | | | |
+|`nv_bfloat16`|11.0| | | |`hip_bfloat16`|3.5.0| | | | |
 |`nv_bfloat162`|11.0| | | | | | | | | |
 
diff --git a/src/CUDA2HIP_Device_functions.cpp b/src/CUDA2HIP_Device_functions.cpp
index 5ed4739c..2742f405 100644
--- a/src/CUDA2HIP_Device_functions.cpp
+++ b/src/CUDA2HIP_Device_functions.cpp
@@ -781,10 +781,10 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DEVICE_FUNCTION_MAP {
   {"__low2bfloat162",                   {"__low2bfloat162",                    "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__high2bfloat162",                  {"__high2bfloat162",                   "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__bfloat16_as_short",               {"__bfloat16_as_short",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
-  {"__bfloat16_as_ushort",              {"__bfloat16_as_ushort",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__short_as_bfloat16",               {"__short_as_bfloat16",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__ushort_as_bfloat16",              {"__ushort_as_bfloat16",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
-  {"__float22bfloat162_rn",             {"__float22bfloat162_rn",              "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
+  {"__bfloat16_as_ushort",              {"__bfloat16_as_ushort",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__short_as_bfloat16",               {"__short_as_bfloat16",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__ushort_as_bfloat16",              {"__ushort_as_bfloat16",               "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
+  {"__float22bfloat162_rn",             {"__float22bfloat162_rn",              "", CONV_DEVICE_FUNC, API_RUNTIME, 1}},
   {"__bfloat162char_rz",                {"__bfloat162char_rz",                 "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"__bfloat162uchar_rz",               {"__bfloat162uchar_rz",                "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
   {"make_bfloat162",                    {"make_bfloat162",                     "", CONV_DEVICE_FUNC, API_RUNTIME, 1, UNSUPPORTED}},
@@ -1524,6 +1524,10 @@ const std::map<llvm::StringRef, hipAPIversions> HIP_DEVICE_FUNCTION_VER_MAP {
   {"__low2bfloat162",                   {HIP_5070, HIP_0,    HIP_0   }},
   {"__high2bfloat162",                  {HIP_5070, HIP_0,    HIP_0   }},
   {"__bfloat16_as_short",               {HIP_5070, HIP_0,    HIP_0   }},
+  {"__bfloat16_as_ushort",              {HIP_5070, HIP_0,    HIP_0   }},
+  {"__short_as_bfloat16",               {HIP_5070, HIP_0,    HIP_0   }},
+  {"__ushort_as_bfloat16",              {HIP_5070, HIP_0,    HIP_0   }},
+  {"__float22bfloat162_rn",             {HIP_5070, HIP_0,    HIP_0   }},
 };
 
 const std::map<unsigned int, llvm::StringRef> CUDA_DEVICE_FUNCTION_API_SECTION_MAP {
diff --git a/src/CUDA2HIP_Device_types.cpp b/src/CUDA2HIP_Device_types.cpp
index 27ebd93c..ed05493d 100644
--- a/src/CUDA2HIP_Device_types.cpp
+++ b/src/CUDA2HIP_Device_types.cpp
@@ -31,7 +31,7 @@ const std::map<llvm::StringRef, hipCounter> CUDA_DEVICE_TYPE_NAME_MAP {
   {"__half2_raw",                          {"__half2_raw",                           "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2}},
   // Bfloat16 Precision Device types
   {"__nv_bfloat16",                        {"__hip_bfloat16",                        "rocblas_bfloat16",                        CONV_DEVICE_TYPE, API_RUNTIME, 2}},
-  {"nv_bfloat16",                          {"hip_bfloat16",                          "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2, UNSUPPORTED}},
+  {"nv_bfloat16",                          {"hip_bfloat16",                          "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2}},
   {"__nv_bfloat16_raw",                    {"__hip_bfloat16_raw",                    "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2}},
   {"__nv_bfloat162",                       {"__hip_bfloat162",                       "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2}},
   {"nv_bfloat162",                         {"hip_bfloat162",                         "",                                        CONV_DEVICE_TYPE, API_RUNTIME, 2, UNSUPPORTED}},
@@ -102,6 +102,7 @@ const std::map<llvm::StringRef, hipAPIversions> HIP_DEVICE_TYPE_NAME_VER_MAP {
   {"__hip_bfloat16_raw",                   {HIP_6020, HIP_0,    HIP_0   }},
   {"__hip_bfloat162_raw",                  {HIP_6020, HIP_0,    HIP_0   }},
   {"__hip_bfloat162",                      {HIP_5070, HIP_0,    HIP_0   }},
+  {"hip_bfloat16",                         {HIP_3050, HIP_0,    HIP_0   }},
 
   {"rocblas_half",                         {HIP_1050, HIP_0,    HIP_0   }},
   {"rocblas_bfloat16",                     {HIP_3050, HIP_0,    HIP_0   }},
diff --git a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu
index f6fcc756..12afdd7b 100644
--- a/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu
+++ b/tests/unit_tests/synthetic/libraries/cudevice2hipdevice.cu
@@ -16,6 +16,7 @@ int main() {
   float fa = 0.0f;
   float fx = 0.0f;
   short int shi = 0;
+  unsigned short int ushi = 0;
   double2 d2 = { 0.0f, 0.0f };
   float2 f2 = { 0.0f, 0.0f };
   __half hx = { 0.0f };
@@ -25,13 +26,16 @@ int main() {
   __half2_raw h2rx = { 0, 0 };
 
 #if CUDA_VERSION >= 11000
-  // CHECK: __hip_bfloat16 bf16 = { 0.0f };
+  // CHECK: __hip_bfloat16 _bf16 = { 0.0f };
   // CHECK-NEXT: __hip_bfloat16 bf16a = { 0.0f };
   // CHECK-NEXT: __hip_bfloat16 bf16b = { 0.0f };
-  __nv_bfloat16 bf16 = { 0.0f };
+  __nv_bfloat16 _bf16 = { 0.0f };
   __nv_bfloat16 bf16a = { 0.0f };
   __nv_bfloat16 bf16b = { 0.0f };
 
+  // CHECK: hip_bfloat16 bf16 = { 0 };
+  nv_bfloat16 bf16 = { 0 };
+
   // CHECK: __hip_bfloat16_raw bf16r = { 0 };
   __nv_bfloat16_raw bf16r = { 0 };
 
@@ -47,18 +51,18 @@ int main() {
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
   // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a)
-  // CHECK: bf16 = __double2bfloat16(da);
-  bf16 = __double2bfloat16(da);
+  // CHECK: _bf16 = __double2bfloat16(da);
+  _bf16 = __double2bfloat16(da);
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
   // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f);
-  // CHECK: bf16 = __float2bfloat16(fa);
-  bf16 = __float2bfloat16(fa);
+  // CHECK: _bf16 = __float2bfloat16(fa);
+  _bf16 = __float2bfloat16(fa);
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
   // HIP: __BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a);
-  // CHECK: bf16 = __bfloat162float(fa);
-  bf16 = __bfloat162float(fa);
+  // CHECK: _bf16 = __bfloat162float(fa);
+  _bf16 = __bfloat162float(fa);
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
   // HIP: __BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a);
@@ -77,8 +81,8 @@ int main() {
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
   // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a);
-  // CHECK: bf16 = __low2bfloat16(bf162a);
-  bf16 = __low2bfloat16(bf162a);
+  // CHECK: _bf16 = __low2bfloat16(bf162a);
+  _bf16 = __low2bfloat16(bf162a);
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
   // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b);
@@ -97,8 +101,28 @@ int main() {
 
   // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
   // HIP: __BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h);
-  // CHECK: shi = __bfloat16_as_short(bf16);
-  shi = __bfloat16_as_short(bf16);
+  // CHECK: shi = __bfloat16_as_short(_bf16);
+  shi = __bfloat16_as_short(_bf16);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h);
+  // CHECK: ushi = __bfloat16_as_ushort(_bf16);
+  ushi = __bfloat16_as_ushort(_bf16);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a);
+  // CHECK: _bf16 = __short_as_bfloat16(shi);
+  _bf16 = __short_as_bfloat16(shi);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a);
+  // CHECK: _bf16 = __ushort_as_bfloat16(ushi);
+  _bf16 = __ushort_as_bfloat16(ushi);
+
+  // CUDA: __CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+  // HIP: __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a);
+  // CHECK: bf162 = __float22bfloat162_rn(f2);
+  bf162 = __float22bfloat162_rn(f2);
 #endif
 
 #if CUDA_VERSION >= 11080