From e501e42938c0f3d6176adc7f6915f1ecec3bcdfd Mon Sep 17 00:00:00 2001
From: "Tang, Jiajun" <jiajun.tang@intel.com>
Date: Tue, 26 Sep 2023 11:07:11 +0800
Subject: [PATCH] [SYCLomatic #1351] Add 67 bf16 precision conversion tests.

Signed-off-by: Tang, Jiajun jiajun.tang@intel.com
---
 features/feature_case/math/math-bf16-conv.cu  |  968 +++++++++-
 features/feature_case/math/math-emu-double.cu |   52 +
 .../feature_case/math/math-ext-bf16-conv.cu   | 1550 +++++++++++++++++
 features/feature_case/math/math-ext-double.cu |   52 +
 features/features.xml                         |    1 +
 features/test_feature.py                      |    8 +-
 6 files changed, 2601 insertions(+), 30 deletions(-)
 create mode 100644 features/feature_case/math/math-ext-bf16-conv.cu
diff --git a/features/feature_case/math/math-bf16-conv.cu b/features/feature_case/math/math-bf16-conv.cu
index 9612d5ba3..e9cdb6bd9 100644
--- a/features/feature_case/math/math-bf16-conv.cu
+++ b/features/feature_case/math/math-bf16-conv.cu
@@ -47,6 +47,23 @@ void checkResult(const string &FuncName, const vector<float> &Inputs,
   check(abs(Result - Expect) < pow(10, -precision));
 }
 
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const float2 &Expect, const float2 &Result,
+                 const int precision) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << fixed << setprecision(precision) << "{" << Result.x << ", "
+       << Result.y << "} (expect {" << Expect.x - pow(10, -precision) << " ~ "
+       << Expect.x + pow(10, -precision) << ", "
+       << Expect.y - pow(10, -precision) << " ~ "
+       << Expect.y + pow(10, -precision) << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result.x - Expect.x) < pow(10, -precision) &&
+        abs(Result.y - Expect.y) < pow(10, -precision));
+}
+
 void checkResult(const string &FuncName, const vector<float2> &Inputs,
                  const float2 &Expect, const float2 &Result,
                  const int precision) {
@@ -64,15 +81,42 @@ void checkResult(const string &FuncName, const vector<float2> &Inputs,
         abs(Result.y - Expect.y) < pow(10, -precision));
 }
 
-void checkResult(const string &FuncName, const vector<__nv_bfloat16> &Inputs,
-                 const __nv_bfloat16 &Expect, const float &Result,
+void checkResult(const string &FuncName, const vector<float2> &Inputs,
+                 const float &Expect, const float &Result,
                  const int precision) {
-  vector<float> FInputs;
-  for (const auto &it : Inputs) {
-    FInputs.push_back(__bfloat162float(it));
+  cout << FuncName << "({" << Inputs[0].x << ", " << Inputs[0].y << "}";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", {" << Inputs[i].x << ", " << Inputs[i].y << "}";
   }
+  cout << ") = " << fixed << setprecision(precision) << Result << " (expect "
+       << Expect - pow(10, -precision) << " ~ " << Expect + pow(10, -precision)
+       << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result - Expect) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const int &Expect, const int &Result) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << Result << " (expect " << Expect << ")";
+  check(Result == Expect);
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const __nv_bfloat16 &Expect, const float &Result,
+                 const int precision) {
   float FExpect = __bfloat162float(Expect);
-  checkResult(FuncName, FInputs, FExpect, Result, precision);
+  checkResult(FuncName, Inputs, FExpect, Result, precision);
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const __nv_bfloat162 &Expect, const float2 &Result,
+                 const int precision) {
+  float2 FExpect{__bfloat162float(Expect.x), __bfloat162float(Expect.y)};
+  checkResult(FuncName, Inputs, FExpect, Result, precision);
 }
 
 void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs,
@@ -102,6 +146,16 @@ void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs,
   checkResult(FuncName, FInputs, Expect, Result, precision);
 }
 
+void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  vector<float2> FInputs;
+  for (const auto &it : Inputs) {
+    FInputs.push_back({__bfloat162float(it.x), __bfloat162float(it.y)});
+  }
+  checkResult(FuncName, FInputs, Expect, Result, precision);
+}
+
 __global__ void setValue(__nv_bfloat16 *Input1, const __nv_bfloat16 Input2) {
   *Input1 = Input2;
 }
@@ -110,18 +164,18 @@ __global__ void setValue(__nv_bfloat162 *Input1, const __nv_bfloat162 Input2) {
   *Input1 = Input2;
 }
 
-__global__ void bFloat1622float2(float *const Result, __nv_bfloat162 Input1) {
+__global__ void bfloat1622float2(float *const Result, __nv_bfloat162 Input1) {
   auto ret = __bfloat1622float2(Input1);
   Result[0] = ret.x;
   Result[1] = ret.y;
 }
 
-void testBFloat1622float2Cases(
+void testBfloat1622float2Cases(
     const vector<pair<__nv_bfloat162, f2i_pair>> &TestCases) {
   float *Result;
   cudaMallocManaged(&Result, 2 * sizeof(*Result));
   for (const auto &TestCase : TestCases) {
-    bFloat1622float2<<<1, 1>>>(Result, TestCase.first);
+    bfloat1622float2<<<1, 1>>>(Result, TestCase.first);
     cudaDeviceSynchronize();
     checkResult("__bfloat1622float2", {TestCase.first}, TestCase.second.first,
                 {Result[0], Result[1]}, TestCase.second.second);
@@ -134,37 +188,444 @@ void testBFloat1622float2Cases(
   }
 }
 
-__global__ void bFloat162float(float *const Result, __nv_bfloat16 Input1) {
+__global__ void bfloat162bfloat162(float *const Result, __nv_bfloat16 Input1) {
+  auto ret = __bfloat162bfloat162(Input1);
+  Result[0] = ret.x;
+  Result[1] = ret.y;
+}
+
+void testBfloat162bfloat162Cases(
+    const vector<pair<__nv_bfloat16, bf162i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162bfloat162<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162bfloat162", vector<float>{TestCase.first},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+  }
+}
+
+__global__ void bfloat162float(float *const Result, __nv_bfloat16 Input1) {
   *Result = __bfloat162float(Input1);
 }
 
-void testBFloat162floatCases(
+void testBfloat162floatCases(
     const vector<pair<__nv_bfloat16, fi_pair>> &TestCases) {
   float *Result;
   cudaMallocManaged(&Result, sizeof(*Result));
   for (const auto &TestCase : TestCases) {
-    bFloat162float<<<1, 1>>>(Result, TestCase.first);
+    bfloat162float<<<1, 1>>>(Result, TestCase.first);
     cudaDeviceSynchronize();
-    checkResult("__bfloat162float", {TestCase.first}, TestCase.second.first,
-                *Result, TestCase.second.second);
+    checkResult("__bfloat162float", std::vector<float>{TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
     *Result = __bfloat162float(TestCase.first);
-    checkResult("(host)__bfloat162float", {TestCase.first},
+    checkResult("(host)__bfloat162float", std::vector<float>{TestCase.first},
                 TestCase.second.first, *Result, TestCase.second.second);
   }
 }
 
-__global__ void float22bFloat162_rn(float *const Result, float2 Input1) {
+__global__ void bfloat162int_rd(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rd(Input1);
+}
+
+void testBfloat162int_rdCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_rn(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rn(Input1);
+}
+
+void testBfloat162int_rnCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_ru(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_ru(Input1);
+}
+
+void testBfloat162int_ruCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_rz(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rz(Input1);
+}
+
+void testBfloat162int_rzCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ll_rd(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rd(Input1);
+}
+
+void testBfloat162ll_rdCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rd", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_rn(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rn(Input1);
+}
+
+void testBfloat162ll_rnCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rn", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_ru(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_ru(Input1);
+}
+
+void testBfloat162ll_ruCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_ru", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_rz(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rz(Input1);
+}
+
+void testBfloat162ll_rzCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rz", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162short_rd(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rd(Input1);
+}
+
+void testBfloat162short_rdCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_rn(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rn(Input1);
+}
+
+void testBfloat162short_rnCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_ru(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_ru(Input1);
+}
+
+void testBfloat162short_ruCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_rz(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rz(Input1);
+}
+
+void testBfloat162short_rzCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rd(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rd(Input1);
+}
+
+void testBfloat162uint_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rn(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rn(Input1);
+}
+
+void testBfloat162uint_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_ru(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_ru(Input1);
+}
+
+void testBfloat162uint_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rz(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rz(Input1);
+}
+
+void testBfloat162uint_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rd(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rd(Input1);
+}
+
+void testBfloat162ull_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rn(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rn(Input1);
+}
+
+void testBfloat162ull_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_ru(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_ru(Input1);
+}
+
+void testBfloat162ull_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rz(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rz(Input1);
+}
+
+void testBfloat162ull_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rd(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rd(Input1);
+}
+
+void testBfloat162ushort_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rn(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rn(Input1);
+}
+
+void testBfloat162ushort_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_ru(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_ru(Input1);
+}
+
+void testBfloat162ushort_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rz(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rz(Input1);
+}
+
+void testBfloat162ushort_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void float22bfloat162_rn(float *const Result, float2 Input1) {
   auto ret = __float22bfloat162_rn(Input1);
   Result[0] = __bfloat162float(ret.x);
   Result[1] = __bfloat162float(ret.y);
 }
 
-void testFloat22bFloat162_rnCases(
+void testFloat22bfloat162_rnCases(
     const vector<pair<float2, bf162i_pair>> &TestCases) {
   float *Result;
   cudaMallocManaged(&Result, 2 * sizeof(*Result));
   for (const auto &TestCase : TestCases) {
-    float22bFloat162_rn<<<1, 1>>>(Result, TestCase.first);
+    float22bfloat162_rn<<<1, 1>>>(Result, TestCase.first);
     cudaDeviceSynchronize();
     checkResult("__float22bfloat162_rn", {TestCase.first},
                 TestCase.second.first, {Result[0], Result[1]},
@@ -178,15 +639,15 @@ void testFloat22bFloat162_rnCases(
   }
 }
 
-__global__ void float2bFloat16(float *const Result, float Input1) {
+__global__ void float2bfloat16(float *const Result, float Input1) {
   *Result = __bfloat162float(__float2bfloat16(Input1));
 }
 
-void testFloat2bFloat16Cases(const vector<pair<float, bf16i_pair>> &TestCases) {
+void testFloat2bfloat16Cases(const vector<pair<float, bf16i_pair>> &TestCases) {
   float *Result;
   cudaMallocManaged(&Result, sizeof(*Result));
   for (const auto &TestCase : TestCases) {
-    float2bFloat16<<<1, 1>>>(Result, TestCase.first);
+    float2bfloat16<<<1, 1>>>(Result, TestCase.first);
     cudaDeviceSynchronize();
     checkResult("__float2bfloat16", {TestCase.first}, TestCase.second.first,
                 *Result, TestCase.second.second);
@@ -196,6 +657,139 @@ void testFloat2bFloat16Cases(const vector<pair<float, bf16i_pair>> &TestCases) {
   }
 }
 
+__global__ void float2bfloat162_rn(float *const Result, float Input1) {
+  auto ret = __float2bfloat162_rn(Input1);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testFloat2bfloat162_rnCases(
+    const vector<pair<float, bf162i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat162_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat162_rn", vector<float>{TestCase.first},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+    auto ret = __float2bfloat162_rn(TestCase.first);
+    Result[0] = __bfloat162float(ret.x);
+    Result[1] = __bfloat162float(ret.y);
+    checkResult("(host)__float2bfloat162_rn", vector<float>{TestCase.first},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+  }
+}
+
+__global__ void floats2bfloat162_rn(float *const Result, float Input1,
+                                    float Input2) {
+  auto ret = __floats2bfloat162_rn(Input1, Input2);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testFloats2bfloat162_rnCases(
+    const vector<pair<pair<float, float>, bf162i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    floats2bfloat162_rn<<<1, 1>>>(Result, TestCase.first.first,
+                                  TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult("__floats2bfloat162_rn",
+                vector<float>{TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+    auto ret =
+        __floats2bfloat162_rn(TestCase.first.first, TestCase.first.second);
+    Result[0] = __bfloat162float(ret.x);
+    Result[1] = __bfloat162float(ret.y);
+    checkResult("(host)__floats2bfloat162_rn",
+                vector<float>{TestCase.first.first, TestCase.first.second},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+  }
+}
+
+__global__ void halves2bfloat162(float *const Result, __nv_bfloat16 Input1,
+                                 __nv_bfloat16 Input2) {
+  auto ret = __halves2bfloat162(Input1, Input2);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testHalves2bfloat162Cases(
+    const vector<pair<pair<__nv_bfloat16, __nv_bfloat16>, bf162i_pair>>
+        &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    halves2bfloat162<<<1, 1>>>(Result, TestCase.first.first,
+                               TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult(
+        "__halves2bfloat162", {TestCase.first.first, TestCase.first.second},
+        TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second);
+  }
+}
+
+__global__ void high2bfloat16(float *const Result, __nv_bfloat162 Input1) {
+  *Result = __bfloat162float(__high2bfloat16(Input1));
+}
+
+void testHigh2bfloat16Cases(
+    const vector<pair<__nv_bfloat162, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    high2bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__high2bfloat16", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void high2bfloat162(float *const Result, __nv_bfloat162 Input1) {
+  auto ret = __high2bfloat162(Input1);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testHigh2bfloat162Cases(
+    const vector<pair<__nv_bfloat162, bf162i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    high2bfloat162<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__high2bfloat162", {TestCase.first}, TestCase.second.first,
+                {Result[0], Result[1]}, TestCase.second.second);
+  }
+}
+
+__global__ void highs2bfloat162(float *const Result, __nv_bfloat162 Input1,
+                                __nv_bfloat162 Input2) {
+  auto ret = __highs2bfloat162(Input1, Input2);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testHighs2bfloat162Cases(
+    const vector<pair<pair<__nv_bfloat162, __nv_bfloat162>, bf162i_pair>>
+        &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    highs2bfloat162<<<1, 1>>>(Result, TestCase.first.first,
+                              TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult(
+        "__highs2bfloat162", {TestCase.first.first, TestCase.first.second},
+        TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second);
+  }
+}
+
 __global__ void ldca(float *const Result, __nv_bfloat16 *Input1) {
   *Result = __ldca(Input1);
 }
@@ -436,6 +1030,62 @@ void testLdluCases(const vector<pair<__nv_bfloat162, int>> &TestCases) {
   }
 }
 
+__global__ void low2bfloat16(float *const Result, __nv_bfloat162 Input1) {
+  *Result = __bfloat162float(__low2bfloat16(Input1));
+}
+
+void testLow2bfloat16Cases(
+    const vector<pair<__nv_bfloat162, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    low2bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__low2bfloat16", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void low2bfloat162(float *const Result, __nv_bfloat162 Input1) {
+  auto ret = __low2bfloat162(Input1);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testLow2bfloat162Cases(
+    const vector<pair<__nv_bfloat162, bf162i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    low2bfloat162<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__low2bfloat162", {TestCase.first}, TestCase.second.first,
+                {Result[0], Result[1]}, TestCase.second.second);
+  }
+}
+
+__global__ void lows2bfloat162(float *const Result, __nv_bfloat162 Input1,
+                               __nv_bfloat162 Input2) {
+  auto ret = __lows2bfloat162(Input1, Input2);
+  Result[0] = __bfloat162float(ret.x);
+  Result[1] = __bfloat162float(ret.y);
+}
+
+void testLows2bfloat162Cases(
+    const vector<pair<pair<__nv_bfloat162, __nv_bfloat162>, bf162i_pair>>
+        &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    lows2bfloat162<<<1, 1>>>(Result, TestCase.first.first,
+                             TestCase.first.second);
+    cudaDeviceSynchronize();
+    checkResult(
+        "__lows2bfloat162", {TestCase.first.first, TestCase.first.second},
+        TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second);
+  }
+}
+
 __global__ void stcg(float *const Result, __nv_bfloat16 Input1,
                      __nv_bfloat16 *const Temp) {
   __stcg(Temp, Input1);
@@ -593,30 +1243,274 @@ void testStwtCases(const vector<pair<__nv_bfloat162, int>> &TestCases) {
 }
 
 int main() {
-  testBFloat1622float2Cases({
+  testBfloat1622float2Cases({
       {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}},
       {{0.3, 0.5}, {{0.30078125, 0.5}, 16}},
       {{30, 50}, {{30, 50}, 14}},
       {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}},
   });
-  testBFloat162floatCases({
+  testBfloat162bfloat162Cases({
+      {-0.3, {{-0.30078125, -0.30078125}, 16}},
+      {0.5, {{0.5, 0.5}, 16}},
+      {30, {{30, 30}, 14}},
+      {0.432643, {{0.43359375, 0.43359375}, 16}},
+      {1, {{1, 1}, 15}},
+      {100.6, {{100.5, 100.5}, 14}},
+  });
+  testBfloat162floatCases({
       {-0.3, {-0.30078125, 16}},
       {0.3, {0.30078125, 16}},
       {30, {30, 14}},
       {0.432643, {0.43359375, 16}},
   });
-  testFloat22bFloat162_rnCases({
+  testBfloat162int_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162int_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162int_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162int_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ll_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ll_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ll_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ll_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162short_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162short_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162short_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162short_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162uint_rdCases({
+      // {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162uint_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162uint_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162uint_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ull_rdCases({
+      // {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ull_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ull_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ull_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ushort_rdCases({
+      // {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ushort_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ushort_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ushort_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testFloat22bfloat162_rnCases({
       {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}},
       {{0.3, 0.5}, {{0.30078125, 0.5}, 16}},
       {{30, 50}, {{30, 50}, 14}},
       {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}},
   });
-  testFloat2bFloat16Cases({
+  testFloat2bfloat16Cases({
       {-0.3, {-0.30078125, 16}},
       {0.3, {0.30078125, 16}},
       {30, {30, 14}},
       {0.432643, {0.43359375, 16}},
   });
+  testFloat2bfloat162_rnCases({
+      {-0.3, {{-0.30078125, -0.30078125}, 16}},
+      {0.5, {{0.5, 0.5}, 16}},
+      {30, {{30, 30}, 14}},
+      {0.432643, {{0.43359375, 0.43359375}, 16}},
+  });
+  testFloats2bfloat162_rnCases({
+      {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}},
+      {{0.3, 0.5}, {{0.30078125, 0.5}, 16}},
+      {{30, 50}, {{30, 50}, 14}},
+      {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}},
+  });
+  testHalves2bfloat162Cases({
+      {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}},
+      {{0.3, 0.5}, {{0.30078125, 0.5}, 16}},
+      {{30, 50}, {{30, 50}, 14}},
+      {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}},
+      {{1, 5000}, {{1, 4992}, 12}},
+      {{10.7, 3000000}, {{10.6875, 2998272}, 9}},
+  });
+  testHigh2bfloat16Cases({
+      {{-0.3, -0.5}, {-0.5, 16}},
+      {{0.3, 0.5}, {0.5, 16}},
+      {{30, 50}, {50, 14}},
+      {{0.432643, 0.23654}, {0.236328125, 16}},
+      {{1, 5000}, {4992, 12}},
+      {{10.7, 3000000}, {2998272, 9}},
+  });
+  testHigh2bfloat162Cases({
+      {{-0.3, -0.5}, {{-0.5, -0.5}, 16}},
+      {{0.3, 0.5}, {{0.5, 0.5}, 16}},
+      {{30, 50}, {{50, 50}, 14}},
+      {{0.432643, 0.23654}, {{0.236328125, 0.236328125}, 16}},
+      {{1, 5000}, {{4992, 4992}, 12}},
+      {{10.7, 3000000}, {{2998272, 2998272}, 9}},
+  });
+  testHighs2bfloat162Cases({
+      {{{-0.3, -0.5}, {10.7, 3000000}}, {{-0.5, 2998272}, 9}},
+      {{{0.3, 0.5}, {-0.3, -0.5}}, {{0.5, -0.5}, 16}},
+      {{{30, 50}, {0.3, 0.5}}, {{50, 0.5}, 14}},
+      {{{0.432643, 0.23654}, {30, 50}}, {{0.236328125, 50}, 14}},
+      {{{1, 5000}, {0.432643, 0.23654}}, {{4992, 0.236328125000}, 12}},
+      {{{10.7, 3000000}, {1, 5000}}, {{2998272, 4992}, 9}},
+  });
   testLdcaCases({
       {-0.3, 16},
       {-0.4, 16},
@@ -701,6 +1595,30 @@ int main() {
       {{1, 100.6}, 14},
       {{100.6, 1}, 14},
   });
+  testLow2bfloat16Cases({
+      {{-0.3, -0.5}, {-0.30078125, 16}},
+      {{0.3, 0.5}, {0.30078125, 16}},
+      {{30, 50}, {30, 14}},
+      {{0.432643, 0.23654}, {0.43359375, 16}},
+      {{1, 5000}, {1, 15}},
+      {{10.7, 3000000}, {10.6875, 15}},
+  });
+  testLow2bfloat162Cases({
+      {{-0.3, -0.5}, {{-0.30078125, -0.30078125}, 16}},
+      {{0.3, 0.5}, {{0.30078125, 0.30078125}, 16}},
+      {{30, 50}, {{30, 30}, 14}},
+      {{0.432643, 0.23654}, {{0.43359375, 0.43359375}, 16}},
+      {{1, 5000}, {{1, 1}, 15}},
+      {{10.7, 3000000}, {{10.6875, 10.6875}, 15}},
+  });
+  testLows2bfloat162Cases({
+      {{{-0.3, -0.5}, {10.7, 3000000}}, {{-0.30078125, 10.6875}, 15}},
+      {{{0.3, 0.5}, {-0.3, -0.5}}, {{0.30078125, -0.30078125}, 16}},
+      {{{30, 50}, {0.3, 0.5}}, {{30, 0.30078125}, 14}},
+      {{{0.432643, 0.23654}, {30, 50}}, {{0.43359375, 30}, 14}},
+      {{{1, 5000}, {0.432643, 0.23654}}, {{1, 0.43359375}, 15}},
+      {{{10.7, 3000000}, {1, 5000}}, {{10.6875, 1}, 15}},
+  });
   testStcgCases({
       {-0.3, 16},
       {-0.4, 16},
diff --git a/features/feature_case/math/math-emu-double.cu b/features/feature_case/math/math-emu-double.cu
index a36e971e6..4109f338c 100644
--- a/features/feature_case/math/math-emu-double.cu
+++ b/features/feature_case/math/math-emu-double.cu
@@ -11,8 +11,11 @@
 #include <iostream>
 #include <vector>
 
+#include "cuda_bf16.h"
+
 using namespace std;
 
+typedef pair<__nv_bfloat16, int> bf16i_pair;
 typedef vector<double> d_vector;
 typedef tuple<double, double, double> d_tuple3;
 typedef tuple<double, double, double, double> d_tuple4;
@@ -31,6 +34,27 @@ void check(bool IsPassed) {
   }
 }
 
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << fixed << setprecision(precision) << Result << " (expect "
+       << Expect - pow(10, -precision) << " ~ " << Expect + pow(10, -precision)
+       << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result - Expect) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const __nv_bfloat16 &Expect, const float &Result,
+                 const int precision) {
+  float FExpect = __bfloat162float(Expect);
+  checkResult(FuncName, Inputs, FExpect, Result, precision);
+}
+
 template <typename T = double>
 void checkResult(const string &FuncName, const vector<T> &Inputs,
                  const double &Expect, const double &DeviceResult,
@@ -50,6 +74,26 @@ __global__ void setVecValue(double *Input1, const double Input2) {
   *Input1 = Input2;
 }
 
+// Bfloat16 Precision Conversion and Data Movement
+
+__global__ void double2bfloat16(float *const Result, double Input1) {
+  *Result = __double2bfloat16(Input1);
+}
+
+void testDouble2bfloat16Cases(
+    const vector<pair<double, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    double2bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__double2bfloat16", {(float)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+// Double Precision Mathematical Functions
+
 __global__ void _norm(double *const DeviceResult, int Input1,
                       const double *Input2) {
   *DeviceResult = norm(Input1, Input2);
@@ -257,6 +301,14 @@ void testRnorm4dCases(const vector<pair<d_tuple4, di_pair>> &TestCases) {
 }
 
 int main() {
+  testDouble2bfloat16Cases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
   testNormCases({
       {{-0.3, -0.34, -0.98}, {1.079814798935447, 15}},
       {{0.3, 0.34, 0.98}, {1.079814798935447, 15}},
diff --git a/features/feature_case/math/math-ext-bf16-conv.cu b/features/feature_case/math/math-ext-bf16-conv.cu
new file mode 100644
index 000000000..a3a1e6a45
--- /dev/null
+++ b/features/feature_case/math/math-ext-bf16-conv.cu
@@ -0,0 +1,1550 @@
+// ====---------- math-ext-bf16-conv.cu---------- *- CUDA -* --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "cuda_bf16.h"
+
+using namespace std;
+
+typedef pair<float2, int> f2i_pair;
+typedef pair<float, int> fi_pair;
+typedef pair<__nv_bfloat16, int> bf16i_pair;
+
+int passed = 0;
+int failed = 0;
+
+void check(bool IsPassed) {
+  if (IsPassed) {
+    cout << " ---- passed" << endl;
+    passed++;
+  } else {
+    cout << " ---- failed" << endl;
+    failed++;
+  }
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision)
+       << Result << " (expect " << Expect - pow(10, -precision) << " ~ "
+       << Expect + pow(10, -precision) << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result - Expect) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float2> &Inputs,
+                 const float2 &Expect, const float2 &Result,
+                 const int precision) {
+  cout << FuncName << "({" << Inputs[0].x << ", " << Inputs[0].y << "}";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", {" << Inputs[i].x << ", " << Inputs[i].y << "}";
+  }
+  cout << ") = " << fixed << setprecision(precision) << "{" << Result.x << ", "
+       << Result.y << "} (expect {" << Expect.x - pow(10, -precision) << " ~ "
+       << Expect.x + pow(10, -precision) << ", "
+       << Expect.y - pow(10, -precision) << " ~ "
+       << Expect.y + pow(10, -precision) << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result.x - Expect.x) < pow(10, -precision) &&
+        abs(Result.y - Expect.y) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const int &Expect, const int &Result) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << Result << " (expect " << Expect << ")";
+  check(Result == Expect);
+}
+
+void checkResult(const string &FuncName, const vector<int> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision)
+       << Result << " (expect " << Expect - pow(10, -precision) << " ~ "
+       << Expect + pow(10, -precision) << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result - Expect) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const __nv_bfloat16 &Expect, const float &Result,
+                 const int precision) {
+  float FExpect = __bfloat162float(Expect);
+  checkResult(FuncName, Inputs, FExpect, Result, precision);
+}
+
+void checkResult(const string &FuncName, const vector<__nv_bfloat16> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  vector<float> FInputs;
+  for (const auto &it : Inputs) {
+    FInputs.push_back(__bfloat162float(it));
+  }
+  checkResult(FuncName, FInputs, Expect, Result, precision);
+}
+
+void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs,
+                 const float2 &Expect, const float2 &Result,
+                 const int precision) {
+  vector<float2> FInputs;
+  for (const auto &it : Inputs) {
+    FInputs.push_back({__bfloat162float(it.x), __bfloat162float(it.y)});
+  }
+  checkResult(FuncName, FInputs, Expect, Result, precision);
+}
+
+__global__ void bfloat1622float2(float *const Result, __nv_bfloat162 Input1) {
+  auto ret = __bfloat1622float2(Input1);
+  Result[0] = ret.x;
+  Result[1] = ret.y;
+}
+
+void testBfloat1622float2Cases(
+    const vector<pair<__nv_bfloat162, f2i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, 2 * sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat1622float2<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat1622float2", {TestCase.first}, TestCase.second.first,
+                {Result[0], Result[1]}, TestCase.second.second);
+    auto ret = __bfloat1622float2(TestCase.first);
+    Result[0] = ret.x;
+    Result[1] = ret.y;
+    checkResult("(host)__bfloat1622float2", {TestCase.first},
+                TestCase.second.first, {Result[0], Result[1]},
+                TestCase.second.second);
+  }
+}
+
+__global__ void bfloat162float(float *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162float(Input1);
+}
+
+void testBfloat162floatCases(
+    const vector<pair<__nv_bfloat16, fi_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162float<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162float", vector<__nv_bfloat16>{TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+    *Result = __bfloat162float(TestCase.first);
+    checkResult("(host)__bfloat162float", vector<__nv_bfloat16>{TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void bfloat162int_rd(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rd(Input1);
+}
+
+void testBfloat162int_rdCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_rn(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rn(Input1);
+}
+
+void testBfloat162int_rnCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_ru(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_ru(Input1);
+}
+
+void testBfloat162int_ruCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162int_rz(int *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162int_rz(Input1);
+}
+
+void testBfloat162int_rzCases(
+    const vector<pair<__nv_bfloat16, int>> &TestCases) {
+  int *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162int_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162int_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ll_rd(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rd(Input1);
+}
+
+void testBfloat162ll_rdCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rd", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_rn(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rn(Input1);
+}
+
+void testBfloat162ll_rnCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rn", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_ru(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_ru(Input1);
+}
+
+void testBfloat162ll_ruCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_ru", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162ll_rz(long long *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162ll_rz(Input1);
+}
+
+void testBfloat162ll_rzCases(
+    const vector<pair<__nv_bfloat16, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ll_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ll_rz", {TestCase.first}, TestCase.second, *Result);
+  }
+}
+
+__global__ void bfloat162short_rd(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rd(Input1);
+}
+
+void testBfloat162short_rdCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_rn(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rn(Input1);
+}
+
+void testBfloat162short_rnCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_ru(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_ru(Input1);
+}
+
+void testBfloat162short_ruCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162short_rz(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162short_rz(Input1);
+}
+
+void testBfloat162short_rzCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162short_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162short_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rd(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rd(Input1);
+}
+
+void testBfloat162uint_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rn(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rn(Input1);
+}
+
+void testBfloat162uint_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_ru(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_ru(Input1);
+}
+
+void testBfloat162uint_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162uint_rz(unsigned *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat162uint_rz(Input1);
+}
+
+void testBfloat162uint_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162uint_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162uint_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rd(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rd(Input1);
+}
+
+void testBfloat162ull_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rn(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rn(Input1);
+}
+
+void testBfloat162ull_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_ru(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_ru(Input1);
+}
+
+void testBfloat162ull_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ull_rz(unsigned long long *const Result,
+                                __nv_bfloat16 Input1) {
+  *Result = __bfloat162ull_rz(Input1);
+}
+
+void testBfloat162ull_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ull_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ull_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rd(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rd(Input1);
+}
+
+void testBfloat162ushort_rdCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rd", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rn(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rn(Input1);
+}
+
+void testBfloat162ushort_rnCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rn", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_ru(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_ru(Input1);
+}
+
+void testBfloat162ushort_ruCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_ru", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat162ushort_rz(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat162ushort_rz(Input1);
+}
+
+void testBfloat162ushort_rzCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat162ushort_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat162ushort_rz", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat16_as_short(short *const Result, __nv_bfloat16 Input1) {
+  *Result = __bfloat16_as_short(Input1);
+}
+
+void testBfloat16_as_shortCases(
+    const vector<pair<__nv_bfloat16, short>> &TestCases) {
+  short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat16_as_short<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat16_as_short", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void bfloat16_as_ushort(unsigned short *const Result,
+                                   __nv_bfloat16 Input1) {
+  *Result = __bfloat16_as_ushort(Input1);
+}
+
+void testBfloat16_as_ushortCases(
+    const vector<pair<__nv_bfloat16, unsigned short>> &TestCases) {
+  unsigned short *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    bfloat16_as_ushort<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__bfloat16_as_ushort", {TestCase.first}, TestCase.second,
+                *Result);
+  }
+}
+
+__global__ void float2bfloat16(float *const Result, float Input1) {
+  *Result = __float2bfloat16(Input1);
+}
+
+void testFloat2bfloat16Cases(const vector<pair<float, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat16", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void float2bfloat16_rd(float *const Result, float Input1) {
+  *Result = __float2bfloat16_rd(Input1);
+}
+
+void testFloat2bfloat16_rdCases(
+    const vector<pair<float, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat16_rd", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void float2bfloat16_rn(float *const Result, float Input1) {
+  *Result = __float2bfloat16_rn(Input1);
+}
+
+void testFloat2bfloat16_rnCases(
+    const vector<pair<float, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat16_rn", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void float2bfloat16_ru(float *const Result, float Input1) {
+  *Result = __float2bfloat16_ru(Input1);
+}
+
+void testFloat2bfloat16_ruCases(
+    const vector<pair<float, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat16_ru", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void float2bfloat16_rz(float *const Result, float Input1) {
+  *Result = __float2bfloat16_rz(Input1);
+}
+
+void testFloat2bfloat16_rzCases(
+    const vector<pair<float, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    float2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__float2bfloat16_rz", {TestCase.first}, TestCase.second.first,
+                *Result, TestCase.second.second);
+  }
+}
+
+__global__ void int2bfloat16_rd(float *const Result, int Input1) {
+  *Result = __int2bfloat16_rd(Input1);
+}
+
+void testInt2bfloat16_rdCases(const vector<pair<int, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    int2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__int2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void int2bfloat16_rn(float *const Result, int Input1) {
+  *Result = __int2bfloat16_rn(Input1);
+}
+
+void testInt2bfloat16_rnCases(const vector<pair<int, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    int2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__int2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void int2bfloat16_ru(float *const Result, int Input1) {
+  *Result = __int2bfloat16_ru(Input1);
+}
+
+void testInt2bfloat16_ruCases(const vector<pair<int, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    int2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__int2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void int2bfloat16_rz(float *const Result, int Input1) {
+  *Result = __int2bfloat16_rz(Input1);
+}
+
+void testInt2bfloat16_rzCases(const vector<pair<int, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    int2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__int2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ll2bfloat16_rd(float *const Result, long long Input1) {
+  *Result = __ll2bfloat16_rd(Input1);
+}
+
+void testLl2bfloat16_rdCases(
+    const vector<pair<long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ll2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ll2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ll2bfloat16_rn(float *const Result, long long Input1) {
+  *Result = __ll2bfloat16_rn(Input1);
+}
+
+void testLl2bfloat16_rnCases(
+    const vector<pair<long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ll2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ll2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ll2bfloat16_ru(float *const Result, long long Input1) {
+  *Result = __ll2bfloat16_ru(Input1);
+}
+
+void testLl2bfloat16_ruCases(
+    const vector<pair<long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ll2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ll2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ll2bfloat16_rz(float *const Result, long long Input1) {
+  *Result = __ll2bfloat16_rz(Input1);
+}
+
+void testLl2bfloat16_rzCases(
+    const vector<pair<long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ll2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ll2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void short2bfloat16_rd(float *const Result, short Input1) {
+  *Result = __short2bfloat16_rd(Input1);
+}
+
+void testShort2bfloat16_rdCases(
+    const vector<pair<short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    short2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__short2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void short2bfloat16_rn(float *const Result, short Input1) {
+  *Result = __short2bfloat16_rn(Input1);
+}
+
+void testShort2bfloat16_rnCases(
+    const vector<pair<short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    short2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__short2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void short2bfloat16_ru(float *const Result, short Input1) {
+  *Result = __short2bfloat16_ru(Input1);
+}
+
+void testShort2bfloat16_ruCases(
+    const vector<pair<short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    short2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__short2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void short2bfloat16_rz(float *const Result, short Input1) {
+  *Result = __short2bfloat16_rz(Input1);
+}
+
+void testShort2bfloat16_rzCases(
+    const vector<pair<short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    short2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__short2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void short_as_bfloat16(float *const Result, short Input1) {
+  *Result = __short_as_bfloat16(Input1);
+}
+
+void testShort_as_bfloat16Cases(
+    const vector<pair<short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    short_as_bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__short_as_bfloat16", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void uint2bfloat16_rd(float *const Result, unsigned Input1) {
+  *Result = __uint2bfloat16_rd(Input1);
+}
+
+void testUint2bfloat16_rdCases(
+    const vector<pair<unsigned, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    uint2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__uint2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void uint2bfloat16_rn(float *const Result, unsigned Input1) {
+  *Result = __uint2bfloat16_rn(Input1);
+}
+
+void testUint2bfloat16_rnCases(
+    const vector<pair<unsigned, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    uint2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__uint2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void uint2bfloat16_ru(float *const Result, unsigned Input1) {
+  *Result = __uint2bfloat16_ru(Input1);
+}
+
+void testUint2bfloat16_ruCases(
+    const vector<pair<unsigned, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    uint2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__uint2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void uint2bfloat16_rz(float *const Result, unsigned Input1) {
+  *Result = __uint2bfloat16_rz(Input1);
+}
+
+void testUint2bfloat16_rzCases(
+    const vector<pair<unsigned, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    uint2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__uint2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ull2bfloat16_rd(float *const Result,
+                                unsigned long long Input1) {
+  *Result = __ull2bfloat16_rd(Input1);
+}
+
+void testUll2bfloat16_rdCases(
+    const vector<pair<unsigned long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ull2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ull2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ull2bfloat16_rn(float *const Result,
+                                unsigned long long Input1) {
+  *Result = __ull2bfloat16_rn(Input1);
+}
+
+void testUll2bfloat16_rnCases(
+    const vector<pair<unsigned long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ull2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ull2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ull2bfloat16_ru(float *const Result,
+                                unsigned long long Input1) {
+  *Result = __ull2bfloat16_ru(Input1);
+}
+
+void testUll2bfloat16_ruCases(
+    const vector<pair<unsigned long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ull2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ull2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ull2bfloat16_rz(float *const Result,
+                                unsigned long long Input1) {
+  *Result = __ull2bfloat16_rz(Input1);
+}
+
+void testUll2bfloat16_rzCases(
+    const vector<pair<unsigned long long, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ull2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ull2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ushort2bfloat16_rd(float *const Result, unsigned short Input1) {
+  *Result = __ushort2bfloat16_rd(Input1);
+}
+
+void testUshort2bfloat16_rdCases(
+    const vector<pair<unsigned short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ushort2bfloat16_rd<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ushort2bfloat16_rd", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ushort2bfloat16_rn(float *const Result, unsigned short Input1) {
+  *Result = __ushort2bfloat16_rn(Input1);
+}
+
+void testUshort2bfloat16_rnCases(
+    const vector<pair<unsigned short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ushort2bfloat16_rn<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ushort2bfloat16_rn", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ushort2bfloat16_ru(float *const Result, unsigned short Input1) {
+  *Result = __ushort2bfloat16_ru(Input1);
+}
+
+void testUshort2bfloat16_ruCases(
+    const vector<pair<unsigned short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ushort2bfloat16_ru<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ushort2bfloat16_ru", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ushort2bfloat16_rz(float *const Result, unsigned short Input1) {
+  *Result = __ushort2bfloat16_rz(Input1);
+}
+
+void testUshort2bfloat16_rzCases(
+    const vector<pair<unsigned short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ushort2bfloat16_rz<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ushort2bfloat16_rz", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+__global__ void ushort_as_bfloat16(float *const Result, unsigned short Input1) {
+  *Result = __ushort_as_bfloat16(Input1);
+}
+
+void testUshort_as_bfloat16Cases(
+    const vector<pair<unsigned short, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    ushort_as_bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__ushort_as_bfloat16", vector<int>{(int)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+int main() {
+  testBfloat1622float2Cases({
+      {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}},
+      {{0.3, 0.5}, {{0.30078125, 0.5}, 16}},
+      {{30, 50}, {{30, 50}, 14}},
+      {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}},
+  });
+  testBfloat162floatCases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+  });
+  testBfloat162int_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162int_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162int_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162int_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ll_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ll_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ll_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ll_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162short_rdCases({
+      {-0.3, -1},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162short_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162short_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162short_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162uint_rdCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162uint_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162uint_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162uint_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ull_rdCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ull_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ull_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ull_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ushort_rdCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat162ushort_rnCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ushort_ruCases({
+      {-0.3, 0},
+      {0.3, 1},
+      {30, 30},
+      {0.432643, 1},
+      {1, 1},
+      {10.7, 11},
+  });
+  testBfloat162ushort_rzCases({
+      {-0.3, 0},
+      {0.3, 0},
+      {30, 30},
+      {0.432643, 0},
+      {1, 1},
+      {10.7, 10},
+  });
+  testBfloat16_as_shortCases({
+      {-0.3, -16742},
+      {0.3, 16026},
+      {30, 16880},
+      {0.432643, 16094},
+      {1, 16256},
+      {10.7, 16683},
+  });
+  testBfloat16_as_ushortCases({
+      {-0.3, 48794},
+      {0.3, 16026},
+      {30, 16880},
+      {0.432643, 16094},
+      {1, 16256},
+      {10.7, 16683},
+  });
+  testFloat2bfloat16Cases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
+  testFloat2bfloat16_rdCases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.298828125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.431640625, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
+  testFloat2bfloat16_rnCases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
+  testFloat2bfloat16_ruCases({
+      {-0.3, {-0.298828125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+      {1, {1, 15}},
+      {10.7, {10.75, 15}},
+  });
+  testFloat2bfloat16_rzCases({
+      {-0.3, {-0.298828125, 16}},
+      {0.3, {0.298828125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.431640625, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
+  testInt2bfloat16_rdCases({
+      {-10000, {-10048, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testInt2bfloat16_rnCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testInt2bfloat16_ruCases({
+      {-10000, {-9984, 12}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testInt2bfloat16_rzCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testLl2bfloat16_rdCases({
+      {-10000, {-10048, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testLl2bfloat16_rnCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testLl2bfloat16_ruCases({
+      {-10000, {-9984, 12}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testLl2bfloat16_rzCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testShort2bfloat16_rdCases({
+      {-10000, {-10048, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testShort2bfloat16_rnCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-3008, 12}},
+  });
+  testShort2bfloat16_ruCases({
+      {-10000, {-9984, 12}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testShort2bfloat16_rzCases({
+      {-10000, {-9984, 12}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {-2992, 12}},
+  });
+  testShort_as_bfloat16Cases({
+      {-10000, {-2111062325329920.0, 0}},
+      {10000, {0.000000000000001998401444325282, 30}},
+      {30000, {223106505640168374663419764146176.0, -17}},
+      {3000, {0.00000000000000000000000000000007087422195345028, 47}},
+      {1000, {0.0000000000000000000000000000000000013635734469538535, 52}},
+      {-3000, {-63382530011411470074835160268800.0, -16}},
+  });
+  testUint2bfloat16_rdCases({
+      {-10000, {4278190080.0, 6}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {4278190080.0, 6}},
+  });
+  testUint2bfloat16_rnCases({
+      {-10000, {4294967296.0, 6}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {4294967296.0, 6}},
+  });
+  testUint2bfloat16_ruCases({
+      {-10000, {4294967296.0, 6}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {4294967296.0, 6}},
+  });
+  testUint2bfloat16_rzCases({
+      {-10000, {4278190080.0, 6}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {4278190080.0, 6}},
+  });
+  testUll2bfloat16_rdCases({
+      {-10000, {18374686479671623680.0, -4}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {18374686479671623680.0, -4}},
+  });
+  testUll2bfloat16_rnCases({
+      {-10000, {18446744073709551616.0, -4}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {18446744073709551616.0, -4}},
+  });
+  testUll2bfloat16_ruCases({
+      {-10000, {18446744073709551616.0, -4}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {18446744073709551616.0, -4}},
+  });
+  testUll2bfloat16_rzCases({
+      {-10000, {18374686479671623680.0, -4}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {18374686479671623680.0, -4}},
+  });
+  testUshort2bfloat16_rdCases({
+      {-10000, {55296.0, 11}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {62464.0, 11}},
+  });
+  testUshort2bfloat16_rnCases({
+      {-10000, {55552.0, 11}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {62464.0, 11}},
+  });
+  testUshort2bfloat16_ruCases({
+      {-10000, {55552.0, 11}},
+      {10000, {10048, 12}},
+      {30000, {30080, 11}},
+      {3000, {3008, 12}},
+      {1000, {1000, 13}},
+      {-3000, {62720.0, 11}},
+  });
+  testUshort2bfloat16_rzCases({
+      {-10000, {55296.0, 11}},
+      {10000, {9984, 12}},
+      {30000, {29952, 11}},
+      {3000, {2992, 12}},
+      {1000, {1000, 13}},
+      {-3000, {62464.0, 11}},
+  });
+  testUshort_as_bfloat16Cases({
+      {-10000, {-2111062325329920.0, 0}},
+      {10000, {0.0000000000000019984014443252817727625, 30}},
+      {30000, {223106505640168374663419764146176.0, -17}},
+      {3000, {0.00000000000000000000000000000007087422195345028, 47}},
+      {1000, {0.0000000000000000000000000000000000013635734469538535, 52}},
+      {-3000, {-63382530011411470074835160268800.0, -16}},
+  });
+  cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
+  if (failed) {
+    cout << "failed!" << endl;
+  }
+  return failed;
+}
diff --git a/features/feature_case/math/math-ext-double.cu b/features/feature_case/math/math-ext-double.cu
index f9ca2cbb7..73074a5c8 100644
--- a/features/feature_case/math/math-ext-double.cu
+++ b/features/feature_case/math/math-ext-double.cu
@@ -11,8 +11,11 @@
 #include <iostream>
 #include <vector>
 
+#include "cuda_bf16.h"
+
 using namespace std;
 
+typedef pair<__nv_bfloat16, int> bf16i_pair;
 typedef vector<double> d_vector;
 typedef pair<double, int> di_pair;
 
@@ -29,6 +32,27 @@ void check(bool IsPassed) {
   }
 }
 
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const float &Expect, const float &Result,
+                 const int precision) {
+  cout << FuncName << "(" << Inputs[0] << "";
+  for (size_t i = 1; i < Inputs.size(); ++i) {
+    cout << ", " << Inputs[i];
+  }
+  cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision)
+       << Result << " (expect " << Expect - pow(10, -precision) << " ~ "
+       << Expect + pow(10, -precision) << ")";
+  cout.unsetf(ios::fixed);
+  check(abs(Result - Expect) < pow(10, -precision));
+}
+
+void checkResult(const string &FuncName, const vector<float> &Inputs,
+                 const __nv_bfloat16 &Expect, const float &Result,
+                 const int precision) {
+  float FExpect = __bfloat162float(Expect);
+  checkResult(FuncName, Inputs, FExpect, Result, precision);
+}
+
 template <typename T = double>
 void checkResult(const string &FuncName, const vector<T> &Inputs,
                  const double &Expect, const double &DeviceResult,
@@ -44,6 +68,26 @@ void checkResult(const string &FuncName, const vector<T> &Inputs,
   check(abs(DeviceResult - Expect) < pow(10, -precision));
 }
 
+// Bfloat16 Precision Conversion and Data Movement
+
+__global__ void double2bfloat16(float *const Result, double Input1) {
+  *Result = __double2bfloat16(Input1);
+}
+
+void testDouble2bfloat16Cases(
+    const vector<pair<double, bf16i_pair>> &TestCases) {
+  float *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    double2bfloat16<<<1, 1>>>(Result, TestCase.first);
+    cudaDeviceSynchronize();
+    checkResult("__double2bfloat16", {(float)TestCase.first},
+                TestCase.second.first, *Result, TestCase.second.second);
+  }
+}
+
+// Double Precision Mathematical Functions
+
 __global__ void cylBesselI0(double *const Result, double Input1) {
   *Result = cyl_bessel_i0(Input1);
 }
@@ -311,6 +355,14 @@ void testY1Cases(const vector<pair<double, di_pair>> &TestCases) {
 }
 
 int main() {
+  testDouble2bfloat16Cases({
+      {-0.3, {-0.30078125, 16}},
+      {0.3, {0.30078125, 16}},
+      {30, {30, 14}},
+      {0.432643, {0.43359375, 16}},
+      {1, {1, 15}},
+      {10.7, {10.6875, 15}},
+  });
   testCylBesselI0Cases({
       {0.3, {1.022626879351597, 15}},
       {0.5, {1.063483370741324, 15}},
diff --git a/features/features.xml b/features/features.xml
index bc7cdc62d..871c61544 100644
--- a/features/features.xml
+++ b/features/features.xml
@@ -123,6 +123,7 @@
     <test testName="math-half-raw" configFile="config/TEMPLATE_math_after_11.xml"/>
     <test testName="math-experimental-bf16" configFile="config/TEMPLATE_math_after_11.xml" />
     <test testName="math-experimental-bf162" configFile="config/TEMPLATE_math_after_11.xml" />
+    <test testName="math-ext-bf16-conv" configFile="config/TEMPLATE_math_after_11.xml" />
     <test testName="math-ext-double" configFile="config/TEMPLATE_math_skip_double.xml" splitGroup="double"/>
     <test testName="math-ext-float" configFile="config/TEMPLATE_math_skip_double.xml" splitGroup="double"/>
     <test testName="math-ext-half" configFile="config/TEMPLATE_math_after_9.xml" />
diff --git a/features/test_feature.py b/features/test_feature.py
index ecde1e279..15df20162 100644
--- a/features/test_feature.py
+++ b/features/test_feature.py
@@ -31,8 +31,8 @@
               'cub_device_reduce_arg', 'cub_device_seg_sort_pairs', 'cub_intrinsic', 'cub_device_seg_sort_keys', 'thrust-math1', 'thrust-math2',
               'cub_transform_iterator', 'activemask', 'complex', 'thrust-math', 'libcu_array', 'libcu_complex', 'libcu_tuple',
               'user_defined_rules', 'math-exec', 'math-habs', 'math-emu-double', 'math-emu-float', 'math-emu-half', 'math-emu-half-after11', 'math-emu-half2', 'math-emu-half2-after11', 'math-emu-half2-after12', 'math-emu-simd',
-              'math-emu-bf16', 'math-emu-bf16-after12', 'math-emu-bf162', 'math-experimental-bf16', 'math-experimental-bf162', "math-half-raw",
-              'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation',
+              'math-emu-bf16', 'math-emu-bf162-after12', 'math-emu-bf162', 'math-experimental-bf16', 'math-experimental-bf162', "math-half-raw",
+              'math-ext-bf16-conv', 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation',
               'cudnn-fill', 'cudnn-lrn', 'cudnn-memory', 'cudnn-pooling', 'cudnn-reorder', 'cudnn-scale', 'cudnn-softmax',
               'cudnn-sum', 'math-funnelshift', 'thrust-sort_by_key', 'thrust-find', 'thrust-inner_product', 'thrust-reduce_by_key',
               'math-bf16-conv', 'math-half-conv',
@@ -79,8 +79,6 @@ def migrate_test():
     logical_group_exper = ['cooperative_groups', 'cooperative_groups_thread_group']
     experimental_bfloat16_tests = ['math-experimental-bf16', 'math-experimental-bf162']
 
-    math_extension_tests = ['math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd']
-
     if test_config.current_test in nd_range_bar_exper:
         src.append(' --use-experimental-features=nd_range_barrier ')
     if test_config.current_test == "user_defined_rules":
@@ -89,7 +87,7 @@ def migrate_test():
         src.append(' --use-experimental-features=logical-group ')
     if test_config.current_test == 'math_intel_specific':
         src.append(' --rule-file=./math_intel_specific/intel_specific_math.yaml')
-    if test_config.current_test in math_extension_tests:
+    if test_config.current_test.startswith('math-ext-'):
         src.append(' --use-dpcpp-extensions=intel_device_math')
     if test_config.current_test in occupancy_calculation_exper:
         src.append(' --use-experimental-features=occupancy-calculation ')