From e501e42938c0f3d6176adc7f6915f1ecec3bcdfd Mon Sep 17 00:00:00 2001 From: "Tang, Jiajun" Date: Tue, 26 Sep 2023 11:07:11 +0800 Subject: [PATCH] [SYCLomatic #1351] Add 67 bf16 precision conversion tests. Signed-off-by: Tang, Jiajun jiajun.tang@intel.com --- features/feature_case/math/math-bf16-conv.cu | 968 +++++++++- features/feature_case/math/math-emu-double.cu | 52 + .../feature_case/math/math-ext-bf16-conv.cu | 1550 +++++++++++++++++ features/feature_case/math/math-ext-double.cu | 52 + features/features.xml | 1 + features/test_feature.py | 8 +- 6 files changed, 2601 insertions(+), 30 deletions(-) create mode 100644 features/feature_case/math/math-ext-bf16-conv.cu diff --git a/features/feature_case/math/math-bf16-conv.cu b/features/feature_case/math/math-bf16-conv.cu index 9612d5ba3..e9cdb6bd9 100644 --- a/features/feature_case/math/math-bf16-conv.cu +++ b/features/feature_case/math/math-bf16-conv.cu @@ -47,6 +47,23 @@ void checkResult(const string &FuncName, const vector &Inputs, check(abs(Result - Expect) < pow(10, -precision)); } +void checkResult(const string &FuncName, const vector &Inputs, + const float2 &Expect, const float2 &Result, + const int precision) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << fixed << setprecision(precision) << "{" << Result.x << ", " + << Result.y << "} (expect {" << Expect.x - pow(10, -precision) << " ~ " + << Expect.x + pow(10, -precision) << ", " + << Expect.y - pow(10, -precision) << " ~ " + << Expect.y + pow(10, -precision) << ")"; + cout.unsetf(ios::fixed); + check(abs(Result.x - Expect.x) < pow(10, -precision) && + abs(Result.y - Expect.y) < pow(10, -precision)); +} + void checkResult(const string &FuncName, const vector &Inputs, const float2 &Expect, const float2 &Result, const int precision) { @@ -64,15 +81,42 @@ void checkResult(const string &FuncName, const vector &Inputs, abs(Result.y - Expect.y) < pow(10, -precision)); } -void checkResult(const string &FuncName, const vector<__nv_bfloat16> &Inputs, - const __nv_bfloat16 &Expect, const float &Result, +void checkResult(const string &FuncName, const vector &Inputs, + const float &Expect, const float &Result, const int precision) { - vector FInputs; - for (const auto &it : Inputs) { - FInputs.push_back(__bfloat162float(it)); + cout << FuncName << "({" << Inputs[0].x << ", " << Inputs[0].y << "}"; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", {" << Inputs[i].x << ", " << Inputs[i].y << "}"; } + cout << ") = " << fixed << setprecision(precision) << Result << " (expect " + << Expect - pow(10, -precision) << " ~ " << Expect + pow(10, -precision) + << ")"; + cout.unsetf(ios::fixed); + check(abs(Result - Expect) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const int &Expect, const int &Result) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << Result << " (expect " << Expect << ")"; + check(Result == Expect); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const __nv_bfloat16 &Expect, const float &Result, + const int precision) { float FExpect = __bfloat162float(Expect); - checkResult(FuncName, FInputs, FExpect, Result, precision); + checkResult(FuncName, Inputs, FExpect, Result, precision); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const __nv_bfloat162 &Expect, const float2 &Result, + const int precision) { + float2 FExpect{__bfloat162float(Expect.x), __bfloat162float(Expect.y)}; + checkResult(FuncName, Inputs, FExpect, Result, precision); } void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs, @@ -102,6 +146,16 @@ void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs, checkResult(FuncName, FInputs, Expect, Result, precision); } +void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs, + const float &Expect, const float &Result, + const int precision) { + vector FInputs; + for (const auto &it : Inputs) { + FInputs.push_back({__bfloat162float(it.x), __bfloat162float(it.y)}); + } + checkResult(FuncName, FInputs, Expect, Result, precision); +} + __global__ void setValue(__nv_bfloat16 *Input1, const __nv_bfloat16 Input2) { *Input1 = Input2; } @@ -110,18 +164,18 @@ __global__ void setValue(__nv_bfloat162 *Input1, const __nv_bfloat162 Input2) { *Input1 = Input2; } -__global__ void bFloat1622float2(float *const Result, __nv_bfloat162 Input1) { +__global__ void bfloat1622float2(float *const Result, __nv_bfloat162 Input1) { auto ret = __bfloat1622float2(Input1); Result[0] = ret.x; Result[1] = ret.y; } -void testBFloat1622float2Cases( +void testBfloat1622float2Cases( const vector> &TestCases) { float *Result; cudaMallocManaged(&Result, 2 * sizeof(*Result)); for (const auto &TestCase : TestCases) { - bFloat1622float2<<<1, 1>>>(Result, TestCase.first); + bfloat1622float2<<<1, 1>>>(Result, TestCase.first); cudaDeviceSynchronize(); checkResult("__bfloat1622float2", {TestCase.first}, TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second); @@ -134,37 +188,444 @@ void testBFloat1622float2Cases( } } -__global__ void bFloat162float(float *const Result, __nv_bfloat16 Input1) { +__global__ void bfloat162bfloat162(float *const Result, __nv_bfloat16 Input1) { + auto ret = __bfloat162bfloat162(Input1); + Result[0] = ret.x; + Result[1] = ret.y; +} + +void testBfloat162bfloat162Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162bfloat162<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162bfloat162", vector{TestCase.first}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + } +} + +__global__ void bfloat162float(float *const Result, __nv_bfloat16 Input1) { *Result = __bfloat162float(Input1); } -void testBFloat162floatCases( +void testBfloat162floatCases( const vector> &TestCases) { float *Result; cudaMallocManaged(&Result, sizeof(*Result)); for (const auto &TestCase : TestCases) { - bFloat162float<<<1, 1>>>(Result, TestCase.first); + bfloat162float<<<1, 1>>>(Result, TestCase.first); cudaDeviceSynchronize(); - checkResult("__bfloat162float", {TestCase.first}, TestCase.second.first, - *Result, TestCase.second.second); + checkResult("__bfloat162float", std::vector{TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); *Result = __bfloat162float(TestCase.first); - checkResult("(host)__bfloat162float", {TestCase.first}, + checkResult("(host)__bfloat162float", std::vector{TestCase.first}, TestCase.second.first, *Result, TestCase.second.second); } } -__global__ void float22bFloat162_rn(float *const Result, float2 Input1) { +__global__ void bfloat162int_rd(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rd(Input1); +} + +void testBfloat162int_rdCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_rn(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rn(Input1); +} + +void testBfloat162int_rnCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_ru(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_ru(Input1); +} + +void testBfloat162int_ruCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_rz(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rz(Input1); +} + +void testBfloat162int_rzCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ll_rd(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rd(Input1); +} + +void testBfloat162ll_rdCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rd", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_rn(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rn(Input1); +} + +void testBfloat162ll_rnCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rn", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_ru(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_ru(Input1); +} + +void testBfloat162ll_ruCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_ru", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_rz(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rz(Input1); +} + +void testBfloat162ll_rzCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rz", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162short_rd(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rd(Input1); +} + +void testBfloat162short_rdCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_rn(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rn(Input1); +} + +void testBfloat162short_rnCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_ru(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_ru(Input1); +} + +void testBfloat162short_ruCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_rz(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rz(Input1); +} + +void testBfloat162short_rzCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rd(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rd(Input1); +} + +void testBfloat162uint_rdCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rn(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rn(Input1); +} + +void testBfloat162uint_rnCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_ru(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_ru(Input1); +} + +void testBfloat162uint_ruCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rz(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rz(Input1); +} + +void testBfloat162uint_rzCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rd(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rd(Input1); +} + +void testBfloat162ull_rdCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rn(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rn(Input1); +} + +void testBfloat162ull_rnCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_ru(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_ru(Input1); +} + +void testBfloat162ull_ruCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rz(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rz(Input1); +} + +void testBfloat162ull_rzCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rd(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rd(Input1); +} + +void testBfloat162ushort_rdCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rn(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rn(Input1); +} + +void testBfloat162ushort_rnCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_ru(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_ru(Input1); +} + +void testBfloat162ushort_ruCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rz(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rz(Input1); +} + +void testBfloat162ushort_rzCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void float22bfloat162_rn(float *const Result, float2 Input1) { auto ret = __float22bfloat162_rn(Input1); Result[0] = __bfloat162float(ret.x); Result[1] = __bfloat162float(ret.y); } -void testFloat22bFloat162_rnCases( +void testFloat22bfloat162_rnCases( const vector> &TestCases) { float *Result; cudaMallocManaged(&Result, 2 * sizeof(*Result)); for (const auto &TestCase : TestCases) { - float22bFloat162_rn<<<1, 1>>>(Result, TestCase.first); + float22bfloat162_rn<<<1, 1>>>(Result, TestCase.first); cudaDeviceSynchronize(); checkResult("__float22bfloat162_rn", {TestCase.first}, TestCase.second.first, {Result[0], Result[1]}, @@ -178,15 +639,15 @@ void testFloat22bFloat162_rnCases( } } -__global__ void float2bFloat16(float *const Result, float Input1) { +__global__ void float2bfloat16(float *const Result, float Input1) { *Result = __bfloat162float(__float2bfloat16(Input1)); } -void testFloat2bFloat16Cases(const vector> &TestCases) { +void testFloat2bfloat16Cases(const vector> &TestCases) { float *Result; cudaMallocManaged(&Result, sizeof(*Result)); for (const auto &TestCase : TestCases) { - float2bFloat16<<<1, 1>>>(Result, TestCase.first); + float2bfloat16<<<1, 1>>>(Result, TestCase.first); cudaDeviceSynchronize(); checkResult("__float2bfloat16", {TestCase.first}, TestCase.second.first, *Result, TestCase.second.second); @@ -196,6 +657,139 @@ void testFloat2bFloat16Cases(const vector> &TestCases) { } } +__global__ void float2bfloat162_rn(float *const Result, float Input1) { + auto ret = __float2bfloat162_rn(Input1); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testFloat2bfloat162_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat162_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat162_rn", vector{TestCase.first}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + auto ret = __float2bfloat162_rn(TestCase.first); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); + checkResult("(host)__float2bfloat162_rn", vector{TestCase.first}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + } +} + +__global__ void floats2bfloat162_rn(float *const Result, float Input1, + float Input2) { + auto ret = __floats2bfloat162_rn(Input1, Input2); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testFloats2bfloat162_rnCases( + const vector, bf162i_pair>> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + floats2bfloat162_rn<<<1, 1>>>(Result, TestCase.first.first, + TestCase.first.second); + cudaDeviceSynchronize(); + checkResult("__floats2bfloat162_rn", + vector{TestCase.first.first, TestCase.first.second}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + auto ret = + __floats2bfloat162_rn(TestCase.first.first, TestCase.first.second); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); + checkResult("(host)__floats2bfloat162_rn", + vector{TestCase.first.first, TestCase.first.second}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + } +} + +__global__ void halves2bfloat162(float *const Result, __nv_bfloat16 Input1, + __nv_bfloat16 Input2) { + auto ret = __halves2bfloat162(Input1, Input2); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testHalves2bfloat162Cases( + const vector, bf162i_pair>> + &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + halves2bfloat162<<<1, 1>>>(Result, TestCase.first.first, + TestCase.first.second); + cudaDeviceSynchronize(); + checkResult( + "__halves2bfloat162", {TestCase.first.first, TestCase.first.second}, + TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second); + } +} + +__global__ void high2bfloat16(float *const Result, __nv_bfloat162 Input1) { + *Result = __bfloat162float(__high2bfloat16(Input1)); +} + +void testHigh2bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + high2bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__high2bfloat16", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void high2bfloat162(float *const Result, __nv_bfloat162 Input1) { + auto ret = __high2bfloat162(Input1); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testHigh2bfloat162Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + high2bfloat162<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__high2bfloat162", {TestCase.first}, TestCase.second.first, + {Result[0], Result[1]}, TestCase.second.second); + } +} + +__global__ void highs2bfloat162(float *const Result, __nv_bfloat162 Input1, + __nv_bfloat162 Input2) { + auto ret = __highs2bfloat162(Input1, Input2); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testHighs2bfloat162Cases( + const vector, bf162i_pair>> + &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + highs2bfloat162<<<1, 1>>>(Result, TestCase.first.first, + TestCase.first.second); + cudaDeviceSynchronize(); + checkResult( + "__highs2bfloat162", {TestCase.first.first, TestCase.first.second}, + TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second); + } +} + __global__ void ldca(float *const Result, __nv_bfloat16 *Input1) { *Result = __ldca(Input1); } @@ -436,6 +1030,62 @@ void testLdluCases(const vector> &TestCases) { } } +__global__ void low2bfloat16(float *const Result, __nv_bfloat162 Input1) { + *Result = __bfloat162float(__low2bfloat16(Input1)); +} + +void testLow2bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + low2bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__low2bfloat16", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void low2bfloat162(float *const Result, __nv_bfloat162 Input1) { + auto ret = __low2bfloat162(Input1); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testLow2bfloat162Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + low2bfloat162<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__low2bfloat162", {TestCase.first}, TestCase.second.first, + {Result[0], Result[1]}, TestCase.second.second); + } +} + +__global__ void lows2bfloat162(float *const Result, __nv_bfloat162 Input1, + __nv_bfloat162 Input2) { + auto ret = __lows2bfloat162(Input1, Input2); + Result[0] = __bfloat162float(ret.x); + Result[1] = __bfloat162float(ret.y); +} + +void testLows2bfloat162Cases( + const vector, bf162i_pair>> + &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + lows2bfloat162<<<1, 1>>>(Result, TestCase.first.first, + TestCase.first.second); + cudaDeviceSynchronize(); + checkResult( + "__lows2bfloat162", {TestCase.first.first, TestCase.first.second}, + TestCase.second.first, {Result[0], Result[1]}, TestCase.second.second); + } +} + __global__ void stcg(float *const Result, __nv_bfloat16 Input1, __nv_bfloat16 *const Temp) { __stcg(Temp, Input1); @@ -593,30 +1243,274 @@ void testStwtCases(const vector> &TestCases) { } int main() { - testBFloat1622float2Cases({ + testBfloat1622float2Cases({ {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}}, {{0.3, 0.5}, {{0.30078125, 0.5}, 16}}, {{30, 50}, {{30, 50}, 14}}, {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}}, }); - testBFloat162floatCases({ + testBfloat162bfloat162Cases({ + {-0.3, {{-0.30078125, -0.30078125}, 16}}, + {0.5, {{0.5, 0.5}, 16}}, + {30, {{30, 30}, 14}}, + {0.432643, {{0.43359375, 0.43359375}, 16}}, + {1, {{1, 1}, 15}}, + {100.6, {{100.5, 100.5}, 14}}, + }); + testBfloat162floatCases({ {-0.3, {-0.30078125, 16}}, {0.3, {0.30078125, 16}}, {30, {30, 14}}, {0.432643, {0.43359375, 16}}, }); - testFloat22bFloat162_rnCases({ + testBfloat162int_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162int_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162int_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162int_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ll_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ll_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ll_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ll_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162short_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162short_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162short_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162short_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162uint_rdCases({ + // {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162uint_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162uint_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162uint_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ull_rdCases({ + // {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ull_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ull_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ull_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ushort_rdCases({ + // {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ushort_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ushort_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ushort_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testFloat22bfloat162_rnCases({ {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}}, {{0.3, 0.5}, {{0.30078125, 0.5}, 16}}, {{30, 50}, {{30, 50}, 14}}, {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}}, }); - testFloat2bFloat16Cases({ + testFloat2bfloat16Cases({ {-0.3, {-0.30078125, 16}}, {0.3, {0.30078125, 16}}, {30, {30, 14}}, {0.432643, {0.43359375, 16}}, }); + testFloat2bfloat162_rnCases({ + {-0.3, {{-0.30078125, -0.30078125}, 16}}, + {0.5, {{0.5, 0.5}, 16}}, + {30, {{30, 30}, 14}}, + {0.432643, {{0.43359375, 0.43359375}, 16}}, + }); + testFloats2bfloat162_rnCases({ + {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}}, + {{0.3, 0.5}, {{0.30078125, 0.5}, 16}}, + {{30, 50}, {{30, 50}, 14}}, + {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}}, + }); + testHalves2bfloat162Cases({ + {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}}, + {{0.3, 0.5}, {{0.30078125, 0.5}, 16}}, + {{30, 50}, {{30, 50}, 14}}, + {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}}, + {{1, 5000}, {{1, 4992}, 12}}, + {{10.7, 3000000}, {{10.6875, 2998272}, 9}}, + }); + testHigh2bfloat16Cases({ + {{-0.3, -0.5}, {-0.5, 16}}, + {{0.3, 0.5}, {0.5, 16}}, + {{30, 50}, {50, 14}}, + {{0.432643, 0.23654}, {0.236328125, 16}}, + {{1, 5000}, {4992, 12}}, + {{10.7, 3000000}, {2998272, 9}}, + }); + testHigh2bfloat162Cases({ + {{-0.3, -0.5}, {{-0.5, -0.5}, 16}}, + {{0.3, 0.5}, {{0.5, 0.5}, 16}}, + {{30, 50}, {{50, 50}, 14}}, + {{0.432643, 0.23654}, {{0.236328125, 0.236328125}, 16}}, + {{1, 5000}, {{4992, 4992}, 12}}, + {{10.7, 3000000}, {{2998272, 2998272}, 9}}, + }); + testHighs2bfloat162Cases({ + {{{-0.3, -0.5}, {10.7, 3000000}}, {{-0.5, 2998272}, 9}}, + {{{0.3, 0.5}, {-0.3, -0.5}}, {{0.5, -0.5}, 16}}, + {{{30, 50}, {0.3, 0.5}}, {{50, 0.5}, 14}}, + {{{0.432643, 0.23654}, {30, 50}}, {{0.236328125, 50}, 14}}, + {{{1, 5000}, {0.432643, 0.23654}}, {{4992, 0.236328125000}, 12}}, + {{{10.7, 3000000}, {1, 5000}}, {{2998272, 4992}, 9}}, + }); testLdcaCases({ {-0.3, 16}, {-0.4, 16}, @@ -701,6 +1595,30 @@ int main() { {{1, 100.6}, 14}, {{100.6, 1}, 14}, }); + testLow2bfloat16Cases({ + {{-0.3, -0.5}, {-0.30078125, 16}}, + {{0.3, 0.5}, {0.30078125, 16}}, + {{30, 50}, {30, 14}}, + {{0.432643, 0.23654}, {0.43359375, 16}}, + {{1, 5000}, {1, 15}}, + {{10.7, 3000000}, {10.6875, 15}}, + }); + testLow2bfloat162Cases({ + {{-0.3, -0.5}, {{-0.30078125, -0.30078125}, 16}}, + {{0.3, 0.5}, {{0.30078125, 0.30078125}, 16}}, + {{30, 50}, {{30, 30}, 14}}, + {{0.432643, 0.23654}, {{0.43359375, 0.43359375}, 16}}, + {{1, 5000}, {{1, 1}, 15}}, + {{10.7, 3000000}, {{10.6875, 10.6875}, 15}}, + }); + testLows2bfloat162Cases({ + {{{-0.3, -0.5}, {10.7, 3000000}}, {{-0.30078125, 10.6875}, 15}}, + {{{0.3, 0.5}, {-0.3, -0.5}}, {{0.30078125, -0.30078125}, 16}}, + {{{30, 50}, {0.3, 0.5}}, {{30, 0.30078125}, 14}}, + {{{0.432643, 0.23654}, {30, 50}}, {{0.43359375, 30}, 14}}, + {{{1, 5000}, {0.432643, 0.23654}}, {{1, 0.43359375}, 15}}, + {{{10.7, 3000000}, {1, 5000}}, {{10.6875, 1}, 15}}, + }); testStcgCases({ {-0.3, 16}, {-0.4, 16}, diff --git a/features/feature_case/math/math-emu-double.cu b/features/feature_case/math/math-emu-double.cu index a36e971e6..4109f338c 100644 --- a/features/feature_case/math/math-emu-double.cu +++ b/features/feature_case/math/math-emu-double.cu @@ -11,8 +11,11 @@ #include #include +#include "cuda_bf16.h" + using namespace std; +typedef pair<__nv_bfloat16, int> bf16i_pair; typedef vector d_vector; typedef tuple d_tuple3; typedef tuple d_tuple4; @@ -31,6 +34,27 @@ void check(bool IsPassed) { } } +void checkResult(const string &FuncName, const vector &Inputs, + const float &Expect, const float &Result, + const int precision) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << fixed << setprecision(precision) << Result << " (expect " + << Expect - pow(10, -precision) << " ~ " << Expect + pow(10, -precision) + << ")"; + cout.unsetf(ios::fixed); + check(abs(Result - Expect) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const __nv_bfloat16 &Expect, const float &Result, + const int precision) { + float FExpect = __bfloat162float(Expect); + checkResult(FuncName, Inputs, FExpect, Result, precision); +} + template void checkResult(const string &FuncName, const vector &Inputs, const double &Expect, const double &DeviceResult, @@ -50,6 +74,26 @@ __global__ void setVecValue(double *Input1, const double Input2) { *Input1 = Input2; } +// Bfloat16 Precision Conversion and Data Movement + +__global__ void double2bfloat16(float *const Result, double Input1) { + *Result = __double2bfloat16(Input1); +} + +void testDouble2bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + double2bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__double2bfloat16", {(float)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +// Double Precision Mathematical Functions + __global__ void _norm(double *const DeviceResult, int Input1, const double *Input2) { *DeviceResult = norm(Input1, Input2); @@ -257,6 +301,14 @@ void testRnorm4dCases(const vector> &TestCases) { } int main() { + testDouble2bfloat16Cases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); testNormCases({ {{-0.3, -0.34, -0.98}, {1.079814798935447, 15}}, {{0.3, 0.34, 0.98}, {1.079814798935447, 15}}, diff --git a/features/feature_case/math/math-ext-bf16-conv.cu b/features/feature_case/math/math-ext-bf16-conv.cu new file mode 100644 index 000000000..a3a1e6a45 --- /dev/null +++ b/features/feature_case/math/math-ext-bf16-conv.cu @@ -0,0 +1,1550 @@ +// ====---------- math-ext-bf16-conv.cu---------- *- CUDA -* --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===---------------------------------------------------------------------===// + +#include +#include +#include + +#include "cuda_bf16.h" + +using namespace std; + +typedef pair f2i_pair; +typedef pair fi_pair; +typedef pair<__nv_bfloat16, int> bf16i_pair; + +int passed = 0; +int failed = 0; + +void check(bool IsPassed) { + if (IsPassed) { + cout << " ---- passed" << endl; + passed++; + } else { + cout << " ---- failed" << endl; + failed++; + } +} + +void checkResult(const string &FuncName, const vector &Inputs, + const float &Expect, const float &Result, + const int precision) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision) + << Result << " (expect " << Expect - pow(10, -precision) << " ~ " + << Expect + pow(10, -precision) << ")"; + cout.unsetf(ios::fixed); + check(abs(Result - Expect) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const float2 &Expect, const float2 &Result, + const int precision) { + cout << FuncName << "({" << Inputs[0].x << ", " << Inputs[0].y << "}"; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", {" << Inputs[i].x << ", " << Inputs[i].y << "}"; + } + cout << ") = " << fixed << setprecision(precision) << "{" << Result.x << ", " + << Result.y << "} (expect {" << Expect.x - pow(10, -precision) << " ~ " + << Expect.x + pow(10, -precision) << ", " + << Expect.y - pow(10, -precision) << " ~ " + << Expect.y + pow(10, -precision) << ")"; + cout.unsetf(ios::fixed); + check(abs(Result.x - Expect.x) < pow(10, -precision) && + abs(Result.y - Expect.y) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const int &Expect, const int &Result) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << Result << " (expect " << Expect << ")"; + check(Result == Expect); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const float &Expect, const float &Result, + const int precision) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision) + << Result << " (expect " << Expect - pow(10, -precision) << " ~ " + << Expect + pow(10, -precision) << ")"; + cout.unsetf(ios::fixed); + check(abs(Result - Expect) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const __nv_bfloat16 &Expect, const float &Result, + const int precision) { + float FExpect = __bfloat162float(Expect); + checkResult(FuncName, Inputs, FExpect, Result, precision); +} + +void checkResult(const string &FuncName, const vector<__nv_bfloat16> &Inputs, + const float &Expect, const float &Result, + const int precision) { + vector FInputs; + for (const auto &it : Inputs) { + FInputs.push_back(__bfloat162float(it)); + } + checkResult(FuncName, FInputs, Expect, Result, precision); +} + +void checkResult(const string &FuncName, const vector<__nv_bfloat162> &Inputs, + const float2 &Expect, const float2 &Result, + const int precision) { + vector FInputs; + for (const auto &it : Inputs) { + FInputs.push_back({__bfloat162float(it.x), __bfloat162float(it.y)}); + } + checkResult(FuncName, FInputs, Expect, Result, precision); +} + +__global__ void bfloat1622float2(float *const Result, __nv_bfloat162 Input1) { + auto ret = __bfloat1622float2(Input1); + Result[0] = ret.x; + Result[1] = ret.y; +} + +void testBfloat1622float2Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, 2 * sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat1622float2<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat1622float2", {TestCase.first}, TestCase.second.first, + {Result[0], Result[1]}, TestCase.second.second); + auto ret = __bfloat1622float2(TestCase.first); + Result[0] = ret.x; + Result[1] = ret.y; + checkResult("(host)__bfloat1622float2", {TestCase.first}, + TestCase.second.first, {Result[0], Result[1]}, + TestCase.second.second); + } +} + +__global__ void bfloat162float(float *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162float(Input1); +} + +void testBfloat162floatCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162float<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162float", vector<__nv_bfloat16>{TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + *Result = __bfloat162float(TestCase.first); + checkResult("(host)__bfloat162float", vector<__nv_bfloat16>{TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void bfloat162int_rd(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rd(Input1); +} + +void testBfloat162int_rdCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_rn(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rn(Input1); +} + +void testBfloat162int_rnCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_ru(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_ru(Input1); +} + +void testBfloat162int_ruCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162int_rz(int *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162int_rz(Input1); +} + +void testBfloat162int_rzCases( + const vector> &TestCases) { + int *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162int_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162int_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ll_rd(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rd(Input1); +} + +void testBfloat162ll_rdCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rd", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_rn(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rn(Input1); +} + +void testBfloat162ll_rnCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rn", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_ru(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_ru(Input1); +} + +void testBfloat162ll_ruCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_ru", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162ll_rz(long long *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162ll_rz(Input1); +} + +void testBfloat162ll_rzCases( + const vector> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ll_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ll_rz", {TestCase.first}, TestCase.second, *Result); + } +} + +__global__ void bfloat162short_rd(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rd(Input1); +} + +void testBfloat162short_rdCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_rn(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rn(Input1); +} + +void testBfloat162short_rnCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_ru(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_ru(Input1); +} + +void testBfloat162short_ruCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162short_rz(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162short_rz(Input1); +} + +void testBfloat162short_rzCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162short_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162short_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rd(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rd(Input1); +} + +void testBfloat162uint_rdCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rn(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rn(Input1); +} + +void testBfloat162uint_rnCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_ru(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_ru(Input1); +} + +void testBfloat162uint_ruCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162uint_rz(unsigned *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat162uint_rz(Input1); +} + +void testBfloat162uint_rzCases( + const vector> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162uint_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162uint_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rd(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rd(Input1); +} + +void testBfloat162ull_rdCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rn(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rn(Input1); +} + +void testBfloat162ull_rnCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_ru(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_ru(Input1); +} + +void testBfloat162ull_ruCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ull_rz(unsigned long long *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ull_rz(Input1); +} + +void testBfloat162ull_rzCases( + const vector> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ull_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ull_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rd(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rd(Input1); +} + +void testBfloat162ushort_rdCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rd", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rn(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rn(Input1); +} + +void testBfloat162ushort_rnCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rn", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_ru(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_ru(Input1); +} + +void testBfloat162ushort_ruCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_ru", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat162ushort_rz(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat162ushort_rz(Input1); +} + +void testBfloat162ushort_rzCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat162ushort_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat162ushort_rz", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat16_as_short(short *const Result, __nv_bfloat16 Input1) { + *Result = __bfloat16_as_short(Input1); +} + +void testBfloat16_as_shortCases( + const vector> &TestCases) { + short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat16_as_short<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat16_as_short", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void bfloat16_as_ushort(unsigned short *const Result, + __nv_bfloat16 Input1) { + *Result = __bfloat16_as_ushort(Input1); +} + +void testBfloat16_as_ushortCases( + const vector> &TestCases) { + unsigned short *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + bfloat16_as_ushort<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__bfloat16_as_ushort", {TestCase.first}, TestCase.second, + *Result); + } +} + +__global__ void float2bfloat16(float *const Result, float Input1) { + *Result = __float2bfloat16(Input1); +} + +void testFloat2bfloat16Cases(const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat16", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void float2bfloat16_rd(float *const Result, float Input1) { + *Result = __float2bfloat16_rd(Input1); +} + +void testFloat2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat16_rd", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void float2bfloat16_rn(float *const Result, float Input1) { + *Result = __float2bfloat16_rn(Input1); +} + +void testFloat2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat16_rn", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void float2bfloat16_ru(float *const Result, float Input1) { + *Result = __float2bfloat16_ru(Input1); +} + +void testFloat2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat16_ru", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void float2bfloat16_rz(float *const Result, float Input1) { + *Result = __float2bfloat16_rz(Input1); +} + +void testFloat2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + float2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__float2bfloat16_rz", {TestCase.first}, TestCase.second.first, + *Result, TestCase.second.second); + } +} + +__global__ void int2bfloat16_rd(float *const Result, int Input1) { + *Result = __int2bfloat16_rd(Input1); +} + +void testInt2bfloat16_rdCases(const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + int2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__int2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void int2bfloat16_rn(float *const Result, int Input1) { + *Result = __int2bfloat16_rn(Input1); +} + +void testInt2bfloat16_rnCases(const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + int2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__int2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void int2bfloat16_ru(float *const Result, int Input1) { + *Result = __int2bfloat16_ru(Input1); +} + +void testInt2bfloat16_ruCases(const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + int2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__int2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void int2bfloat16_rz(float *const Result, int Input1) { + *Result = __int2bfloat16_rz(Input1); +} + +void testInt2bfloat16_rzCases(const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + int2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__int2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ll2bfloat16_rd(float *const Result, long long Input1) { + *Result = __ll2bfloat16_rd(Input1); +} + +void testLl2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ll2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ll2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ll2bfloat16_rn(float *const Result, long long Input1) { + *Result = __ll2bfloat16_rn(Input1); +} + +void testLl2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ll2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ll2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ll2bfloat16_ru(float *const Result, long long Input1) { + *Result = __ll2bfloat16_ru(Input1); +} + +void testLl2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ll2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ll2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ll2bfloat16_rz(float *const Result, long long Input1) { + *Result = __ll2bfloat16_rz(Input1); +} + +void testLl2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ll2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ll2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void short2bfloat16_rd(float *const Result, short Input1) { + *Result = __short2bfloat16_rd(Input1); +} + +void testShort2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + short2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__short2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void short2bfloat16_rn(float *const Result, short Input1) { + *Result = __short2bfloat16_rn(Input1); +} + +void testShort2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + short2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__short2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void short2bfloat16_ru(float *const Result, short Input1) { + *Result = __short2bfloat16_ru(Input1); +} + +void testShort2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + short2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__short2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void short2bfloat16_rz(float *const Result, short Input1) { + *Result = __short2bfloat16_rz(Input1); +} + +void testShort2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + short2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__short2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void short_as_bfloat16(float *const Result, short Input1) { + *Result = __short_as_bfloat16(Input1); +} + +void testShort_as_bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + short_as_bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__short_as_bfloat16", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void uint2bfloat16_rd(float *const Result, unsigned Input1) { + *Result = __uint2bfloat16_rd(Input1); +} + +void testUint2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + uint2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__uint2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void uint2bfloat16_rn(float *const Result, unsigned Input1) { + *Result = __uint2bfloat16_rn(Input1); +} + +void testUint2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + uint2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__uint2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void uint2bfloat16_ru(float *const Result, unsigned Input1) { + *Result = __uint2bfloat16_ru(Input1); +} + +void testUint2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + uint2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__uint2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void uint2bfloat16_rz(float *const Result, unsigned Input1) { + *Result = __uint2bfloat16_rz(Input1); +} + +void testUint2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + uint2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__uint2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ull2bfloat16_rd(float *const Result, + unsigned long long Input1) { + *Result = __ull2bfloat16_rd(Input1); +} + +void testUll2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ull2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ull2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ull2bfloat16_rn(float *const Result, + unsigned long long Input1) { + *Result = __ull2bfloat16_rn(Input1); +} + +void testUll2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ull2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ull2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ull2bfloat16_ru(float *const Result, + unsigned long long Input1) { + *Result = __ull2bfloat16_ru(Input1); +} + +void testUll2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ull2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ull2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ull2bfloat16_rz(float *const Result, + unsigned long long Input1) { + *Result = __ull2bfloat16_rz(Input1); +} + +void testUll2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ull2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ull2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ushort2bfloat16_rd(float *const Result, unsigned short Input1) { + *Result = __ushort2bfloat16_rd(Input1); +} + +void testUshort2bfloat16_rdCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ushort2bfloat16_rd<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ushort2bfloat16_rd", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ushort2bfloat16_rn(float *const Result, unsigned short Input1) { + *Result = __ushort2bfloat16_rn(Input1); +} + +void testUshort2bfloat16_rnCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ushort2bfloat16_rn<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ushort2bfloat16_rn", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ushort2bfloat16_ru(float *const Result, unsigned short Input1) { + *Result = __ushort2bfloat16_ru(Input1); +} + +void testUshort2bfloat16_ruCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ushort2bfloat16_ru<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ushort2bfloat16_ru", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ushort2bfloat16_rz(float *const Result, unsigned short Input1) { + *Result = __ushort2bfloat16_rz(Input1); +} + +void testUshort2bfloat16_rzCases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ushort2bfloat16_rz<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ushort2bfloat16_rz", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +__global__ void ushort_as_bfloat16(float *const Result, unsigned short Input1) { + *Result = __ushort_as_bfloat16(Input1); +} + +void testUshort_as_bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + ushort_as_bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__ushort_as_bfloat16", vector{(int)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +int main() { + testBfloat1622float2Cases({ + {{-0.3, -0.5}, {{-0.30078125, -0.5}, 16}}, + {{0.3, 0.5}, {{0.30078125, 0.5}, 16}}, + {{30, 50}, {{30, 50}, 14}}, + {{0.432643, 0.23654}, {{0.43359375, 0.236328125}, 16}}, + }); + testBfloat162floatCases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + }); + testBfloat162int_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162int_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162int_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162int_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ll_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ll_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ll_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ll_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162short_rdCases({ + {-0.3, -1}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162short_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162short_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162short_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162uint_rdCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162uint_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162uint_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162uint_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ull_rdCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ull_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ull_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ull_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ushort_rdCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat162ushort_rnCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ushort_ruCases({ + {-0.3, 0}, + {0.3, 1}, + {30, 30}, + {0.432643, 1}, + {1, 1}, + {10.7, 11}, + }); + testBfloat162ushort_rzCases({ + {-0.3, 0}, + {0.3, 0}, + {30, 30}, + {0.432643, 0}, + {1, 1}, + {10.7, 10}, + }); + testBfloat16_as_shortCases({ + {-0.3, -16742}, + {0.3, 16026}, + {30, 16880}, + {0.432643, 16094}, + {1, 16256}, + {10.7, 16683}, + }); + testBfloat16_as_ushortCases({ + {-0.3, 48794}, + {0.3, 16026}, + {30, 16880}, + {0.432643, 16094}, + {1, 16256}, + {10.7, 16683}, + }); + testFloat2bfloat16Cases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); + testFloat2bfloat16_rdCases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.298828125, 16}}, + {30, {30, 14}}, + {0.432643, {0.431640625, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); + testFloat2bfloat16_rnCases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); + testFloat2bfloat16_ruCases({ + {-0.3, {-0.298828125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + {1, {1, 15}}, + {10.7, {10.75, 15}}, + }); + testFloat2bfloat16_rzCases({ + {-0.3, {-0.298828125, 16}}, + {0.3, {0.298828125, 16}}, + {30, {30, 14}}, + {0.432643, {0.431640625, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); + testInt2bfloat16_rdCases({ + {-10000, {-10048, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testInt2bfloat16_rnCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testInt2bfloat16_ruCases({ + {-10000, {-9984, 12}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testInt2bfloat16_rzCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testLl2bfloat16_rdCases({ + {-10000, {-10048, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testLl2bfloat16_rnCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testLl2bfloat16_ruCases({ + {-10000, {-9984, 12}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testLl2bfloat16_rzCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testShort2bfloat16_rdCases({ + {-10000, {-10048, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testShort2bfloat16_rnCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-3008, 12}}, + }); + testShort2bfloat16_ruCases({ + {-10000, {-9984, 12}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testShort2bfloat16_rzCases({ + {-10000, {-9984, 12}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {-2992, 12}}, + }); + testShort_as_bfloat16Cases({ + {-10000, {-2111062325329920.0, 0}}, + {10000, {0.000000000000001998401444325282, 30}}, + {30000, {223106505640168374663419764146176.0, -17}}, + {3000, {0.00000000000000000000000000000007087422195345028, 47}}, + {1000, {0.0000000000000000000000000000000000013635734469538535, 52}}, + {-3000, {-63382530011411470074835160268800.0, -16}}, + }); + testUint2bfloat16_rdCases({ + {-10000, {4278190080.0, 6}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {4278190080.0, 6}}, + }); + testUint2bfloat16_rnCases({ + {-10000, {4294967296.0, 6}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {4294967296.0, 6}}, + }); + testUint2bfloat16_ruCases({ + {-10000, {4294967296.0, 6}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {4294967296.0, 6}}, + }); + testUint2bfloat16_rzCases({ + {-10000, {4278190080.0, 6}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {4278190080.0, 6}}, + }); + testUll2bfloat16_rdCases({ + {-10000, {18374686479671623680.0, -4}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {18374686479671623680.0, -4}}, + }); + testUll2bfloat16_rnCases({ + {-10000, {18446744073709551616.0, -4}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {18446744073709551616.0, -4}}, + }); + testUll2bfloat16_ruCases({ + {-10000, {18446744073709551616.0, -4}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {18446744073709551616.0, -4}}, + }); + testUll2bfloat16_rzCases({ + {-10000, {18374686479671623680.0, -4}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {18374686479671623680.0, -4}}, + }); + testUshort2bfloat16_rdCases({ + {-10000, {55296.0, 11}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {62464.0, 11}}, + }); + testUshort2bfloat16_rnCases({ + {-10000, {55552.0, 11}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {62464.0, 11}}, + }); + testUshort2bfloat16_ruCases({ + {-10000, {55552.0, 11}}, + {10000, {10048, 12}}, + {30000, {30080, 11}}, + {3000, {3008, 12}}, + {1000, {1000, 13}}, + {-3000, {62720.0, 11}}, + }); + testUshort2bfloat16_rzCases({ + {-10000, {55296.0, 11}}, + {10000, {9984, 12}}, + {30000, {29952, 11}}, + {3000, {2992, 12}}, + {1000, {1000, 13}}, + {-3000, {62464.0, 11}}, + }); + testUshort_as_bfloat16Cases({ + {-10000, {-2111062325329920.0, 0}}, + {10000, {0.0000000000000019984014443252817727625, 30}}, + {30000, {223106505640168374663419764146176.0, -17}}, + {3000, {0.00000000000000000000000000000007087422195345028, 47}}, + {1000, {0.0000000000000000000000000000000000013635734469538535, 52}}, + {-3000, {-63382530011411470074835160268800.0, -16}}, + }); + cout << "passed " << passed << "/" << passed + failed << " cases!" << endl; + if (failed) { + cout << "failed!" << endl; + } + return failed; +} diff --git a/features/feature_case/math/math-ext-double.cu b/features/feature_case/math/math-ext-double.cu index f9ca2cbb7..73074a5c8 100644 --- a/features/feature_case/math/math-ext-double.cu +++ b/features/feature_case/math/math-ext-double.cu @@ -11,8 +11,11 @@ #include #include +#include "cuda_bf16.h" + using namespace std; +typedef pair<__nv_bfloat16, int> bf16i_pair; typedef vector d_vector; typedef pair di_pair; @@ -29,6 +32,27 @@ void check(bool IsPassed) { } } +void checkResult(const string &FuncName, const vector &Inputs, + const float &Expect, const float &Result, + const int precision) { + cout << FuncName << "(" << Inputs[0] << ""; + for (size_t i = 1; i < Inputs.size(); ++i) { + cout << ", " << Inputs[i]; + } + cout << ") = " << fixed << setprecision(precision < 0 ? 0 : precision) + << Result << " (expect " << Expect - pow(10, -precision) << " ~ " + << Expect + pow(10, -precision) << ")"; + cout.unsetf(ios::fixed); + check(abs(Result - Expect) < pow(10, -precision)); +} + +void checkResult(const string &FuncName, const vector &Inputs, + const __nv_bfloat16 &Expect, const float &Result, + const int precision) { + float FExpect = __bfloat162float(Expect); + checkResult(FuncName, Inputs, FExpect, Result, precision); +} + template void checkResult(const string &FuncName, const vector &Inputs, const double &Expect, const double &DeviceResult, @@ -44,6 +68,26 @@ void checkResult(const string &FuncName, const vector &Inputs, check(abs(DeviceResult - Expect) < pow(10, -precision)); } +// Bfloat16 Precision Conversion and Data Movement + +__global__ void double2bfloat16(float *const Result, double Input1) { + *Result = __double2bfloat16(Input1); +} + +void testDouble2bfloat16Cases( + const vector> &TestCases) { + float *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + double2bfloat16<<<1, 1>>>(Result, TestCase.first); + cudaDeviceSynchronize(); + checkResult("__double2bfloat16", {(float)TestCase.first}, + TestCase.second.first, *Result, TestCase.second.second); + } +} + +// Double Precision Mathematical Functions + __global__ void cylBesselI0(double *const Result, double Input1) { *Result = cyl_bessel_i0(Input1); } @@ -311,6 +355,14 @@ void testY1Cases(const vector> &TestCases) { } int main() { + testDouble2bfloat16Cases({ + {-0.3, {-0.30078125, 16}}, + {0.3, {0.30078125, 16}}, + {30, {30, 14}}, + {0.432643, {0.43359375, 16}}, + {1, {1, 15}}, + {10.7, {10.6875, 15}}, + }); testCylBesselI0Cases({ {0.3, {1.022626879351597, 15}}, {0.5, {1.063483370741324, 15}}, diff --git a/features/features.xml b/features/features.xml index bc7cdc62d..871c61544 100644 --- a/features/features.xml +++ b/features/features.xml @@ -123,6 +123,7 @@ + diff --git a/features/test_feature.py b/features/test_feature.py index ecde1e279..15df20162 100644 --- a/features/test_feature.py +++ b/features/test_feature.py @@ -31,8 +31,8 @@ 'cub_device_reduce_arg', 'cub_device_seg_sort_pairs', 'cub_intrinsic', 'cub_device_seg_sort_keys', 'thrust-math1', 'thrust-math2', 'cub_transform_iterator', 'activemask', 'complex', 'thrust-math', 'libcu_array', 'libcu_complex', 'libcu_tuple', 'user_defined_rules', 'math-exec', 'math-habs', 'math-emu-double', 'math-emu-float', 'math-emu-half', 'math-emu-half-after11', 'math-emu-half2', 'math-emu-half2-after11', 'math-emu-half2-after12', 'math-emu-simd', - 'math-emu-bf16', 'math-emu-bf16-after12', 'math-emu-bf162', 'math-experimental-bf16', 'math-experimental-bf162', "math-half-raw", - 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation', + 'math-emu-bf16', 'math-emu-bf162-after12', 'math-emu-bf162', 'math-experimental-bf16', 'math-experimental-bf162', "math-half-raw", + 'math-ext-bf16-conv', 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation', 'cudnn-fill', 'cudnn-lrn', 'cudnn-memory', 'cudnn-pooling', 'cudnn-reorder', 'cudnn-scale', 'cudnn-softmax', 'cudnn-sum', 'math-funnelshift', 'thrust-sort_by_key', 'thrust-find', 'thrust-inner_product', 'thrust-reduce_by_key', 'math-bf16-conv', 'math-half-conv', @@ -79,8 +79,6 @@ def migrate_test(): logical_group_exper = ['cooperative_groups', 'cooperative_groups_thread_group'] experimental_bfloat16_tests = ['math-experimental-bf16', 'math-experimental-bf162'] - math_extension_tests = ['math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd'] - if test_config.current_test in nd_range_bar_exper: src.append(' --use-experimental-features=nd_range_barrier ') if test_config.current_test == "user_defined_rules": @@ -89,7 +87,7 @@ def migrate_test(): src.append(' --use-experimental-features=logical-group ') if test_config.current_test == 'math_intel_specific': src.append(' --rule-file=./math_intel_specific/intel_specific_math.yaml') - if test_config.current_test in math_extension_tests: + if test_config.current_test.startswith('math-ext-'): src.append(' --use-dpcpp-extensions=intel_device_math') if test_config.current_test in occupancy_calculation_exper: src.append(' --use-experimental-features=occupancy-calculation ')