Ensure cuda::std::bit_cast works with vector types

We want to make sure we can use `bit_cast` with suitable vector or sum types like * `cuda::std::array` * c-arrays * cuda vector types like float2 Unfortunately we cannot use it with the extended floating point vector types, because they are not trivially copyable and also not trivially copy assignable
NVIDIA · Dec 17, 2024 · 0f52938 · 0f52938
1 parent 19f91c8
commit 0f52938
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 181 deletions.
diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h
@@ -22,8 +22,6 @@
 #endif // no system header
 
 #include <cuda/std/__fwd/array.h>
-#include <cuda/std/__fwd/pair.h>
-#include <cuda/std/__fwd/tuple.h>
 #include <cuda/std/__tuple_dir/sfinae_helpers.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_extended_floating_point.h>
@@ -53,57 +51,15 @@ struct __is_bit_castable
     _CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
 };
 
-template <class _Tp, size_t _Np>
-struct __is_bit_castable<_Tp[_Np]>
-{
-  static constexpr bool value = __is_bit_castable<remove_extent_t<_Tp>>::value;
-};
-
-template <class _Tp, size_t _Np>
-struct __is_bit_castable<array<_Tp, _Np>>
-{
-  static constexpr bool value = __is_bit_castable<_Tp>::value;
-};
-
-template <class _Tp, class _Up>
-struct __is_bit_castable<pair<_Tp, _Up>>
-{
-  static constexpr bool value = __is_bit_castable<_Tp>::value && __is_bit_castable<_Up>::value;
-};
-
-template <class... _Tp>
-struct __is_bit_castable<tuple<_Tp...>>
-{
-  static constexpr bool value = __all<__is_bit_castable<_Tp>::value...>::value;
-};
-
-#if !defined(_CCCL_NO_INLINE_VARIABLES)
+#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
 template <class _Tp>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v =
+_CCCL_INLINE_VAR constexpr bool __is_bit_castable_v =
   _CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
+#endif // !_CCCL_NO_VARIABLE_TEMPLATES
 
-template <class _Tp, size_t _Np>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<_Tp[_Np]> = __is_bit_castable_v<remove_extent_t<_Tp>>;
-
-template <class _Tp, size_t _Np>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<array<_Tp, _Np>> = __is_bit_castable_v<_Tp>;
-
-template <class _Tp, class _Up>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<pair<_Tp, _Up>> =
-  __is_bit_castable_v<_Tp> && __is_bit_castable_v<_Up>;
-
-template <class... _Tp>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<tuple<_Tp...>> = __all<__is_bit_castable_v<_Tp>...>::value;
-#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES)
-template <class _Tp>
-_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v = __is_bit_castable<_Tp>::value;
-#endif // _CCCL_STD_VER >= 2014
-
-template <class _To,
-          class _From,
-          enable_if_t<(sizeof(_To) == sizeof(_From)), int>        = 0,
-          enable_if_t<_CCCL_TRAIT(__is_bit_castable, _To), int>   = 0,
-          enable_if_t<_CCCL_TRAIT(__is_bit_castable, _From), int> = 0>
+_CCCL_TEMPLATE(class _To, class _From)
+_CCCL_REQUIRES((sizeof(_To) == sizeof(_From)) //
+               _CCCL_AND _CCCL_TRAIT(__is_bit_castable, _To) _CCCL_AND _CCCL_TRAIT(__is_bit_castable, _From))
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
 {
 #if defined(_CCCL_BUILTIN_BIT_CAST)

diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_extended.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_extended.pass.cpp
@@ -39,6 +39,12 @@ __host__ __device__ cuda::std::size_t test_memcmp(void* lhs, void* rhs, size_t b
   return 0;
 }
 
+template <class T>
+__host__ __device__ bool equal(T& lhs, T& rhs)
+{
+  return test_memcmp(&lhs, &rhs, sizeof(T)) == 0;
+}
+
 // cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
 // the results might not memcmp cleanly.
 template <bool HasUniqueObjectRepresentations = true, typename T>
@@ -52,7 +58,7 @@ __host__ __device__ void test_roundtrip_through_buffer(T from)
   T to           = cuda::std::bit_cast<T>(middle);
   Buffer middle2 = cuda::std::bit_cast<Buffer>(to);
 
-  assert((from == to) == (from == from)); // because NaN
+  assert(equal(from, to) == equal(from, from)); // because NaN
 
   _CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
   {
@@ -77,6 +83,32 @@ __host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 10> gener
           cuda::std::numeric_limits<T>::max()};
 }
 
+#define REPEAT_1(base_type, index) base_type(input[index][0])
+#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), base_type(input[index][1])
+#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), base_type(input[index][2])
+#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), base_type(input[index][3])
+
+#define TEST_CUDA_VECTOR_TYPE(base_type, size)           \
+  {                                                      \
+    for (base_type##size i :                             \
+         {base_type##size{REPEAT_##size(base_type, 0)},  \
+          base_type##size{REPEAT_##size(base_type, 1)},  \
+          base_type##size{REPEAT_##size(base_type, 2)},  \
+          base_type##size{REPEAT_##size(base_type, 3)},  \
+          base_type##size{REPEAT_##size(base_type, 4)},  \
+          base_type##size{REPEAT_##size(base_type, 5)},  \
+          base_type##size{REPEAT_##size(base_type, 6)}}) \
+    {                                                    \
+      test_roundtrip_through_buffer(i);                  \
+    }                                                    \
+  }
+
+#define TEST_CUDA_VECTOR_TYPES(base_type) \
+  TEST_CUDA_VECTOR_TYPE(base_type, 1)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 2)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 3)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 4)
+
 template <typename T>
 __host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> generate_unsigned_integral_values()
 {
@@ -90,135 +122,84 @@ __host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> genera
 
 __host__ __device__ bool tests()
 {
-  for (bool b : {false, true})
-  {
-    test_roundtrip_through_buffer(b);
-  }
-
-  for (char c : {'\0', 'a', 'b', 'c', 'd'})
-  {
-    test_roundtrip_through_buffer(c);
-  }
-
-  // Fundamental signed integer types
-  for (signed char i : generate_signed_integral_values<signed char>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (short i : generate_signed_integral_values<short>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (int i : generate_signed_integral_values<int>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (long i : generate_signed_integral_values<long>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (long long i : generate_signed_integral_values<long long>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  // Fundamental unsigned integer types
-  for (unsigned char i : generate_unsigned_integral_values<unsigned char>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (unsigned short i : generate_unsigned_integral_values<unsigned short>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (unsigned int i : generate_unsigned_integral_values<unsigned int>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (unsigned long i : generate_unsigned_integral_values<unsigned long>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (unsigned long long i : generate_unsigned_integral_values<unsigned long long>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  // Fixed width signed integer types
-  for (cuda::std::int32_t i : generate_signed_integral_values<cuda::std::int32_t>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (cuda::std::int64_t i : generate_signed_integral_values<cuda::std::int64_t>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  // Fixed width unsigned integer types
-  for (cuda::std::uint32_t i : generate_unsigned_integral_values<cuda::std::uint32_t>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (cuda::std::uint64_t i : generate_unsigned_integral_values<cuda::std::uint64_t>())
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  // Floating point types
-  for (float i :
-       {0.0f,
-        1.0f,
-        -1.0f,
-        10.0f,
-        -10.0f,
-        1e10f,
-        1e-10f,
-        1e20f,
-        1e-20f,
-        2.71828f,
-        3.14159f,
-#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
-        cuda::std::nanf(""),
-#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
-        __builtin_nanf("0x55550001"), // NaN with a payload
-        cuda::std::numeric_limits<float>::signaling_NaN(),
-        cuda::std::numeric_limits<float>::quiet_NaN(),
-        cuda::std::numeric_limits<float>::infinity()})
-  {
-    test_roundtrip_through_buffer(i);
-  }
-
-  for (double i :
-       {0.0,
-        1.0,
-        -1.0,
-        10.0,
-        -10.0,
-        1e10,
-        1e-10,
-        1e100,
-        1e-100,
-        2.718281828459045,
-        3.141592653589793238462643383279502884197169399375105820974944,
-#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
-        cuda::std::nan(""),
-#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
-        cuda::std::numeric_limits<double>::signaling_NaN(),
-        cuda::std::numeric_limits<double>::quiet_NaN(),
-        cuda::std::numeric_limits<double>::infinity()})
-  {
-    test_roundtrip_through_buffer(i);
-  }
+  using pair = cuda::std::pair<float, int>;
+  for (pair i :
+       {pair{0.0f, 1},
+        pair{1.0f, 2},
+        pair{-1.0f, 3},
+        pair{10.0f, 4},
+        pair{-10.0f, 5},
+        pair{2.71828f, 6},
+        pair{3.14159f, 7}})
+  {
+    test_roundtrip_through_buffer(i);
+  }
+
+#if defined(_CCCL_BUILTIN_BIT_CAST) // tuple is not trivially default constructible
+  using tuple = cuda::std::tuple<float, int, short>;
+  for (tuple i :
+       {tuple{0.0f, 1, -1},
+        tuple{1.0f, 2, -2},
+        tuple{-1.0f, 3, -3},
+        tuple{10.0f, 4, -4},
+        tuple{-10.0f, 5, -5},
+        tuple{2.71828f, 6, -6},
+        tuple{3.14159f, 7, -7}})
+  {
+    test_roundtrip_through_buffer(i);
+  }
+#endif // _CCCL_BUILTIN_BIT_CAST
+
+  using array = cuda::std::array<float, 2>;
+  for (array i :
+       {array{0.0f, 1.0f},
+        array{1.0f, 2.0f},
+        array{-1.0f, 3.0f},
+        array{10.0f, 4.0f},
+        array{-10.0f, 5.0f},
+        array{2.71828f, 6.0f},
+        array{3.14159f, 7.0f}})
+  {
+    test_roundtrip_through_buffer(i);
+  }
+
+  float carray[2] = {0.0f, 1.0f};
+  test_roundtrip_through_buffer(carray);
+
+  // test cuda vector types except __half2 and __nv_bfloat162 because they are cursed
+  constexpr double input[7][4] = {
+    {0.0, 1.0, -7.0, -0.0},
+    {1.0, 2.0, -7.0, -1.0},
+    {-1.0, 3.0, -7.0, 1.0},
+    {10.0, 4.0, -7.0, -10.0},
+    {-10.0, 5.0, -7.0, 10.0},
+    {2.71828, 6.0, -7.0, -2.71828},
+    {3.14159, 7.0, -7.0, -3.14159}};
+
+#if !_CCCL_CUDA_COMPILER(CLANG)
+  using uchar  = unsigned char;
+  using ushort = unsigned short;
+  using uint   = unsigned int;
+  using ulong  = unsigned long;
+#endif // !_CCCL_CUDA_COMPILER(CLANG)
+  using longlong  = long long;
+  using ulonglong = unsigned long long;
+
+  TEST_CUDA_VECTOR_TYPES(char)
+  TEST_CUDA_VECTOR_TYPES(uchar)
+  TEST_CUDA_VECTOR_TYPES(short)
+  TEST_CUDA_VECTOR_TYPES(ushort)
+  TEST_CUDA_VECTOR_TYPES(int)
+  TEST_CUDA_VECTOR_TYPES(uint)
+  TEST_CUDA_VECTOR_TYPES(long)
+  TEST_CUDA_VECTOR_TYPES(ulong)
+  TEST_CUDA_VECTOR_TYPES(longlong)
+  TEST_CUDA_VECTOR_TYPES(ulonglong)
+  TEST_CUDA_VECTOR_TYPES(float)
+  TEST_CUDA_VECTOR_TYPES(double)
+
+  using dim = unsigned int;
+  TEST_CUDA_VECTOR_TYPE(dim, 3)
 
 #ifdef _LIBCUDACXX_HAS_NVFP16
   // Extended floating point type __half
@@ -236,7 +217,7 @@ __host__ __device__ bool tests()
 #endif // _LIBCUDACXX_HAS_NVFP16
 
 #ifdef _LIBCUDACXX_HAS_NVBF16
-  // Extended floating point type __half
+  // Extended floating point type __nv_bfloat16
   for (__nv_bfloat16 i :
        {__float2bfloat16(0.0f),
         __float2bfloat16(1.0f),