Skip to content

Commit

Permalink
Ensure cuda::std::bit_cast works with vector types
Browse files Browse the repository at this point in the history
We want to make sure we can use `bit_cast` with suitable vector or sum types like
* `cuda::std::array`
* c-arrays
* cuda vector types like float2

Unfortunately we cannot use it with the extended floating point vector types, because they are not trivially copyable and also not trivially copy assignable
  • Loading branch information
miscco committed Dec 17, 2024
1 parent 19f91c8 commit 0f52938
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 181 deletions.
56 changes: 6 additions & 50 deletions libcudacxx/include/cuda/std/__bit/bit_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
#endif // no system header

#include <cuda/std/__fwd/array.h>
#include <cuda/std/__fwd/pair.h>
#include <cuda/std/__fwd/tuple.h>
#include <cuda/std/__tuple_dir/sfinae_helpers.h>
#include <cuda/std/__type_traits/enable_if.h>
#include <cuda/std/__type_traits/is_extended_floating_point.h>
Expand Down Expand Up @@ -53,57 +51,15 @@ struct __is_bit_castable
_CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
};

template <class _Tp, size_t _Np>
struct __is_bit_castable<_Tp[_Np]>
{
static constexpr bool value = __is_bit_castable<remove_extent_t<_Tp>>::value;
};

template <class _Tp, size_t _Np>
struct __is_bit_castable<array<_Tp, _Np>>
{
static constexpr bool value = __is_bit_castable<_Tp>::value;
};

template <class _Tp, class _Up>
struct __is_bit_castable<pair<_Tp, _Up>>
{
static constexpr bool value = __is_bit_castable<_Tp>::value && __is_bit_castable<_Up>::value;
};

template <class... _Tp>
struct __is_bit_castable<tuple<_Tp...>>
{
static constexpr bool value = __all<__is_bit_castable<_Tp>::value...>::value;
};

#if !defined(_CCCL_NO_INLINE_VARIABLES)
#if !defined(_CCCL_NO_VARIABLE_TEMPLATES)
template <class _Tp>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v =
_CCCL_INLINE_VAR constexpr bool __is_bit_castable_v =
_CCCL_TRAIT(is_trivially_copyable, _Tp) || _CCCL_TRAIT(__is_extended_floating_point, _Tp);
#endif // !_CCCL_NO_VARIABLE_TEMPLATES

template <class _Tp, size_t _Np>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<_Tp[_Np]> = __is_bit_castable_v<remove_extent_t<_Tp>>;

template <class _Tp, size_t _Np>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<array<_Tp, _Np>> = __is_bit_castable_v<_Tp>;

template <class _Tp, class _Up>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<pair<_Tp, _Up>> =
__is_bit_castable_v<_Tp> && __is_bit_castable_v<_Up>;

template <class... _Tp>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v<tuple<_Tp...>> = __all<__is_bit_castable_v<_Tp>...>::value;
#elif !defined(_CCCL_NO_VARIABLE_TEMPLATES)
template <class _Tp>
_LIBCUDACXX_INLINE_VAR constexpr bool __is_bit_castable_v = __is_bit_castable<_Tp>::value;
#endif // _CCCL_STD_VER >= 2014

template <class _To,
class _From,
enable_if_t<(sizeof(_To) == sizeof(_From)), int> = 0,
enable_if_t<_CCCL_TRAIT(__is_bit_castable, _To), int> = 0,
enable_if_t<_CCCL_TRAIT(__is_bit_castable, _From), int> = 0>
_CCCL_TEMPLATE(class _To, class _From)
_CCCL_REQUIRES((sizeof(_To) == sizeof(_From)) //
_CCCL_AND _CCCL_TRAIT(__is_bit_castable, _To) _CCCL_AND _CCCL_TRAIT(__is_bit_castable, _From))
_CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI _LIBCUDACXX_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
{
#if defined(_CCCL_BUILTIN_BIT_CAST)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ __host__ __device__ cuda::std::size_t test_memcmp(void* lhs, void* rhs, size_t b
return 0;
}

template <class T>
__host__ __device__ bool equal(T& lhs, T& rhs)
{
return test_memcmp(&lhs, &rhs, sizeof(T)) == 0;
}

// cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
// the results might not memcmp cleanly.
template <bool HasUniqueObjectRepresentations = true, typename T>
Expand All @@ -52,7 +58,7 @@ __host__ __device__ void test_roundtrip_through_buffer(T from)
T to = cuda::std::bit_cast<T>(middle);
Buffer middle2 = cuda::std::bit_cast<Buffer>(to);

assert((from == to) == (from == from)); // because NaN
assert(equal(from, to) == equal(from, from)); // because NaN

_CCCL_IF_CONSTEXPR (HasUniqueObjectRepresentations)
{
Expand All @@ -77,6 +83,32 @@ __host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 10> gener
cuda::std::numeric_limits<T>::max()};
}

#define REPEAT_1(base_type, index) base_type(input[index][0])
#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), base_type(input[index][1])
#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), base_type(input[index][2])
#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), base_type(input[index][3])

#define TEST_CUDA_VECTOR_TYPE(base_type, size) \
{ \
for (base_type##size i : \
{base_type##size{REPEAT_##size(base_type, 0)}, \
base_type##size{REPEAT_##size(base_type, 1)}, \
base_type##size{REPEAT_##size(base_type, 2)}, \
base_type##size{REPEAT_##size(base_type, 3)}, \
base_type##size{REPEAT_##size(base_type, 4)}, \
base_type##size{REPEAT_##size(base_type, 5)}, \
base_type##size{REPEAT_##size(base_type, 6)}}) \
{ \
test_roundtrip_through_buffer(i); \
} \
}

#define TEST_CUDA_VECTOR_TYPES(base_type) \
TEST_CUDA_VECTOR_TYPE(base_type, 1) \
TEST_CUDA_VECTOR_TYPE(base_type, 2) \
TEST_CUDA_VECTOR_TYPE(base_type, 3) \
TEST_CUDA_VECTOR_TYPE(base_type, 4)

template <typename T>
__host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> generate_unsigned_integral_values()
{
Expand All @@ -90,135 +122,84 @@ __host__ __device__ _LIBCUDACXX_CONSTEXPR_BIT_CAST cuda::std::array<T, 6> genera

__host__ __device__ bool tests()
{
for (bool b : {false, true})
{
test_roundtrip_through_buffer(b);
}

for (char c : {'\0', 'a', 'b', 'c', 'd'})
{
test_roundtrip_through_buffer(c);
}

// Fundamental signed integer types
for (signed char i : generate_signed_integral_values<signed char>())
{
test_roundtrip_through_buffer(i);
}

for (short i : generate_signed_integral_values<short>())
{
test_roundtrip_through_buffer(i);
}

for (int i : generate_signed_integral_values<int>())
{
test_roundtrip_through_buffer(i);
}

for (long i : generate_signed_integral_values<long>())
{
test_roundtrip_through_buffer(i);
}

for (long long i : generate_signed_integral_values<long long>())
{
test_roundtrip_through_buffer(i);
}

// Fundamental unsigned integer types
for (unsigned char i : generate_unsigned_integral_values<unsigned char>())
{
test_roundtrip_through_buffer(i);
}

for (unsigned short i : generate_unsigned_integral_values<unsigned short>())
{
test_roundtrip_through_buffer(i);
}

for (unsigned int i : generate_unsigned_integral_values<unsigned int>())
{
test_roundtrip_through_buffer(i);
}

for (unsigned long i : generate_unsigned_integral_values<unsigned long>())
{
test_roundtrip_through_buffer(i);
}

for (unsigned long long i : generate_unsigned_integral_values<unsigned long long>())
{
test_roundtrip_through_buffer(i);
}

// Fixed width signed integer types
for (cuda::std::int32_t i : generate_signed_integral_values<cuda::std::int32_t>())
{
test_roundtrip_through_buffer(i);
}

for (cuda::std::int64_t i : generate_signed_integral_values<cuda::std::int64_t>())
{
test_roundtrip_through_buffer(i);
}

// Fixed width unsigned integer types
for (cuda::std::uint32_t i : generate_unsigned_integral_values<cuda::std::uint32_t>())
{
test_roundtrip_through_buffer(i);
}

for (cuda::std::uint64_t i : generate_unsigned_integral_values<cuda::std::uint64_t>())
{
test_roundtrip_through_buffer(i);
}

// Floating point types
for (float i :
{0.0f,
1.0f,
-1.0f,
10.0f,
-10.0f,
1e10f,
1e-10f,
1e20f,
1e-20f,
2.71828f,
3.14159f,
#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
cuda::std::nanf(""),
#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
__builtin_nanf("0x55550001"), // NaN with a payload
cuda::std::numeric_limits<float>::signaling_NaN(),
cuda::std::numeric_limits<float>::quiet_NaN(),
cuda::std::numeric_limits<float>::infinity()})
{
test_roundtrip_through_buffer(i);
}

for (double i :
{0.0,
1.0,
-1.0,
10.0,
-10.0,
1e10,
1e-10,
1e100,
1e-100,
2.718281828459045,
3.141592653589793238462643383279502884197169399375105820974944,
#if !defined(TEST_COMPILER_NVRTC) && !defined(TEST_COMPILER_CLANG_CUDA)
cuda::std::nan(""),
#endif // !TEST_COMPILER_NVRTC && !TEST_COMPILER_CLANG_CUDA
cuda::std::numeric_limits<double>::signaling_NaN(),
cuda::std::numeric_limits<double>::quiet_NaN(),
cuda::std::numeric_limits<double>::infinity()})
{
test_roundtrip_through_buffer(i);
}
using pair = cuda::std::pair<float, int>;
for (pair i :
{pair{0.0f, 1},
pair{1.0f, 2},
pair{-1.0f, 3},
pair{10.0f, 4},
pair{-10.0f, 5},
pair{2.71828f, 6},
pair{3.14159f, 7}})
{
test_roundtrip_through_buffer(i);
}

#if defined(_CCCL_BUILTIN_BIT_CAST) // tuple is not trivially default constructible
using tuple = cuda::std::tuple<float, int, short>;
for (tuple i :
{tuple{0.0f, 1, -1},
tuple{1.0f, 2, -2},
tuple{-1.0f, 3, -3},
tuple{10.0f, 4, -4},
tuple{-10.0f, 5, -5},
tuple{2.71828f, 6, -6},
tuple{3.14159f, 7, -7}})
{
test_roundtrip_through_buffer(i);
}
#endif // _CCCL_BUILTIN_BIT_CAST

using array = cuda::std::array<float, 2>;
for (array i :
{array{0.0f, 1.0f},
array{1.0f, 2.0f},
array{-1.0f, 3.0f},
array{10.0f, 4.0f},
array{-10.0f, 5.0f},
array{2.71828f, 6.0f},
array{3.14159f, 7.0f}})
{
test_roundtrip_through_buffer(i);
}

float carray[2] = {0.0f, 1.0f};
test_roundtrip_through_buffer(carray);

// test cuda vector types except __half2 and __nv_bfloat162 because they are cursed
constexpr double input[7][4] = {
{0.0, 1.0, -7.0, -0.0},
{1.0, 2.0, -7.0, -1.0},
{-1.0, 3.0, -7.0, 1.0},
{10.0, 4.0, -7.0, -10.0},
{-10.0, 5.0, -7.0, 10.0},
{2.71828, 6.0, -7.0, -2.71828},
{3.14159, 7.0, -7.0, -3.14159}};

#if !_CCCL_CUDA_COMPILER(CLANG)
using uchar = unsigned char;
using ushort = unsigned short;
using uint = unsigned int;
using ulong = unsigned long;
#endif // !_CCCL_CUDA_COMPILER(CLANG)
using longlong = long long;
using ulonglong = unsigned long long;

TEST_CUDA_VECTOR_TYPES(char)
TEST_CUDA_VECTOR_TYPES(uchar)
TEST_CUDA_VECTOR_TYPES(short)
TEST_CUDA_VECTOR_TYPES(ushort)
TEST_CUDA_VECTOR_TYPES(int)
TEST_CUDA_VECTOR_TYPES(uint)
TEST_CUDA_VECTOR_TYPES(long)
TEST_CUDA_VECTOR_TYPES(ulong)
TEST_CUDA_VECTOR_TYPES(longlong)
TEST_CUDA_VECTOR_TYPES(ulonglong)
TEST_CUDA_VECTOR_TYPES(float)
TEST_CUDA_VECTOR_TYPES(double)

using dim = unsigned int;
TEST_CUDA_VECTOR_TYPE(dim, 3)

#ifdef _LIBCUDACXX_HAS_NVFP16
// Extended floating point type __half
Expand All @@ -236,7 +217,7 @@ __host__ __device__ bool tests()
#endif // _LIBCUDACXX_HAS_NVFP16

#ifdef _LIBCUDACXX_HAS_NVBF16
// Extended floating point type __half
// Extended floating point type __nv_bfloat16
for (__nv_bfloat16 i :
{__float2bfloat16(0.0f),
__float2bfloat16(1.0f),
Expand Down

0 comments on commit 0f52938

Please sign in to comment.