From ce702b4a282e2df7ab0dbe058ea1ff5c5deb87d2 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 5 Nov 2024 21:48:01 +0000 Subject: [PATCH 1/2] Fix NumericLimits --- .../cuda/transformers/beam_search_topk.cu | 2 +- .../transformers/greedy_search_top_one.cu | 2 +- .../core/providers/cuda/math/topk_impl.cuh | 10 ++--- .../providers/cuda/shared_inc/cuda_utils.h | 42 ++++--------------- 4 files changed, 16 insertions(+), 40 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu index 5ac10f6321e63..44be2ef2375ee 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu @@ -60,7 +60,7 @@ struct TopK { __device__ __forceinline__ void Init() { for (int i = 0; i < max_k; i++) { key[i] = -1; - value[i] = NumericLimits::Min(); + value[i] = NumericLimits::Lowest(); } } }; diff --git a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu index 68a2e16482af9..9e901d24b1a5a 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu @@ -19,7 +19,7 @@ struct TopOne { int32_t key; T value; - __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits::Min()) : key(key), value(value) { + __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits::Lowest()) : key(key), value(value) { } __device__ __forceinline__ void Reduce(int32_t k, T v) { diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cuh b/onnxruntime/core/providers/cuda/math/topk_impl.cuh index cbde6da457fdb..112566e54bbba 100644 --- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh +++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh @@ -412,7 +412,7 @@ Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, if (aligned_dimension <= GridDim::maxThreadsPerBlock) { BitonicTopK<<), stream>>>( input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, - aligned_dimension, NumericLimits::Min(), NumericLimits::Max()); + aligned_dimension, NumericLimits::Lowest(), NumericLimits::Max()); } else if (K <= BT * 16 || 0 == sorted) { if (use_deterministic_compute) { static std::once_flag log_warning; @@ -425,19 +425,19 @@ Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute, if (BT * 2 >= K || 0 == sorted) { RadixTopK<<>>( input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, - NumericLimits::Min(), NumericLimits::Max()); + NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT * 4 >= K) { RadixTopK<<>>( input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, - NumericLimits::Min(), NumericLimits::Max()); + NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT * 8 >= K) { RadixTopK<<>>( input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, - NumericLimits::Min(), NumericLimits::Max()); + NumericLimits::Lowest(), NumericLimits::Max()); } else { RadixTopK<<>>( input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, - NumericLimits::Min(), NumericLimits::Max()); + NumericLimits::Lowest(), NumericLimits::Max()); } } else { auto input_key_buffer = kernel->GetScratchBuffer(dimension, ort_stream); diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index ed642754af3ba..ab0c543279fd0 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "core/framework/float16.h" @@ -120,7 +121,7 @@ constexpr int kNumBitsPerBitmaskElement = std::numeric_limits struct NumericLimits { - __inline__ __host__ __device__ static T Min() { + __inline__ __host__ __device__ static T Lowest() { return std::numeric_limits::lowest(); } __inline__ __host__ __device__ static T Max() { @@ -128,43 +129,18 @@ struct NumericLimits { } }; -template <> -struct NumericLimits { - __inline__ __host__ __device__ static half Min() { - return -65504.0; - } - __inline__ __host__ __device__ static half Max() { - return 65504.0; - } -}; +#ifndef CUDART_MAX_NORMAL_FP16 // CUDA 12.3 or later has this macro +#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) +#endif template <> struct NumericLimits { - __inline__ __host__ __device__ static half Min() { - return -65504.0; - } - __inline__ __host__ __device__ static half Max() { - return 65504.0; + __inline__ __host__ __device__ static half Lowest() { + return -CUDART_MAX_NORMAL_FP16; } -}; -template <> -struct NumericLimits { - __inline__ __host__ __device__ static float Min() { - return -INFINITY; - } - __inline__ __host__ __device__ static float Max() { - return INFINITY; - } -}; - -template <> -struct NumericLimits { - __inline__ __host__ __device__ static double Min() { - return -HUGE_VAL; - } - __inline__ __host__ __device__ static double Max() { - return HUGE_VAL; + __inline__ __host__ __device__ static half Max() { + return CUDART_MAX_NORMAL_FP16; } }; From d27100f3e5738060fd33f06fb37c5efc03a46f4e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 5 Nov 2024 23:01:49 +0000 Subject: [PATCH 2/2] refine --- .../cuda/transformers/greedy_search_top_one.cu | 6 +++++- .../core/providers/cuda/shared_inc/cuda_utils.h | 10 +++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu index 9e901d24b1a5a..b2969194ff400 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu @@ -5,6 +5,7 @@ #include + #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/providers/cuda/cu_inc/common.cuh" @@ -19,7 +20,10 @@ struct TopOne { int32_t key; T value; - __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits::Lowest()) : key(key), value(value) { + __device__ __host__ __forceinline__ TopOne() : key(-1), value(NumericLimits::Lowest()) { + } + + __device__ __host__ __forceinline__ TopOne(int32_t key, T value) : key(key), value(value) { } __device__ __forceinline__ void Reduce(int32_t k, T v) { diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index ab0c543279fd0..f9433642f0857 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -129,18 +129,18 @@ struct NumericLimits { } }; -#ifndef CUDART_MAX_NORMAL_FP16 // CUDA 12.3 or later has this macro -#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU) -#endif - template <> struct NumericLimits { __inline__ __host__ __device__ static half Lowest() { - return -CUDART_MAX_NORMAL_FP16; + return -65504.0f; } __inline__ __host__ __device__ static half Max() { +#ifdef CUDART_MAX_NORMAL_FP16 // defined in cuda 12.3 or later return CUDART_MAX_NORMAL_FP16; +#else + return 65504.0f; +#endif } };