From ce702b4a282e2df7ab0dbe058ea1ff5c5deb87d2 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 5 Nov 2024 21:48:01 +0000
Subject: [PATCH 1/2] Fix NumericLimits

---
 .../cuda/transformers/beam_search_topk.cu     |  2 +-
 .../transformers/greedy_search_top_one.cu     |  2 +-
 .../core/providers/cuda/math/topk_impl.cuh    | 10 ++---
 .../providers/cuda/shared_inc/cuda_utils.h    | 42 ++++---------------
 4 files changed, 16 insertions(+), 40 deletions(-)
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu
index 5ac10f6321e63..44be2ef2375ee 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu
@@ -60,7 +60,7 @@ struct TopK {
   __device__ __forceinline__ void Init() {
     for (int i = 0; i < max_k; i++) {
       key[i] = -1;
-      value[i] = NumericLimits<T>::Min();
+      value[i] = NumericLimits<T>::Lowest();
     }
   }
 };
diff --git a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
index 68a2e16482af9..9e901d24b1a5a 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
@@ -19,7 +19,7 @@ struct TopOne {
   int32_t key;
   T value;
 
-  __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits<T>::Min()) : key(key), value(value) {
+  __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits<T>::Lowest()) : key(key), value(value) {
   }
 
   __device__ __forceinline__ void Reduce(int32_t k, T v) {
diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cuh b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
index cbde6da457fdb..112566e54bbba 100644
--- a/onnxruntime/core/providers/cuda/math/topk_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/topk_impl.cuh
@@ -412,7 +412,7 @@ Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute,
   if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
     BitonicTopK<CudaT><<<N, GridDim::maxThreadsPerBlock, aligned_dimension * sizeof(KV<CudaT>), stream>>>(
         input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension,
-        aligned_dimension, NumericLimits<T>::Min(), NumericLimits<T>::Max());
+        aligned_dimension, NumericLimits<CudaT>::Lowest(), NumericLimits<CudaT>::Max());
   } else if (K <= BT * 16 || 0 == sorted) {
     if (use_deterministic_compute) {
       static std::once_flag log_warning;
@@ -425,19 +425,19 @@ Status TopKImpl(const CudaKernel* kernel, bool use_deterministic_compute,
     if (BT * 2 >= K || 0 == sorted) {
       RadixTopK<CudaT, BT, 2><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
           input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
-          NumericLimits<T>::Min(), NumericLimits<T>::Max());
+          NumericLimits<CudaT>::Lowest(), NumericLimits<CudaT>::Max());
     } else if (BT * 4 >= K) {
       RadixTopK<CudaT, BT, 4><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
           input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
-          NumericLimits<T>::Min(), NumericLimits<T>::Max());
+          NumericLimits<CudaT>::Lowest(), NumericLimits<CudaT>::Max());
     } else if (BT * 8 >= K) {
       RadixTopK<CudaT, BT, 8><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
           input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
-          NumericLimits<T>::Min(), NumericLimits<T>::Max());
+          NumericLimits<CudaT>::Lowest(), NumericLimits<CudaT>::Max());
     } else {
       RadixTopK<CudaT, BT, 16><<<N, BT, 256 * sizeof(uint32_t), stream>>>(
           input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT,
-          NumericLimits<T>::Min(), NumericLimits<T>::Max());
+          NumericLimits<CudaT>::Lowest(), NumericLimits<CudaT>::Max());
     }
   } else {
     auto input_key_buffer = kernel->GetScratchBuffer<CudaT>(dimension, ort_stream);
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index ed642754af3ba..ab0c543279fd0 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -10,6 +10,7 @@
 #include <memory>
 #include <type_traits>
 #include <vector>
+#include <limits>
 
 #include <gsl/gsl>
 #include "core/framework/float16.h"
@@ -120,7 +121,7 @@ constexpr int kNumBitsPerBitmaskElement = std::numeric_limits<BitmaskElementType
 
 template <typename T>
 struct NumericLimits {
-  __inline__ __host__ __device__ static T Min() {
+  __inline__ __host__ __device__ static T Lowest() {
     return std::numeric_limits<T>::lowest();
   }
   __inline__ __host__ __device__ static T Max() {
@@ -128,43 +129,18 @@ struct NumericLimits {
   }
 };
 
-template <>
-struct NumericLimits<MLFloat16> {
-  __inline__ __host__ __device__ static half Min() {
-    return -65504.0;
-  }
-  __inline__ __host__ __device__ static half Max() {
-    return 65504.0;
-  }
-};
+#ifndef CUDART_MAX_NORMAL_FP16  // CUDA 12.3 or later has this macro
+#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU)
+#endif
 
 template <>
 struct NumericLimits<half> {
-  __inline__ __host__ __device__ static half Min() {
-    return -65504.0;
-  }
-  __inline__ __host__ __device__ static half Max() {
-    return 65504.0;
+  __inline__ __host__ __device__ static half Lowest() {
+    return -CUDART_MAX_NORMAL_FP16;
   }
-};
 
-template <>
-struct NumericLimits<float> {
-  __inline__ __host__ __device__ static float Min() {
-    return -INFINITY;
-  }
-  __inline__ __host__ __device__ static float Max() {
-    return INFINITY;
-  }
-};
-
-template <>
-struct NumericLimits<double> {
-  __inline__ __host__ __device__ static double Min() {
-    return -HUGE_VAL;
-  }
-  __inline__ __host__ __device__ static double Max() {
-    return HUGE_VAL;
+  __inline__ __host__ __device__ static half Max() {
+    return CUDART_MAX_NORMAL_FP16;
   }
 };
 

From d27100f3e5738060fd33f06fb37c5efc03a46f4e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 5 Nov 2024 23:01:49 +0000
Subject: [PATCH 2/2] refine

---
 .../cuda/transformers/greedy_search_top_one.cu         |  6 +++++-
 .../core/providers/cuda/shared_inc/cuda_utils.h        | 10 +++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
index 9e901d24b1a5a..b2969194ff400 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/greedy_search_top_one.cu
@@ -5,6 +5,7 @@
 
 #include <cub/cub.cuh>
 
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
 
@@ -19,7 +20,10 @@ struct TopOne {
   int32_t key;
   T value;
 
-  __device__ __host__ __forceinline__ TopOne(int32_t key = -1, T value = NumericLimits<T>::Lowest()) : key(key), value(value) {
+  __device__ __host__ __forceinline__ TopOne() : key(-1), value(NumericLimits<T>::Lowest()) {
+  }
+
+  __device__ __host__ __forceinline__ TopOne(int32_t key, T value) : key(key), value(value) {
   }
 
   __device__ __forceinline__ void Reduce(int32_t k, T v) {
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index ab0c543279fd0..f9433642f0857 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -129,18 +129,18 @@ struct NumericLimits {
   }
 };
 
-#ifndef CUDART_MAX_NORMAL_FP16  // CUDA 12.3 or later has this macro
-#define CUDART_MAX_NORMAL_FP16 __ushort_as_half((unsigned short)0x7BFFU)
-#endif
-
 template <>
 struct NumericLimits<half> {
   __inline__ __host__ __device__ static half Lowest() {
-    return -CUDART_MAX_NORMAL_FP16;
+    return -65504.0f;
   }
 
   __inline__ __host__ __device__ static half Max() {
+#ifdef CUDART_MAX_NORMAL_FP16  // defined in cuda 12.3 or later
     return CUDART_MAX_NORMAL_FP16;
+#else
+    return 65504.0f;
+#endif
   }
 };