revert changes to onnxruntime/contrib_ops/cpu/transformers

ROCm · Oct 9, 2023 · a227096 · a227096
1 parent f3e251b
commit a227096
Show file tree

Hide file tree

Showing 11 changed files with 32 additions and 32 deletions.
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@@ -217,7 +217,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           device_copy_int32_func_ ? device_copy_int32_func_ : GenerationCpuDeviceHelper::DeviceCopy<int32_t>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -240,7 +240,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           device_copy_int32_func_,
           update_gpt_feeds_fp16_func_,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -271,7 +271,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
           expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -293,7 +293,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_,
           expand_buffer_float16_func_,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -320,7 +320,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
           expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -341,7 +341,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_,
           expand_buffer_float16_func_,
           create_beam_scorer_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search.h
@@ -66,7 +66,7 @@ class BeamSearch : public IControlFlowKernel {
     create_beam_scorer_func_ = create_beam_scorer_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   void SetDeviceHelpers_Cuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func) {
@@ -96,7 +96,7 @@ class BeamSearch : public IControlFlowKernel {
     expand_buffer_float16_func_ = expand_buffer_float16_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   const void* cuda_device_prop_ = nullptr;
   int cuda_device_arch_ = 0;
 #endif
@@ -115,7 +115,7 @@ class BeamSearch : public IControlFlowKernel {
   GenerationDeviceHelper::InitBeamStateFunc<MLFloat16> init_beam_state_fp16_func_;
   GenerationDeviceHelper::CreateBeamScorer create_beam_scorer_func_;
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -46,7 +46,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
         update_feeds_func_(update_feeds_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const void* cuda_device_prop,
@@ -100,7 +100,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
   GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
   GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
@@ -336,7 +336,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
     // Increase sequence length after a new token is generated.
     ++current_length;
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
     // contains DecoderMaskedSelfAttention nodes
     if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -53,7 +53,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
         expand_buffer_float16_func_(expand_buffer_float16_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
@@ -87,7 +87,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
   // Device specific functions
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif
@@ -280,7 +280,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
       auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
       beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       // Here we only need to reorder the past key for self-attention and cross-attention.
       for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
         ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -51,7 +51,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
         expand_buffer_float16_func_(expand_buffer_float16_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
@@ -85,7 +85,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
   // Device specific functions
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif
@@ -272,7 +272,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
       auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
       beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       // Here we only need to reorder the past key for self-attention and cross-attention.
       for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
         ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
@@ -33,7 +33,7 @@ enum DeviceCopyDirection {
 
 namespace GenerationDeviceHelper {
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
 using ReorderPastStateFunc = std::function<Status(
     const void* cuda_device_prop,  // cudaDeviceProp
     Tensor& past_state,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
@@ -203,7 +203,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
           init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
           device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -227,7 +227,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
           init_greedy_state_fp16_func_,
           device_copy_func_,
           update_gpt_feeds_fp16_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.h
@@ -60,7 +60,7 @@ class GreedySearch : public IControlFlowKernel {
     init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
     reorder_past_state_func_ = reorder_past_state_func;
   }
@@ -73,7 +73,7 @@ class GreedySearch : public IControlFlowKernel {
     update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   const void* cuda_device_prop_ = nullptr;
   int cuda_device_arch_ = 0;
 #endif
@@ -90,7 +90,7 @@ class GreedySearch : public IControlFlowKernel {
   GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
 

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -60,7 +60,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
         init_greedy_state_func_(init_greedy_state_func),
         update_feeds_func_(update_feeds_func) {}
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const void* cuda_device_prop,
@@ -109,7 +109,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
   GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<T> init_greedy_state_func_;
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
   GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
@@ -336,7 +336,7 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
     // Increase sequence length after a new token is generated.
     ++current_length;
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
     // contains DecoderMaskedSelfAttention nodes
     if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {

diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling.cc b/onnxruntime/contrib_ops/cpu/transformers/sampling.cc
@@ -139,7 +139,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
           init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
           device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -163,7 +163,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
           init_greedy_state_fp16_func_,
           device_copy_func_,
           update_gpt_feeds_fp16_func_};
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling.h b/onnxruntime/contrib_ops/cpu/transformers/sampling.h
@@ -57,7 +57,7 @@ class Sampling : public IControlFlowKernel {
     init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
     reorder_past_state_func_ = reorder_past_state_func;
   }
@@ -70,7 +70,7 @@ class Sampling : public IControlFlowKernel {
     update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
   }
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   const void* gpu_device_prop_ = nullptr;
   int gpu_device_arch_ = 0;
 #endif
@@ -87,7 +87,7 @@ class Sampling : public IControlFlowKernel {
   GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#ifdef USE_CUDA
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif