Skip to content

Commit

Permalink
revert changes to onnxruntime/contrib_ops/cpu/transformers
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffdaily committed Oct 9, 2023
1 parent f3e251b commit a227096
Show file tree
Hide file tree
Showing 11 changed files with 32 additions and 32 deletions.
12 changes: 6 additions & 6 deletions onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
device_copy_int32_func_ ? device_copy_int32_func_ : GenerationCpuDeviceHelper::DeviceCopy<int32_t>,
update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -240,7 +240,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
device_copy_int32_func_,
update_gpt_feeds_fp16_func_,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand Down Expand Up @@ -271,7 +271,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -293,7 +293,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
expand_buffer_float_func_,
expand_buffer_float16_func_,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -320,7 +320,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -341,7 +341,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
expand_buffer_float_func_,
expand_buffer_float16_func_,
create_beam_scorer_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/contrib_ops/cpu/transformers/beam_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class BeamSearch : public IControlFlowKernel {
create_beam_scorer_func_ = create_beam_scorer_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
void SetDeviceHelpers_Cuda(
const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func) {
Expand Down Expand Up @@ -96,7 +96,7 @@ class BeamSearch : public IControlFlowKernel {
expand_buffer_float16_func_ = expand_buffer_float16_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
const void* cuda_device_prop_ = nullptr;
int cuda_device_arch_ = 0;
#endif
Expand All @@ -115,7 +115,7 @@ class BeamSearch : public IControlFlowKernel {
GenerationDeviceHelper::InitBeamStateFunc<MLFloat16> init_beam_state_fp16_func_;
GenerationDeviceHelper::CreateBeamScorer create_beam_scorer_func_;

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
update_feeds_func_(update_feeds_func),
create_beam_scorer_func_(create_beam_scorer_func) {}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
Status InitializeCuda(
const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
const void* cuda_device_prop,
Expand Down Expand Up @@ -100,7 +100,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
#endif
GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
Expand Down Expand Up @@ -336,7 +336,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
// Increase sequence length after a new token is generated.
++current_length;

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
// Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
// contains DecoderMaskedSelfAttention nodes
if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
expand_buffer_float16_func_(expand_buffer_float16_func),
create_beam_scorer_func_(create_beam_scorer_func) {}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
Status InitializeCuda(
const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
Expand Down Expand Up @@ -87,7 +87,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
// Device specific functions
GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
#endif
Expand Down Expand Up @@ -280,7 +280,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
// Here we only need to reorder the past key for self-attention and cross-attention.
for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
expand_buffer_float16_func_(expand_buffer_float16_func),
create_beam_scorer_func_(create_beam_scorer_func) {}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
Status InitializeCuda(
const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
Expand Down Expand Up @@ -85,7 +85,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
// Device specific functions
GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
#endif
Expand Down Expand Up @@ -272,7 +272,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
// Here we only need to reorder the past key for self-attention and cross-attention.
for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ enum DeviceCopyDirection {

namespace GenerationDeviceHelper {

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
using ReorderPastStateFunc = std::function<Status(
const void* cuda_device_prop, // cudaDeviceProp
Tensor& past_state,
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -227,7 +227,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
init_greedy_state_fp16_func_,
device_copy_func_,
update_gpt_feeds_fp16_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/contrib_ops/cpu/transformers/greedy_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class GreedySearch : public IControlFlowKernel {
init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
reorder_past_state_func_ = reorder_past_state_func;
}
Expand All @@ -73,7 +73,7 @@ class GreedySearch : public IControlFlowKernel {
update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
const void* cuda_device_prop_ = nullptr;
int cuda_device_arch_ = 0;
#endif
Expand All @@ -90,7 +90,7 @@ class GreedySearch : public IControlFlowKernel {
GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
init_greedy_state_func_(init_greedy_state_func),
update_feeds_func_(update_feeds_func) {}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
Status InitializeCuda(
const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
const void* cuda_device_prop,
Expand Down Expand Up @@ -109,7 +109,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
GenerationDeviceHelper::InitGreedyStateFunc<T> init_greedy_state_func_;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
#endif
GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
Expand Down Expand Up @@ -336,7 +336,7 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
// Increase sequence length after a new token is generated.
++current_length;

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
// Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
// contains DecoderMaskedSelfAttention nodes
if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/contrib_ops/cpu/transformers/sampling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand All @@ -163,7 +163,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
init_greedy_state_fp16_func_,
device_copy_func_,
update_gpt_feeds_fp16_func_};
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
#endif
ORT_RETURN_IF_ERROR(impl.Initialize());
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/contrib_ops/cpu/transformers/sampling.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class Sampling : public IControlFlowKernel {
init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
reorder_past_state_func_ = reorder_past_state_func;
}
Expand All @@ -70,7 +70,7 @@ class Sampling : public IControlFlowKernel {
update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
}

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
const void* gpu_device_prop_ = nullptr;
int gpu_device_arch_ = 0;
#endif
Expand All @@ -87,7 +87,7 @@ class Sampling : public IControlFlowKernel {
GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;

#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
#endif

Expand Down

0 comments on commit a227096

Please sign in to comment.