CUDA EP vs ROCM EP hipify audit

Migrate all CUDA EP improvements and changes to ROCM EP. The process involves using hipify against all CUDA EP files (i.e. do not exclude any files from onnxruntime_rocm_hipify.cmake) then vimdiff compare them against the ROCM EP files that are under source control and pull in most changes. These changes include functional as well as formatting and makes comparing CUDA EP and ROCM EP easier, though it makes the PR diff somewhat less obvious due to formatting changes. - hipify audit of onnxruntime/core/providers/rocm, enable ops - Loop - Scan - hipify audit of onnxruntime/contrib_ops/rocm - fix contrib ops search implementation - enable more contrib ops - Affine - ComplexMul - ConvTransposeWithDynamicPads - Crop - DynamicSlice - FFT [Rfft, Irfft] - GreedySearch - ImageScaler - ParametricSoftplus - ScaledTanh - ThresholdRelu
ROCm · Oct 3, 2023 · b1f3135 · b1f3135
1 parent d11e053
commit b1f3135
Show file tree

Hide file tree

Showing 46 changed files with 1,501 additions and 1,504 deletions.
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
@@ -1516,6 +1516,7 @@ if (onnxruntime_USE_ROCM)
   find_package(hiprand REQUIRED)
   find_package(rocblas REQUIRED)
   find_package(MIOpen REQUIRED)
+  find_package(hipfft REQUIRED)
 
   # MIOpen version
   if(NOT DEFINED ENV{MIOPEN_PATH})
@@ -1554,7 +1555,7 @@ if (onnxruntime_USE_ROCM)
 
   find_library(RCCL_LIB rccl REQUIRED)
   find_library(ROCTRACER_LIB roctracer64 REQUIRED)
-  set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen ${RCCL_LIB} ${ROCTRACER_LIB})
+  set(ONNXRUNTIME_ROCM_LIBS roc::rocblas MIOpen hip::hipfft ${RCCL_LIB} ${ROCTRACER_LIB})
 
   file(GLOB_RECURSE onnxruntime_providers_rocm_cc_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/providers/rocm/*.h"

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -48,15 +48,6 @@ set(contrib_ops_excluded_files
   "diffusion/group_norm_impl.cu"
   "diffusion/group_norm_impl.h"
   "diffusion/nhwc_conv.cc"
-  "math/complex_mul.cc"
-  "math/complex_mul.h"
-  "math/complex_mul_impl.cu"
-  "math/complex_mul_impl.h"
-  "math/cufft_plan_cache.h"
-  "math/fft_ops.cc"
-  "math/fft_ops.h"
-  "math/fft_ops_impl.cu"
-  "math/fft_ops_impl.h"
   "quantization/attention_quantization.cc"
   "quantization/attention_quantization.h"
   "quantization/attention_quantization_impl.cu"
@@ -86,19 +77,6 @@ set(contrib_ops_excluded_files
   "quantization/qordered_ops/qordered_unary_ops.cc"
   "quantization/qordered_ops/qordered_unary_ops_impl.h"
   "quantization/qordered_ops/qordered_unary_ops_impl.cu"
-  "tensor/crop.cc"
-  "tensor/crop.h"
-  "tensor/crop_impl.cu"
-  "tensor/crop_impl.h"
-  "tensor/dynamicslice.cc"
-  "tensor/image_scaler.cc"
-  "tensor/image_scaler.h"
-  "tensor/image_scaler_impl.cu"
-  "tensor/image_scaler_impl.h"
-  "transformers/greedy_search.cc"
-  "transformers/greedy_search.h"
-  "conv_transpose_with_dynamic_pads.cc"
-  "conv_transpose_with_dynamic_pads.h"
   "cuda_contrib_kernels.cc"
   "cuda_contrib_kernels.h"
   "inverse.cc"
@@ -114,10 +92,6 @@ endif()
 
 set(provider_excluded_files
   "atomic/common.cuh"
-  "controlflow/loop.cc"
-  "controlflow/loop.h"
-  "controlflow/scan.cc"
-  "controlflow/scan.h"
   "cu_inc/common.cuh"
   "math/einsum_utils/einsum_auxiliary_ops.cc"
   "math/einsum_utils/einsum_auxiliary_ops.h"
@@ -165,7 +139,6 @@ set(provider_excluded_files
   "cuda_memory_check.h"
   "cuda_fence.cc"
   "cuda_fence.h"
-  "cuda_fwd.h"
   "cuda_kernel.h"
   "cuda_pch.cc"
   "cuda_pch.h"

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search.cc
@@ -217,7 +217,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           device_copy_int32_func_ ? device_copy_int32_func_ : GenerationCpuDeviceHelper::DeviceCopy<int32_t>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -240,7 +240,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           device_copy_int32_func_,
           update_gpt_feeds_fp16_func_,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -271,7 +271,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
           expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -293,7 +293,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_,
           expand_buffer_float16_func_,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -320,7 +320,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_ ? expand_buffer_float_func_ : GenerationCpuDeviceHelper::ExpandBuffer<float>,
           expand_buffer_float16_func_ ? expand_buffer_float16_func_ : GenerationCpuDeviceHelper::ExpandBuffer<MLFloat16>,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -341,7 +341,7 @@ Status BeamSearch::Compute(OpKernelContext* ctx) const {
           expand_buffer_float_func_,
           expand_buffer_float16_func_,
           create_beam_scorer_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, init_cache_indir_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search.h
@@ -66,7 +66,7 @@ class BeamSearch : public IControlFlowKernel {
     create_beam_scorer_func_ = create_beam_scorer_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   void SetDeviceHelpers_Cuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func) {
@@ -96,7 +96,7 @@ class BeamSearch : public IControlFlowKernel {
     expand_buffer_float16_func_ = expand_buffer_float16_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   const void* cuda_device_prop_ = nullptr;
   int cuda_device_arch_ = 0;
 #endif
@@ -115,7 +115,7 @@ class BeamSearch : public IControlFlowKernel {
   GenerationDeviceHelper::InitBeamStateFunc<MLFloat16> init_beam_state_fp16_func_;
   GenerationDeviceHelper::CreateBeamScorer create_beam_scorer_func_;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -46,7 +46,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
         update_feeds_func_(update_feeds_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const void* cuda_device_prop,
@@ -100,7 +100,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
   GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
   GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
@@ -336,7 +336,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
     // Increase sequence length after a new token is generated.
     ++current_length;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
     // contains DecoderMaskedSelfAttention nodes
     if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_t5.h
@@ -53,7 +53,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
         expand_buffer_float16_func_(expand_buffer_float16_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
@@ -87,7 +87,7 @@ class BeamSearchT5 : public BeamSearchBase<T> {
   // Device specific functions
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif
@@ -280,7 +280,7 @@ Status BeamSearchT5<T>::Execute(const FeedsFetchesManager& encoder_feeds_fetches
       auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
       beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       // Here we only need to reorder the past key for self-attention and cross-attention.
       for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
         ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -51,7 +51,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
         expand_buffer_float16_func_(expand_buffer_float16_func),
         create_beam_scorer_func_(create_beam_scorer_func) {}
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const GenerationDeviceHelper::InitCacheIndirFunc& init_cache_indir_func,
@@ -85,7 +85,7 @@ class BeamSearchWhisper : public BeamSearchBase<T> {
   // Device specific functions
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitBeamStateFunc<T> init_beam_state_func_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
   GenerationDeviceHelper::InitCacheIndirFunc init_cache_indir_func_;
 #endif
@@ -272,7 +272,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
       auto cross_attention_past_key_sz = first_cross_attention_key->Shape().Size();
       beam_state.EnsurePastStateReorderStagingBuffer(this->temp_space_allocator_, cross_attention_past_key_sz);
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       // Here we only need to reorder the past key for self-attention and cross-attention.
       for (size_t i = 0; i < 2 * static_cast<size_t>(decoder_subgraph_.num_layers); ++i) {
         ORT_RETURN_IF_ERROR(reorder_past_state_func_(cuda_device_prop_,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
@@ -33,7 +33,7 @@ enum DeviceCopyDirection {
 
 namespace GenerationDeviceHelper {
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
 using ReorderPastStateFunc = std::function<Status(
     const void* cuda_device_prop,  // cudaDeviceProp
     Tensor& past_state,

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.cc
@@ -203,7 +203,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
           init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
           device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -227,7 +227,7 @@ Status GreedySearch::Compute(OpKernelContext* ctx) const {
           init_greedy_state_fp16_func_,
           device_copy_func_,
           update_gpt_feeds_fp16_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, cuda_device_prop_, cuda_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search.h
@@ -60,7 +60,7 @@ class GreedySearch : public IControlFlowKernel {
     init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
     reorder_past_state_func_ = reorder_past_state_func;
   }
@@ -73,7 +73,7 @@ class GreedySearch : public IControlFlowKernel {
     update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   const void* cuda_device_prop_ = nullptr;
   int cuda_device_arch_ = 0;
 #endif
@@ -90,7 +90,7 @@ class GreedySearch : public IControlFlowKernel {
   GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
 

diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -60,7 +60,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
         init_greedy_state_func_(init_greedy_state_func),
         update_feeds_func_(update_feeds_func) {}
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   Status InitializeCuda(
       const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func,
       const void* cuda_device_prop,
@@ -109,7 +109,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
   GenerationDeviceHelper::CreateGptInputsFunc create_inputs_func_;
   GenerationDeviceHelper::AddToFeedsFunc add_to_feeds_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<T> init_greedy_state_func_;
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif
   GenerationDeviceHelper::UpdateGptFeedsFunc<T> update_feeds_func_;
@@ -336,7 +336,7 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
     // Increase sequence length after a new token is generated.
     ++current_length;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
     // contains DecoderMaskedSelfAttention nodes
     if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_attention_) {

diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling.cc b/onnxruntime/contrib_ops/cpu/transformers/sampling.cc
@@ -139,7 +139,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
           init_greedy_state_func_ ? init_greedy_state_func_ : GenerationCpuDeviceHelper::InitGreedyState<float>,
           device_copy_func_ ? device_copy_func_ : GenerationCpuDeviceHelper::DeviceCopy<float>,
           update_gpt_feeds_func_ ? update_gpt_feeds_func_ : GenerationCpuDeviceHelper::UpdateGptFeeds<float>};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());
@@ -163,7 +163,7 @@ Status Sampling::Compute(OpKernelContext* ctx) const {
           init_greedy_state_fp16_func_,
           device_copy_func_,
           update_gpt_feeds_fp16_func_};
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
       ORT_RETURN_IF_ERROR(impl.InitializeCuda(reorder_past_state_func_, gpu_device_prop_, gpu_device_arch_));
 #endif
       ORT_RETURN_IF_ERROR(impl.Initialize());

diff --git a/onnxruntime/contrib_ops/cpu/transformers/sampling.h b/onnxruntime/contrib_ops/cpu/transformers/sampling.h
@@ -57,7 +57,7 @@ class Sampling : public IControlFlowKernel {
     init_greedy_state_fp16_func_ = init_greedy_state_fp16_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   void SetDeviceHelpers_Cuda(const GenerationDeviceHelper::ReorderPastStateFunc& reorder_past_state_func) {
     reorder_past_state_func_ = reorder_past_state_func;
   }
@@ -70,7 +70,7 @@ class Sampling : public IControlFlowKernel {
     update_gpt_feeds_fp16_func_ = update_gpt_feeds_fp16_func;
   }
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   const void* gpu_device_prop_ = nullptr;
   int gpu_device_arch_ = 0;
 #endif
@@ -87,7 +87,7 @@ class Sampling : public IControlFlowKernel {
   GenerationDeviceHelper::InitGreedyStateFunc<float> init_greedy_state_func_;
   GenerationDeviceHelper::InitGreedyStateFunc<MLFloat16> init_greedy_state_fp16_func_;
 
-#ifdef USE_CUDA
+#if defined(USE_CUDA) || defined(USE_ROCM)
   GenerationDeviceHelper::ReorderPastStateFunc reorder_past_state_func_;
 #endif