From 0586d45be780c51cdfec1dc969fde9c161a52004 Mon Sep 17 00:00:00 2001 From: milakov Date: Mon, 23 Nov 2015 22:06:05 +0300 Subject: [PATCH] Allocating chunk of mem for fixed working buffers --- nnforge/cuda/backward_propagation_cuda.cpp | 30 ++++++++++--- .../cuda/convolution_layer_tester_cuda.cpp | 4 +- nnforge/cuda/convolution_layer_tester_cuda.h | 2 +- .../cuda/convolution_layer_updater_cuda.cpp | 42 +++++++++++-------- nnforge/cuda/convolution_layer_updater_cuda.h | 2 +- nnforge/cuda/cuda_running_configuration.cpp | 12 +++++- nnforge/cuda/cuda_running_configuration.h | 6 ++- nnforge/cuda/factory_generator_cuda.cpp | 12 ++++-- nnforge/cuda/factory_generator_cuda.h | 6 ++- nnforge/cuda/forward_propagation_cuda.cpp | 31 +++++++++++--- nnforge/cuda/layer_tester_cuda.cpp | 4 +- nnforge/cuda/layer_tester_cuda.h | 3 +- nnforge/cuda/layer_updater_cuda.cpp | 4 +- nnforge/cuda/layer_updater_cuda.h | 3 +- .../max_subsampling_layer_updater_cuda.cuh | 2 +- nnforge/cuda/maxout_layer_updater_cuda.cu | 10 ++--- 16 files changed, 121 insertions(+), 52 deletions(-) diff --git a/nnforge/cuda/backward_propagation_cuda.cpp b/nnforge/cuda/backward_propagation_cuda.cpp index 847d02c..bef4169 100644 --- a/nnforge/cuda/backward_propagation_cuda.cpp +++ b/nnforge/cuda/backward_propagation_cuda.cpp @@ -1031,12 +1031,17 @@ namespace nnforge void backward_propagation_cuda::setup_temporary_working_fixed_buffer_sizes() { + size_t max_fixed_working_buffers_size = cuda_config->get_max_fixed_working_buffers_size(); + std::vector > > temporary_working_fixed_buffer_set_list; { std::map > > buffers; for(std::vector::const_iterator it = actions_in_execution_order.begin(); it != actions_in_execution_order.end(); ++it) { - size_t temporary_working_fixed_buffer_size = updaters[it->get_name()]->get_temporary_working_fixed_buffer_size(it->get_action()); + std::pair temporary_working_fixed_buffer_size_and_flag = updaters[it->get_name()]->get_temporary_working_fixed_buffer_size(it->get_action()); + size_t temporary_working_fixed_buffer_size = temporary_working_fixed_buffer_size_and_flag.first; + if (temporary_working_fixed_buffer_size_and_flag.second) + temporary_working_fixed_buffer_size = std::max(temporary_working_fixed_buffer_size, max_fixed_working_buffers_size); if (temporary_working_fixed_buffer_size > 0) buffers.insert(std::make_pair(*it, std::vector >())).first->second.push_back(std::make_pair(buffer_lifetime(buffer_lifetime::working_buffer), static_cast(temporary_working_fixed_buffer_size))); } @@ -1062,15 +1067,30 @@ namespace nnforge temporary_working_fixed_set_size_list.clear(); temporary_working_fixed_data_action_to_set_map.clear(); + + std::set set_ids_with_hungry_working_buffers; for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id) { const std::vector >& action_list = temporary_working_fixed_buffer_set_list[set_id]; - size_t max_buffer_size = 0; + for(std::vector >::const_iterator it = action_list.begin(); it != action_list.end(); ++it) + { + std::string layer_name = it->first.get_name(); + if (updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action()).second) + set_ids_with_hungry_working_buffers.insert(set_id); + } + } + if (set_ids_with_hungry_working_buffers.size() > 1) + max_fixed_working_buffers_size /= set_ids_with_hungry_working_buffers.size(); + + for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id) + { + const std::vector >& action_list = temporary_working_fixed_buffer_set_list[set_id]; + size_t max_buffer_size = (set_ids_with_hungry_working_buffers.find(set_id) != set_ids_with_hungry_working_buffers.end()) ? max_fixed_working_buffers_size : 1; for(std::vector >::const_iterator it = action_list.begin(); it != action_list.end(); ++it) { std::string layer_name = it->first.get_name(); temporary_working_fixed_data_action_to_set_map.insert(std::make_pair(it->first, set_id)); - size_t buffer_size = updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action()); + size_t buffer_size = updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action()).first; max_buffer_size = std::max(max_buffer_size, buffer_size); } temporary_working_fixed_set_size_list.push_back(max_buffer_size); @@ -1087,10 +1107,10 @@ namespace nnforge { if (it != temporary_working_fixed_set_size_list.begin()) debug_str << ", "; - debug_str << ((*it + 1024 - 1) / 1024) << " KB"; + debug_str << ((*it + (1024 * 1024) - 1) / (1024 * 1024)) << " MB"; total_buffer_size += *it; } - debug_str << "), total " << ((total_buffer_size + 1024 - 1) / 1024) << " KB"; + debug_str << "), total " << ((total_buffer_size + (1024 * 1024) - 1) / (1024 * 1024)) << " MB"; } debug->output_message(debug_str.str().c_str()); boost::filesystem::ofstream out(debug->get_path_to_unique_file("backward_prop_cuda_temporary_fixed_buffers", "gv"), std::ios_base::out | std::ios_base::trunc); diff --git a/nnforge/cuda/convolution_layer_tester_cuda.cpp b/nnforge/cuda/convolution_layer_tester_cuda.cpp index e73773c..1c41175 100644 --- a/nnforge/cuda/convolution_layer_tester_cuda.cpp +++ b/nnforge/cuda/convolution_layer_tester_cuda.cpp @@ -151,12 +151,12 @@ namespace nnforge zero_padding); } - size_t convolution_layer_tester_cuda::get_temporary_working_fixed_buffer_size() const + std::pair convolution_layer_tester_cuda::get_temporary_working_fixed_buffer_size() const { unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count; for(int i = 0; i < window_sizes.size(); ++i) working_buffer_elem_count *= window_sizes[i]; - return working_buffer_elem_count * sizeof(int); + return std::make_pair(working_buffer_elem_count * sizeof(int), true); } } } diff --git a/nnforge/cuda/convolution_layer_tester_cuda.h b/nnforge/cuda/convolution_layer_tester_cuda.h index 8f2346b..fa596cb 100644 --- a/nnforge/cuda/convolution_layer_tester_cuda.h +++ b/nnforge/cuda/convolution_layer_tester_cuda.h @@ -43,7 +43,7 @@ namespace nnforge cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count); - virtual size_t get_temporary_working_fixed_buffer_size() const; + virtual std::pair get_temporary_working_fixed_buffer_size() const; protected: virtual void tester_configured(); diff --git a/nnforge/cuda/convolution_layer_updater_cuda.cpp b/nnforge/cuda/convolution_layer_updater_cuda.cpp index 7ae0cb9..49bb07f 100644 --- a/nnforge/cuda/convolution_layer_updater_cuda.cpp +++ b/nnforge/cuda/convolution_layer_updater_cuda.cpp @@ -299,26 +299,34 @@ namespace nnforge zero_padding); } - size_t convolution_layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const + std::pair convolution_layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const { - if (action.get_action_type() == layer_action::forward) + switch (action.get_action_type()) { - unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count; - for(int i = 0; i < window_sizes.size(); ++i) - working_buffer_elem_count *= window_sizes[i]; - - return working_buffer_elem_count * sizeof(int); - } - else if (action.get_action_type() == layer_action::backward_weights) - { - unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count); - for(int i = 0; i < window_sizes.size(); ++i) - working_buffer_elem_count *= window_sizes[i]; - - return working_buffer_elem_count * sizeof(int); + case layer_action::forward: + { + unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count; + for(int i = 0; i < window_sizes.size(); ++i) + working_buffer_elem_count *= window_sizes[i]; + return std::make_pair(working_buffer_elem_count * sizeof(int), true); + } + case layer_action::backward_data: + { + unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count); + for(int i = 0; i < window_sizes.size(); ++i) + working_buffer_elem_count *= window_sizes[i]; + return std::make_pair(working_buffer_elem_count * sizeof(int), true); + } + case layer_action::backward_weights: + { + unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count); + for(int i = 0; i < window_sizes.size(); ++i) + working_buffer_elem_count *= window_sizes[i]; + return std::make_pair(working_buffer_elem_count * sizeof(int), true); + } + default: + return std::make_pair(0, false); } - else - return layer_updater_cuda::get_temporary_working_fixed_buffer_size(action); } bool convolution_layer_updater_cuda::is_backward_data_dependent_on_input_buffer(unsigned int action_input_index, unsigned int data_input_index) const diff --git a/nnforge/cuda/convolution_layer_updater_cuda.h b/nnforge/cuda/convolution_layer_updater_cuda.h index 22195e2..5e017c5 100644 --- a/nnforge/cuda/convolution_layer_updater_cuda.h +++ b/nnforge/cuda/convolution_layer_updater_cuda.h @@ -74,7 +74,7 @@ namespace nnforge cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer, unsigned int entry_count); - virtual size_t get_temporary_working_fixed_buffer_size(const layer_action& action) const; + virtual std::pair get_temporary_working_fixed_buffer_size(const layer_action& action) const; virtual bool is_backward_data_dependent_on_input_buffer(unsigned int action_input_index, unsigned int data_input_index) const; diff --git a/nnforge/cuda/cuda_running_configuration.cpp b/nnforge/cuda/cuda_running_configuration.cpp index 477974f..58451c7 100644 --- a/nnforge/cuda/cuda_running_configuration.cpp +++ b/nnforge/cuda/cuda_running_configuration.cpp @@ -42,13 +42,15 @@ namespace nnforge unsigned int reserved_thread_count, bool dont_share_buffers, bool single_command_stream, - unsigned int optimize_action_graph_assumed_chunk_size) + unsigned int optimize_action_graph_assumed_chunk_size, + float cuda_fixed_working_buffers_ratio) : device_id(device_id) , max_global_memory_usage_ratio(max_global_memory_usage_ratio) , reserved_thread_count(reserved_thread_count) , dont_share_buffers(dont_share_buffers) , single_command_stream(single_command_stream) , optimize_action_graph_assumed_chunk_size(optimize_action_graph_assumed_chunk_size) + , cuda_fixed_working_buffers_ratio(cuda_fixed_working_buffers_ratio) , cublas_handle(0) , cusparse_handle(0) , cudnn_handle(0) @@ -180,11 +182,12 @@ namespace nnforge #ifdef _WIN32 out << "Driver mode = " << (running_configuration.tcc_mode ? "TCC" : "WDDM") << std::endl; #endif - out << "Estimated GFLOPS = " << static_cast(running_configuration.get_flops() / 1.0e+12F) << std::endl; + out << "Estimated GFLOPS = " << static_cast(running_configuration.get_flops() / 1.0e+9F) << std::endl; out << "--- Settings ---" << std::endl; out << "Max global memory usage ratio = " << running_configuration.max_global_memory_usage_ratio << std::endl; + out << "Fixed working buffers ratio = " << running_configuration.cuda_fixed_working_buffers_ratio << std::endl; out << "Threads reserved for CUDA sync (others will be used for on-the-fly data processing by job runner) = " << running_configuration.reserved_thread_count << std::endl; out << "Don't share buffers = " << running_configuration.dont_share_buffers << std::endl; out << "Use single command stream = " << running_configuration.single_command_stream << std::endl; @@ -220,6 +223,11 @@ namespace nnforge return entry_count; } + size_t cuda_running_configuration::get_max_fixed_working_buffers_size() const + { + return static_cast(static_cast(global_memory_size) * max_global_memory_usage_ratio * cuda_fixed_working_buffers_ratio); + } + cublasHandle_t cuda_running_configuration::get_cublas_handle() const { return cublas_handle; diff --git a/nnforge/cuda/cuda_running_configuration.h b/nnforge/cuda/cuda_running_configuration.h index 0c13f9d..29166ab 100644 --- a/nnforge/cuda/cuda_running_configuration.h +++ b/nnforge/cuda/cuda_running_configuration.h @@ -43,7 +43,8 @@ namespace nnforge unsigned int reserved_thread_count, bool dont_share_buffers, bool single_command_stream, - unsigned int optimize_action_graph_assumed_chunk_size); + unsigned int optimize_action_graph_assumed_chunk_size, + float cuda_fixed_working_buffers_ratio); ~cuda_running_configuration(); @@ -51,6 +52,8 @@ namespace nnforge const buffer_cuda_size_configuration& buffers_config, float ratio = 1.0F) const; + size_t get_max_fixed_working_buffers_size() const; + cublasHandle_t get_cublas_handle() const; cusparseHandle_t get_cusparse_handle() const; @@ -85,6 +88,7 @@ namespace nnforge bool dont_share_buffers; bool single_command_stream; unsigned int optimize_action_graph_assumed_chunk_size; + float cuda_fixed_working_buffers_ratio; int driver_version; int runtime_version; diff --git a/nnforge/cuda/factory_generator_cuda.cpp b/nnforge/cuda/factory_generator_cuda.cpp index 428be9e..e8c108d 100644 --- a/nnforge/cuda/factory_generator_cuda.cpp +++ b/nnforge/cuda/factory_generator_cuda.cpp @@ -31,13 +31,15 @@ namespace nnforge unsigned int cuda_reserved_thread_count, bool cuda_dont_share_buffers, bool cuda_single_command_stream, - unsigned int optimize_action_graph_assumed_chunk_size) + unsigned int cuda_optimize_action_graph_assumed_chunk_size, + float cuda_fixed_working_buffers_ratio) : cuda_device_id(cuda_device_id) , cuda_max_global_memory_usage_ratio(cuda_max_global_memory_usage_ratio) , cuda_reserved_thread_count(cuda_reserved_thread_count) , cuda_dont_share_buffers(cuda_dont_share_buffers) , cuda_single_command_stream(cuda_single_command_stream) - , optimize_action_graph_assumed_chunk_size(optimize_action_graph_assumed_chunk_size) + , cuda_optimize_action_graph_assumed_chunk_size(cuda_optimize_action_graph_assumed_chunk_size) + , cuda_fixed_working_buffers_ratio(cuda_fixed_working_buffers_ratio) { } @@ -57,7 +59,8 @@ namespace nnforge cuda_reserved_thread_count, cuda_dont_share_buffers, cuda_single_command_stream, - optimize_action_graph_assumed_chunk_size)); + cuda_optimize_action_graph_assumed_chunk_size, + cuda_fixed_working_buffers_ratio)); } forward_propagation_factory::ptr factory_generator_cuda::create_forward_propagation_factory() const @@ -75,6 +78,7 @@ namespace nnforge std::vector res; res.push_back(float_option("cuda_max_global_memory_usage_ratio,G", &cuda_max_global_memory_usage_ratio, 0.9F, "Part of the global memory to be used by a single CUDA configuration. Set to smaller value if the device is used for graphics as well")); + res.push_back(float_option("cuda_fixed_working_buffers_ratio", &cuda_fixed_working_buffers_ratio, 0.1F, "Part of memory use dby app, which is allocated to working buffers (independent of batch size)")); return res; } @@ -85,7 +89,7 @@ namespace nnforge res.push_back(int_option("cuda_device_id,D", &cuda_device_id, 0, "CUDA device ID")); res.push_back(int_option("cuda_reserved_thread_count", &cuda_reserved_thread_count, 1, "The number of hw threads not used for input data processing")); - res.push_back(int_option("optimize_action_graph_assumed_chunk_size", &optimize_action_graph_assumed_chunk_size, 32, "Assumed chunk size when optimizing action graph")); + res.push_back(int_option("cuda_optimize_action_graph_assumed_chunk_size", &cuda_optimize_action_graph_assumed_chunk_size, 32, "Assumed chunk size when optimizing action graph")); return res; } diff --git a/nnforge/cuda/factory_generator_cuda.h b/nnforge/cuda/factory_generator_cuda.h index 8fb380b..dcc86a6 100644 --- a/nnforge/cuda/factory_generator_cuda.h +++ b/nnforge/cuda/factory_generator_cuda.h @@ -32,7 +32,8 @@ namespace nnforge unsigned int cuda_reserved_thread_count, bool cuda_dont_share_buffers, bool cuda_single_command_stream, - unsigned int optimize_action_graph_assumed_chunk_size); + unsigned int cuda_optimize_action_graph_assumed_chunk_size, + float cuda_fixed_working_buffers_ratio); factory_generator_cuda(); @@ -58,7 +59,8 @@ namespace nnforge int cuda_reserved_thread_count; bool cuda_dont_share_buffers; bool cuda_single_command_stream; - int optimize_action_graph_assumed_chunk_size; + int cuda_optimize_action_graph_assumed_chunk_size; + float cuda_fixed_working_buffers_ratio; cuda_running_configuration::const_ptr cuda_config; }; diff --git a/nnforge/cuda/forward_propagation_cuda.cpp b/nnforge/cuda/forward_propagation_cuda.cpp index 350ce4e..1a86145 100644 --- a/nnforge/cuda/forward_propagation_cuda.cpp +++ b/nnforge/cuda/forward_propagation_cuda.cpp @@ -744,12 +744,17 @@ namespace nnforge void forward_propagation_cuda::setup_temporary_working_fixed_buffer_sizes() { + size_t max_fixed_working_buffers_size = cuda_config->get_max_fixed_working_buffers_size(); + std::vector > > temporary_working_fixed_buffer_set_list; { std::map > > buffers; for(std::map::const_iterator it = testers.begin(); it != testers.end(); ++it) { - size_t temporary_working_fixed_buffer_size = it->second->get_temporary_working_fixed_buffer_size(); + std::pair temporary_working_fixed_buffer_size_and_flag = it->second->get_temporary_working_fixed_buffer_size(); + size_t temporary_working_fixed_buffer_size = temporary_working_fixed_buffer_size_and_flag.first; + if (temporary_working_fixed_buffer_size_and_flag.second) + temporary_working_fixed_buffer_size = std::max(temporary_working_fixed_buffer_size, max_fixed_working_buffers_size); if (temporary_working_fixed_buffer_size > 0) buffers.insert(std::make_pair(layer_name_with_action(it->first, layer_action::forward), std::vector >())).first->second.push_back(std::make_pair(buffer_lifetime(buffer_lifetime::working_buffer), static_cast(temporary_working_fixed_buffer_size))); } @@ -775,15 +780,31 @@ namespace nnforge temporary_working_fixed_set_size_list.clear(); temporary_working_fixed_data_action_to_set_map.clear(); + + std::set set_ids_with_hungry_working_buffers; for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id) { const std::vector >& action_list = temporary_working_fixed_buffer_set_list[set_id]; - size_t max_buffer_size = 0; + for(std::vector >::const_iterator it = action_list.begin(); it != action_list.end(); ++it) + { + std::string layer_name = it->first.get_name(); + if (testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size().second) + set_ids_with_hungry_working_buffers.insert(set_id); + } + } + if (set_ids_with_hungry_working_buffers.size() > 1) + max_fixed_working_buffers_size /= set_ids_with_hungry_working_buffers.size(); + + for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id) + { + const std::vector >& action_list = temporary_working_fixed_buffer_set_list[set_id]; + size_t max_buffer_size = (set_ids_with_hungry_working_buffers.find(set_id) != set_ids_with_hungry_working_buffers.end()) ? max_fixed_working_buffers_size : 1; + for(std::vector >::const_iterator it = action_list.begin(); it != action_list.end(); ++it) { std::string layer_name = it->first.get_name(); temporary_working_fixed_data_action_to_set_map.insert(std::make_pair(it->first, set_id)); - size_t buffer_size = testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size(); + size_t buffer_size = testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size().first; max_buffer_size = std::max(max_buffer_size, buffer_size); } temporary_working_fixed_set_size_list.push_back(max_buffer_size); @@ -800,10 +821,10 @@ namespace nnforge { if (it != temporary_working_fixed_set_size_list.begin()) debug_str << ", "; - debug_str << ((*it + 1024 - 1) / 1024) << " KB"; + debug_str << ((*it + (1024 * 1024) - 1) / (1024 * 1024)) << " MB"; total_buffer_size += *it; } - debug_str << "), total " << ((total_buffer_size + 1024 - 1) / 1024) << " KB"; + debug_str << "), total " << ((total_buffer_size + (1024 * 1024) - 1) / (1024 * 1024)) << " MB"; } debug->output_message(debug_str.str().c_str()); boost::filesystem::ofstream out(debug->get_path_to_unique_file("forward_prop_cuda_temporary_fixed_buffers", "gv"), std::ios_base::out | std::ios_base::trunc); diff --git a/nnforge/cuda/layer_tester_cuda.cpp b/nnforge/cuda/layer_tester_cuda.cpp index 31217a2..99cbf20 100644 --- a/nnforge/cuda/layer_tester_cuda.cpp +++ b/nnforge/cuda/layer_tester_cuda.cpp @@ -107,9 +107,9 @@ namespace nnforge return std::vector(); } - size_t layer_tester_cuda::get_temporary_working_fixed_buffer_size() const + std::pair layer_tester_cuda::get_temporary_working_fixed_buffer_size() const { - return 0; + return std::make_pair(0, false); } size_t layer_tester_cuda::get_temporary_working_per_entry_buffer_size() const diff --git a/nnforge/cuda/layer_tester_cuda.h b/nnforge/cuda/layer_tester_cuda.h index cb1ab0e..858ff63 100644 --- a/nnforge/cuda/layer_tester_cuda.h +++ b/nnforge/cuda/layer_tester_cuda.h @@ -64,7 +64,8 @@ namespace nnforge virtual int get_input_index_layer_can_write() const; - virtual size_t get_temporary_working_fixed_buffer_size() const; + // The function should return the minimum size and the flag indicating whether the tester would be happy to have larger working buffer + virtual std::pair get_temporary_working_fixed_buffer_size() const; virtual size_t get_temporary_working_per_entry_buffer_size() const; diff --git a/nnforge/cuda/layer_updater_cuda.cpp b/nnforge/cuda/layer_updater_cuda.cpp index b341f0c..f350d53 100644 --- a/nnforge/cuda/layer_updater_cuda.cpp +++ b/nnforge/cuda/layer_updater_cuda.cpp @@ -118,12 +118,12 @@ namespace nnforge { } - size_t layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const + std::pair layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const { if (actions.find(action) == actions.end()) throw neural_network_exception((boost::format("get_temporary_working_fixed_buffer_size called for layer %1% for action %2% while it is not configured to run such an action") % layer_schema->instance_name % action.str()).str()); - return 0; + return std::make_pair(0, false); } size_t layer_updater_cuda::get_temporary_working_per_entry_buffer_size(const layer_action& action) const diff --git a/nnforge/cuda/layer_updater_cuda.h b/nnforge/cuda/layer_updater_cuda.h index 997c85a..e671d09 100644 --- a/nnforge/cuda/layer_updater_cuda.h +++ b/nnforge/cuda/layer_updater_cuda.h @@ -100,7 +100,8 @@ namespace nnforge virtual std::vector get_linear_addressing_through_texture_per_entry() const; - virtual size_t get_temporary_working_fixed_buffer_size(const layer_action& action) const; + // The function should return the minimum size and the flag indicating whether the tester would be happy to have larger working buffer + virtual std::pair get_temporary_working_fixed_buffer_size(const layer_action& action) const; virtual size_t get_temporary_working_per_entry_buffer_size(const layer_action& action) const; diff --git a/nnforge/cuda/max_subsampling_layer_updater_cuda.cuh b/nnforge/cuda/max_subsampling_layer_updater_cuda.cuh index 18e546d..377ba62 100644 --- a/nnforge/cuda/max_subsampling_layer_updater_cuda.cuh +++ b/nnforge/cuda/max_subsampling_layer_updater_cuda.cuh @@ -172,7 +172,7 @@ namespace nnforge for(int i = DIMENSION_COUNT - 1; i >= 0; --i) offset = offset * output_sizes[i] + xyzw[i]; output[offset] = res[0]; - max_positions[offset] = max_pos[0]; + max_positions[offset] = static_cast(max_pos[0]); #pragma unroll for(int i = 1; i < FEATURE_MAP_BLOCK_SIZE; ++i) { diff --git a/nnforge/cuda/maxout_layer_updater_cuda.cu b/nnforge/cuda/maxout_layer_updater_cuda.cu index 896086a..271983a 100644 --- a/nnforge/cuda/maxout_layer_updater_cuda.cu +++ b/nnforge/cuda/maxout_layer_updater_cuda.cu @@ -89,7 +89,7 @@ __global__ void maxout_forward_only_upd_kernel( } template -__global__ void maxout_deriviative_upd_kernel( +__global__ void maxout_backprop_upd_kernel( float * __restrict input_errors, const position_type * __restrict max_feature_map_positions, const float * __restrict output_errors, @@ -209,7 +209,7 @@ namespace nnforge if (feature_map_subsampling_size <= 256) { if (add_update_to_destination) - maxout_deriviative_upd_kernel<<>>( + maxout_backprop_upd_kernel<<>>( *input_errors_buffer, *temporary_per_entry_buffer, *output_errors_buffer, @@ -219,7 +219,7 @@ namespace nnforge feature_map_subsampling_size, entry_count); else - maxout_deriviative_upd_kernel<<>>( + maxout_backprop_upd_kernel<<>>( *input_errors_buffer, *temporary_per_entry_buffer, *output_errors_buffer, @@ -232,7 +232,7 @@ namespace nnforge else { if (add_update_to_destination) - maxout_deriviative_upd_kernel<<>>( + maxout_backprop_upd_kernel<<>>( *input_errors_buffer, *temporary_per_entry_buffer, *output_errors_buffer, @@ -242,7 +242,7 @@ namespace nnforge feature_map_subsampling_size, entry_count); else - maxout_deriviative_upd_kernel<<>>( + maxout_backprop_upd_kernel<<>>( *input_errors_buffer, *temporary_per_entry_buffer, *output_errors_buffer,