From def551dfd50bfff4e9d50108dc4e8027b553b8ec Mon Sep 17 00:00:00 2001 From: Siva Date: Fri, 29 Sep 2023 10:30:20 +0530 Subject: [PATCH] [CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native performance (#15818) * [RTVM] Improve rtvm tool with new options to measure native performance Few fixes and enhancements that affects model loading times New options to measure performance. * * review comments * * review comments --- apps/cpp_rtvm/README.md | 22 ++++ apps/cpp_rtvm/main.cc | 199 +++++++++++++++++++++++++++++++----- apps/cpp_rtvm/tvm_runner.cc | 129 +++++++++++++++++------ apps/cpp_rtvm/tvm_runner.h | 24 ++++- 4 files changed, 316 insertions(+), 58 deletions(-) diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md index c60a7b0e12f5..652d46eb5821 100644 --- a/apps/cpp_rtvm/README.md +++ b/apps/cpp_rtvm/README.md @@ -122,6 +122,11 @@ Command line usage --input - Numpy file for the model input (optional and we use random of not given) --output - Numpy file name to dump the model output as numpy --dump-meta - Dump model meta information +--pre-compiled - The file name of a file where pre-compiled programs should be stored +--profile - Profile over all execution +--dry-run - Profile after given dry runs, default 10 +--run-count - Profile for given runs, default 50 +--zero-copy - Profile with zero copy api Example ./rtvm --model=keras-resnet50 --device="opencl" --dump-meta @@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the `rtvm` then After method `Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled programs if the file exists. In opposite case the file will be created and pre-compiled programs will be saved to this file. + +# Performnace Profiling Options +The tool has added few options to measure wall clock performance of the given model on Target natively. +--profile : Can turn on the profiling +--dry-run : The number of times dry run the model before mearuring the performance. Default value os 10 +--run-count : The number times to run the model and take an average. Default value is 50. +--zero-copy: This option enables graph runtime zero copy to be used for input and output than byte copy to DLTensor. + +Performance profile options dumps information summary as given below. + Module Load :27 ms + Graph Runtime Create :11 ms + Params Read :15 ms + Params Set :41 ms + Pre Compiled Progs Load :24 ms +Total Load Time :118 ms +Average ExecTime :27 ms +Unload Time :35.9236 ms diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc index c38a5f62bd9a..dc3cf1c41499 100644 --- a/apps/cpp_rtvm/main.cc +++ b/apps/cpp_rtvm/main.cc @@ -29,6 +29,7 @@ #endif #include +#include #include #include #include @@ -54,7 +55,11 @@ static const string kUsage = "--input - Numpy file for the model input (optional and we use random of not given)\n" "--output - Numpy file name to dump the model output as numpy\n" "--dump-meta - Dump model meta information\n" - "--pre-compiled - The file name of a file where pre-compiled programs should be stored" + "--pre-compiled - The file name of a file where pre-compiled programs should be stored\n" + "--profile - Profile over all execution\n" + "--dry-run - Profile after given dry runs, default 10\n" + "--run-count - Profile for given runs, default 50\n" + "--zero-copy - Profile with zero copy api\n" "\n" " Example\n" " ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n" @@ -68,6 +73,7 @@ static const string kUsage = * \arg input Numpy file for the model input * \arg output Numpy file name to dump the model output as numpy * \arg pre_compiled File name where pre-compiled programs should be stored + * \arg profile Do we profile overall execution */ struct ToolArgs { string model; @@ -75,7 +81,11 @@ struct ToolArgs { string input; string output; string pre_compiled; - bool dump_meta = false; + bool dump_meta{false}; + bool profile{false}; + int dry_run{10}; + int run_count{50}; + bool zero_copy{false}; }; /*! @@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) { LOG(INFO) << "Output = " << args.output; LOG(INFO) << "Pre-compiled = " << args.pre_compiled; LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False")); + LOG(INFO) << "Profile = " << ((args.profile) ? ("True") : ("False")); + LOG(INFO) << "Dry Run = " << args.dry_run; + LOG(INFO) << "Run Count = " << args.run_count; + LOG(INFO) << "Zero Copy = " << ((args.zero_copy) ? ("True") : ("False")); } #if defined(__linux__) || defined(__ANDROID__) @@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) { } args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled="); + + const string pprofile = GetCmdOption(argc, argv, "--profile", true); + if (!pprofile.empty()) { + args.profile = true; + } + + const string pdry_run = GetCmdOption(argc, argv, "--dry-run="); + if (!pdry_run.empty()) { + args.dry_run = stoi(pdry_run); + } + + const string prun = GetCmdOption(argc, argv, "--run-count="); + if (!prun.empty()) { + args.run_count = stoi(prun); + } + + const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true); + if (!pzcopy.empty()) { + args.zero_copy = true; + } } /*! @@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) { #endif // Initialize TVM Runner - TVMRunner runner = TVMRunner(args.model, args.device); + auto runner = new TVMRunner(args.model, args.device); // Load the model - runner.Load(); + runner->Load(); if (!args.pre_compiled.empty()) { - runner.UsePreCompiledPrograms(args.pre_compiled); + runner->UsePreCompiledPrograms(args.pre_compiled); } // Query Model meta Information - TVMMetaInfo mInfo = runner.GetMetaInfo(); + TVMMetaInfo mInfo = runner->GetMetaInfo(); // Print Meta Information - if (args.dump_meta) runner.PrintMetaInfo(); + if (args.dump_meta) runner->PrintMetaInfo(); + + int total_exec_time = 0; + + if (args.profile) { + if (args.dry_run) { + for (int ii = 0; ii < args.dry_run; ++ii) { + runner->Run(); + } + TVMSynchronize(GetTVMDevice(args.device), 0, nullptr); + } + int total_time = 0; + std::map input_data_even, input_data_odd; + std::map output_data_even, output_data_odd; + + std::map input_data; + std::map output_data; + + // Alloc / populate and keep input data ready + for (auto& elem : mInfo.input_info) { + if (args.zero_copy) { + auto ndarr = + NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second), + DLDevice{GetTVMDevice(args.device), 0}); + input_data_even.insert({elem.first, ndarr}); + + ndarr = + NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second), + DLDevice{GetTVMDevice(args.device), 0}); + input_data_odd.insert({elem.first, ndarr}); + } else { + char* data = (char*)malloc(runner->GetInputMemSize(elem.first)); + input_data.insert({elem.first, data}); + } + } + + // Alloc and keep output bufers ready + for (auto& elem : mInfo.output_info) { + if (args.zero_copy) { + auto ndarr = + NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second), + DLDevice{GetTVMDevice(args.device), 0}); + output_data_even.insert({elem.first, ndarr}); + + ndarr = + NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second), + DLDevice{GetTVMDevice(args.device), 0}); + output_data_odd.insert({elem.first, ndarr}); + } else { + char* data = (char*)malloc(runner->GetOutputMemSize(elem.first)); + output_data.insert({elem.first, data}); + } + } + + for (int ii = 0; ii < args.run_count; ++ii) { + // Timer start + auto tstart = std::chrono::high_resolution_clock::now(); + // Set random input for all input + for (auto& elem : mInfo.input_info) { + if (args.zero_copy) { + if (ii % 2) { + runner->SetInput(elem.first, input_data_even[elem.first]); + } else { + runner->SetInput(elem.first, input_data_odd[elem.first]); + } + } else { + runner->SetInput(elem.first, input_data[elem.first]); + } + } + + if (args.zero_copy) { + // With zero copy set the result NDArray up front + for (auto& elem : mInfo.output_info) { + if (ii % 2) { + runner->SetOutput(elem.first, output_data_even[elem.first]); + } else { + runner->SetOutput(elem.first, output_data_odd[elem.first]); + } + } + } - if (args.input.empty() || args.output.empty()) { + // Run the model + runner->Run(); + + if (!args.zero_copy) { + // W/o zero copy we need to invoke explicite data copy + for (auto& elem : mInfo.output_info) { + runner->GetOutput(elem.first, output_data[elem.first]); + } + } else { + // Just wait for the run to complete. + TVMSynchronize(GetTVMDevice(args.device), 0, nullptr); + } + + // Timer end + auto tend = std::chrono::high_resolution_clock::now(); + LOG(INFO) << "Exec Time:" << static_cast((tend - tstart).count()) / 1e6; + total_exec_time += static_cast((tend - tstart).count()) / 1e6; + } + + // Free input bufers + for (auto& elem : mInfo.input_info) { + free(input_data[elem.first]); + } + + // Free output bufers + for (auto& elem : mInfo.output_info) { + free(output_data[elem.first]); + } + } else if (!args.input.empty() && !args.output.empty()) { + LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output; + // Set Input from Numpy Input + runner->SetInput(args.input); + // Run the model + runner->Run(); + // Get Output as Numpy dump + runner->GetOutput(args.output); + } else { LOG(INFO) << "Executing dry run ... "; // Set random input for all inputs for (auto& elem : mInfo.input_info) { LOG(INFO) << "Set Random Input for :" << elem.first; auto shape = elem.second.first; - size_t ssize = runner.GetInputMemSize(elem.first); + size_t ssize = runner->GetInputMemSize(elem.first); char* data = (char*)malloc(ssize); LOG(INFO) << "Random Input Size:" << ssize << " bytes"; - runner.SetInput(elem.first, data); + runner->SetInput(elem.first, data); free(data); } - // Run the model - runner.Run(); - + runner->Run(); // Get Output and dump few values for (auto& elem : mInfo.output_info) { LOG(INFO) << "Get Output for :" << elem.first; auto shape = elem.second.first; - size_t ssize = runner.GetOutputMemSize(elem.first); + size_t ssize = runner->GetOutputMemSize(elem.first); char* data = (char*)malloc(ssize); - runner.GetOutput(elem.first, data); + runner->GetOutput(elem.first, data); LOG(INFO) << "Output Size:" << ssize << " bytes"; free(data); } - } else { - LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output; - - // Set Input from Numpy Input - runner.SetInput(args.input); - - // Run the model - runner.Run(); + } - // Get Output as Numpy dump - runner.GetOutput(args.output); + if (args.profile) { + // Print Stats + runner->PrintStats(); } + auto tstart = std::chrono::high_resolution_clock::now(); + delete runner; + auto tend = std::chrono::high_resolution_clock::now(); + if (args.profile) { + LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " ms"; + LOG(INFO) << "Unload Time :" << static_cast((tend - tstart).count()) / 1e6 + << " ms"; + } return 0; } diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc index 2fd4f2281e01..7d6dbc23ee76 100644 --- a/apps/cpp_rtvm/tvm_runner.cc +++ b/apps/cpp_rtvm/tvm_runner.cc @@ -26,10 +26,12 @@ #include +#include #include #include #include #include +#include namespace tvm { namespace runtime { @@ -39,25 +41,25 @@ namespace runtime { * \param device the target device in string format. * \return dl_device corresponding to the device string. */ -int GetTVMDevice(std::string device) { +DLDeviceType GetTVMDevice(std::string device) { if (!device.compare("cpu")) { - return static_cast(kDLCPU); + return kDLCPU; } else if (!device.compare("llvm")) { - return static_cast(kDLCPU); + return kDLCPU; } else if (!device.compare("cuda")) { - return static_cast(kDLCUDA); + return kDLCUDA; } else if (!device.compare("opencl")) { - return static_cast(kDLOpenCL); + return kDLOpenCL; } else if (!device.compare("vulkan")) { - return static_cast(kDLVulkan); + return kDLVulkan; } else if (!device.compare("metal")) { - return static_cast(kDLMetal); + return kDLMetal; } else if (!device.compare("vpi")) { - return static_cast(kDLVPI); + return kDLVPI; } else if (!device.compare("rocm")) { - return static_cast(kDLROCM); + return kDLROCM; } else if (!device.compare("oneapi")) { - return static_cast(kDLOneAPI); + return kDLOneAPI; } else { LOG(FATAL) << "TVMRunner : Unsupported device :" << device; } @@ -80,34 +82,59 @@ TVMRunner::TVMRunner(std::string path, std::string device) int TVMRunner::Load(void) { LOG(INFO) << "TVMRunner Load:" << r_model_path; // Load the lib file + auto tstart = std::chrono::high_resolution_clock::now(); + r_mod_handle = Module::LoadFromFile((r_model_path + "/mod.so").c_str(), "so"); + auto tend = std::chrono::high_resolution_clock::now(); + r_module_load_ms = static_cast((tend - tstart).count()) / 1e6; + tstart = std::chrono::high_resolution_clock::now(); // Read model json file std::ifstream json_reader((r_model_path + "/mod.json").c_str()); CHECK(!json_reader.fail()) << "Failed to open json file:" << (r_model_path + "/mod.json").c_str(); - std::string json_str((std::istreambuf_iterator(json_reader)), - std::istreambuf_iterator()); + json_reader.seekg(0, std::ios_base::end); + std::size_t json_size = json_reader.tellg(); + json_reader.seekg(0, std::ios_base::beg); + std::string json_data; + json_data.reserve(json_size); + json_reader.read((char*)json_data.c_str(), json_size); json_reader.close(); // Get ref to graph exeutor auto f_handle = tvm::runtime::Registry::Get("tvm.graph_executor.create"); // Greate graph runtime - r_graph_handle = (*f_handle)(json_str, r_mod_handle, GetTVMDevice(r_device), 0); + r_graph_handle = + (*f_handle)(json_data, r_mod_handle, static_cast(GetTVMDevice(r_device)), 0); + + tend = std::chrono::high_resolution_clock::now(); + r_graph_load_ms = static_cast((tend - tstart).count()) / 1e6; // Read params binary file + tstart = std::chrono::high_resolution_clock::now(); std::ifstream params_reader((r_model_path + "/mod.params").c_str(), std::ios::binary); CHECK(!params_reader.fail()) << "Failed to open json file:" << (r_model_path + "/mod.params").c_str(); - const std::string params_str((std::istreambuf_iterator(params_reader)), - std::istreambuf_iterator()); + + params_reader.seekg(0, std::ios_base::end); + std::size_t param_size = params_reader.tellg(); + params_reader.seekg(0, std::ios_base::beg); + std::vector param_data(param_size / sizeof(char)); + params_reader.read((char*)¶m_data[0], param_size); params_reader.close(); + TVMByteArray params_arr; - params_arr.data = params_str.c_str(); - params_arr.size = params_str.length(); + params_arr.data = (char*)¶m_data[0]; + params_arr.size = param_size; + + tend = std::chrono::high_resolution_clock::now(); + r_param_read_ms = static_cast((tend - tstart).count()) / 1e6; // Load parameters + tstart = std::chrono::high_resolution_clock::now(); r_graph_handle.GetFunction("load_params")(params_arr); + tend = std::chrono::high_resolution_clock::now(); + r_param_load_ms = static_cast((tend - tstart).count()) / 1e6; return 0; } @@ -117,6 +144,7 @@ int TVMRunner::Load(void) { * \param file_name File name where pre-compiled programs should be stored. */ void TVMRunner::UsePreCompiledPrograms(std::string file_name) { + auto tstart = std::chrono::high_resolution_clock::now(); if (r_run_was_called) { LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before first run"; return; @@ -130,10 +158,19 @@ void TVMRunner::UsePreCompiledPrograms(std::string file_name) { std::ofstream fs(file_name, std::ofstream::binary); fs.write(bytes.c_str(), bytes.size()); } else { - std::string bytes((std::istreambuf_iterator(ifs)), std::istreambuf_iterator()); - f_set(String(bytes)); + ifs.seekg(0, std::ios_base::end); + std::size_t blob_size = ifs.tellg(); + ifs.seekg(0, std::ios_base::beg); + std::string blob_data; + blob_data.reserve(blob_size); + blob_data.resize(blob_size); + ifs.read((char*)blob_data.c_str(), blob_size); + ifs.close(); + f_set(String(blob_data)); } } + auto tend = std::chrono::high_resolution_clock::now(); + r_pre_compiled_load_ms = static_cast((tend - tstart).count()) / 1e6; } /*! @@ -156,8 +193,6 @@ inline size_t GetMemSize(NDArray& narr) { * \return The memory size. */ size_t TVMRunner::GetInputMemSize(std::string input_id) { - LOG(INFO) << "TVMRunner::GetInputMemSize:" << input_id; - NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id); auto ssize = GetMemSize(in_arr); @@ -170,8 +205,6 @@ size_t TVMRunner::GetInputMemSize(std::string input_id) { * \return The memory size. */ size_t TVMRunner::GetOutputMemSize(std::string output_id) { - LOG(INFO) << "TVMRunner::GetOutputMemSize:" << output_id; - NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id); auto ssize = GetMemSize(out_arr); @@ -209,13 +242,23 @@ int TVMRunner::SetInput(std::string inputfile) { * \param 0 on success else error code. */ int TVMRunner::SetInput(std::string input_id, char* raw_input) { - LOG(INFO) << "TVMRunner::SetInput (Raw)"; NDArray in_arr = r_graph_handle.GetFunction("get_input")(input_id); auto ssize = GetMemSize(in_arr); in_arr.CopyFromBytes(raw_input, ssize); return 0; } +/*! + * \brief Set the model input from given NDArray with zero copy. + * \param input_id input node name. + * \param ndarr NDArray. + * \param 0 on success else error code. + */ +int TVMRunner::SetInput(std::string input_id, NDArray& ndarr) { + r_graph_handle.GetFunction("set_input_zero_copy")(input_id, ndarr); + return 0; +} + /*! * \brief Get the model outputs and dump them to npz file. * \param outputfile the npz file to where we dump the output data. @@ -255,21 +298,29 @@ int TVMRunner::GetOutput(std::string outputfile) { * \param 0 on success else error code. */ int TVMRunner::GetOutput(std::string output_id, char* raw_output) { - LOG(INFO) << "TVMRunner::GetOutput (Raw)"; NDArray out_arr = r_graph_handle.GetFunction("get_output")(output_id); auto ssize = GetMemSize(out_arr); out_arr.CopyToBytes(raw_output, ssize); return 0; } +/*! + * \brief Set the model output from given NDArray with zero copy. + * \param output_id output node name. + * \param ndarr NDArray. + * \param 0 on success else error code. + */ +int TVMRunner::SetOutput(std::string output_id, NDArray& ndarr) { + r_graph_handle.GetFunction("set_output_zero_copy")(output_id, ndarr); + return 0; +} + /*! * \brief Call one cycle of execution for the model. * \param 0 on success else error code. */ int TVMRunner::Run(void) { - LOG(INFO) << "TVMRunner::Run"; r_run_was_called = true; - r_graph_handle.GetFunction("run")(); return 0; } @@ -289,10 +340,10 @@ TVMMetaInfo TVMRunner::GetMetaInfo(void) { auto dtype_info = GetRef>(tvm_input_info["dtype"].as()); for (const auto& kv : shape_info) { auto stuple = GetRef(kv.second.as()); - std::vector vshape; + std::vector vshape; vshape.assign(stuple.begin(), stuple.end()); auto dtype = GetRef(dtype_info[kv.first].as()); - std::pair, std::string> value = std::make_pair(vshape, dtype); + std::pair, std::string> value = std::make_pair(vshape, dtype); mInfo.input_info.insert({kv.first, value}); } @@ -301,10 +352,10 @@ TVMMetaInfo TVMRunner::GetMetaInfo(void) { dtype_info = GetRef>(tvm_input_info["dtype"].as()); for (const auto& kv : shape_info) { auto stuple = GetRef(kv.second.as()); - std::vector vshape; + std::vector vshape; vshape.assign(stuple.begin(), stuple.end()); auto dtype = GetRef(dtype_info[kv.first].as()); - std::pair, std::string> value = std::make_pair(vshape, dtype); + std::pair, std::string> value = std::make_pair(vshape, dtype); mInfo.output_info.insert({kv.first, value}); } @@ -343,5 +394,21 @@ void TVMRunner::PrintMetaInfo(void) { } } +/*! + * \brief Print stats information. + */ +void TVMRunner::PrintStats(void) { + LOG(INFO) << "Performance Stats:" << r_model_path; + LOG(INFO) << " Module Load :" << r_module_load_ms << " ms"; + LOG(INFO) << " Graph Runtime Create :" << r_graph_load_ms << " ms"; + LOG(INFO) << " Params Read :" << r_param_read_ms << " ms"; + LOG(INFO) << " Params Set :" << r_param_load_ms << " ms"; + LOG(INFO) << " Pre Compiled Progs Load :" << r_pre_compiled_load_ms << " ms"; + LOG(INFO) << "Total Load Time :" + << r_module_load_ms + r_graph_load_ms + r_param_read_ms + r_param_load_ms + + r_pre_compiled_load_ms + << " ms"; +} + } // namespace runtime } // namespace tvm diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h index 47717c3ecfd2..e93b63ae85a8 100644 --- a/apps/cpp_rtvm/tvm_runner.h +++ b/apps/cpp_rtvm/tvm_runner.h @@ -41,8 +41,8 @@ namespace runtime { typedef struct _TVMMetaInfo { int n_inputs; int n_outputs; - std::map, std::string>> input_info; - std::map, std::string>> output_info; + std::map, std::string>> input_info; + std::map, std::string>> output_info; } TVMMetaInfo; /*! @@ -63,10 +63,14 @@ class TVMRunner { int SetInput(std::string); /*! \brief To set the input from binary data */ int SetInput(std::string, char*); + /*! \brief To set the input from NDArray */ + int SetInput(std::string, NDArray& ndarr); /*! \brief Save the model output into given npz file */ int GetOutput(std::string); /*! \brief Get the model output in binary format */ int GetOutput(std::string, char*); + /*! \brief Swap output NDArray with given one */ + int SetOutput(std::string, NDArray& ndarr); /*! \brief To get the input mem size */ size_t GetInputMemSize(std::string); /*! \brief To get the output mem size */ @@ -76,6 +80,21 @@ class TVMRunner { /*! \brief Print function to show all meta information */ void PrintMetaInfo(void); + /*! \brief Print function to show all stats information */ + void PrintStats(void); + + // Public profiling information + /*! Module load time */ + int r_module_load_ms{0}; + /*! Graph runtime creatint time */ + int r_graph_load_ms{0}; + /*! Params read time */ + int r_param_read_ms{0}; + /*! Params load time */ + int r_param_load_ms{0}; + /*! Pre compiled programs load time */ + int r_pre_compiled_load_ms{0}; + private: /*! \brief Module handle for the shared object */ Module r_mod_handle; @@ -91,6 +110,7 @@ class TVMRunner { bool r_run_was_called; }; +DLDeviceType GetTVMDevice(std::string device); } // namespace runtime } // namespace tvm #endif // TVM_APPS_CPP_RTVM_RUNNER_H_