From 6bc7b45c4e9bbde23b7155380c1e483fb420cda2 Mon Sep 17 00:00:00 2001 From: yanwei-gr <64010848+yanwei-gr@users.noreply.github.com> Date: Fri, 12 Nov 2021 15:06:37 +0800 Subject: [PATCH] catch potential exception and handle (#677) * close floating point check in popart * create pipeline resource when computation created. not do it at the time of set the cache computation item fix pixel bert detect application runtime error * add weiming build script fix * add inplementation of CreateExecutable move complie_and_run to createExecute, and more error handle logic * add potential exception handle & open session_option to optimize * add more exception handling * update odla_computation::init() to return odla_status value * fix popart_config load error * add bool to make sure a compuation can't be executed when used to compile executable * improve load config logic * updat noticee strings * update load_config logic bug Co-authored-by: yanwei --- ODLA/platforms/odla_popart/odla_compute.cc | 54 +++++++--- ODLA/platforms/odla_popart/odla_popart.cc | 104 ++++++++++++++------ ODLA/platforms/odla_popart/odla_popart.h | 26 +++-- ODLA/platforms/odla_popart/popart_config.cc | 79 +++++++++++---- ODLA/platforms/odla_popart/popart_config.h | 22 ++--- 5 files changed, 204 insertions(+), 81 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_compute.cc b/ODLA/platforms/odla_popart/odla_compute.cc index 23fb117e0..f9886ceda 100644 --- a/ODLA/platforms/odla_popart/odla_compute.cc +++ b/ODLA/platforms/odla_popart/odla_compute.cc @@ -62,8 +62,9 @@ odla_status odla_SetComputationItem(odla_computation comp, odla_item_type type, comp->opts.cache_dir = (reinterpret_cast(value)); break; case 1001: // load cache directly, need set path of cache file - PopartConfig::instance()->set_load_cache(true); - PopartConfig::instance()->set_cache_path(reinterpret_cast(value)); + PopartConfig::instance()->set_load_or_save_cache(true); + PopartConfig::instance()->set_cache_path( + (std::string) reinterpret_cast(value)); break; default: std::cerr << "Unsupported property type: " << type << std::endl; @@ -81,8 +82,10 @@ odla_status odla_CreateExecutable(odla_executable* executable, return ODLA_FAILURE; } else { if (comp->session) { + popart::logging::info("Create cache file from exist session"); return comp->compile_and_export(); } else { + popart::logging::info("Computation is not initialized. init it first"); _odla_computation::instance()->init(true); // set is_compile to true // this comp init will create // executable @@ -107,6 +110,7 @@ odla_status odla_LoadExecutable(const odla_char* file_name, odla_status odla_CreateComputation(odla_computation* comp) { static void* custom_op_handle = nullptr; *comp = _odla_computation::instance(); + popart::logging::info("computation created"); if (custom_op_handle == nullptr) { custom_op_handle = dlopen("libcustom_ops.so", RTLD_NOW | RTLD_GLOBAL); if (custom_op_handle == nullptr) { @@ -116,16 +120,18 @@ odla_status odla_CreateComputation(odla_computation* comp) { } // Read the config file if (!PopartConfig::instance()->inited()) { - if (PopartConfig::instance()->load_cache()) { - odla_status ret = PopartConfig::instance()->extract_config_from_cache(); - if (ret == ODLA_FAILURE) { - popart::logging::err("load config from cache failed"); - return ret; - } + auto ret = PopartConfig::instance()->load_config( + std::getenv("ODLA_POPART_CONFIG")); + if (ret != ODLA_SUCCESS) { + popart::logging::err("error load config"); + return ret; } - PopartConfig::instance()->load_config(std::getenv("ODLA_POPART_CONFIG")); } - _odla_computation::instance()->set_executor(); + odla_status status = _odla_computation::instance()->set_executor(); + if (status != ODLA_SUCCESS) { + popart::logging::err("set_executor failed"); + return ODLA_FAILURE; + } if (PopartConfig::instance()->execution_mode() == PARALLEL || PopartConfig::instance()->execution_mode() == PIPELINE) { QManager::instance()->createQ(PopartConfig::instance()->queue_type()); @@ -137,8 +143,14 @@ odla_status odla_CreateComputation(odla_computation* comp) { } odla_status odla_CreateContext(odla_context* context) { - _odla_computation::instance(false) - ->init(); // Place the init here to avoid long execution problem + odla_status status = + _odla_computation::instance(false) + ->init(); // Place the init here to avoid long execution problem + if (status != ODLA_SUCCESS && + _odla_computation::instance()->session == nullptr) { + popart::logging::err("init computation item in CreateContext failed."); + return ODLA_FAILURE; + } *context = new _odla_pipeline_context(_odla_computation::instance()); return ODLA_SUCCESS; } @@ -149,15 +161,27 @@ odla_status odla_DestroyContext(odla_context ctx) { } odla_status odla_DestroyComputation(odla_computation comp) { - comp->mark_done(); - _odla_computation::destruct(); - QManager::instance()->deleteQ(); // delete current queue + if (comp != nullptr) { + if (!comp->is_compile_only()) { + comp->mark_done(); + QManager::instance()->deleteQ(); // delete current queue + } + comp->release_session(); + _odla_computation::destruct(); // release the real computation + } + return ODLA_SUCCESS; } odla_status odla_ExecuteComputation(odla_computation comp, odla_context context, odla_compute_mode mode, odla_device device) { + if (_odla_computation::instance()->is_compile_only()) { + popart::logging::err( + "This computation is created for compile executable, please re-create " + "another computation for computing"); + return ODLA_FAILURE; + } if (!context->hold("odla_ExecuteComputation")) return ODLA_FAILURE; return comp->executor()->compute(comp, context, mode, device); } diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc index 715b01979..b434daa42 100644 --- a/ODLA/platforms/odla_popart/odla_popart.cc +++ b/ODLA/platforms/odla_popart/odla_popart.cc @@ -90,6 +90,7 @@ void compute_loop(odla_computation comp) { } odla_status _odla_computation::compile_and_export() { + odla_status ret_value = ODLA_SUCCESS; popart::logging::warn("Start compile and export"); const std::string& cache_file_name = PopartConfig::instance()->get_cache_path(); @@ -97,7 +98,8 @@ odla_status _odla_computation::compile_and_export() { int file_prefix = cache_file_name.rfind(file_suffix); if (file_prefix == std::string::npos || file_prefix + file_suffix.size() < cache_file_name.size()) { - popart::logging::err("Bad cache file name"); + popart::logging::err( + "Bad cache file name. File name should end with '.popart'"); return ODLA_FAILURE; } if (file_prefix == std::string::npos) { @@ -117,7 +119,7 @@ odla_status _odla_computation::compile_and_export() { config_fs.open(config_file_name, std::ios_base::in | std::ifstream::binary); if (!config_fs.is_open()) { popart::logging::warn( - "invalid config file name:[ {} ] will use default config", + "Open config file failed:[ {} ] will use default config", config_file_name); PopartConfig::instance()->use_default(); config_string = PopartConfig::instance()->get_default_config_string(); @@ -134,18 +136,28 @@ odla_status _odla_computation::compile_and_export() { cache_fs.write((char*)&config_size, sizeof(config_size)); cache_fs.write(config_string.c_str(), config_string.size()); - _odla_computation::instance()->session->compileAndExport(cache_fs.flush()); - + try { + _odla_computation::instance()->session->compileAndExport(cache_fs.flush()); + } catch (std::exception& e) { + popart::logging::err("compileAndExport Falied: {}", e.what()); + ret_value = ODLA_FAILURE; + } cache_fs.flush(); cache_fs.close(); config_fs.close(); + + return ret_value; } -void _odla_computation::init(bool is_compile) { +odla_status _odla_computation::init(bool is_compile) { if (!session) { std::lock_guard guard(init_mutex_); if (!session) { - set_opts(); + odla_status status = set_opts(); + if (status != ODLA_SUCCESS) { + popart::logging::err("set computation option failed"); + return status; + } // Cretate the dataflow std::vector ids; for (const auto& output : outputs_map) @@ -168,8 +180,14 @@ void _odla_computation::init(bool is_compile) { // Create and config SessionOptions set_session_opts(); - if (use_pipeline()) - builder = popart::Builder::createFromOnnxModel(set_pipeline_stage()); + if (use_pipeline()) { + try { + builder = popart::Builder::createFromOnnxModel(set_pipeline_stage()); + } catch (std::exception& e) { + popart::logging::err("create builder from onnx model failed."); + return ODLA_FAILURE; + } + } auto proto = builder->getModelProto(); // So, the init must be called at // odla_ExecuteCompute @@ -185,12 +203,19 @@ void _odla_computation::init(bool is_compile) { PopartConfig::instance()->save_model_path()); } - // Create InferenceSession - auto new_session = popart::InferenceSession::createFromOnnxModel( - proto, data_flow, device, popart::InputShapeInfo(), session_opts_); + std::unique_ptr new_session; + try { + // Create InferenceSession + new_session = std::move(popart::InferenceSession::createFromOnnxModel( + proto, data_flow, device, popart::InputShapeInfo(), session_opts_)); + } catch (std::exception& e) { + popart::logging::err("Session::createFromOnnxModel failed:{}", + e.what()); + return ODLA_FAILURE; + } if (!is_compile) { - if (PopartConfig::instance()->load_cache()) { + if (PopartConfig::instance()->load_or_save_cache()) { popart::logging::info("Load cachefile from existing stream"); auto cache_fs = PopartConfig::instance()->get_cache_fs(); if (cache_fs->is_open()) { @@ -202,10 +227,14 @@ void _odla_computation::init(bool is_compile) { } } - new_session->prepareDevice(); - new_session->setRandomSeed(0); // Init seed - new_session->weightsFromHost(); // Copy weights from host to IPU - + try { + new_session->prepareDevice(); + new_session->setRandomSeed(0); // Init seed + new_session->weightsFromHost(); // Copy weights from host to IPU + } catch (std::exception& e) { + popart::logging::err("session init failed: {}", e.what()); + return ODLA_FAILURE; + } // If in parallel mode, start the thread ExecutionMode mode = PopartConfig::instance()->execution_mode(); if (PIPELINE == mode || PARALLEL == mode) { @@ -214,7 +243,10 @@ void _odla_computation::init(bool is_compile) { popart::logging::warn("Parallel loop has been started"); parallel_thread.detach(); } + } else { + is_compile_only_ = true; } + session = std::move(new_session); // set session after all initialization done. } @@ -222,25 +254,31 @@ void _odla_computation::init(bool is_compile) { } // Now we set this by config file, should set by the caller? -void _odla_computation::set_opts() { +odla_status _odla_computation::set_opts() { if (PopartConfig::instance()->debug()) { opts.ipu_num = PopartConfig::instance()->ipu_num(); opts.batches_per_step = PopartConfig::instance()->batches_per_step(); } else if (use_pipeline()) { // Only check when use pipeline - if (opts.ipu_num != PopartConfig::instance()->ipu_num()) - throw std::invalid_argument( + if (opts.ipu_num != PopartConfig::instance()->ipu_num()) { + popart::logging::err( "number of ipus in pipeline configuration:" + std::to_string(PopartConfig::instance()->ipu_num()) + " must same with options: " + std::to_string(opts.ipu_num)); - if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) - throw std::invalid_argument( + return ODLA_FAILURE; + } + if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) { + popart::logging::err( "batches per step in pipeline configuration:" + std::to_string(PopartConfig::instance()->batches_per_step()) + " must same with options: " + std::to_string(opts.batches_per_step)); + return ODLA_FAILURE; + } } + return ODLA_SUCCESS; } -void _odla_computation::set_executor() { +odla_status _odla_computation::set_executor() { + odla_status ret_value = ODLA_SUCCESS; ExecutionMode mode = PopartConfig::instance()->execution_mode(); if (PIPELINE == mode || PARALLEL == mode) { popart::logging::info("set the executor as parallel"); @@ -249,10 +287,13 @@ void _odla_computation::set_executor() { popart::logging::info("set the executor as sequence"); executor_ = new Sequence(); } else { - throw std::invalid_argument( - "*** FATAL *** unknown execution mode: {}" + std::to_string(mode) + - ". Should be one of pipeline, parallel or sequence"); + popart::logging::err( + "unknown excution mode: {}, Should be one of pipeline, parallel or " + "sequence", + std::to_string(mode)); + ret_value = ODLA_FAILURE; } + return ret_value; } void _odla_computation::set_session_opts() { @@ -270,9 +311,9 @@ void _odla_computation::set_session_opts() { session_opts_.cachePath = opts.enable_engine_cache ? opts.cache_dir : envEngineCachePath; } - // session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true"; - // session_opts_.matmulOptions["enableMultiStageReduce"] = "false"; - // session_opts_.matmulOptions["enableFastReduce"] = "true"; + session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true"; + session_opts_.matmulOptions["enableMultiStageReduce"] = "false"; + session_opts_.matmulOptions["enableFastReduce"] = "true"; session_opts_.enableFloatingPointChecks = false; session_opts_.enableStochasticRounding = false; session_opts_.enablePrefetchDatastreams = false; // true; @@ -392,9 +433,12 @@ bool _odla_context::hold(const std::string& function_name) { ss_holder << thread_id_of_holder; popart::logging::err( "[{}] odla_context {} has been held by thread: {}" - ", when try to hold it in function {}.", + ", when try to hold it in function {}. multi threads try to hold the " + "same context.", this_thread_id, this, thread_id_of_holder, function_name); - throw std::runtime_error("Multiple threads try to hold the same context"); + return false; + // throw std::runtime_error("Multiple threads try to hold the same + // context"); } return false; } diff --git a/ODLA/platforms/odla_popart/odla_popart.h b/ODLA/platforms/odla_popart/odla_popart.h index 3feea52a0..a3d744bed 100644 --- a/ODLA/platforms/odla_popart/odla_popart.h +++ b/ODLA/platforms/odla_popart/odla_popart.h @@ -115,6 +115,7 @@ struct _odla_computation { } } } + bool is_compile_only_; bool done_; bool thread_complete_; std::mutex init_mutex_; @@ -127,21 +128,34 @@ struct _odla_computation { device(nullptr), opts({false, 1, 1}), done_(false), + is_compile_only_(false), executor_(nullptr), thread_state_(DONE) { builder->setAttribute(popart::sVirtualGraphAttribute, 0); } - void init(bool is_compile = false); std::string set_pipeline_stage(); void set_session_opts(); - void set_executor(); - void set_opts(); + bool use_pipeline(); bool hold(); + + odla_status init_working_thread(); + odla_status init(bool is_compile = false); + odla_status set_executor(); + odla_status set_opts(); odla_status compile_and_export(); inline Execution* executor() { return executor_; } inline bool is_done() { return thread_state_ != RUNNING; } + inline bool is_compile_only() { return is_compile_only_; } + inline void release_session() { + if (session != nullptr) { + session->getDevice().getDeviceInfo()->detach(); + session.reset(); + assert(session == nullptr); + } + } + inline void mark_done() { while (thread_state_ != DONE) { std::unique_lock lock(thread_done_mutex_); @@ -150,11 +164,7 @@ struct _odla_computation { } // Once get notified, only detach the device once std::lock_guard guard(init_mutex_); - if (session != nullptr) { - session->getDevice().getDeviceInfo()->detach(); - session.reset(); - assert(session == nullptr); - } + release_session(); } inline void thread_done() { std::unique_lock lock(thread_done_mutex_); diff --git a/ODLA/platforms/odla_popart/popart_config.cc b/ODLA/platforms/odla_popart/popart_config.cc index 4311aa03f..9c20fd0a2 100644 --- a/ODLA/platforms/odla_popart/popart_config.cc +++ b/ODLA/platforms/odla_popart/popart_config.cc @@ -28,6 +28,19 @@ PopartConfig* PopartConfig::instance_ = new PopartConfig(); +const std::string& get_config_path_from_cache_file( + const std::string& cache_path) { + std::string file_suffix(".popart"); + int file_prefix = cache_path.rfind(file_suffix); + if (file_prefix == std::string::npos || + file_prefix + file_suffix.size() < cache_path.size()) { + popart::logging::err( + "Bad cache file name. File name should end with '.popart'"); + return std::move(std::string("")); + } + return std::move(std::string(cache_path.substr(0, file_prefix) + ".json")); +} + void PopartConfig::use_default() { amp_ = 0.6; version_ = "1.0.0"; @@ -56,14 +69,36 @@ void PopartConfig::use_default() { }\n"; } -void PopartConfig::load_config(const char* file_path) { +odla_status PopartConfig::load_config(const char* env_file_path) { if (inited_) { popart::logging::info("config already inited"); - return; + return ODLA_SUCCESS; + } + odla_status ret = ODLA_FAILURE; + if (load_or_save_cache()) { + ret = extract_config_from_cache(); + if (ret != ODLA_SUCCESS) { + popart::logging::warn("load config from cache failed"); + std::string config_file_path = + get_config_path_from_cache_file(std::string(cache_path_)); + if (!config_file_path.empty()) { + popart::logging::info("try load from file: {}", config_file_path); + ret = load_from_file(config_file_path); + } + } + } + if (ret != ODLA_SUCCESS) { + use_default(); + if (env_file_path != nullptr) { + ret = load_from_file(env_file_path); + if (ret != ODLA_SUCCESS) { + popart::logging::info("use default config"); + } + } else { + popart::logging::info("use default config"); + } } - popart::logging::info("use default config"); - use_default(); - if (file_path != nullptr) load_from_file(file_path); + return ODLA_SUCCESS; } void PopartConfig::parse_from_json(const json& jf) { @@ -122,25 +157,34 @@ void PopartConfig::parse_from_json(const json& jf) { inited_ = true; } -void PopartConfig::load_from_string(const std::string& config_string) { +odla_status PopartConfig::load_from_string(const std::string& config_string) { if (inited_) { - return; + return ODLA_SUCCESS; + } + json jf; + try { + jf = json::parse(config_string); + } catch (std::exception& e) { + popart::logging::err("parse config falied:{}", e.what()); + return ODLA_FAILURE; } - json jf = json::parse(config_string); parse_from_json(jf); + return ODLA_SUCCESS; } -void PopartConfig::load_from_file(const std::string& file_path) { +odla_status PopartConfig::load_from_file(const std::string& file_path) { if (inited_) { - return; + return ODLA_SUCCESS; } using json = nlohmann::json; - std::ifstream ifs(file_path); - if (!ifs.good()) - throw std::invalid_argument(std::string("Configuraton file [") + file_path + - "] was not found."); + std::ifstream ifs(file_path, std::ios_base::in); + if (!ifs.good()) { + popart::logging::err("config file {} not found", file_path); + return ODLA_FAILURE; + } json jf = json::parse(ifs); parse_from_json(jf); + return ODLA_SUCCESS; } void PopartConfig::print() { @@ -200,6 +244,7 @@ bool PopartConfig::get_pipeline_setting(const std::string& node_name, pipeline_stage = default_setting_iter->second[1]; return true; } + throw std::runtime_error( "Node: " + node_name + " was not configured to any ipu or stage for pipeline"); @@ -221,9 +266,9 @@ odla_status PopartConfig::extract_config_from_cache() { if (cache_fs->read(config_data_buffer.data(), config_len)) { std::string config_string(config_data_buffer.begin(), config_data_buffer.end()); - try { - load_from_string(config_string); - } catch (std::exception& e) { + + odla_status ret = load_from_string(config_string); + if (ret != ODLA_SUCCESS) { popart::logging::err("load from cached config string failed."); return ODLA_FAILURE; } diff --git a/ODLA/platforms/odla_popart/popart_config.h b/ODLA/platforms/odla_popart/popart_config.h index 30f9784d0..fdd1682e8 100644 --- a/ODLA/platforms/odla_popart/popart_config.h +++ b/ODLA/platforms/odla_popart/popart_config.h @@ -62,9 +62,9 @@ class PopartConfig { execution_mode_; // The execution mode {PIPELINE, PARALLEL, SEQUENCE} bool load_onnx_; // Whether load onnx model to run instead of the model // constructed. Use for test - bool load_cache_; // If the session will load graph from cache - std::string cache_path_; // the path of cache file, for load cache - // directly + bool load_or_save_cache_; // If the session will load graph from cache + std::string cache_path_; // the path of cache file, for load cache + // directly std::string load_onnx_path_; // The path of onnx model file to load if // load_onnx set to be true @@ -86,7 +86,7 @@ class PopartConfig { std::shared_ptr cache_fs; static PopartConfig* instance_; - void load_from_file(const std::string& file_path); + odla_status load_from_file(const std::string& file_path); public: PopartConfig() @@ -94,7 +94,7 @@ class PopartConfig { batches_per_step_(1), execution_mode_(UNKNOWN), load_onnx_(false), - load_cache_(false), + load_or_save_cache_(false), save_model_(false), inited_(false), ipu_num_(1) {} @@ -123,18 +123,18 @@ class PopartConfig { inline std::shared_ptr get_cache_fs() { return cache_fs; } inline void set_cache_fs(std::shared_ptr fs) { cache_fs = fs; } - inline bool load_cache() { return load_cache_; } + inline bool load_or_save_cache() { return load_or_save_cache_; } inline const std::string& get_cache_path() { return cache_path_; } - inline void set_load_cache(bool is_load_cache) { - load_cache_ = is_load_cache; + inline void set_load_or_save_cache(bool is_load_or_save_cache) { + load_or_save_cache_ = is_load_or_save_cache; } - inline void set_cache_path(std::string catch_path) { + inline void set_cache_path(const std::string& catch_path) { cache_path_ = catch_path; } void parse_from_json(const json&); - void load_from_string(const std::string& config_string); - void load_config(const char* file_path); + odla_status load_from_string(const std::string& config_string); + odla_status load_config(const char* file_path); bool get_pipeline_setting(const std::string& node_name, int64_t& ipu_idx, int64_t& pipeline_stage); odla_status extract_config_from_cache();