catch potential exception and handle (#677)

* close floating point check in popart * create pipeline resource when computation created. not do it at the time of set the cache computation item fix pixel bert detect application runtime error * add weiming build script fix * add inplementation of CreateExecutable move complie_and_run to createExecute, and more error handle logic * add potential exception handle & open session_option to optimize * add more exception handling * update odla_computation::init() to return odla_status value * fix popart_config load error * add bool to make sure a compuation can't be executed when used to compile executable * improve load config logic * updat noticee strings * update load_config logic bug Co-authored-by: yanwei <[email protected]>
alibaba · Nov 12, 2021 · 6bc7b45 · 6bc7b45
1 parent e0f07fa
commit 6bc7b45
Show file tree

Hide file tree

Showing 5 changed files with 204 additions and 81 deletions.
diff --git a/ODLA/platforms/odla_popart/odla_compute.cc b/ODLA/platforms/odla_popart/odla_compute.cc
@@ -62,8 +62,9 @@ odla_status odla_SetComputationItem(odla_computation comp, odla_item_type type,
       comp->opts.cache_dir = (reinterpret_cast<char*>(value));
       break;
     case 1001: // load cache directly, need set path of cache file
-      PopartConfig::instance()->set_load_cache(true);
-      PopartConfig::instance()->set_cache_path(reinterpret_cast<char*>(value));
+      PopartConfig::instance()->set_load_or_save_cache(true);
+      PopartConfig::instance()->set_cache_path(
+          (std::string) reinterpret_cast<char*>(value));
       break;
     default:
       std::cerr << "Unsupported property type: " << type << std::endl;
@@ -81,8 +82,10 @@ odla_status odla_CreateExecutable(odla_executable* executable,
     return ODLA_FAILURE;
   } else {
     if (comp->session) {
+      popart::logging::info("Create cache file from exist session");
       return comp->compile_and_export();
     } else {
+      popart::logging::info("Computation is not initialized. init it first");
       _odla_computation::instance()->init(true); // set is_compile to true
                                                  // this comp init will create
                                                  // executable
@@ -107,6 +110,7 @@ odla_status odla_LoadExecutable(const odla_char* file_name,
 odla_status odla_CreateComputation(odla_computation* comp) {
   static void* custom_op_handle = nullptr;
   *comp = _odla_computation::instance();
+  popart::logging::info("computation created");
   if (custom_op_handle == nullptr) {
     custom_op_handle = dlopen("libcustom_ops.so", RTLD_NOW | RTLD_GLOBAL);
     if (custom_op_handle == nullptr) {
@@ -116,16 +120,18 @@ odla_status odla_CreateComputation(odla_computation* comp) {
   }
   // Read the config file
   if (!PopartConfig::instance()->inited()) {
-    if (PopartConfig::instance()->load_cache()) {
-      odla_status ret = PopartConfig::instance()->extract_config_from_cache();
-      if (ret == ODLA_FAILURE) {
-        popart::logging::err("load config from cache failed");
-        return ret;
-      }
+    auto ret = PopartConfig::instance()->load_config(
+        std::getenv("ODLA_POPART_CONFIG"));
+    if (ret != ODLA_SUCCESS) {
+      popart::logging::err("error load config");
+      return ret;
     }
-    PopartConfig::instance()->load_config(std::getenv("ODLA_POPART_CONFIG"));
   }
-  _odla_computation::instance()->set_executor();
+  odla_status status = _odla_computation::instance()->set_executor();
+  if (status != ODLA_SUCCESS) {
+    popart::logging::err("set_executor failed");
+    return ODLA_FAILURE;
+  }
   if (PopartConfig::instance()->execution_mode() == PARALLEL ||
       PopartConfig::instance()->execution_mode() == PIPELINE) {
     QManager::instance()->createQ(PopartConfig::instance()->queue_type());
@@ -137,8 +143,14 @@ odla_status odla_CreateComputation(odla_computation* comp) {
 }
 
 odla_status odla_CreateContext(odla_context* context) {
-  _odla_computation::instance(false)
-      ->init(); // Place the init here to avoid long execution problem
+  odla_status status =
+      _odla_computation::instance(false)
+          ->init(); // Place the init here to avoid long execution problem
+  if (status != ODLA_SUCCESS &&
+      _odla_computation::instance()->session == nullptr) {
+    popart::logging::err("init computation item in CreateContext failed.");
+    return ODLA_FAILURE;
+  }
   *context = new _odla_pipeline_context(_odla_computation::instance());
   return ODLA_SUCCESS;
 }
@@ -149,15 +161,27 @@ odla_status odla_DestroyContext(odla_context ctx) {
 }
 
 odla_status odla_DestroyComputation(odla_computation comp) {
-  comp->mark_done();
-  _odla_computation::destruct();
-  QManager::instance()->deleteQ(); // delete current queue
+  if (comp != nullptr) {
+    if (!comp->is_compile_only()) {
+      comp->mark_done();
+      QManager::instance()->deleteQ(); // delete current queue
+    }
+    comp->release_session();
+    _odla_computation::destruct(); // release the real computation
+  }
+
   return ODLA_SUCCESS;
 }
 
 odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
                                     odla_compute_mode mode,
                                     odla_device device) {
+  if (_odla_computation::instance()->is_compile_only()) {
+    popart::logging::err(
+        "This computation is created for compile executable, please re-create "
+        "another computation for computing");
+    return ODLA_FAILURE;
+  }
   if (!context->hold("odla_ExecuteComputation")) return ODLA_FAILURE;
   return comp->executor()->compute(comp, context, mode, device);
 }

diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc
@@ -90,14 +90,16 @@ void compute_loop(odla_computation comp) {
 }
 
 odla_status _odla_computation::compile_and_export() {
+  odla_status ret_value = ODLA_SUCCESS;
   popart::logging::warn("Start compile and export");
   const std::string& cache_file_name =
       PopartConfig::instance()->get_cache_path();
   std::string file_suffix(".popart");
   int file_prefix = cache_file_name.rfind(file_suffix);
   if (file_prefix == std::string::npos ||
       file_prefix + file_suffix.size() < cache_file_name.size()) {
-    popart::logging::err("Bad cache file name");
+    popart::logging::err(
+        "Bad cache file name. File name should end with '.popart'");
     return ODLA_FAILURE;
   }
   if (file_prefix == std::string::npos) {
@@ -117,7 +119,7 @@ odla_status _odla_computation::compile_and_export() {
     config_fs.open(config_file_name, std::ios_base::in | std::ifstream::binary);
     if (!config_fs.is_open()) {
       popart::logging::warn(
-          "invalid config file name:[ {} ] will use default config",
+          "Open config file failed:[ {} ] will use default config",
           config_file_name);
       PopartConfig::instance()->use_default();
       config_string = PopartConfig::instance()->get_default_config_string();
@@ -134,18 +136,28 @@ odla_status _odla_computation::compile_and_export() {
   cache_fs.write((char*)&config_size, sizeof(config_size));
   cache_fs.write(config_string.c_str(), config_string.size());
 
-  _odla_computation::instance()->session->compileAndExport(cache_fs.flush());
-
+  try {
+    _odla_computation::instance()->session->compileAndExport(cache_fs.flush());
+  } catch (std::exception& e) {
+    popart::logging::err("compileAndExport Falied: {}", e.what());
+    ret_value = ODLA_FAILURE;
+  }
   cache_fs.flush();
   cache_fs.close();
   config_fs.close();
+
+  return ret_value;
 }
 
-void _odla_computation::init(bool is_compile) {
+odla_status _odla_computation::init(bool is_compile) {
   if (!session) {
     std::lock_guard<std::mutex> guard(init_mutex_);
     if (!session) {
-      set_opts();
+      odla_status status = set_opts();
+      if (status != ODLA_SUCCESS) {
+        popart::logging::err("set computation option failed");
+        return status;
+      }
       // Cretate the dataflow
       std::vector<popart::TensorId> ids;
       for (const auto& output : outputs_map)
@@ -168,8 +180,14 @@ void _odla_computation::init(bool is_compile) {
 
       // Create and config SessionOptions
       set_session_opts();
-      if (use_pipeline())
-        builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
+      if (use_pipeline()) {
+        try {
+          builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
+        } catch (std::exception& e) {
+          popart::logging::err("create builder from onnx model failed.");
+          return ODLA_FAILURE;
+        }
+      }
       auto proto = builder->getModelProto(); // So, the init must be called at
                                              // odla_ExecuteCompute
 
@@ -185,12 +203,19 @@ void _odla_computation::init(bool is_compile) {
                               PopartConfig::instance()->save_model_path());
       }
 
-      // Create InferenceSession
-      auto new_session = popart::InferenceSession::createFromOnnxModel(
-          proto, data_flow, device, popart::InputShapeInfo(), session_opts_);
+      std::unique_ptr<popart::InferenceSession> new_session;
+      try {
+        // Create InferenceSession
+        new_session = std::move(popart::InferenceSession::createFromOnnxModel(
+            proto, data_flow, device, popart::InputShapeInfo(), session_opts_));
+      } catch (std::exception& e) {
+        popart::logging::err("Session::createFromOnnxModel failed:{}",
+                             e.what());
+        return ODLA_FAILURE;
+      }
 
       if (!is_compile) {
-        if (PopartConfig::instance()->load_cache()) {
+        if (PopartConfig::instance()->load_or_save_cache()) {
           popart::logging::info("Load cachefile from existing stream");
           auto cache_fs = PopartConfig::instance()->get_cache_fs();
           if (cache_fs->is_open()) {
@@ -202,10 +227,14 @@ void _odla_computation::init(bool is_compile) {
           }
         }
 
-        new_session->prepareDevice();
-        new_session->setRandomSeed(0);  // Init seed
-        new_session->weightsFromHost(); // Copy weights from host to IPU
-
+        try {
+          new_session->prepareDevice();
+          new_session->setRandomSeed(0);  // Init seed
+          new_session->weightsFromHost(); // Copy weights from host to IPU
+        } catch (std::exception& e) {
+          popart::logging::err("session init failed: {}", e.what());
+          return ODLA_FAILURE;
+        }
         // If in parallel mode, start the thread
         ExecutionMode mode = PopartConfig::instance()->execution_mode();
         if (PIPELINE == mode || PARALLEL == mode) {
@@ -214,33 +243,42 @@ void _odla_computation::init(bool is_compile) {
           popart::logging::warn("Parallel loop has been started");
           parallel_thread.detach();
         }
+      } else {
+        is_compile_only_ = true;
       }
+
       session =
           std::move(new_session); // set session after all initialization done.
     }
   }
 }
 
 // Now we set this by config file, should set by the caller?
-void _odla_computation::set_opts() {
+odla_status _odla_computation::set_opts() {
   if (PopartConfig::instance()->debug()) {
     opts.ipu_num = PopartConfig::instance()->ipu_num();
     opts.batches_per_step = PopartConfig::instance()->batches_per_step();
   } else if (use_pipeline()) { // Only check when use pipeline
-    if (opts.ipu_num != PopartConfig::instance()->ipu_num())
-      throw std::invalid_argument(
+    if (opts.ipu_num != PopartConfig::instance()->ipu_num()) {
+      popart::logging::err(
           "number of ipus in pipeline configuration:" +
           std::to_string(PopartConfig::instance()->ipu_num()) +
           " must same with options: " + std::to_string(opts.ipu_num));
-    if (opts.batches_per_step != PopartConfig::instance()->batches_per_step())
-      throw std::invalid_argument(
+      return ODLA_FAILURE;
+    }
+    if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) {
+      popart::logging::err(
           "batches per step in pipeline configuration:" +
           std::to_string(PopartConfig::instance()->batches_per_step()) +
           " must same with options: " + std::to_string(opts.batches_per_step));
+      return ODLA_FAILURE;
+    }
   }
+  return ODLA_SUCCESS;
 }
 
-void _odla_computation::set_executor() {
+odla_status _odla_computation::set_executor() {
+  odla_status ret_value = ODLA_SUCCESS;
   ExecutionMode mode = PopartConfig::instance()->execution_mode();
   if (PIPELINE == mode || PARALLEL == mode) {
     popart::logging::info("set the executor as parallel");
@@ -249,10 +287,13 @@ void _odla_computation::set_executor() {
     popart::logging::info("set the executor as sequence");
     executor_ = new Sequence();
   } else {
-    throw std::invalid_argument(
-        "*** FATAL *** unknown execution mode: {}" + std::to_string(mode) +
-        ". Should be one of pipeline, parallel or sequence");
+    popart::logging::err(
+        "unknown excution mode: {}, Should be one of pipeline, parallel or "
+        "sequence",
+        std::to_string(mode));
+    ret_value = ODLA_FAILURE;
   }
+  return ret_value;
 }
 
 void _odla_computation::set_session_opts() {
@@ -270,9 +311,9 @@ void _odla_computation::set_session_opts() {
     session_opts_.cachePath =
         opts.enable_engine_cache ? opts.cache_dir : envEngineCachePath;
   }
-  // session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
-  // session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
-  // session_opts_.matmulOptions["enableFastReduce"] = "true";
+  session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
+  session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
+  session_opts_.matmulOptions["enableFastReduce"] = "true";
   session_opts_.enableFloatingPointChecks = false;
   session_opts_.enableStochasticRounding = false;
   session_opts_.enablePrefetchDatastreams = false; // true;
@@ -392,9 +433,12 @@ bool _odla_context::hold(const std::string& function_name) {
     ss_holder << thread_id_of_holder;
     popart::logging::err(
         "[{}] odla_context {} has been held by thread: {}"
-        ", when try to hold it in function {}.",
+        ", when try to hold it in function {}. multi threads try to hold the "
+        "same context.",
         this_thread_id, this, thread_id_of_holder, function_name);
-    throw std::runtime_error("Multiple threads try to hold the same context");
+    return false;
+    //    throw std::runtime_error("Multiple threads try to hold the same
+    //    context");
   }
   return false;
 }

diff --git a/ODLA/platforms/odla_popart/odla_popart.h b/ODLA/platforms/odla_popart/odla_popart.h
@@ -115,6 +115,7 @@ struct _odla_computation {
       }
     }
   }
+  bool is_compile_only_;
   bool done_;
   bool thread_complete_;
   std::mutex init_mutex_;
@@ -127,21 +128,34 @@ struct _odla_computation {
         device(nullptr),
         opts({false, 1, 1}),
         done_(false),
+        is_compile_only_(false),
         executor_(nullptr),
         thread_state_(DONE) {
     builder->setAttribute(popart::sVirtualGraphAttribute, 0);
   }
-  void init(bool is_compile = false);
   std::string set_pipeline_stage();
   void set_session_opts();
-  void set_executor();
-  void set_opts();
+
   bool use_pipeline();
   bool hold();
+
+  odla_status init_working_thread();
+  odla_status init(bool is_compile = false);
+  odla_status set_executor();
+  odla_status set_opts();
   odla_status compile_and_export();
 
   inline Execution* executor() { return executor_; }
   inline bool is_done() { return thread_state_ != RUNNING; }
+  inline bool is_compile_only() { return is_compile_only_; }
+  inline void release_session() {
+    if (session != nullptr) {
+      session->getDevice().getDeviceInfo()->detach();
+      session.reset();
+      assert(session == nullptr);
+    }
+  }
+
   inline void mark_done() {
     while (thread_state_ != DONE) {
       std::unique_lock<std::mutex> lock(thread_done_mutex_);
@@ -150,11 +164,7 @@ struct _odla_computation {
     }
     // Once get notified, only detach the device once
     std::lock_guard<std::mutex> guard(init_mutex_);
-    if (session != nullptr) {
-      session->getDevice().getDeviceInfo()->detach();
-      session.reset();
-      assert(session == nullptr);
-    }
+    release_session();
   }
   inline void thread_done() {
     std::unique_lock<std::mutex> lock(thread_done_mutex_);