Skip to content

Commit

Permalink
catch potential exception and handle (#677)
Browse files Browse the repository at this point in the history
* close floating point check in popart

* create pipeline resource when computation created. not do it at the time of set the cache computation item

fix pixel bert detect application runtime error

* add weiming build script fix

* add inplementation of CreateExecutable

move complie_and_run to createExecute, and more error handle logic

* add potential exception handle & open session_option to optimize

* add more exception handling

* update odla_computation::init() to return odla_status value

* fix popart_config load error

* add bool to make sure a compuation can't be executed when used to compile executable

* improve load config logic

* updat noticee strings

* update load_config logic bug

Co-authored-by: yanwei <[email protected]>
  • Loading branch information
yanwei-gr and yanwei authored Nov 12, 2021
1 parent e0f07fa commit 6bc7b45
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 81 deletions.
54 changes: 39 additions & 15 deletions ODLA/platforms/odla_popart/odla_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ odla_status odla_SetComputationItem(odla_computation comp, odla_item_type type,
comp->opts.cache_dir = (reinterpret_cast<char*>(value));
break;
case 1001: // load cache directly, need set path of cache file
PopartConfig::instance()->set_load_cache(true);
PopartConfig::instance()->set_cache_path(reinterpret_cast<char*>(value));
PopartConfig::instance()->set_load_or_save_cache(true);
PopartConfig::instance()->set_cache_path(
(std::string) reinterpret_cast<char*>(value));
break;
default:
std::cerr << "Unsupported property type: " << type << std::endl;
Expand All @@ -81,8 +82,10 @@ odla_status odla_CreateExecutable(odla_executable* executable,
return ODLA_FAILURE;
} else {
if (comp->session) {
popart::logging::info("Create cache file from exist session");
return comp->compile_and_export();
} else {
popart::logging::info("Computation is not initialized. init it first");
_odla_computation::instance()->init(true); // set is_compile to true
// this comp init will create
// executable
Expand All @@ -107,6 +110,7 @@ odla_status odla_LoadExecutable(const odla_char* file_name,
odla_status odla_CreateComputation(odla_computation* comp) {
static void* custom_op_handle = nullptr;
*comp = _odla_computation::instance();
popart::logging::info("computation created");
if (custom_op_handle == nullptr) {
custom_op_handle = dlopen("libcustom_ops.so", RTLD_NOW | RTLD_GLOBAL);
if (custom_op_handle == nullptr) {
Expand All @@ -116,16 +120,18 @@ odla_status odla_CreateComputation(odla_computation* comp) {
}
// Read the config file
if (!PopartConfig::instance()->inited()) {
if (PopartConfig::instance()->load_cache()) {
odla_status ret = PopartConfig::instance()->extract_config_from_cache();
if (ret == ODLA_FAILURE) {
popart::logging::err("load config from cache failed");
return ret;
}
auto ret = PopartConfig::instance()->load_config(
std::getenv("ODLA_POPART_CONFIG"));
if (ret != ODLA_SUCCESS) {
popart::logging::err("error load config");
return ret;
}
PopartConfig::instance()->load_config(std::getenv("ODLA_POPART_CONFIG"));
}
_odla_computation::instance()->set_executor();
odla_status status = _odla_computation::instance()->set_executor();
if (status != ODLA_SUCCESS) {
popart::logging::err("set_executor failed");
return ODLA_FAILURE;
}
if (PopartConfig::instance()->execution_mode() == PARALLEL ||
PopartConfig::instance()->execution_mode() == PIPELINE) {
QManager::instance()->createQ(PopartConfig::instance()->queue_type());
Expand All @@ -137,8 +143,14 @@ odla_status odla_CreateComputation(odla_computation* comp) {
}

odla_status odla_CreateContext(odla_context* context) {
_odla_computation::instance(false)
->init(); // Place the init here to avoid long execution problem
odla_status status =
_odla_computation::instance(false)
->init(); // Place the init here to avoid long execution problem
if (status != ODLA_SUCCESS &&
_odla_computation::instance()->session == nullptr) {
popart::logging::err("init computation item in CreateContext failed.");
return ODLA_FAILURE;
}
*context = new _odla_pipeline_context(_odla_computation::instance());
return ODLA_SUCCESS;
}
Expand All @@ -149,15 +161,27 @@ odla_status odla_DestroyContext(odla_context ctx) {
}

odla_status odla_DestroyComputation(odla_computation comp) {
comp->mark_done();
_odla_computation::destruct();
QManager::instance()->deleteQ(); // delete current queue
if (comp != nullptr) {
if (!comp->is_compile_only()) {
comp->mark_done();
QManager::instance()->deleteQ(); // delete current queue
}
comp->release_session();
_odla_computation::destruct(); // release the real computation
}

return ODLA_SUCCESS;
}

odla_status odla_ExecuteComputation(odla_computation comp, odla_context context,
odla_compute_mode mode,
odla_device device) {
if (_odla_computation::instance()->is_compile_only()) {
popart::logging::err(
"This computation is created for compile executable, please re-create "
"another computation for computing");
return ODLA_FAILURE;
}
if (!context->hold("odla_ExecuteComputation")) return ODLA_FAILURE;
return comp->executor()->compute(comp, context, mode, device);
}
Expand Down
104 changes: 74 additions & 30 deletions ODLA/platforms/odla_popart/odla_popart.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,16 @@ void compute_loop(odla_computation comp) {
}

odla_status _odla_computation::compile_and_export() {
odla_status ret_value = ODLA_SUCCESS;
popart::logging::warn("Start compile and export");
const std::string& cache_file_name =
PopartConfig::instance()->get_cache_path();
std::string file_suffix(".popart");
int file_prefix = cache_file_name.rfind(file_suffix);
if (file_prefix == std::string::npos ||
file_prefix + file_suffix.size() < cache_file_name.size()) {
popart::logging::err("Bad cache file name");
popart::logging::err(
"Bad cache file name. File name should end with '.popart'");
return ODLA_FAILURE;
}
if (file_prefix == std::string::npos) {
Expand All @@ -117,7 +119,7 @@ odla_status _odla_computation::compile_and_export() {
config_fs.open(config_file_name, std::ios_base::in | std::ifstream::binary);
if (!config_fs.is_open()) {
popart::logging::warn(
"invalid config file name:[ {} ] will use default config",
"Open config file failed:[ {} ] will use default config",
config_file_name);
PopartConfig::instance()->use_default();
config_string = PopartConfig::instance()->get_default_config_string();
Expand All @@ -134,18 +136,28 @@ odla_status _odla_computation::compile_and_export() {
cache_fs.write((char*)&config_size, sizeof(config_size));
cache_fs.write(config_string.c_str(), config_string.size());

_odla_computation::instance()->session->compileAndExport(cache_fs.flush());

try {
_odla_computation::instance()->session->compileAndExport(cache_fs.flush());
} catch (std::exception& e) {
popart::logging::err("compileAndExport Falied: {}", e.what());
ret_value = ODLA_FAILURE;
}
cache_fs.flush();
cache_fs.close();
config_fs.close();

return ret_value;
}

void _odla_computation::init(bool is_compile) {
odla_status _odla_computation::init(bool is_compile) {
if (!session) {
std::lock_guard<std::mutex> guard(init_mutex_);
if (!session) {
set_opts();
odla_status status = set_opts();
if (status != ODLA_SUCCESS) {
popart::logging::err("set computation option failed");
return status;
}
// Cretate the dataflow
std::vector<popart::TensorId> ids;
for (const auto& output : outputs_map)
Expand All @@ -168,8 +180,14 @@ void _odla_computation::init(bool is_compile) {

// Create and config SessionOptions
set_session_opts();
if (use_pipeline())
builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
if (use_pipeline()) {
try {
builder = popart::Builder::createFromOnnxModel(set_pipeline_stage());
} catch (std::exception& e) {
popart::logging::err("create builder from onnx model failed.");
return ODLA_FAILURE;
}
}
auto proto = builder->getModelProto(); // So, the init must be called at
// odla_ExecuteCompute

Expand All @@ -185,12 +203,19 @@ void _odla_computation::init(bool is_compile) {
PopartConfig::instance()->save_model_path());
}

// Create InferenceSession
auto new_session = popart::InferenceSession::createFromOnnxModel(
proto, data_flow, device, popart::InputShapeInfo(), session_opts_);
std::unique_ptr<popart::InferenceSession> new_session;
try {
// Create InferenceSession
new_session = std::move(popart::InferenceSession::createFromOnnxModel(
proto, data_flow, device, popart::InputShapeInfo(), session_opts_));
} catch (std::exception& e) {
popart::logging::err("Session::createFromOnnxModel failed:{}",
e.what());
return ODLA_FAILURE;
}

if (!is_compile) {
if (PopartConfig::instance()->load_cache()) {
if (PopartConfig::instance()->load_or_save_cache()) {
popart::logging::info("Load cachefile from existing stream");
auto cache_fs = PopartConfig::instance()->get_cache_fs();
if (cache_fs->is_open()) {
Expand All @@ -202,10 +227,14 @@ void _odla_computation::init(bool is_compile) {
}
}

new_session->prepareDevice();
new_session->setRandomSeed(0); // Init seed
new_session->weightsFromHost(); // Copy weights from host to IPU

try {
new_session->prepareDevice();
new_session->setRandomSeed(0); // Init seed
new_session->weightsFromHost(); // Copy weights from host to IPU
} catch (std::exception& e) {
popart::logging::err("session init failed: {}", e.what());
return ODLA_FAILURE;
}
// If in parallel mode, start the thread
ExecutionMode mode = PopartConfig::instance()->execution_mode();
if (PIPELINE == mode || PARALLEL == mode) {
Expand All @@ -214,33 +243,42 @@ void _odla_computation::init(bool is_compile) {
popart::logging::warn("Parallel loop has been started");
parallel_thread.detach();
}
} else {
is_compile_only_ = true;
}

session =
std::move(new_session); // set session after all initialization done.
}
}
}

// Now we set this by config file, should set by the caller?
void _odla_computation::set_opts() {
odla_status _odla_computation::set_opts() {
if (PopartConfig::instance()->debug()) {
opts.ipu_num = PopartConfig::instance()->ipu_num();
opts.batches_per_step = PopartConfig::instance()->batches_per_step();
} else if (use_pipeline()) { // Only check when use pipeline
if (opts.ipu_num != PopartConfig::instance()->ipu_num())
throw std::invalid_argument(
if (opts.ipu_num != PopartConfig::instance()->ipu_num()) {
popart::logging::err(
"number of ipus in pipeline configuration:" +
std::to_string(PopartConfig::instance()->ipu_num()) +
" must same with options: " + std::to_string(opts.ipu_num));
if (opts.batches_per_step != PopartConfig::instance()->batches_per_step())
throw std::invalid_argument(
return ODLA_FAILURE;
}
if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) {
popart::logging::err(
"batches per step in pipeline configuration:" +
std::to_string(PopartConfig::instance()->batches_per_step()) +
" must same with options: " + std::to_string(opts.batches_per_step));
return ODLA_FAILURE;
}
}
return ODLA_SUCCESS;
}

void _odla_computation::set_executor() {
odla_status _odla_computation::set_executor() {
odla_status ret_value = ODLA_SUCCESS;
ExecutionMode mode = PopartConfig::instance()->execution_mode();
if (PIPELINE == mode || PARALLEL == mode) {
popart::logging::info("set the executor as parallel");
Expand All @@ -249,10 +287,13 @@ void _odla_computation::set_executor() {
popart::logging::info("set the executor as sequence");
executor_ = new Sequence();
} else {
throw std::invalid_argument(
"*** FATAL *** unknown execution mode: {}" + std::to_string(mode) +
". Should be one of pipeline, parallel or sequence");
popart::logging::err(
"unknown excution mode: {}, Should be one of pipeline, parallel or "
"sequence",
std::to_string(mode));
ret_value = ODLA_FAILURE;
}
return ret_value;
}

void _odla_computation::set_session_opts() {
Expand All @@ -270,9 +311,9 @@ void _odla_computation::set_session_opts() {
session_opts_.cachePath =
opts.enable_engine_cache ? opts.cache_dir : envEngineCachePath;
}
// session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
// session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
// session_opts_.matmulOptions["enableFastReduce"] = "true";
session_opts_.matmulOptions["use128BitConvUnitLoad"] = "true";
session_opts_.matmulOptions["enableMultiStageReduce"] = "false";
session_opts_.matmulOptions["enableFastReduce"] = "true";
session_opts_.enableFloatingPointChecks = false;
session_opts_.enableStochasticRounding = false;
session_opts_.enablePrefetchDatastreams = false; // true;
Expand Down Expand Up @@ -392,9 +433,12 @@ bool _odla_context::hold(const std::string& function_name) {
ss_holder << thread_id_of_holder;
popart::logging::err(
"[{}] odla_context {} has been held by thread: {}"
", when try to hold it in function {}.",
", when try to hold it in function {}. multi threads try to hold the "
"same context.",
this_thread_id, this, thread_id_of_holder, function_name);
throw std::runtime_error("Multiple threads try to hold the same context");
return false;
// throw std::runtime_error("Multiple threads try to hold the same
// context");
}
return false;
}
Expand Down
26 changes: 18 additions & 8 deletions ODLA/platforms/odla_popart/odla_popart.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ struct _odla_computation {
}
}
}
bool is_compile_only_;
bool done_;
bool thread_complete_;
std::mutex init_mutex_;
Expand All @@ -127,21 +128,34 @@ struct _odla_computation {
device(nullptr),
opts({false, 1, 1}),
done_(false),
is_compile_only_(false),
executor_(nullptr),
thread_state_(DONE) {
builder->setAttribute(popart::sVirtualGraphAttribute, 0);
}
void init(bool is_compile = false);
std::string set_pipeline_stage();
void set_session_opts();
void set_executor();
void set_opts();

bool use_pipeline();
bool hold();

odla_status init_working_thread();
odla_status init(bool is_compile = false);
odla_status set_executor();
odla_status set_opts();
odla_status compile_and_export();

inline Execution* executor() { return executor_; }
inline bool is_done() { return thread_state_ != RUNNING; }
inline bool is_compile_only() { return is_compile_only_; }
inline void release_session() {
if (session != nullptr) {
session->getDevice().getDeviceInfo()->detach();
session.reset();
assert(session == nullptr);
}
}

inline void mark_done() {
while (thread_state_ != DONE) {
std::unique_lock<std::mutex> lock(thread_done_mutex_);
Expand All @@ -150,11 +164,7 @@ struct _odla_computation {
}
// Once get notified, only detach the device once
std::lock_guard<std::mutex> guard(init_mutex_);
if (session != nullptr) {
session->getDevice().getDeviceInfo()->detach();
session.reset();
assert(session == nullptr);
}
release_session();
}
inline void thread_done() {
std::unique_lock<std::mutex> lock(thread_done_mutex_);
Expand Down
Loading

0 comments on commit 6bc7b45

Please sign in to comment.