Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more debug information to check the coredump when destroyComputation #899

Open
wants to merge 5 commits into
base: ipu_stable_sdk2.3.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ODLA/platforms/odla_popart/odla_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ odla_status odla_DestroyComputation(odla_computation comp) {
}
popart::logging::warn("reset config state, comp: {}", comp);
PopartConfig::instance()->reset_init_state();
popart::logging::warn("reset config state DONE, comp: {}", comp);

return ODLA_SUCCESS;
}
Expand Down
31 changes: 29 additions & 2 deletions ODLA/platforms/odla_popart/odla_popart.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,27 @@ void compute_loop(odla_computation comp) {
#undef RETURN_ERROR
#define RETURN_ERROR(ERR_CODE) return ERR_CODE;

void _odla_computation::release_session() {
if (nullptr == session)
popart::logging::warn("session is nullptr when try to release it");
else if (session->getDevice().getDeviceInfo() == nullptr)
popart::logging::warn(
"session->getDevice().getDeviceInfo() is nullptr when try to release "
"it");
else {
popart::logging::warn(
"Calling session->getDevice().getDeviceInfo()->detach() to detach the "
"device when QManager Status is {}",
QManager::instance()->get_status());
session->getDevice().getDeviceInfo()->detach();
popart::logging::warn("The computation:{} session:{} detached from device",
this, session.get());
}
if (session != nullptr) session.reset();
assert(session == nullptr);
popart::logging::warn("The computation:{} session has been reset", this);
}

odla_status _odla_computation::compile_and_export() {
odla_status ret_value = ODLA_SUCCESS;
POPLAR_TRY
Expand Down Expand Up @@ -213,7 +234,7 @@ odla_status _odla_computation::init(bool is_compile) {
popart::AnchorReturnType("All"));
// Acquire IPU
if (opts.use_ipu_model) {
popart::logging::info("Using IPU Model to run.");
popart::logging::warn("Using IPU Model to run.");
std::map<std::string, std::string> deviceOpts{
{"numIPUs", std::to_string(opts.ipu_num)}, {"tilesPerIPU", "1216"}};
device =
Expand All @@ -230,6 +251,7 @@ odla_status _odla_computation::init(bool is_compile) {
throw std::runtime_error(
"Failed to get a device when initializing odla_computation");
}
popart::logging::warn("Device acquired to run model");

// Create and config SessionOptions
set_session_opts();
Expand All @@ -255,6 +277,9 @@ odla_status _odla_computation::init(bool is_compile) {
// Create InferenceSession
new_session = std::move(popart::InferenceSession::createFromOnnxModel(
proto, data_flow, device, popart::InputShapeInfo(), session_opts_));
popart::logging::warn(
"New session: {} has been created for computation: {}",
new_session.get(), this);

if (!is_compile) {
if (PopartConfig::instance()->load_or_save_cache()) {
Expand Down Expand Up @@ -297,7 +322,9 @@ odla_status _odla_computation::init(bool is_compile) {
is_compile_only_ = true;
}
// set session after all initialization done.
popart::logging::warn("Moving new_session to session: {}", session.get());
session = std::move(new_session);
popart::logging::warn("Moved new_session to session: {}", session.get());
// Thread must be started after all initialization done
if (!is_compile) {
ExecutionMode mode = PopartConfig::instance()->execution_mode();
Expand Down Expand Up @@ -404,7 +431,7 @@ bool _odla_computation::hold() {
} else {
std::stringstream ss_holder;
ss_holder << thread_id_of_holder;
popart::logging::warn(
popart::logging::info(
"The odla_computation {} has been held by thread: {}"
", when thread {} try to hold it.",
this, thread_id_of_holder, this_thread_id);
Expand Down
12 changes: 1 addition & 11 deletions ODLA/platforms/odla_popart/odla_popart.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,17 +151,7 @@ struct _odla_computation {
inline Execution* executor() { return executor_; }
inline bool is_done() { return thread_state_ != RUNNING; }
inline bool is_compile_only() { return is_compile_only_; }
inline void release_session() {
if (session != nullptr) {
session->getDevice().getDeviceInfo()->detach();
popart::logging::warn(
"The computation:{} session:{} detached from device", this,
session.get());
session.reset();
assert(session == nullptr);
popart::logging::warn("The computation:{} session has been reset", this);
}
}
void release_session();
inline void set_thread_run() {
std::unique_lock<std::mutex> lock(thread_done_mutex_);
thread_state_ = RUNNING;
Expand Down
2 changes: 1 addition & 1 deletion ODLA/platforms/odla_popart/popart_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class PopartConfig {
std::lock_guard<std::mutex> guard(config_mutex_);
if (inited_) {
inited_ = false;
if (cache_fs->is_open()) {
if (cache_fs && cache_fs->is_open()) {
cache_fs->close();
cache_fs->clear();
}
Expand Down