Skip to content

Commit

Permalink
Merge branch 'fix-create_dir_error_during_mpi-jershi' into 'main'
Browse files Browse the repository at this point in the history
Resolve the create dir error during mpi dumpping issue

See merge request dl/hugectr/hugectr!1005
  • Loading branch information
minseokl committed Oct 21, 2022
2 parents 4f0cef9 + 8a94147 commit f262e9d
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "HugeCTR/include/data_simulator.hpp"
#include "HugeCTR/include/embeddings/distributed_slot_sparse_embedding_hash.hpp"
#include "HugeCTR/include/io/filesystem.hpp"
#include "HugeCTR/include/io/io_utils.hpp"
#include "HugeCTR/include/utils.cuh"

namespace HugeCTR {
Expand Down Expand Up @@ -805,10 +806,14 @@ void DistributedSlotSparseEmbeddingHash<TypeHashKey, TypeEmbeddingComp>::dump_pa
CudaDeviceContext context;
size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();

auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
bool is_local_path = IOUtils::is_local_path(sparse_model);
const std::string key_file(sparse_model + "/key");
const std::string vec_file(sparse_model + "/emb_vector");

#ifdef ENABLE_MPI
HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported.");
fs->create_dir(sparse_model);
MPI_File key_fh, vec_fh;
HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &key_fh));
Expand Down Expand Up @@ -933,7 +938,6 @@ void DistributedSlotSparseEmbeddingHash<TypeHashKey, TypeEmbeddingComp>::dump_pa
HCTR_MPI_THROW(MPI_File_close(&vec_fh));
HCTR_MPI_THROW(MPI_Type_free(&TYPE_EMB_VECTOR));
#else
auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
fs->write(key_file, reinterpret_cast<char *>(h_key_ptr), total_count * key_size, true);
fs->write(vec_file, reinterpret_cast<char *>(h_hash_table_value), total_count * vec_size, true);
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "HugeCTR/include/data_simulator.hpp"
#include "HugeCTR/include/embeddings/localized_slot_sparse_embedding_hash.hpp"
#include "HugeCTR/include/io/filesystem.hpp"
#include "HugeCTR/include/io/io_utils.hpp"
#include "HugeCTR/include/utils.cuh"
#include "HugeCTR/include/utils.hpp"

Expand Down Expand Up @@ -959,11 +960,14 @@ void LocalizedSlotSparseEmbeddingHash<TypeHashKey, TypeEmbeddingComp>::dump_para
size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();

auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
bool is_local_path = IOUtils::is_local_path(sparse_model);
const std::string key_file(sparse_model + "/key");
const std::string slot_file(sparse_model + "/slot_id");
const std::string vec_file(sparse_model + "/emb_vector");

#ifdef ENABLE_MPI
HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported.");
fs->create_dir(sparse_model);
MPI_File key_fh, slot_fh, vec_fh;
HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &key_fh));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp"
#include "HugeCTR/include/io/filesystem.hpp"
#include "HugeCTR/include/io/io_utils.hpp"

#ifdef ENABLE_MPI
#include <mpi.h>
Expand Down Expand Up @@ -1012,12 +1013,15 @@ void LocalizedSlotSparseEmbeddingOneHot<TypeHashKey, TypeEmbeddingComp>::dump_pa
size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count();

auto fs = FileSystemBuilder::build_unique_by_path(sparse_model);
bool is_local_path = IOUtils::is_local_path(sparse_model);

const std::string key_file(sparse_model + "/key");
const std::string slot_file(sparse_model + "/slot_id");
const std::string vec_file(sparse_model + "/emb_vector");

#ifdef ENABLE_MPI
HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported.");
fs->create_dir(sparse_model);
MPI_File key_fh, slot_fh, vec_fh;
HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &key_fh));
Expand Down
6 changes: 4 additions & 2 deletions HugeCTR/src/io/local_filesystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ size_t LocalFileSystem::get_file_size(const std::string& path) const {
}

void LocalFileSystem::create_dir(const std::string& path) {
bool success = std::filesystem::create_directory(path);
HCTR_CHECK_HINT(success, std::string("Failed to create the directory: " + path).c_str());
if (!std::filesystem::exists(path)) {
bool success = std::filesystem::create_directories(path);
HCTR_CHECK_HINT(success, std::string("Failed to create the directory: " + path).c_str());
}
}

void LocalFileSystem::delete_file(const std::string& path, bool recursive) {
Expand Down

0 comments on commit f262e9d

Please sign in to comment.