From 8a941472f15bc3c16df4ba55610b026276844841 Mon Sep 17 00:00:00 2001 From: Jerry Shi Date: Fri, 21 Oct 2022 01:30:31 -0700 Subject: [PATCH] Resolve the create dir error during mpi dumpping issue --- .../embeddings/distributed_slot_sparse_embedding_hash.cu | 6 +++++- .../src/embeddings/localized_slot_sparse_embedding_hash.cu | 4 ++++ .../embeddings/localized_slot_sparse_embedding_one_hot.cu | 4 ++++ HugeCTR/src/io/local_filesystem.cpp | 6 ++++-- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/HugeCTR/src/embeddings/distributed_slot_sparse_embedding_hash.cu b/HugeCTR/src/embeddings/distributed_slot_sparse_embedding_hash.cu index ddd9e3fb9d..d1aa9df30e 100644 --- a/HugeCTR/src/embeddings/distributed_slot_sparse_embedding_hash.cu +++ b/HugeCTR/src/embeddings/distributed_slot_sparse_embedding_hash.cu @@ -24,6 +24,7 @@ #include "HugeCTR/include/data_simulator.hpp" #include "HugeCTR/include/embeddings/distributed_slot_sparse_embedding_hash.hpp" #include "HugeCTR/include/io/filesystem.hpp" +#include "HugeCTR/include/io/io_utils.hpp" #include "HugeCTR/include/utils.cuh" namespace HugeCTR { @@ -805,10 +806,14 @@ void DistributedSlotSparseEmbeddingHash::dump_pa CudaDeviceContext context; size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); + auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); + bool is_local_path = IOUtils::is_local_path(sparse_model); const std::string key_file(sparse_model + "/key"); const std::string vec_file(sparse_model + "/emb_vector"); #ifdef ENABLE_MPI + HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported."); + fs->create_dir(sparse_model); MPI_File key_fh, vec_fh; HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &key_fh)); @@ -933,7 +938,6 @@ void DistributedSlotSparseEmbeddingHash::dump_pa HCTR_MPI_THROW(MPI_File_close(&vec_fh)); HCTR_MPI_THROW(MPI_Type_free(&TYPE_EMB_VECTOR)); #else - auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); fs->write(key_file, reinterpret_cast(h_key_ptr), total_count * key_size, true); fs->write(vec_file, reinterpret_cast(h_hash_table_value), total_count * vec_size, true); #endif diff --git a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_hash.cu b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_hash.cu index 214990dc05..be8231f3a1 100644 --- a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_hash.cu +++ b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_hash.cu @@ -23,6 +23,7 @@ #include "HugeCTR/include/data_simulator.hpp" #include "HugeCTR/include/embeddings/localized_slot_sparse_embedding_hash.hpp" #include "HugeCTR/include/io/filesystem.hpp" +#include "HugeCTR/include/io/io_utils.hpp" #include "HugeCTR/include/utils.cuh" #include "HugeCTR/include/utils.hpp" @@ -959,11 +960,14 @@ void LocalizedSlotSparseEmbeddingHash::dump_para size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); + bool is_local_path = IOUtils::is_local_path(sparse_model); const std::string key_file(sparse_model + "/key"); const std::string slot_file(sparse_model + "/slot_id"); const std::string vec_file(sparse_model + "/emb_vector"); #ifdef ENABLE_MPI + HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported."); + fs->create_dir(sparse_model); MPI_File key_fh, slot_fh, vec_fh; HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &key_fh)); diff --git a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu index 1b5d27811b..f839069600 100644 --- a/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu +++ b/HugeCTR/src/embeddings/localized_slot_sparse_embedding_one_hot.cu @@ -16,6 +16,7 @@ #include "HugeCTR/include/embeddings/localized_slot_sparse_embedding_one_hot.hpp" #include "HugeCTR/include/io/filesystem.hpp" +#include "HugeCTR/include/io/io_utils.hpp" #ifdef ENABLE_MPI #include @@ -1012,12 +1013,15 @@ void LocalizedSlotSparseEmbeddingOneHot::dump_pa size_t local_gpu_count = embedding_data_.get_resource_manager().get_local_gpu_count(); auto fs = FileSystemBuilder::build_unique_by_path(sparse_model); + bool is_local_path = IOUtils::is_local_path(sparse_model); const std::string key_file(sparse_model + "/key"); const std::string slot_file(sparse_model + "/slot_id"); const std::string vec_file(sparse_model + "/emb_vector"); #ifdef ENABLE_MPI + HCTR_CHECK_HINT(is_local_path, "Dumping to remote file system in MPI mode is not supported."); + fs->create_dir(sparse_model); MPI_File key_fh, slot_fh, vec_fh; HCTR_MPI_THROW(MPI_File_open(MPI_COMM_WORLD, key_file.c_str(), MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &key_fh)); diff --git a/HugeCTR/src/io/local_filesystem.cpp b/HugeCTR/src/io/local_filesystem.cpp index bef959b9f3..699d4229a9 100644 --- a/HugeCTR/src/io/local_filesystem.cpp +++ b/HugeCTR/src/io/local_filesystem.cpp @@ -37,8 +37,10 @@ size_t LocalFileSystem::get_file_size(const std::string& path) const { } void LocalFileSystem::create_dir(const std::string& path) { - bool success = std::filesystem::create_directory(path); - HCTR_CHECK_HINT(success, std::string("Failed to create the directory: " + path).c_str()); + if (!std::filesystem::exists(path)) { + bool success = std::filesystem::create_directories(path); + HCTR_CHECK_HINT(success, std::string("Failed to create the directory: " + path).c_str()); + } } void LocalFileSystem::delete_file(const std::string& path, bool recursive) {