From 6061115b874f96db6e908b122023e066129dd945 Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Sun, 29 Sep 2024 17:06:10 -0700 Subject: [PATCH 1/3] fix type of number for offset and size --- dlio_benchmark/reader/indexed_binary_mmap_reader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/reader/indexed_binary_mmap_reader.py b/dlio_benchmark/reader/indexed_binary_mmap_reader.py index 500f9d2c..f398a5dd 100644 --- a/dlio_benchmark/reader/indexed_binary_mmap_reader.py +++ b/dlio_benchmark/reader/indexed_binary_mmap_reader.py @@ -57,10 +57,10 @@ def load_index_file(self, global_sample_idx, filename, sample_index): self.file_map_ibr[filename] = [] bin_buffer_mmap = np.memmap(offset_file, mode='r', order='C') bin_buffer = memoryview(bin_buffer_mmap) - self.file_map_ibr[filename].append(np.frombuffer(bin_buffer, dtype=np.uint8)) + self.file_map_ibr[filename].append(np.frombuffer(bin_buffer, dtype=np.uint64)) bin_buffer_mmap = np.memmap(sz_file, mode='r', order='C') bin_buffer = memoryview(bin_buffer_mmap) - self.file_map_ibr[filename].append(np.frombuffer(bin_buffer, dtype=np.uint8)) + self.file_map_ibr[filename].append(np.frombuffer(bin_buffer, dtype=np.uint64)) @dlp.log def load_index(self): @@ -113,4 +113,4 @@ def is_index_based(self): return True def is_iterator_based(self): - return True \ No newline at end of file + return True From 4e6367447dac05fbcd939f9dd62613a2a7cd2be5 Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Sun, 29 Sep 2024 18:38:16 -0700 Subject: [PATCH 2/3] optimization for reading in deep speed framework. --- dlio_benchmark/configs/workload/megatron_deepspeed.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/configs/workload/megatron_deepspeed.yaml b/dlio_benchmark/configs/workload/megatron_deepspeed.yaml index 20e4a3aa..9f4da4ab 100644 --- a/dlio_benchmark/configs/workload/megatron_deepspeed.yaml +++ b/dlio_benchmark/configs/workload/megatron_deepspeed.yaml @@ -12,12 +12,12 @@ dataset: data_folder: dataset/megatron-deepspeed/ format: mmap_indexed_binary num_files_train: 1 - num_samples_per_file: 277203535 - record_length: 2048 + num_samples_per_file: 270706 + record_length: 2097152 reader: data_loader: pytorch - batch_size: 1024 + batch_size: 1 read_threads: 1 file_shuffle: seed sample_shuffle: seed From 90d812c626ec63ffa9a3ef7d4081b3fbe289271d Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Sun, 29 Sep 2024 18:52:47 -0700 Subject: [PATCH 3/3] increase buffer to 16 MB --- dlio_benchmark/data_generator/indexed_binary_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/data_generator/indexed_binary_generator.py b/dlio_benchmark/data_generator/indexed_binary_generator.py index 6a7013b9..719bf238 100644 --- a/dlio_benchmark/data_generator/indexed_binary_generator.py +++ b/dlio_benchmark/data_generator/indexed_binary_generator.py @@ -69,7 +69,7 @@ def generate(self): out_path_spec_off_idx = self.index_file_path_off(out_path_spec) out_path_spec_sz_idx = self.index_file_path_size(out_path_spec) fh = MPI.File.Open(comm, out_path_spec, amode) - samples_per_loop = int(MB / sample_size) + samples_per_loop = int(MB * 16 / sample_size) for sample_index in range(self.my_rank*samples_per_rank, samples_per_rank*(self.my_rank+1), samples_per_loop): #logging.info(f"{utcnow()} rank {self.my_rank} writing {sample_index} * {samples_per_loop} for {samples_per_rank} samples")