diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index 02fe1e68..01d0704c 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -45,38 +45,29 @@ def generate(self): """ super().generate() np.random.seed(10) - samples_per_iter=max(1, int(self._args.generation_buffer_size/self._args.record_length)) record_labels = [0] * self.num_samples dim = self.get_dimension(self.total_files_to_generate) + chunks = None + if self.enable_chunking: + chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size))) + if chunk_dimension > self._dimension: + chunk_dimension = self._dimension + chunks = (1, chunk_dimension, chunk_dimension) + compression = None + compression_level = None + if self.compression != Compression.NONE: + compression = str(self.compression) + if self.compression == Compression.GZIP: + compression_level = self.compression_level for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)): - progress(i, self.total_files_to_generate, "Generating HDF5 Data") dim1 = dim[2*i] dim2 = dim[2*i+1] - records = np.random.randint(255, size=(samples_per_iter, dim1, dim2), dtype=np.uint8) + records = np.random.randint(255, size=(dim1, dim2, self.num_samples), dtype=np.uint8) out_path_spec = self.storage.get_uri(self._file_list[i]) + progress(i+1, self.total_files_to_generate, "Generating NPZ Data") hf = h5py.File(out_path_spec, 'w') - chunks = None - if self.enable_chunking: - chunk_dimension = int(math.ceil(math.sqrt(self.chunk_size))) - if chunk_dimension > self._dimension: - chunk_dimension = self._dimension - chunks = (1, chunk_dimension, chunk_dimension) - compression = None - compression_level = None - if self.compression != Compression.NONE: - compression = str(self.compression) - if self.compression == Compression.GZIP: - compression_level = self.compression_level - dset = hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression, - compression_opts=compression_level, dtype=np.uint8) - samples_written = 0 - while samples_written < self.num_samples: - if samples_per_iter < self.num_samples-samples_written: - samples_to_write = samples_per_iter - else: - samples_to_write = self.num_samples-samples_written - dset[samples_written:samples_written+samples_to_write] = records[:samples_to_write] - samples_written += samples_to_write + hf.create_dataset('records', (self.num_samples, dim1, dim2), chunks=chunks, compression=compression, + compression_opts=compression_level, dtype=np.uint8, data=records) hf.create_dataset('labels', data=record_labels) hf.close() np.random.seed() diff --git a/tests/dlio_benchmark_test.py b/tests/dlio_benchmark_test.py index 3e0a1508..99e2faa4 100644 --- a/tests/dlio_benchmark_test.py +++ b/tests/dlio_benchmark_test.py @@ -48,7 +48,8 @@ def init(): DLIOMPI.get_instance().initialize() def finalize(): - DLIOMPI.get_instance().finalize() + # DLIOMPI.get_instance().finalize() + pass def clean(storage_root="./") -> None: comm.Barrier()