From c78a1c7bd0da13fdb2ec3b2bf2207a8cf8d7ad3e Mon Sep 17 00:00:00 2001 From: Scott Stevenson Date: Tue, 3 Dec 2024 22:19:49 +0000 Subject: [PATCH] Add missing call to parent constructor in SimulationDataset The `SimulationDataset` was missing a call to the parent `StreamingDataset` constructor, which led to errors when accessing attributes that are set in that constructor, such as `epoch_seed_change`: ``` AttributeError: 'SimulationDataset' object has no attribute 'epoch_seed_change' Traceback: File "/home/scott/projects/streaming/.venv/lib64/python3.12/site-packages/streamlit/runtime/scriptrunner/exec_code.py", line 88, in exec_func_with_error_handling result = func() ^^^^^^ File "/home/scott/projects/streaming/.venv/lib64/python3.12/site-packages/streamlit/runtime/scriptrunner/script_runner.py", line 579, in code_to_exec exec(code, module.__dict__) File "/home/scott/projects/streaming/simulation/interfaces/sim_ui.py", line 409, in submit_jobs(shuffle_quality, dataset, time_per_sample, node_internet_bandwidth, File "/home/scott/projects/streaming/simulation/interfaces/sim_ui.py", line 110, in submit_jobs for output in gen_sim: ^^^^^^^ File "/home/scott/projects/streaming/simulation/core/main.py", line 110, in simulate samples_per_node = dataset.get_samples_per_node(epoch, 0) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/simulation/core/sim_dataset.py", line 367, in get_samples_per_node partition = generate_work(self.batching_method, self, self.world, epoch, sample_in_epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/batching/__init__.py", line 45, in generate_work return get(dataset, world, epoch, sample_in_epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/batching/random.py", line 49, in generate_work_random_batching shuffle_units, small_per_big = dataset.resample_streams(epoch) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/scott/projects/streaming/streaming/base/dataset.py", line 878, in resample_streams epoch, self.epoch_seed_change) ^^^^^^^^^^^^^^^^^^^^^^ ``` Closes https://github.com/mosaicml/streaming/issues/831 --- simulation/core/sim_dataset.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/simulation/core/sim_dataset.py b/simulation/core/sim_dataset.py index 8d1fb176e..3d083fb8d 100644 --- a/simulation/core/sim_dataset.py +++ b/simulation/core/sim_dataset.py @@ -135,6 +135,29 @@ def __init__(self, # Time how long it takes for StreamingDataset instantiation t0 = time.time() + super().__init__(streams=streams, + remote=remote, + local=local, + split=split, + download_retry=download_retry, + download_timeout=download_timeout, + validate_hash=validate_hash, + keep_zip=keep_zip, + epoch_size=epoch_size, + predownload=predownload, + cache_limit=cache_limit, + partition_algo=partition_algo, + num_canonical_nodes=num_canonical_nodes, + batch_size=batch_size, + shuffle=shuffle, + shuffle_algo=shuffle_algo, + shuffle_seed=shuffle_seed, + shuffle_block_size=shuffle_block_size, + sampling_method=sampling_method, + sampling_granularity=sampling_granularity, + batching_method=batching_method, + allow_unsafe_types=allow_unsafe_types) + # Global arguments (which do not live in Streams). self.nodes = nodes self.devices = devices