From 22ac1f31b0fd73c07f588979d78e70c336e2a355 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:02:00 +0000 Subject: [PATCH 1/9] changed per step log to debug --- dlio_benchmark/utils/statscounter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 5d2c8823..86795936 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -322,7 +322,7 @@ def batch_processed(self, epoch, step, block, t0, computation_time): else: self.output[epoch]['proc'] = [duration] self.output[epoch]['compute']=[computation_time] - logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s") + logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s") def compute_metrics_train(self, epoch, block): key = f"block{block}" From 3221d9f063b2ad36467bf232310834186f3ca672 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:12:40 +0000 Subject: [PATCH 2/9] added configuration for setting log level, workload.workflow.log_level --- dlio_benchmark/common/enumerations.py | 9 +++++++++ dlio_benchmark/utils/config.py | 15 ++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 2101d61d..2dc30802 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -50,6 +50,15 @@ class StorageType(Enum): def __str__(self): return self.value +class LogLevel(Enum): + """ + Different levels of logging + """ + DEBUG = "debug" + INFO = "info" + WARN = "warn" + def __str__(self): + return self.value class MetadataType(Enum): """ diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index a0264553..b0908d22 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -26,7 +26,7 @@ from dlio_benchmark.common.constants import MODULE_CONFIG from dlio_benchmark.common.enumerations import StorageType, FormatType, Shuffle, ReadType, FileAccess, Compression, \ - FrameworkType, \ + FrameworkType, LogLevel, \ DataLoaderType, Profiler, DatasetType, DataLoaderSampler, CheckpointLocationType, CheckpointMechanismType from dlio_benchmark.utils.utility import DLIOMPI, get_trace_name, utcnow from dataclasses import dataclass @@ -91,7 +91,7 @@ class ConfigArguments: chunk_size: int = 0 compression: Compression = Compression.NONE compression_level: int = 4 - debug: bool = False + log_level: LogLevel = LogLevel.INFO total_training_steps: int = -1 do_eval: bool = False batch_size_eval: int = 1 @@ -167,7 +167,12 @@ def configure_dlio_logging(self, is_child=False): if is_child and self.multiprocessing_context == "fork": return # Configure the logging library - log_level = logging.DEBUG if self.debug else logging.INFO + if self.log_level == LogLevel.DEBUG: + log_level = logging.DEBUG + elif self.log_level == LogLevel.WARN + log_level = logging.WARN + else: + log_level = logging.INFO logging.basicConfig( level=log_level, force=True, @@ -569,8 +574,8 @@ def LoadConfig(args, config): args.generate_only = True else: args.generate_only = False - if 'debug' in config['workflow']: - args.debug = config['workflow']['debug'] + if 'log_level' in config['workflow']: + args.log_level = LogLevel(config['workflow']['log_level']) if 'evaluation' in config['workflow']: args.do_eval = config['workflow']['evaluation'] if 'checkpoint' in config['workflow']: From 08d8fa30dbfc4fff68a0934f30dbaeb3618d2ca9 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:16:42 +0000 Subject: [PATCH 3/9] fixed bug --- dlio_benchmark/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 4ba4dc0d..e31e61a7 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -169,7 +169,7 @@ def configure_dlio_logging(self, is_child=False): # Configure the logging library if self.log_level == LogLevel.DEBUG: log_level = logging.DEBUG - elif self.log_level == LogLevel.WARN + elif self.log_level == LogLevel.WARN: log_level = logging.WARN else: log_level = logging.INFO From 573555c45987cc0bd252edb14f657d04aa72dbac Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:47:14 +0000 Subject: [PATCH 4/9] fixed bugs related to mocking --- dlio_benchmark/main.py | 2 -- dlio_benchmark/utils/statscounter.py | 2 +- dlio_benchmark/utils/utility.py | 18 +++++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index dd91fa0f..8e9bed1f 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -146,8 +146,6 @@ def initialize(self): - Start profiling session for Darshan and Tensorboard. """ self.comm.barrier() - if self.args.debug and self.args.my_rank == 0: - input("Debug mode: Press enter to start\n") if self.args.generate_data: if self.args.my_rank == 0: diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 86795936..4bb47868 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -358,7 +358,7 @@ def eval_batch_processed(self, epoch, step, t0, computation_time): duration = time() - t0 self.output[epoch]['proc']['eval'].append(duration) self.output[epoch]['compute']['eval'].append(computation_time) - logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s") + logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s") def finalize(self): self.summary['end'] = utcnow() def save_data(self): diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index 7923fded..db0b9fe8 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -37,30 +37,30 @@ from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE except: class Profile(object): - def __init__(self, **kwargs): + def __init__(self, cat, name=None, epoch=None, step=None, image_idx=None, image_size=None): return - def log(self, **kwargs): + def log(self, func): return - def log_init(self, **kwargs): + def log_init(self, func): return - def iter(self, **kwargs): + def iter(self, func, iter_name="step"): return def __enter__(self): return - def __exit__(self, **kwargs): + def __exit__(self, type, value, traceback): return - def update(self, **kwargs): + def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}): return def flush(self): return def reset(self): return - def log_static(self, **kwargs): + def log_static(self, func): return class dftracer(object): def __init__(self,): self.type = None - def initialize_log(self, **kwargs): + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): return def get_time(self): return @@ -68,7 +68,7 @@ def enter_event(self): return def exit_event(self): return - def log_event(self, **kwargs): + def log_event(self, name, cat, start_time, duration, string_args=None): return def finalize(self): return From 2cae8161164392ac7b6ea0ed4f2328e7f8a5b51b Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:53:21 +0000 Subject: [PATCH 5/9] fixed mocking --- dlio_benchmark/utils/utility.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index db0b9fe8..287d80f0 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -40,11 +40,11 @@ class Profile(object): def __init__(self, cat, name=None, epoch=None, step=None, image_idx=None, image_size=None): return def log(self, func): - return + return func def log_init(self, func): - return + return func def iter(self, func, iter_name="step"): - return + return func def __enter__(self): return def __exit__(self, type, value, traceback): From c179a56c9a5a12832e12433a03870fd03db744a7 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 27 Aug 2024 22:59:49 +0000 Subject: [PATCH 6/9] added doc changes --- docs/source/config.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/config.rst b/docs/source/config.rst index 79ee1717..0e7ac8d6 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -100,6 +100,9 @@ workflow * - profiling - False - whether to perform profiling + * - log_level + - "info" + - select the logging level [warn|info|debug] .. note:: @@ -107,6 +110,10 @@ workflow Even though ``generate_data`` and ``train`` can be performed together in one job, we suggest to perform them seperately to eliminate potential caching effect. One can generate the data first by running DLIO with ```generate_data=True``` and ```train=False```, and then run training benchmark with ```generate_data=False``` and ```train=True```. +.. note:: + + ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. + dataset ------------------ .. list-table:: From 140d86cc1fd1c88e10d5d1d50dbd73d50db61fed Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 3 Sep 2024 19:06:20 +0000 Subject: [PATCH 7/9] added error mode; and changed the metric output to print to STDOUT --- dlio_benchmark/common/enumerations.py | 6 +++--- dlio_benchmark/utils/config.py | 10 ++++++++-- dlio_benchmark/utils/statscounter.py | 6 +++--- docs/source/config.rst | 3 ++- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 2dc30802..46ffa54c 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -54,9 +54,10 @@ class LogLevel(Enum): """ Different levels of logging """ - DEBUG = "debug" - INFO = "info" + ERROR = "error" WARN = "warn" + INFO = "info" + DEBUG = "debug" def __str__(self): return self.value @@ -116,7 +117,6 @@ class ComputationType(Enum): SYNC = 'sync' ASYNC = 'async' - class FormatType(Enum): """ Format Type supported by the benchmark. diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index e31e61a7..cbce06b3 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -167,10 +167,17 @@ def configure_dlio_logging(self, is_child=False): if is_child and self.multiprocessing_context == "fork": return # Configure the logging library + log_format_verbose = '[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]' + log_format_simple = '[%(levelname)s] %(message)s' + # Set logging format to be simple only when debug_level <= INFO + log_format = log_format_simple if self.log_level == LogLevel.DEBUG: log_level = logging.DEBUG + log_format = log_format_verbose elif self.log_level == LogLevel.WARN: log_level = logging.WARN + elif self.log_level == LogLevel.ERROR: + log_level = logging.ERROR else: log_level = logging.INFO logging.basicConfig( @@ -180,10 +187,9 @@ def configure_dlio_logging(self, is_child=False): logging.FileHandler(self.logfile_path, mode="a", encoding='utf-8'), logging.StreamHandler() ], - format='[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]' + format = log_format # logging's max timestamp resolution is msecs, we will pass in usecs in the message ) - def configure_dftracer(self, is_child=False, use_pid=False): # with "multiprocessing_context=fork" the profiler file remains open in the child process if is_child and self.multiprocessing_context == "fork": diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 4bb47868..29da48d1 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -182,7 +182,7 @@ def end_run(self): metric = metric + f"[METRIC] Eval Throughput (MB/second): {np.mean(eval_throughput)*self.record_size/1024/1024:.6f} ({np.std(eval_throughput)*self.record_size/1024/1024:.6f})\n" metric = metric + f"[METRIC] eval_au_meet_expectation: {self.summary['metric']['eval_au_meet_expectation']}\n" metric+="[METRIC] ==========================================================\n" - logging.info(metric) + print(metric) def start_train(self, epoch): if self.my_rank == 0: ts = utcnow() @@ -282,8 +282,8 @@ def end_block(self, epoch, block, steps_taken): logging.info(f"{ts} Ending block {block} - {steps_taken} steps completed in {duration} s") self.per_epoch_stats[epoch][f'block{block}']['end'] = ts self.per_epoch_stats[epoch][f'block{block}']['duration'] = duration - logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}") - logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}") + print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}") + print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}") def start_ckpt(self, epoch, block, steps_taken): if self.my_rank == 0: diff --git a/docs/source/config.rst b/docs/source/config.rst index 0e7ac8d6..b16f6ed9 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -102,7 +102,7 @@ workflow - whether to perform profiling * - log_level - "info" - - select the logging level [warn|info|debug] + - select the logging level [error|warn|info|debug|] .. note:: @@ -113,6 +113,7 @@ workflow .. note:: ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. + For performance mode, we would suggest using error mode to suppress unnecessory logs. dataset ------------------ From b9bc737de11681a1b734302945861e72a469e9ea Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 3 Sep 2024 16:06:58 -0500 Subject: [PATCH 8/9] Update utility.py --- dlio_benchmark/utils/utility.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index 287d80f0..bbc2de87 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -56,7 +56,7 @@ def flush(self): def reset(self): return def log_static(self, func): - return + return func class dftracer(object): def __init__(self,): self.type = None From ad0ea30f35185fed5b5b03aadbaf89080d9f7d0e Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 4 Sep 2024 20:57:06 +0000 Subject: [PATCH 9/9] changed log level field to output section --- dlio_benchmark/common/enumerations.py | 6 +++--- dlio_benchmark/utils/config.py | 9 ++++----- dlio_benchmark/utils/statscounter.py | 6 +++--- docs/source/config.rst | 16 ++++++++-------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 46ffa54c..cc34fb80 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -54,10 +54,10 @@ class LogLevel(Enum): """ Different levels of logging """ - ERROR = "error" - WARN = "warn" - INFO = "info" DEBUG = "debug" + INFO = "info" + WARNING = "warning" + ERROR = "error" def __str__(self): return self.value diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index cbce06b3..7f975060 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -174,8 +174,8 @@ def configure_dlio_logging(self, is_child=False): if self.log_level == LogLevel.DEBUG: log_level = logging.DEBUG log_format = log_format_verbose - elif self.log_level == LogLevel.WARN: - log_level = logging.WARN + elif self.log_level == LogLevel.WARNING: + log_level = logging.WARNING elif self.log_level == LogLevel.ERROR: log_level = logging.ERROR else: @@ -553,7 +553,8 @@ def LoadConfig(args, config): args.output_folder = config['output']['folder'] if 'log_file' in config['output']: args.log_file = config['output']['log_file'] - + if 'log_level' in config['output']: + args.log_level = LogLevel(config['output']['log_level']) if args.output_folder is None: try: hydra_cfg = hydra.core.hydra_config.HydraConfig.get() @@ -569,8 +570,6 @@ def LoadConfig(args, config): args.generate_only = True else: args.generate_only = False - if 'log_level' in config['workflow']: - args.log_level = LogLevel(config['workflow']['log_level']) if 'evaluation' in config['workflow']: args.do_eval = config['workflow']['evaluation'] if 'checkpoint' in config['workflow']: diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 29da48d1..4bb47868 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -182,7 +182,7 @@ def end_run(self): metric = metric + f"[METRIC] Eval Throughput (MB/second): {np.mean(eval_throughput)*self.record_size/1024/1024:.6f} ({np.std(eval_throughput)*self.record_size/1024/1024:.6f})\n" metric = metric + f"[METRIC] eval_au_meet_expectation: {self.summary['metric']['eval_au_meet_expectation']}\n" metric+="[METRIC] ==========================================================\n" - print(metric) + logging.info(metric) def start_train(self, epoch): if self.my_rank == 0: ts = utcnow() @@ -282,8 +282,8 @@ def end_block(self, epoch, block, steps_taken): logging.info(f"{ts} Ending block {block} - {steps_taken} steps completed in {duration} s") self.per_epoch_stats[epoch][f'block{block}']['end'] = ts self.per_epoch_stats[epoch][f'block{block}']['duration'] = duration - print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}") - print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}") + logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}") + logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}") def start_ckpt(self, epoch, block, steps_taken): if self.my_rank == 0: diff --git a/docs/source/config.rst b/docs/source/config.rst index b16f6ed9..0a06eb06 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -100,9 +100,6 @@ workflow * - profiling - False - whether to perform profiling - * - log_level - - "info" - - select the logging level [error|warn|info|debug|] .. note:: @@ -110,11 +107,6 @@ workflow Even though ``generate_data`` and ``train`` can be performed together in one job, we suggest to perform them seperately to eliminate potential caching effect. One can generate the data first by running DLIO with ```generate_data=True``` and ```train=False```, and then run training benchmark with ```generate_data=False``` and ```train=True```. -.. note:: - - ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. - For performance mode, we would suggest using error mode to suppress unnecessory logs. - dataset ------------------ .. list-table:: @@ -365,11 +357,19 @@ output * - log_file - dlio.log - log file name + * - log_level + - "info" + - select the logging level [error|warning|info|debug] .. note:: If ``folder`` is not set (None), the output folder will be ```hydra_log/unet3d/$DATE-$TIME```. +.. note:: + + ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. + For performance mode, we would suggest using error mode to suppress unnecessory logs. + profiling ------------------ .. list-table::