argonne-lcf · zhenghh04 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
@@ -50,6 +50,15 @@ class StorageType(Enum):
 
     def __str__(self):
         return self.value
+class LogLevel(Enum):
+    """
+    Different levels of logging
+    """
+    DEBUG = "debug"
+    INFO = "info"
+    WARN = "warn"
+    def __str__(self):
+        return self.value
 
 class MetadataType(Enum):
     """

diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py
@@ -146,8 +146,6 @@ def initialize(self):
         - Start profiling session for Darshan and Tensorboard.
         """
         self.comm.barrier()
-        if self.args.debug and self.args.my_rank == 0:
-            input("Debug mode: Press enter to start\n")
 
         if self.args.generate_data:
             if self.args.my_rank == 0:

diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
@@ -26,7 +26,7 @@
 
 from dlio_benchmark.common.constants import MODULE_CONFIG
 from dlio_benchmark.common.enumerations import StorageType, FormatType, Shuffle, ReadType, FileAccess, Compression, \
-    FrameworkType, \
+    FrameworkType, LogLevel, \
     DataLoaderType, Profiler, DatasetType, DataLoaderSampler, CheckpointLocationType, CheckpointMechanismType
 from dlio_benchmark.utils.utility import DLIOMPI, get_trace_name, utcnow
 from dataclasses import dataclass
@@ -91,7 +91,7 @@ class ConfigArguments:
     chunk_size: int = 0
     compression: Compression = Compression.NONE
     compression_level: int = 4
-    debug: bool = False
+    log_level: LogLevel = LogLevel.INFO
     total_training_steps: int = -1
     do_eval: bool = False
     batch_size_eval: int = 1
@@ -167,7 +167,12 @@ def configure_dlio_logging(self, is_child=False):
         if is_child and self.multiprocessing_context == "fork":
             return
         # Configure the logging library
-        log_level = logging.DEBUG if self.debug else logging.INFO
+        if self.log_level == LogLevel.DEBUG:
+            log_level = logging.DEBUG
+        elif self.log_level == LogLevel.WARN:
+            log_level = logging.WARN
+        else:
+            log_level = logging.INFO
         logging.basicConfig(
             level=log_level,
             force=True,
@@ -558,8 +563,8 @@ def LoadConfig(args, config):
             args.generate_only = True
         else:
             args.generate_only = False
-        if 'debug' in config['workflow']:
-            args.debug = config['workflow']['debug']
+        if 'log_level' in config['workflow']:
+            args.log_level = LogLevel(config['workflow']['log_level'])
         if 'evaluation' in config['workflow']:
             args.do_eval = config['workflow']['evaluation']
         if 'checkpoint' in config['workflow']:

diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
@@ -322,7 +322,7 @@ def batch_processed(self, epoch, step, block, t0, computation_time):
         else:
             self.output[epoch]['proc'] = [duration]
             self.output[epoch]['compute']=[computation_time]
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
 
     def compute_metrics_train(self, epoch, block):
         key = f"block{block}"
@@ -358,7 +358,7 @@ def eval_batch_processed(self, epoch, step, t0, computation_time):
         duration = time() - t0
         self.output[epoch]['proc']['eval'].append(duration)
         self.output[epoch]['compute']['eval'].append(computation_time)
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
     def finalize(self):
         self.summary['end'] = utcnow()
     def save_data(self):

diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
@@ -37,38 +37,38 @@
     from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
 except:
     class Profile(object):
-        def __init__(self,  **kwargs):
-            return 
-        def log(self,  **kwargs):
-            return 
-        def log_init(self,  **kwargs):
-            return 
-        def iter(self,  **kwargs):
+        def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
             return 
+        def log(self,  func):
+            return func
+        def log_init(self,  func):
+            return func
+        def iter(self,  func, iter_name="step"):
+            return func
         def __enter__(self):
             return
-        def __exit__(self, **kwargs):
+        def __exit__(self, type, value, traceback):
             return
-        def update(self, **kwargs):
+        def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}):
             return
         def flush(self):
             return
         def reset(self):
             return
-        def log_static(self, **kwargs):
+        def log_static(self, func):
             return
     class dftracer(object):
         def __init__(self,):
             self.type = None
-        def initialize_log(self, **kwargs):
+        def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
         def get_time(self):
             return
         def enter_event(self):
             return
         def exit_event(self):
             return
-        def log_event(self,  **kwargs):
+        def log_event(self, name, cat, start_time, duration, string_args=None):
             return
         def finalize(self):
             return