From 22ac1f31b0fd73c07f588979d78e70c336e2a355 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:02:00 +0000
Subject: [PATCH 1/9] changed per step log to debug

---
 dlio_benchmark/utils/statscounter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
index 5d2c8823..86795936 100644
--- a/dlio_benchmark/utils/statscounter.py
+++ b/dlio_benchmark/utils/statscounter.py
@@ -322,7 +322,7 @@ def batch_processed(self, epoch, step, block, t0, computation_time):
         else:
             self.output[epoch]['proc'] = [duration]
             self.output[epoch]['compute']=[computation_time]
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size} samples in {duration} s")
 
     def compute_metrics_train(self, epoch, block):
         key = f"block{block}"

From 3221d9f063b2ad36467bf232310834186f3ca672 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:12:40 +0000
Subject: [PATCH 2/9] added configuration for setting log level,
 workload.workflow.log_level

---
 dlio_benchmark/common/enumerations.py |  9 +++++++++
 dlio_benchmark/utils/config.py        | 15 ++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 2101d61d..2dc30802 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -50,6 +50,15 @@ class StorageType(Enum):
 
     def __str__(self):
         return self.value
+class LogLevel(Enum):
+    """
+    Different levels of logging
+    """
+    DEBUG = "debug"
+    INFO = "info"
+    WARN = "warn"
+    def __str__(self):
+        return self.value
 
 class MetadataType(Enum):
     """
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index a0264553..b0908d22 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -26,7 +26,7 @@
 
 from dlio_benchmark.common.constants import MODULE_CONFIG
 from dlio_benchmark.common.enumerations import StorageType, FormatType, Shuffle, ReadType, FileAccess, Compression, \
-    FrameworkType, \
+    FrameworkType, LogLevel, \
     DataLoaderType, Profiler, DatasetType, DataLoaderSampler, CheckpointLocationType, CheckpointMechanismType
 from dlio_benchmark.utils.utility import DLIOMPI, get_trace_name, utcnow
 from dataclasses import dataclass
@@ -91,7 +91,7 @@ class ConfigArguments:
     chunk_size: int = 0
     compression: Compression = Compression.NONE
     compression_level: int = 4
-    debug: bool = False
+    log_level: LogLevel = LogLevel.INFO
     total_training_steps: int = -1
     do_eval: bool = False
     batch_size_eval: int = 1
@@ -167,7 +167,12 @@ def configure_dlio_logging(self, is_child=False):
         if is_child and self.multiprocessing_context == "fork":
             return
         # Configure the logging library
-        log_level = logging.DEBUG if self.debug else logging.INFO
+        if self.log_level == LogLevel.DEBUG:
+            log_level = logging.DEBUG
+        elif self.log_level == LogLevel.WARN
+            log_level = logging.WARN
+        else:
+            log_level = logging.INFO
         logging.basicConfig(
             level=log_level,
             force=True,
@@ -569,8 +574,8 @@ def LoadConfig(args, config):
             args.generate_only = True
         else:
             args.generate_only = False
-        if 'debug' in config['workflow']:
-            args.debug = config['workflow']['debug']
+        if 'log_level' in config['workflow']:
+            args.log_level = LogLevel(config['workflow']['log_level'])
         if 'evaluation' in config['workflow']:
             args.do_eval = config['workflow']['evaluation']
         if 'checkpoint' in config['workflow']:

From 08d8fa30dbfc4fff68a0934f30dbaeb3618d2ca9 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:16:42 +0000
Subject: [PATCH 3/9] fixed bug

---
 dlio_benchmark/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index 4ba4dc0d..e31e61a7 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -169,7 +169,7 @@ def configure_dlio_logging(self, is_child=False):
         # Configure the logging library
         if self.log_level == LogLevel.DEBUG:
             log_level = logging.DEBUG
-        elif self.log_level == LogLevel.WARN
+        elif self.log_level == LogLevel.WARN:
             log_level = logging.WARN
         else:
             log_level = logging.INFO

From 573555c45987cc0bd252edb14f657d04aa72dbac Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:47:14 +0000
Subject: [PATCH 4/9] fixed bugs related to mocking

---
 dlio_benchmark/main.py               |  2 --
 dlio_benchmark/utils/statscounter.py |  2 +-
 dlio_benchmark/utils/utility.py      | 18 +++++++++---------
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py
index dd91fa0f..8e9bed1f 100644
--- a/dlio_benchmark/main.py
+++ b/dlio_benchmark/main.py
@@ -146,8 +146,6 @@ def initialize(self):
         - Start profiling session for Darshan and Tensorboard.
         """
         self.comm.barrier()
-        if self.args.debug and self.args.my_rank == 0:
-            input("Debug mode: Press enter to start\n")
 
         if self.args.generate_data:
             if self.args.my_rank == 0:
diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
index 86795936..4bb47868 100644
--- a/dlio_benchmark/utils/statscounter.py
+++ b/dlio_benchmark/utils/statscounter.py
@@ -358,7 +358,7 @@ def eval_batch_processed(self, epoch, step, t0, computation_time):
         duration = time() - t0
         self.output[epoch]['proc']['eval'].append(duration)
         self.output[epoch]['compute']['eval'].append(computation_time)
-        logging.info(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
+        logging.debug(f"{utcnow()} Rank {self.my_rank} step {step} processed {self.batch_size_eval} samples in {duration} s")
     def finalize(self):
         self.summary['end'] = utcnow()
     def save_data(self):
diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
index 7923fded..db0b9fe8 100644
--- a/dlio_benchmark/utils/utility.py
+++ b/dlio_benchmark/utils/utility.py
@@ -37,30 +37,30 @@
     from dftracer.logger import dftracer as PerfTrace, dft_fn as Profile, DFTRACER_ENABLE as DFTRACER_ENABLE
 except:
     class Profile(object):
-        def __init__(self,  **kwargs):
+        def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
             return 
-        def log(self,  **kwargs):
+        def log(self,  func):
             return 
-        def log_init(self,  **kwargs):
+        def log_init(self,  func):
             return 
-        def iter(self,  **kwargs):
+        def iter(self,  func, iter_name="step"):
             return 
         def __enter__(self):
             return
-        def __exit__(self, **kwargs):
+        def __exit__(self, type, value, traceback):
             return
-        def update(self, **kwargs):
+        def update(self, epoch=None, step=None, image_idx=None, image_size=None, args={}):
             return
         def flush(self):
             return
         def reset(self):
             return
-        def log_static(self, **kwargs):
+        def log_static(self, func):
             return
     class dftracer(object):
         def __init__(self,):
             self.type = None
-        def initialize_log(self, **kwargs):
+        def initialize_log(self, logfile=None, data_dir=None, process_id=-1):
             return
         def get_time(self):
             return
@@ -68,7 +68,7 @@ def enter_event(self):
             return
         def exit_event(self):
             return
-        def log_event(self,  **kwargs):
+        def log_event(self, name, cat, start_time, duration, string_args=None):
             return
         def finalize(self):
             return

From 2cae8161164392ac7b6ea0ed4f2328e7f8a5b51b Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:53:21 +0000
Subject: [PATCH 5/9] fixed mocking

---
 dlio_benchmark/utils/utility.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
index db0b9fe8..287d80f0 100644
--- a/dlio_benchmark/utils/utility.py
+++ b/dlio_benchmark/utils/utility.py
@@ -40,11 +40,11 @@ class Profile(object):
         def __init__(self,  cat, name=None, epoch=None, step=None, image_idx=None, image_size=None):
             return 
         def log(self,  func):
-            return 
+            return func
         def log_init(self,  func):
-            return 
+            return func
         def iter(self,  func, iter_name="step"):
-            return 
+            return func
         def __enter__(self):
             return
         def __exit__(self, type, value, traceback):

From c179a56c9a5a12832e12433a03870fd03db744a7 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 27 Aug 2024 22:59:49 +0000
Subject: [PATCH 6/9] added doc changes

---
 docs/source/config.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/config.rst b/docs/source/config.rst
index 79ee1717..0e7ac8d6 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -100,6 +100,9 @@ workflow
    * - profiling
      - False
      - whether to perform profiling
+   * - log_level
+     - "info"
+     - select the logging level [warn|info|debug]
 
 .. note:: 
 
@@ -107,6 +110,10 @@ workflow
 
   Even though ``generate_data`` and ``train`` can be performed together in one job, we suggest to perform them seperately to eliminate potential caching effect. One can generate the data first by running DLIO with ```generate_data=True``` and ```train=False```, and then run training benchmark with ```generate_data=False``` and ```train=True```. 
 
+.. note:: 
+  
+  ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. 
+
 dataset
 ------------------
 .. list-table:: 

From 140d86cc1fd1c88e10d5d1d50dbd73d50db61fed Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 3 Sep 2024 19:06:20 +0000
Subject: [PATCH 7/9] added error mode; and changed the metric output to print
 to STDOUT

---
 dlio_benchmark/common/enumerations.py |  6 +++---
 dlio_benchmark/utils/config.py        | 10 ++++++++--
 dlio_benchmark/utils/statscounter.py  |  6 +++---
 docs/source/config.rst                |  3 ++-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 2dc30802..46ffa54c 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -54,9 +54,10 @@ class LogLevel(Enum):
     """
     Different levels of logging
     """
-    DEBUG = "debug"
-    INFO = "info"
+    ERROR = "error" 
     WARN = "warn"
+    INFO = "info"
+    DEBUG = "debug"
     def __str__(self):
         return self.value
 
@@ -116,7 +117,6 @@ class ComputationType(Enum):
     SYNC = 'sync'
     ASYNC = 'async'
 
-
 class FormatType(Enum):
     """
     Format Type supported by the benchmark.
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index e31e61a7..cbce06b3 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -167,10 +167,17 @@ def configure_dlio_logging(self, is_child=False):
         if is_child and self.multiprocessing_context == "fork":
             return
         # Configure the logging library
+        log_format_verbose = '[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
+        log_format_simple = '[%(levelname)s] %(message)s'
+        # Set logging format to be simple only when debug_level <= INFO
+        log_format = log_format_simple
         if self.log_level == LogLevel.DEBUG:
             log_level = logging.DEBUG
+            log_format = log_format_verbose 
         elif self.log_level == LogLevel.WARN:
             log_level = logging.WARN
+        elif self.log_level == LogLevel.ERROR:
+            log_level = logging.ERROR
         else:
             log_level = logging.INFO
         logging.basicConfig(
@@ -180,10 +187,9 @@ def configure_dlio_logging(self, is_child=False):
                 logging.FileHandler(self.logfile_path, mode="a", encoding='utf-8'),
                 logging.StreamHandler()
             ],
-            format='[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
+            format = log_format
             # logging's max timestamp resolution is msecs, we will pass in usecs in the message
         )
-
     def configure_dftracer(self, is_child=False, use_pid=False):
         # with "multiprocessing_context=fork" the profiler file remains open in the child process
         if is_child and self.multiprocessing_context == "fork":
diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
index 4bb47868..29da48d1 100644
--- a/dlio_benchmark/utils/statscounter.py
+++ b/dlio_benchmark/utils/statscounter.py
@@ -182,7 +182,7 @@ def end_run(self):
                     metric = metric + f"[METRIC] Eval Throughput (MB/second): {np.mean(eval_throughput)*self.record_size/1024/1024:.6f} ({np.std(eval_throughput)*self.record_size/1024/1024:.6f})\n"
                     metric = metric + f"[METRIC] eval_au_meet_expectation: {self.summary['metric']['eval_au_meet_expectation']}\n"
                 metric+="[METRIC] ==========================================================\n"
-                logging.info(metric)   
+                print(metric)   
     def start_train(self, epoch):   
         if self.my_rank == 0:
             ts = utcnow()
@@ -282,8 +282,8 @@ def end_block(self, epoch, block, steps_taken):
             logging.info(f"{ts} Ending block {block} - {steps_taken} steps completed in {duration} s")
             self.per_epoch_stats[epoch][f'block{block}']['end'] = ts
             self.per_epoch_stats[epoch][f'block{block}']['duration'] = duration
-            logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}")
-            logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}")
+            print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}")
+            print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}")
 
     def start_ckpt(self, epoch, block, steps_taken):
         if self.my_rank == 0:
diff --git a/docs/source/config.rst b/docs/source/config.rst
index 0e7ac8d6..b16f6ed9 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -102,7 +102,7 @@ workflow
      - whether to perform profiling
    * - log_level
      - "info"
-     - select the logging level [warn|info|debug]
+     - select the logging level [error|warn|info|debug|]
 
 .. note:: 
 
@@ -113,6 +113,7 @@ workflow
 .. note:: 
   
   ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. 
+  For performance mode, we would suggest using error mode to suppress unnecessory logs. 
 
 dataset
 ------------------

From b9bc737de11681a1b734302945861e72a469e9ea Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 3 Sep 2024 16:06:58 -0500
Subject: [PATCH 8/9] Update utility.py

---
 dlio_benchmark/utils/utility.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py
index 287d80f0..bbc2de87 100644
--- a/dlio_benchmark/utils/utility.py
+++ b/dlio_benchmark/utils/utility.py
@@ -56,7 +56,7 @@ def flush(self):
         def reset(self):
             return
         def log_static(self, func):
-            return
+            return func
     class dftracer(object):
         def __init__(self,):
             self.type = None

From ad0ea30f35185fed5b5b03aadbaf89080d9f7d0e Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Wed, 4 Sep 2024 20:57:06 +0000
Subject: [PATCH 9/9] changed log level field to output section

---
 dlio_benchmark/common/enumerations.py |  6 +++---
 dlio_benchmark/utils/config.py        |  9 ++++-----
 dlio_benchmark/utils/statscounter.py  |  6 +++---
 docs/source/config.rst                | 16 ++++++++--------
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 46ffa54c..cc34fb80 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -54,10 +54,10 @@ class LogLevel(Enum):
     """
     Different levels of logging
     """
-    ERROR = "error" 
-    WARN = "warn"
-    INFO = "info"
     DEBUG = "debug"
+    INFO = "info"
+    WARNING = "warning"
+    ERROR = "error" 
     def __str__(self):
         return self.value
 
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index cbce06b3..7f975060 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -174,8 +174,8 @@ def configure_dlio_logging(self, is_child=False):
         if self.log_level == LogLevel.DEBUG:
             log_level = logging.DEBUG
             log_format = log_format_verbose 
-        elif self.log_level == LogLevel.WARN:
-            log_level = logging.WARN
+        elif self.log_level == LogLevel.WARNING:
+            log_level = logging.WARNING
         elif self.log_level == LogLevel.ERROR:
             log_level = logging.ERROR
         else:
@@ -553,7 +553,8 @@ def LoadConfig(args, config):
             args.output_folder = config['output']['folder']
         if 'log_file' in config['output']:
             args.log_file = config['output']['log_file']
-
+        if 'log_level' in config['output']:
+            args.log_level = LogLevel(config['output']['log_level'])
     if args.output_folder is None:
         try:
             hydra_cfg = hydra.core.hydra_config.HydraConfig.get()
@@ -569,8 +570,6 @@ def LoadConfig(args, config):
             args.generate_only = True
         else:
             args.generate_only = False
-        if 'log_level' in config['workflow']:
-            args.log_level = LogLevel(config['workflow']['log_level'])
         if 'evaluation' in config['workflow']:
             args.do_eval = config['workflow']['evaluation']
         if 'checkpoint' in config['workflow']:
diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py
index 29da48d1..4bb47868 100644
--- a/dlio_benchmark/utils/statscounter.py
+++ b/dlio_benchmark/utils/statscounter.py
@@ -182,7 +182,7 @@ def end_run(self):
                     metric = metric + f"[METRIC] Eval Throughput (MB/second): {np.mean(eval_throughput)*self.record_size/1024/1024:.6f} ({np.std(eval_throughput)*self.record_size/1024/1024:.6f})\n"
                     metric = metric + f"[METRIC] eval_au_meet_expectation: {self.summary['metric']['eval_au_meet_expectation']}\n"
                 metric+="[METRIC] ==========================================================\n"
-                print(metric)   
+                logging.info(metric)   
     def start_train(self, epoch):   
         if self.my_rank == 0:
             ts = utcnow()
@@ -282,8 +282,8 @@ def end_block(self, epoch, block, steps_taken):
             logging.info(f"{ts} Ending block {block} - {steps_taken} steps completed in {duration} s")
             self.per_epoch_stats[epoch][f'block{block}']['end'] = ts
             self.per_epoch_stats[epoch][f'block{block}']['duration'] = duration
-            print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}")
-            print(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}")
+            logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}")
+            logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}")
 
     def start_ckpt(self, epoch, block, steps_taken):
         if self.my_rank == 0:
diff --git a/docs/source/config.rst b/docs/source/config.rst
index b16f6ed9..0a06eb06 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -100,9 +100,6 @@ workflow
    * - profiling
      - False
      - whether to perform profiling
-   * - log_level
-     - "info"
-     - select the logging level [error|warn|info|debug|]
 
 .. note:: 
 
@@ -110,11 +107,6 @@ workflow
 
   Even though ``generate_data`` and ``train`` can be performed together in one job, we suggest to perform them seperately to eliminate potential caching effect. One can generate the data first by running DLIO with ```generate_data=True``` and ```train=False```, and then run training benchmark with ```generate_data=False``` and ```train=True```. 
 
-.. note:: 
-  
-  ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. 
-  For performance mode, we would suggest using error mode to suppress unnecessory logs. 
-
 dataset
 ------------------
 .. list-table:: 
@@ -365,11 +357,19 @@ output
    * - log_file
      - dlio.log
      - log file name  
+   * - log_level
+     - "info"
+     - select the logging level [error|warning|info|debug]
 
 .. note::
    
    If ``folder`` is not set (None), the output folder will be ```hydra_log/unet3d/$DATE-$TIME```. 
 
+.. note:: 
+  
+  ``log_level=debug`` will output detailed logging info per steps; whereas ``log_level=info`` only output log at the end of each epoch. 
+  For performance mode, we would suggest using error mode to suppress unnecessory logs. 
+
 profiling
 ------------------
 .. list-table::