From 867905270344b1fbd366f04e6a0042d29d10aeb3 Mon Sep 17 00:00:00 2001 From: Naren Krishna <92325366+naren-ponder@users.noreply.github.com> Date: Wed, 8 Jun 2022 00:46:18 -0700 Subject: [PATCH] FEAT-#4524: Split Modin API and Memory log files (#4526) Co-authored-by: Yaroslav Igoshev Signed-off-by: Naren Krishna --- docs/release_notes/release_notes-0.15.0.rst | 1 + .../advanced_usage/modin_logging.rst | 39 ++++++++++++------- modin/config/envvars.py | 20 +++++----- modin/logging/config.py | 28 +++++++++---- 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/docs/release_notes/release_notes-0.15.0.rst b/docs/release_notes/release_notes-0.15.0.rst index 608da9d3831..d273aaf6ae0 100644 --- a/docs/release_notes/release_notes-0.15.0.rst +++ b/docs/release_notes/release_notes-0.15.0.rst @@ -36,6 +36,7 @@ Key Features and Updates * Benchmarking enhancements * FEAT-#4371: Add logging to Modin (#4372) * FEAT-#4501: Add RSS Memory Profiling to Modin Logging (#4502) + * FEAT-#4524: Split Modin API and Memory log files (#4526) * Refactor Codebase * REFACTOR-#4284: use variable length unpacking when getting results from `deploy` function (#4285) * REFACTOR-#3642: Move PyArrow storage format usage from main feature to experimental ones (#4374) diff --git a/docs/usage_guide/advanced_usage/modin_logging.rst b/docs/usage_guide/advanced_usage/modin_logging.rst index 19043681aaa..786874d33ce 100644 --- a/docs/usage_guide/advanced_usage/modin_logging.rst +++ b/docs/usage_guide/advanced_usage/modin_logging.rst @@ -1,19 +1,26 @@ Modin Logging ============= -Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, -and profiling system memory. When Modin logging is enabled (default disabled), log files are written to a local `.modin` directory at the same -directory level as the notebook/script used to run Modin. Each log file in the `.modin` directory is uniquely named after the job uuid. +Modin logging offers users greater insight into their queries by logging internal Modin API calls, partition metadata, +and profiling system memory. When Modin logging is enabled (default disabled), log files are written to a local ``.modin`` directory at the same +directory level as the notebook/script used to run Modin. It is possible to configure whether to log system memory and additional metadata +in addition to Modin API calls (see the usage examples below). -**Developer Warning:** In some cases, running Modin logging inside of a Jupyterlab or other notebook instance at the root level of the Modin -repository or in the `modin/modin` directory may result in circular dependency issues. This is as a result of a name conflict between the -`modin/logging` directory and the Python `logging` module, which may be used as a default in such environments. As a resolution, one can simply -run Modin logging from a different directory or manually manage the name conflicts. +The logs generated by Modin Logging will be written to a ``.modin/logs/job_`` directory, uniquely named after the job uuid. +The logs that contain the Modin API stack traces are named ``trace.log``. The logs that contain the memory utilization metrics are +named ``memory.log``. By default, if any log file exceeds 10MB (configurable with ``LogFileSize``), that file will be saved and a +separate log file will be created. For instance, if users have 20MB worth of Modin API logs, they can expect to find ``trace.log.1`` +and ``trace.log.2`` in the ``.modin/logs/job_`` directory. After ``10 * LogFileSize`` MB or by default 100MB of logs, the logs will +rollover and the original log files beginning with ``trace.log.1`` will be overwritten with the new log lines. -Usage example -------------- +**Developer Warning:** In some cases, running services like JupyterLab in the ``modin/modin`` directory may result in circular dependency issues. +This is due to a naming conflict between the ``modin/logging`` directory and the Python ``logging`` module, which may be used as a default in +such environments. To resolve this, please run Jupyterlab or other similar services from directories other than ``modin/modin``. -In the example below, we enable logging for internal Modin API calls. +Usage examples +-------------- + +In the example below, we enable logging for internal Modin API calls. .. code-block:: python @@ -21,19 +28,21 @@ In the example below, we enable logging for internal Modin API calls. from modin.config import LogMode LogMode.enable_api_only() - # Your code goes here + # User code goes here In the next example, we add logging for not only internal Modin API calls, but also for partition metadata and memory profiling. -We can set the granularity (in seconds) at which the system memory utilization is logged using `LogMemoryInterval`. +We can set the granularity (in seconds) at which the system memory utilization is logged using ``LogMemoryInterval``. +We can also set the maximum size of the logs (in MBs) using ``LogFileSize``. .. code-block:: python import modin.pandas as pd - from modin.config import LogMode, LogMemoryInterval + from modin.config import LogMode, LogMemoryInterval, LogFileSize LogMode.enable() LogMemoryInterval.put(2) # Defaults to 5 seconds, new interval is 2 seconds + LogFileSize.put(5) # Defaults to 10 MB per log file, new size is 5 MB - # Your code goes here + # User code goes here Disable Modin logging like so: @@ -43,4 +52,4 @@ Disable Modin logging like so: from modin.config import LogMode LogMode.disable() - # Your code goes here + # User code goes here diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 2f1a760dca5..c49770c5120 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -415,16 +415,16 @@ def get(cls): return log_memory_interval -class LogMemorySize(EnvironmentVariable, type=int): - """Size of logs (in MBs) to store per Modin job.""" +class LogFileSize(EnvironmentVariable, type=int): + """Max size of logs (in MBs) to store per Modin job.""" - varname = "MODIN_LOG_MEMORY_SIZE" - default = 100 + varname = "MODIN_LOG_FILE_SIZE" + default = 10 @classmethod def put(cls, value): """ - Set ``LogMemorySize`` with extra checks. + Set ``LogFileSize`` with extra checks. Parameters ---------- @@ -432,21 +432,21 @@ def put(cls, value): Config value to set. """ if value <= 0: - raise ValueError(f"Log memory size should be > 0 MB, passed value {value}") + raise ValueError(f"Log file size should be > 0 MB, passed value {value}") super().put(value) @classmethod def get(cls): """ - Get ``LogMemorySize`` with extra checks. + Get ``LogFileSize`` with extra checks. Returns ------- int """ - log_memory_size = super().get() - assert log_memory_size > 0, "`LogMemorySize` should be > 0" - return log_memory_size + log_file_size = super().get() + assert log_file_size > 0, "`LogFileSize` should be > 0" + return log_file_size class PersistentPickle(EnvironmentVariable, type=bool): diff --git a/modin/logging/config.py b/modin/logging/config.py index 98f708ece83..9cad15eb9be 100644 --- a/modin/logging/config.py +++ b/modin/logging/config.py @@ -27,7 +27,7 @@ import pkg_resources import threading import time -from modin.config import LogMemoryInterval, LogMemorySize, LogMode +from modin.config import LogMemoryInterval, LogFileSize, LogMode __LOGGER_CONFIGURED__: bool = False @@ -93,14 +93,14 @@ def configure_logging(): global __LOGGER_CONFIGURED__ logger = logging.getLogger("modin.logger") job_id = uuid.uuid4().hex - log_filename = f".modin/logs/job_{job_id}.log" + log_filename = f".modin/logs/job_{job_id}/trace.log" os.makedirs(os.path.dirname(log_filename), exist_ok=True) logfile = RotatingFileHandler( filename=log_filename, mode="a", - maxBytes=LogMemorySize.get() * int(1e5), + maxBytes=LogFileSize.get() * int(1e6), backupCount=10, ) formatter = ModinFormatter( @@ -120,19 +120,31 @@ def configure_logging(): pandas_version = pkg_resources.get_distribution("pandas").version num_physical_cores = str(psutil.cpu_count(logical=False)) num_total_cores = str(psutil.cpu_count(logical=True)) - svmem = psutil.virtual_memory() logger.info(f"Modin Version: {modin_version}") logger.info(f"Pandas Version: {pandas_version}") logger.info(f"Physical Cores: {num_physical_cores}") logger.info(f"Total Cores: {num_total_cores}") - logger.info(f"Memory Total: {bytes_int_to_str(svmem.total)}") - logger.info(f"Memory Available: {bytes_int_to_str(svmem.available)}") - logger.info(f"Memory Used: {bytes_int_to_str(svmem.used)}") if LogMode.get() != "enable_api_only": mem_sleep = LogMemoryInterval.get() + mem_logger = logging.getLogger("modin_memory.logger") + mem_log_filename = f".modin/logs/job_{job_id}/memory.log" + logfile = RotatingFileHandler( + filename=mem_log_filename, + mode="a", + maxBytes=LogFileSize.get() * int(1e6), + backupCount=10, + ) + logfile.setFormatter(formatter) + mem_logger.addHandler(logfile) + mem_logger.setLevel(logging.DEBUG) + + svmem = psutil.virtual_memory() + mem_logger.info(f"Memory Total: {bytes_int_to_str(svmem.total)}") + mem_logger.info(f"Memory Available: {bytes_int_to_str(svmem.available)}") + mem_logger.info(f"Memory Used: {bytes_int_to_str(svmem.used)}") mem = threading.Thread( - target=memory_thread, args=[logger, mem_sleep], daemon=True + target=memory_thread, args=[mem_logger, mem_sleep], daemon=True ) mem.start()