From 651a6f73b57a2596102c801dd5597ac0c98194a3 Mon Sep 17 00:00:00 2001 From: Nodar Okroshiashvili Date: Thu, 2 Mar 2023 00:22:55 +0400 Subject: [PATCH 1/4] Add small utility to profile any function --- profiling/profiling.py | 29 +++++++++++++++++++++++++++++ test_requirements.txt | 1 + 2 files changed, 30 insertions(+) create mode 100644 profiling/profiling.py diff --git a/profiling/profiling.py b/profiling/profiling.py new file mode 100644 index 000000000..78df3edd8 --- /dev/null +++ b/profiling/profiling.py @@ -0,0 +1,29 @@ +import functools + +from pyinstrument.profiler import Profiler + + +def profile_function(output_file="profile.html"): + """ + Profiles a function execution time. + + Parameters + ---------- + output_file: file to write profile output. Defaults to "profile.html". + """ + + def decorator(function): + @functools.wraps(function) + def wrapper(*args, **kwargs): + profiler = Profiler() + profiler.start() + result = function(*args, **kwargs) + profiler.stop() + output = profiler.output_html() + with open(output_file, "w") as f: + f.write(output) + return result + + return wrapper + + return decorator diff --git a/test_requirements.txt b/test_requirements.txt index 42ebfeb9b..d2c088546 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -7,3 +7,4 @@ coverage>=6.4.4 flake8>=3.9.2 isort>=5.8.0 mypy>=0.740 +pyinstrument>=4.4.0 From c22a96b87e44d659f3facb848c80bcbf4d28cc2d Mon Sep 17 00:00:00 2001 From: Nodar Okroshiashvili Date: Tue, 14 Mar 2023 19:26:50 +0400 Subject: [PATCH 2/4] Update gitignore file to exclude profiles directory --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3ba72acd9..724f605cf 100644 --- a/.gitignore +++ b/.gitignore @@ -105,6 +105,9 @@ venv.bak/ # mypy .mypy_cache/ +# profiling +/profiles + # Miscelaneous .idea .vscode From 43c341cf330b594fdc457710b57dd6e22d8d8fc0 Mon Sep 17 00:00:00 2001 From: Nodar Okroshiashvili Date: Tue, 14 Mar 2023 19:27:09 +0400 Subject: [PATCH 3/4] Add Bash/Zsh script to run time profiling for any Python module --- profiling/profiling.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 profiling/profiling.sh diff --git a/profiling/profiling.sh b/profiling/profiling.sh new file mode 100644 index 000000000..eca490474 --- /dev/null +++ b/profiling/profiling.sh @@ -0,0 +1,7 @@ +mkdir -p profiles/code_profiles + +file="$@" + +pyinstrument -r html -o profiles/code_profiles/performance_profile_$(date "+%Y.%m.%d-%H:%M").html $file + +pyinstrument -r speedscope -o profiles/code_profiles/speedscope_$(date "+%Y.%m.%d-%H:%M").json $file From 7731a4c2f533de1e1f82914c8ee8ef46accf96ba Mon Sep 17 00:00:00 2001 From: Nodar Okroshiashvili Date: Mon, 24 Apr 2023 13:16:38 +0400 Subject: [PATCH 4/4] Add a section in documentation describing code profiling --- docs/contribute/contribute_code.rst | 70 +++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/docs/contribute/contribute_code.rst b/docs/contribute/contribute_code.rst index 754d5c510..e8e346bfe 100644 --- a/docs/contribute/contribute_code.rst +++ b/docs/contribute/contribute_code.rst @@ -449,6 +449,76 @@ you want to know why we prefer tox, this will tell you everything ;) +Code Profiling +-------------- + +If you want to profile your code, you can use the **profiling** module in root directory. There you will find two files, +`profiling.py` and `profiling.sh`. Both file does the same thing but in different ways. The profiling.py file is a python script +containing a function that must be used as a decorator for the class/method we want to profile. +The profiling.sh file is a bash/zsh script that you can run from the command line to profile whole .py script file. +Let us see how to use them. First, start with profiling.py file. + +I doubt that `DropDuplicateFeatures` class should take more time than other classes as it iterates over the columns and +checks if they are duplicated or not. So, I will profile the `DropDuplicateFeatures` class. + +First, I will find where this class resides and on top of the imports I will add the following line:: + + from profiling.profiling import profile_function + +Now, I will decorate the `DropDuplicateFeatures.fit` method with the `profile_function` function:: + + @profile_function(output_file="profile.html") + def fit(self, X: pd.DataFrame, y: pd.Series = None): + ... + +The next step is to create a temporary .py file that will contain the code that we want to profile. + +For example, I will create a file named `temp.py` and copy the following code:: + + import pandas as pd + import numpy as np + + from feature_engine.selection import DropDuplicateFeatures + + + if __name__ == "__main__": + rows = 10000 + cols = 60000 + col_names = [f"col_{i}" for i in range(cols)] + df = pd.DataFrame(np.random.randint(0, 100, size=(rows, cols)), columns=col_names) + + transformer = DropDuplicateFeatures() + transformer.fit(df) + + train_t = transformer.transform(df) + + +Now, I will run the `temp.py` file from the command line:: + + $ python temp.py + +This will create a file named `profile.html` in the root directory of the project. This file contains the profiling +results. You can open it with your favorite browser and inspect the results. + +If you don't like adding additional imports and decorator, then you can use the `profiling.sh` file. This file is a bash/zsh +script that you can run from the command line. Let us see how to use it. + +Again, I will profile the `DropDuplicateFeatures` class. I need to create a temporary .py file and put the same code as above. +After that, open the terminal in root directory and run the following command:: + + $ ./profiling/profiling.sh temp.py + + +This will create a directory, named `profiles`, in the root directory of the project. This directory contains tw files: +the first is .html file and you can open it with any browser, the second file is .json file and you can use +`speedscope `_ to visualize results. + + +.. note:: + To profile the memory usage, you can use the `memray` package. You can find more information about it + `here `_. + + Review Process --------------