Patch optional imports, documented predictor categorization, revamped…

… docs (#293) * add pre-commit to dev deps, update docs * update docs workflows * try updated docs workflow * see effects on docs * try to ignore loggers * doc revamp * keep using doc push on every push while deving * sphinxarg as dependency * more conditions * Bring back jupyter tags * bring back vf cell tag too * update predictor_binning * update agb * Update VF article * update admexplained * Add getting started guide * Further docstring improvements * Fix tests * don't import from python * Add inline dependencies to the cli * Delete uv.lock I don't want this part of the core lib (yet), may add it back later once we reach some more stability * update release ci to use pypi token * try v5 :) * skip one test * Release docs on public version release * Fixed optional imports, documented predictor categorization * tie polars to 1.16 - it was failing
pegasystems · Dec 9, 2024 · a874e24 · a874e24
1 parent 51ac723
commit a874e24
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 44 deletions.
diff --git a/.github/workflows/Python tests.yml b/.github/workflows/Python tests.yml
@@ -81,7 +81,7 @@ jobs:
         run: uv run pytest --cov=./python/pdstools --cov-report=xml --cov-config=./python/tests/.coveragerc --ignore=python/tests/test_healthcheck.py --ignore=python/tests/test_ADMTrees.py
 
       - name: Upload coverage reports to Codecov
-        uses: codecov/codecov-action@v5.0.7
+        uses: codecov/codecov-action@v5.1.1
         with:
           token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: false
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,4 @@ python/*.ipynb_checkpoints/*
 **/META-INF/*
 r/tests/testthat/d/tmp2
 **/cache
+.venv
diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb
@@ -944,8 +944,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,20 +26,18 @@ classifiers = [
 keywords = [
     "pega",
     "pegasystems",
-    "pds",
     "pdstools",
-    "cdhtools",
     "datascientist",
     "tools",
 ]
 requires-python = ">=3.9"
-dependencies = ['polars>=1.9', 'typing_extensions']
+dependencies = ['polars==1.16', 'typing_extensions']
 
 [tool.setuptools.dynamic]
 version = {attr="pdstools.__version__"}
 
 [project.optional-dependencies]
-adm = ['plotly>=5.5.0']
+adm = ['plotly[express]>=6.0.0rc0', 'requests']
 pega_io = ['aioboto3', 'polars_hash']
 api = ['httpx', 'pydantic', 'anyio']
 healthcheck = ['pdstools[adm]', 'great_tables>=0.13', 'quarto', 'papermill', 'xlsxwriter>=3.0', 'pydot']

diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py
@@ -354,8 +354,8 @@ def _validate_predictor_data(
     def apply_predictor_categorization(
         self,
         df: Optional[pl.LazyFrame] = None,
-        categorization: Optional[
-            Union[pl.Expr, Callable[..., pl.Expr]]
+        categorization: Union[
+            pl.Expr, Callable[..., pl.Expr]
         ] = cdh_utils.default_predictor_categorization,
     ):
         """Apply a new predictor categorization to the datamart tables
@@ -381,25 +381,35 @@ def apply_predictor_categorization(
 
         See also
         --------
-        pdstools.utils.cdh_utils.default_predictor_categorization : The default
+        pdstools.utils.cdh_utils.default_predictor_categorization : The default method
 
         Examples
         --------
-        >>> #TODO
+        >>> dm = ADMDatamart(my_data) #uses the OOTB predictor categorization
+
+        >>> dm.apply_predictor_categorization(categorization=pl.when(
+        >>> pl.col("PredictorName").cast(pl.Utf8).str.contains("Propensity")
+        >>> ).then(pl.lit("External Model")
+        >>> ).otherwise(pl.lit("Adaptive Model)")
+
+        >>> # Now, every subsequent plot will use the custom categorization
         """
-        if callable(categorization):
-            categorization: pl.Expr = categorization()
+
+        categorization_expr: pl.Expr = (
+            categorization() if callable(categorization) else categorization
+        )
+
 
         if df is not None:
-            return df.with_columns(PredictorCategory=categorization)
+            return df.with_columns(PredictorCategory=categorization_expr)
 
         if hasattr(self, "predictor_data") and self.predictor_data is not None:
             self.predictor_data = self.predictor_data.with_columns(
-                PredictorCategory=categorization
+                PredictorCategory=categorization_expr
             )
         if hasattr(self, "combined_data") and self.combined_data is not None:
             self.combined_data = self.combined_data.with_columns(
-                PredictorCategory=categorization
+                PredictorCategory=categorization_expr
             )
 
     def save_data(

diff --git a/python/pdstools/adm/Plots.py b/python/pdstools/adm/Plots.py
@@ -174,6 +174,7 @@ def distribution_graph(df: pl.LazyFrame, title: str):
 
 class Plots(LazyNamespace):
     dependencies = ["plotly"]
+    dependency_group = "adm"
 
     def __init__(self, datamart: "ADMDatamart"):
         self.datamart = datamart
@@ -295,15 +296,15 @@ def over_time(
 
         metric_formatting = {
             "SuccessRate_weighted_average": ":.4%",
-            "Performance_weighted_average": ":.2", # is not a percentage!
+            "Performance_weighted_average": ":.2",  # is not a percentage!
             "Positives": ":.d",
             "ResponseCount": ":.d",
         }
 
         if metric == "Performance":
-            metric_scaling:pl.Expr = pl.lit(100.0)
+            metric_scaling: pl.Expr = pl.lit(100.0)
         else:
-            metric_scaling:pl.Expr = pl.lit(1.0)
+            metric_scaling: pl.Expr = pl.lit(1.0)
 
         if self.datamart.model_data is None:
             raise ValueError("Visualisation requires model_data")
@@ -333,9 +334,10 @@ def over_time(
                     "SnapshotTime", every=every, group_by=grouping_columns
                 )
                 .agg(
-                    (metric_scaling*cdh_utils.weighted_average_polars(
-                        metric, "ResponseCount"
-                    )).name.suffix("_weighted_average")
+                    (
+                        metric_scaling
+                        * cdh_utils.weighted_average_polars(metric, "ResponseCount")
+                    ).name.suffix("_weighted_average")
                 )
                 .sort("SnapshotTime", by_col)
             )
@@ -660,6 +662,10 @@ def predictor_performance(
             Whether to facet the plot into subplots, by default None
         return_df : bool, optional
             Whether to return a dataframe instead of a plot, by default False
+
+        See also
+        --------
+        pdstools.adm.ADMDatamart.apply_predictor_categorization : how to override the out of the box predictor categorization
         """
 
         metric = "PredictorPerformance" if metric == "Performance" else metric
@@ -762,6 +768,31 @@ def predictor_category_performance(
         facet: Optional[Union[pl.Expr, str]] = None,
         return_df: bool = False,
     ):
+        """Plot the predictor category performance
+
+        Parameters
+        ----------
+        metric : str, optional
+            The metric to plot, by default "Performance"
+        active_only : bool, optional
+            Whether to only analyze active predictors, by default False
+        query : Optional[QUERY], optional
+            An optional query to apply, by default None
+        facet : Optional[Union[pl.Expr, str]], optional
+            By which columns to facet the result, by default None
+        return_df : bool, optional
+            An optional flag to get the dataframe instead, by default False
+
+        Returns
+        -------
+        px.Figure
+            A Plotly figure
+
+
+        See also
+        --------
+        pdstools.adm.ADMDatamart.apply_predictor_categorization : how to override the out of the box predictor categorization
+        """
         metric = "PredictorPerformance" if metric == "Performance" else metric
 
         # Determine columns to select and grouping
@@ -847,6 +878,26 @@ def predictor_contribution(
         query: Optional[QUERY] = None,
         return_df: bool = False,
     ):
+        """Plots the predictor contribution for each configuration
+
+        Parameters
+        ----------
+        by : str, optional
+            By which column to plot the contribution, by default "Configuration"
+        query : Optional[QUERY], optional
+            An optional query to apply to the data, by default None
+        return_df : bool, optional
+            An optional flag to get a Dataframe instead, by default False
+
+        Returns
+        -------
+        px.Figure
+            A plotly figure
+
+        See also
+        --------
+        pdstools.adm.ADMDatamart.apply_predictor_categorization : how to override the out of the box predictor categorization
+        """
         df = (
             cdh_utils._apply_query(
                 self.datamart.aggregates.last(table="combined_data"),

diff --git a/python/pdstools/infinity/__init__.py b/python/pdstools/infinity/__init__.py
@@ -1,9 +1,48 @@
 """
-My module docstring
-
-Does this work?
+Infinity API client for Pega Decision Management.
 """
 
-from .client import AsyncInfinity, Infinity
+from importlib.util import find_spec
+from typing import TYPE_CHECKING, List
+
+from ..utils.namespaces import MissingDependenciesException
+
+if TYPE_CHECKING:
+    from .client import Infinity
+
+
+class DependencyNotFound:
+    def __init__(self, dependencies: List[str]):
+        self.dependencies = dependencies
+        self.namespace = "the DX API Client"
+        self.deps_group = "api"
+
+    def __repr__(self):
+        return f"While importing, one or more dependencies were not found: {self.dependencies}"
+
+    def __call__(self):
+        raise MissingDependenciesException(
+            self.dependencies, namespace=self.namespace, deps_group=self.deps_group
+        )
+
+
+def __getattr__(name: str):
+    """Lazy import to avoid loading httpx until needed."""
+    if name == "Infinity":
+        missing_dependencies: List[str] = []
+        if not find_spec("pydantic"):
+            missing_dependencies.append("pydantic")
+        if not find_spec("httpx"):
+            missing_dependencies.append("httpx")
+
+        if missing_dependencies:
+            return DependencyNotFound(missing_dependencies)
+
+        from .client import Infinity
+
+        return Infinity
+
+    raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
+
 
-__all__ = ["Infinity", "AsyncInfinity"]
+__all__ = ["Infinity"]
diff --git a/python/pdstools/pega_io/API.py b/python/pdstools/pega_io/API.py
@@ -1,7 +1,5 @@
 from os import PathLike
 
-import requests
-
 
 def _read_client_credential_file(credential_file: PathLike):  # pragma: no cover
     outputdict = {}
@@ -37,6 +35,8 @@ def get_token(credential_file: PathLike, verify: bool = True):  # pragma: no cov
         explicitly set verify to False, otherwise Python will yell at you.
 
     """
+    import requests
+
     creds = _read_client_credential_file(credential_file)
     response = requests.post(
         url=creds["Access token endpoint"],

diff --git a/python/pdstools/pega_io/File.py b/python/pdstools/pega_io/File.py
@@ -12,7 +12,6 @@
 from typing import Iterable, List, Literal, Optional, Tuple, Union, overload
 
 import polars as pl
-import requests
 
 from ..utils.cdh_utils import from_prpc_date_time
 
@@ -94,6 +93,8 @@ def read_ds_export(
         logging.debug("Could not find file in directory, checking if URL")
 
         try:
+            import requests
+
             response = requests.get(f"{path}/{filename}")
             logging.info(f"Response: {response}")
             if response.status_code == 200:
@@ -102,6 +103,11 @@ def read_ds_export(
                 file = BytesIO(urllib.request.urlopen(file).read())
                 _, extension = os.path.splitext(filename)
 
+        except ImportError:
+            warnings.warn(
+                "Unable to import `requests`, so not able to check for remote files. If you're trying to read in a file from the internet (or, for instance, using the built-in cdh_sample method), try installing the 'requests' package (`uv pip install requests`)"
+            )
+
         except Exception as e:
             logging.info(e)
             if verbose:
@@ -162,19 +168,19 @@ def import_file(
 
     if extension == ".json":
         try:
-            if isinstance(file, BytesIO):
-                from pyarrow import json
-
-                return pl.LazyFrame(
-                    json.read_json(
-                        file,
-                    )
-                )
-            else:
-                return pl.scan_ndjson(
-                    file,
-                    infer_schema_length=reading_opts.pop("infer_schema_length", 10000),
-                )
+            # if isinstance(file, BytesIO):
+            #     from pyarrow import json
+
+            #     return pl.LazyFrame(
+            #         json.read_json(
+            #             file,
+            #         )
+            #     )
+            # else:
+            return pl.scan_ndjson(
+                file,
+                infer_schema_length=reading_opts.pop("infer_schema_length", 10000),
+            )
         except Exception:  # pragma: no cover
             try:
                 return pl.read_json(file).lazy()

diff --git a/python/pdstools/prediction/Prediction.py b/python/pdstools/prediction/Prediction.py
@@ -201,7 +201,6 @@ def responsecount_trend(
             result.update_layout(yaxis_title="Responses")
         return result
 
-
 class Prediction:
     """Monitor Pega Prediction Studio Predictions"""
 

diff --git a/python/pdstools/reports/HealthCheck.qmd b/python/pdstools/reports/HealthCheck.qmd
@@ -1920,4 +1920,5 @@ except Exception as e:
 # unfortunately no way to get the quarto source file name, so that is hardcoded
 report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")
 
+
 ```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1920,4 +1920,5 @@ except Exception as e:
		# unfortunately no way to get the quarto source file name, so that is hardcoded
		report_utils.show_credits("pega-datascientist-tools/python/pdstools/reports/HealthCheck.qmd")


		```