yusufuyanik1 · yusufuyanik1 · Oct 10, 2023 · Sep 28, 2023 · Oct 3, 2023 · Oct 3, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,31 @@
+# app/Dockerfile
+
+FROM python:3.11.4
+
+WORKDIR /healthcheckapp
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    gdebi-core \
+    && rm -rf /var/lib/apt/lists/*
+
+# remove -k to allow ssl verification   
+RUN curl -k -L https://quarto.org/download/latest/quarto-linux-arm64.deb -o /tmp/quarto-linux-arm64.deb 
+RUN gdebi --non-interactive /tmp/quarto-linux-arm64.deb 
+
+# del this line to allow ssl verification on git
+RUN git config --global http.sslVerify false  
+
+RUN git clone https://github.com/pegasystems/pega-datascientist-tools.git .
+
+# pip3 install --no-cache-dir .[app], replace with this line to allow ssl verification on pip
+RUN pip3 install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host=files.pythonhosted.org --no-cache-dir .[app]
+
+EXPOSE 8501
+
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+
+ENTRYPOINT ["streamlit", "run", "python/pdstools/app/Home.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb
@@ -140,7 +140,7 @@
                 "        model.group_by(\"ModelID\")\n",
                 "        .agg(\n",
                 "            number_of_predictors=pl.col(\"PredictorName\").n_unique(),\n",
-                "            model_performance=cdh_utils.weighed_performance_polars() * 100,\n",
+                "            model_performance=cdh_utils.weighted_performance_polars() * 100,\n",
                 "            response_count=pl.sum(\"ResponseCount\"),\n",
                 "        )\n",
                 "        .collect()\n",

diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py
@@ -970,19 +970,19 @@ def model_summary(
                     .sum()
                     .alias("Count_without_responses"),
                     (
-                        cdh_utils.weighed_performance_polars().alias(
-                            "Performance_weighted"
-                        )
+                        cdh_utils.weighted_performance_polars()
+                        .alias("Performance_weighted")
+                        .fill_nan(0.5)
                     ),
-                    cdh_utils.weighed_average_polars(
-                        "SuccessRate", "ResponseCount"
-                    ).alias("SuccessRate_weighted"),
+                    cdh_utils.weighted_average_polars("SuccessRate", "ResponseCount")
+                    .fill_nan(0.0)
+                    .alias("SuccessRate_weighted"),
                 ],
             )
             .with_columns(
-                (pl.col("Count_without_responses") / pl.col(f"{by}_count")).alias(
-                    "Percentage_without_responses"
-                )
+                (pl.col("Count_without_responses") / pl.col(f"{by}_count"))
+                .alias("Percentage_without_responses")
+                .fill_nan(0.0)
             )
         )
 
@@ -1029,16 +1029,16 @@ def pivot_df(
                 df.unique(subset=[by], keep="first")
                 .group_by(by)
                 .agg(
-                    cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount")
+                    cdh_utils.weighted_average_polars("PerformanceBin", "ResponseCount")
                 )
                 .sort("PerformanceBin", descending=True)
                 .head(top_n)
                 .select(by)
             )
-            df = top_n_xaxis.join(df, on=by, how="left") 
+            df = top_n_xaxis.join(df, on=by, how="left")
         if by not in ["ModelID", "Name"]:
             df = df.group_by([by, "PredictorName"]).agg(
-                cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount")
+                cdh_utils.weighted_average_polars("PerformanceBin", "ResponseCount")
             )
         df = (
             df.collect()
@@ -1430,7 +1430,7 @@ def exportTables(self, file: Path = "Tables.xlsx"):
         from xlsxwriter import Workbook
 
         tabs = {tab: getattr(self, tab) for tab in self.ApplicableTables}
-        with Workbook(file) as wb:
+        with Workbook(file, {"nan_inf_to_errors": True}) as wb:
             for tab, data in tabs.items():
                 data = data.with_columns(
                     pl.col(pl.List(pl.Categorical), pl.List(pl.Utf8))

diff --git a/python/pdstools/plots/plot_base.py b/python/pdstools/plots/plot_base.py
@@ -2,8 +2,8 @@
 import polars as pl
 from .plots_plotly import ADMVisualisations as plotly
 from ..utils.cdh_utils import (
-    weighed_performance_polars,
-    weighed_average_polars,
+    weighted_performance_polars,
+    weighted_average_polars,
 )
 from ..utils.errors import NotApplicableError
 from ..utils.types import any_frame
@@ -116,7 +116,7 @@ def top_n(
         if facets:
             df = df.join(
                 df.group_by(facets + ["PredictorName"])
-                .agg(weighed_average_polars(to_plot, "ResponseCountBin"))
+                .agg(weighted_average_polars(to_plot, "ResponseCountBin"))
                 .filter(pl.col(to_plot).is_not_nan())
                 .group_by(*facets)
                 .agg(
@@ -131,7 +131,7 @@ def top_n(
         else:
             df = df.join(
                 df.group_by("PredictorName")
-                .agg(weighed_average_polars(to_plot, "ResponseCountBin"))
+                .agg(weighted_average_polars(to_plot, "ResponseCountBin"))
                 .filter(pl.col(to_plot).is_not_nan())
                 .sort(to_plot, descending=True)
                 .head(top_n)
@@ -484,10 +484,10 @@ def plotOverTime(
                 df.group_by_dynamic("SnapshotTime", every=every, by=group_by)
                 .agg(
                     [
-                        weighed_average_polars("SuccessRate", "ResponseCount").alias(
+                        weighted_average_polars("SuccessRate", "ResponseCount").alias(
                             "SuccessRate"
                         ),
-                        weighed_performance_polars().alias("weighted_performance"),
+                        weighted_performance_polars().alias("weighted_performance"),
                     ]
                 )
                 .with_columns(pl.col("weighted_performance") * 100)
@@ -963,7 +963,7 @@ def plotPredictorCategoryPerformance(
         df = (
             df.group_by(facets + ["ModelID", "PredictorCategory"])
             .agg(
-                weighed_average_polars("PerformanceBin", "ResponseCountBin").alias(
+                weighted_average_polars("PerformanceBin", "ResponseCountBin").alias(
                     "PerformanceBin"
                 )
             )
@@ -1062,7 +1062,9 @@ def plotPredictorContribution(
             .with_columns((pl.col("PerformanceBin") - 0.5) * 2)
             .group_by(by, "PredictorCategory")
             .agg(
-                Performance=weighed_average_polars("PerformanceBin", "BinResponseCount")
+                Performance=weighted_average_polars(
+                    "PerformanceBin", "BinResponseCount"
+                )
             )
             .with_columns(
                 Contribution=(

diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py
@@ -556,7 +556,7 @@ def toPRPCDateTime(dt: datetime.datetime) -> str:
     return dt.strftime("%Y%m%dT%H%M%S.%f")[:-3] + dt.strftime(" GMT%z")
 
 
-def weighed_average_polars(
+def weighted_average_polars(
     vals: Union[str, pl.Expr], weights: Union[str, pl.Expr]
 ) -> pl.Expr:
     if isinstance(vals, str):
@@ -566,9 +566,9 @@ def weighed_average_polars(
     return ((vals * weights).sum()) / weights.sum()
 
 
-def weighed_performance_polars() -> pl.Expr:
+def weighted_performance_polars() -> pl.Expr:
     """Polars function to return a weighted performance"""
-    return weighed_average_polars("Performance", "ResponseCount")
+    return weighted_average_polars("Performance", "ResponseCount")
 
 
 def zRatio(
@@ -632,7 +632,7 @@ def LogOdds(
 
 
 def featureImportance(over=["PredictorName", "ModelID"]):
-    varImp = weighed_average_polars(
+    varImp = weighted_average_polars(
         LogOdds(
             pl.col("BinPositives"), pl.col("BinResponseCount") - pl.col("BinPositives")
         ),

diff --git a/python/tests/test_cdh_utils.py b/python/tests/test_cdh_utils.py
@@ -133,7 +133,9 @@ def test_toPRPCDateTime():
         == "20180316T134127.847 GMT-0456"
     )
     assert (
-        cdh_utils.toPRPCDateTime(datetime.datetime(2018, 3, 16, 13, 41, 27, 847000))[:-3]
+        cdh_utils.toPRPCDateTime(datetime.datetime(2018, 3, 16, 13, 41, 27, 847000))[
+            :-3
+        ]
         == "20180316T134127.847 GMT+0000"[:-3]
     )
 
@@ -149,7 +151,7 @@ def test_weighted_average_polars():
     output = (
         input.group_by("Channel")
         .agg(
-            cdh_utils.weighed_average_polars("SuccessRate", "ResponseCount").alias(
+            cdh_utils.weighted_average_polars("SuccessRate", "ResponseCount").alias(
                 "SuccessRate_weighted"
             ),
         )
@@ -165,7 +167,7 @@ def test_weighted_average_polars():
     output = (
         input.filter(pl.col("Channel") == "SMS")
         .with_columns(
-            cdh_utils.weighed_average_polars(
+            cdh_utils.weighted_average_polars(
                 vals="SuccessRate", weights="ResponseCount"
             ).alias("weighted_average")
         )
@@ -184,7 +186,7 @@ def test_weighted_average_polars():
     assert output.frame_equal(expected_output)
 
 
-def test_weighed_performance_polars():
+def test_weighted_performance_polars():
     input = pl.DataFrame(
         {
             "Performance": [0.5, 0.8, 0.75, 0.5],  # 0.6, 0.6
@@ -195,7 +197,7 @@ def test_weighed_performance_polars():
 
     output = (
         input.group_by("Channel")
-        .agg(cdh_utils.weighed_performance_polars())
+        .agg(cdh_utils.weighted_performance_polars())
         .sort("Channel")
     )