diff --git a/examples/articles/ADMExplained.ipynb b/examples/articles/ADMExplained.ipynb index c3d05586..be702779 100644 --- a/examples/articles/ADMExplained.ipynb +++ b/examples/articles/ADMExplained.ipynb @@ -140,7 +140,7 @@ " model.group_by(\"ModelID\")\n", " .agg(\n", " number_of_predictors=pl.col(\"PredictorName\").n_unique(),\n", - " model_performance=cdh_utils.weighed_performance_polars() * 100,\n", + " model_performance=cdh_utils.weighted_performance_polars() * 100,\n", " response_count=pl.sum(\"ResponseCount\"),\n", " )\n", " .collect()\n", diff --git a/python/pdstools/adm/ADMDatamart.py b/python/pdstools/adm/ADMDatamart.py index fe39ca90..4ec7e929 100644 --- a/python/pdstools/adm/ADMDatamart.py +++ b/python/pdstools/adm/ADMDatamart.py @@ -970,19 +970,19 @@ def model_summary( .sum() .alias("Count_without_responses"), ( - cdh_utils.weighed_performance_polars().alias( - "Performance_weighted" - ) + cdh_utils.weighted_performance_polars() + .alias("Performance_weighted") + .fill_nan(0.5) ), - cdh_utils.weighed_average_polars( - "SuccessRate", "ResponseCount" - ).alias("SuccessRate_weighted"), + cdh_utils.weighted_average_polars("SuccessRate", "ResponseCount") + .fill_nan(0.0) + .alias("SuccessRate_weighted"), ], ) .with_columns( - (pl.col("Count_without_responses") / pl.col(f"{by}_count")).alias( - "Percentage_without_responses" - ) + (pl.col("Count_without_responses") / pl.col(f"{by}_count")) + .alias("Percentage_without_responses") + .fill_nan(0.0) ) ) @@ -1029,16 +1029,16 @@ def pivot_df( df.unique(subset=[by], keep="first") .group_by(by) .agg( - cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount") + cdh_utils.weighted_average_polars("PerformanceBin", "ResponseCount") ) .sort("PerformanceBin", descending=True) .head(top_n) .select(by) ) - df = top_n_xaxis.join(df, on=by, how="left") + df = top_n_xaxis.join(df, on=by, how="left") if by not in ["ModelID", "Name"]: df = df.group_by([by, "PredictorName"]).agg( - cdh_utils.weighed_average_polars("PerformanceBin", "ResponseCount") + cdh_utils.weighted_average_polars("PerformanceBin", "ResponseCount") ) df = ( df.collect() @@ -1430,7 +1430,7 @@ def exportTables(self, file: Path = "Tables.xlsx"): from xlsxwriter import Workbook tabs = {tab: getattr(self, tab) for tab in self.ApplicableTables} - with Workbook(file) as wb: + with Workbook(file, {"nan_inf_to_errors": True}) as wb: for tab, data in tabs.items(): data = data.with_columns( pl.col(pl.List(pl.Categorical), pl.List(pl.Utf8)) diff --git a/python/pdstools/plots/plot_base.py b/python/pdstools/plots/plot_base.py index d5844253..96da5493 100644 --- a/python/pdstools/plots/plot_base.py +++ b/python/pdstools/plots/plot_base.py @@ -2,8 +2,8 @@ import polars as pl from .plots_plotly import ADMVisualisations as plotly from ..utils.cdh_utils import ( - weighed_performance_polars, - weighed_average_polars, + weighted_performance_polars, + weighted_average_polars, ) from ..utils.errors import NotApplicableError from ..utils.types import any_frame @@ -116,7 +116,7 @@ def top_n( if facets: df = df.join( df.group_by(facets + ["PredictorName"]) - .agg(weighed_average_polars(to_plot, "ResponseCountBin")) + .agg(weighted_average_polars(to_plot, "ResponseCountBin")) .filter(pl.col(to_plot).is_not_nan()) .group_by(*facets) .agg( @@ -131,7 +131,7 @@ def top_n( else: df = df.join( df.group_by("PredictorName") - .agg(weighed_average_polars(to_plot, "ResponseCountBin")) + .agg(weighted_average_polars(to_plot, "ResponseCountBin")) .filter(pl.col(to_plot).is_not_nan()) .sort(to_plot, descending=True) .head(top_n) @@ -484,10 +484,10 @@ def plotOverTime( df.group_by_dynamic("SnapshotTime", every=every, by=group_by) .agg( [ - weighed_average_polars("SuccessRate", "ResponseCount").alias( + weighted_average_polars("SuccessRate", "ResponseCount").alias( "SuccessRate" ), - weighed_performance_polars().alias("weighted_performance"), + weighted_performance_polars().alias("weighted_performance"), ] ) .with_columns(pl.col("weighted_performance") * 100) @@ -963,7 +963,7 @@ def plotPredictorCategoryPerformance( df = ( df.group_by(facets + ["ModelID", "PredictorCategory"]) .agg( - weighed_average_polars("PerformanceBin", "ResponseCountBin").alias( + weighted_average_polars("PerformanceBin", "ResponseCountBin").alias( "PerformanceBin" ) ) @@ -1062,7 +1062,9 @@ def plotPredictorContribution( .with_columns((pl.col("PerformanceBin") - 0.5) * 2) .group_by(by, "PredictorCategory") .agg( - Performance=weighed_average_polars("PerformanceBin", "BinResponseCount") + Performance=weighted_average_polars( + "PerformanceBin", "BinResponseCount" + ) ) .with_columns( Contribution=( diff --git a/python/pdstools/utils/cdh_utils.py b/python/pdstools/utils/cdh_utils.py index e9e93d6f..77af9af0 100644 --- a/python/pdstools/utils/cdh_utils.py +++ b/python/pdstools/utils/cdh_utils.py @@ -556,7 +556,7 @@ def toPRPCDateTime(dt: datetime.datetime) -> str: return dt.strftime("%Y%m%dT%H%M%S.%f")[:-3] + dt.strftime(" GMT%z") -def weighed_average_polars( +def weighted_average_polars( vals: Union[str, pl.Expr], weights: Union[str, pl.Expr] ) -> pl.Expr: if isinstance(vals, str): @@ -566,9 +566,9 @@ def weighed_average_polars( return ((vals * weights).sum()) / weights.sum() -def weighed_performance_polars() -> pl.Expr: +def weighted_performance_polars() -> pl.Expr: """Polars function to return a weighted performance""" - return weighed_average_polars("Performance", "ResponseCount") + return weighted_average_polars("Performance", "ResponseCount") def zRatio( @@ -632,7 +632,7 @@ def LogOdds( def featureImportance(over=["PredictorName", "ModelID"]): - varImp = weighed_average_polars( + varImp = weighted_average_polars( LogOdds( pl.col("BinPositives"), pl.col("BinResponseCount") - pl.col("BinPositives") ), diff --git a/python/tests/test_cdh_utils.py b/python/tests/test_cdh_utils.py index 4bdb7448..a4c94b1a 100644 --- a/python/tests/test_cdh_utils.py +++ b/python/tests/test_cdh_utils.py @@ -133,7 +133,9 @@ def test_toPRPCDateTime(): == "20180316T134127.847 GMT-0456" ) assert ( - cdh_utils.toPRPCDateTime(datetime.datetime(2018, 3, 16, 13, 41, 27, 847000))[:-3] + cdh_utils.toPRPCDateTime(datetime.datetime(2018, 3, 16, 13, 41, 27, 847000))[ + :-3 + ] == "20180316T134127.847 GMT+0000"[:-3] ) @@ -149,7 +151,7 @@ def test_weighted_average_polars(): output = ( input.group_by("Channel") .agg( - cdh_utils.weighed_average_polars("SuccessRate", "ResponseCount").alias( + cdh_utils.weighted_average_polars("SuccessRate", "ResponseCount").alias( "SuccessRate_weighted" ), ) @@ -165,7 +167,7 @@ def test_weighted_average_polars(): output = ( input.filter(pl.col("Channel") == "SMS") .with_columns( - cdh_utils.weighed_average_polars( + cdh_utils.weighted_average_polars( vals="SuccessRate", weights="ResponseCount" ).alias("weighted_average") ) @@ -184,7 +186,7 @@ def test_weighted_average_polars(): assert output.frame_equal(expected_output) -def test_weighed_performance_polars(): +def test_weighted_performance_polars(): input = pl.DataFrame( { "Performance": [0.5, 0.8, 0.75, 0.5], # 0.6, 0.6 @@ -195,7 +197,7 @@ def test_weighed_performance_polars(): output = ( input.group_by("Channel") - .agg(cdh_utils.weighed_performance_polars()) + .agg(cdh_utils.weighted_performance_polars()) .sort("Channel") )