diff --git a/R/utils-model.R b/R/utils-model.R
index 2792cfc..aa8d31e 100644
--- a/R/utils-model.R
+++ b/R/utils-model.R
@@ -103,13 +103,17 @@ utils::globalVariables(c("Observation", "Model", "Value"))
plot_model_comparison <- function(x, model, data, compare_with, prediction_functions, sort_by = NULL) {
model_name <- rev(as.character(model$call[[1]]))[1]
- compare_with$xspliner <- x
- compare_with[[model_name]] <- model
+ models_list <- list(xspliner = x)
+ models_list[[model_name]] <- model
+ models_list <- append(models_list, compare_with)
if (length(prediction_functions) == 1) {
- fitted <- compare_with %>%
+ fitted <- models_list %>%
purrr::map(~ prediction_functions[[1]](., data))
} else {
- fitted <- compare_with %>%
+ if (length(models_list) != length(prediction_functions)) {
+ stop("prediction_functions should provide prediction functions for all models (surrogate, original and model to compare), or common one.")
+ }
+ fitted <- models_list %>%
purrr::map2(prediction_functions, function(model, pred_fun) pred_fun(model, data))
}
diff --git a/docs/articles/automation.html b/docs/articles/automation.html
index cee25a3..dac9949 100644
--- a/docs/articles/automation.html
+++ b/docs/articles/automation.html
@@ -88,7 +88,7 @@
Automate your work
Krystian Igras
-
2019-06-20
+
2019-08-31
automation.Rmd
@@ -123,23 +123,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67320 -0.07027 -0.02497 0.10183 0.45924
+## -0.66812 -0.07307 -0.02386 0.10341 0.45717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.42228 0.22783 -6.243 4.49e-09 ***
-## Sepal.Length 0.03024 0.03545 0.853 0.3950
-## xs(Petal.Length) 1.91896 0.36103 5.315 3.94e-07 ***
-## xf(Species)versicolor 0.11858 0.16157 0.734 0.4642
-## xf(Species)virginica 0.42614 0.21904 1.945 0.0537 .
+## (Intercept) -1.32737 0.21623 -6.139 7.56e-09 ***
+## Sepal.Length 0.03222 0.03547 0.908 0.365
+## xs(Petal.Length) 1.85685 0.35456 5.237 5.63e-07 ***
+## xf(Species)versicolor 0.08438 0.17014 0.496 0.621
+## xf(Species)virginica 0.39232 0.22838 1.718 0.088 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03081421)
+## (Dispersion parameter for gaussian family taken to be 0.03096157)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4681 on 145 degrees of freedom
-## AIC: -89.371
+## Residual deviance: 4.4894 on 145 degrees of freedom
+## AIC: -88.655
##
## Number of Fisher Scoring iterations: 2
When the black box model is based on higher amount of variables it can be problematic to specify local parameters for each predictor. Also formula becomes large and hard to read.
@@ -165,23 +165,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67320 -0.07027 -0.02497 0.10183 0.45924
+## -0.66812 -0.07307 -0.02386 0.10341 0.45717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.42228 0.22783 -6.243 4.49e-09 ***
-## Sepal.Length 0.03024 0.03545 0.853 0.3950
-## xs(Petal.Length) 1.91896 0.36103 5.315 3.94e-07 ***
-## xf(Species)versicolor 0.11858 0.16157 0.734 0.4642
-## xf(Species)virginica 0.42614 0.21904 1.945 0.0537 .
+## (Intercept) -1.32737 0.21623 -6.139 7.56e-09 ***
+## Sepal.Length 0.03222 0.03547 0.908 0.365
+## xs(Petal.Length) 1.85685 0.35456 5.237 5.63e-07 ***
+## xf(Species)versicolor 0.08438 0.17014 0.496 0.621
+## xf(Species)virginica 0.39232 0.22838 1.718 0.088 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03081421)
+## (Dispersion parameter for gaussian family taken to be 0.03096157)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4681 on 145 degrees of freedom
-## AIC: -89.371
+## Residual deviance: 4.4894 on 145 degrees of freedom
+## AIC: -88.655
##
## Number of Fisher Scoring iterations: 2
But still you can specify local parameters that override the global ones.
@@ -201,23 +201,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67745 -0.07577 -0.03086 0.09603 0.46022
+## -0.67121 -0.07495 -0.03046 0.09856 0.45944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5234 0.2594 -5.874 2.80e-08 ***
-## xs(Sepal.Length) 0.2598 0.3088 0.841 0.4016
-## xs(Petal.Length) 1.8853 0.3888 4.849 3.16e-06 ***
-## xf(Species)versicolor 0.1269 0.1669 0.761 0.4481
-## xf(Species)virginica 0.4404 0.2274 1.936 0.0548 .
+## (Intercept) -1.46014 0.26089 -5.597 1.06e-07 ***
+## xs(Sepal.Length) 0.31132 0.31424 0.991 0.3235
+## xs(Petal.Length) 1.80229 0.37818 4.766 4.53e-06 ***
+## xf(Species)versicolor 0.09996 0.17451 0.573 0.5677
+## xf(Species)virginica 0.41648 0.23573 1.767 0.0794 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03081848)
+## (Dispersion parameter for gaussian family taken to be 0.03092842)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4687 on 145 degrees of freedom
-## AIC: -89.35
+## Residual deviance: 4.4846 on 145 degrees of freedom
+## AIC: -88.816
##
## Number of Fisher Scoring iterations: 2
In this case last_evaluation variable will be transformed with thin plate regression spline (bs = "tp" is default for mgcv::s) with basis dimension equal to 10. At the same time average_monthly_hours will be transformed with cubic splines.
@@ -279,23 +279,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## xf(Species)versicolor 0.1458 0.1645 0.886 0.3769
-## xf(Species)virginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926
+## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
Then each predictor is transformed with xs and xf symbols and use of default parameters or global ones when specified.
@@ -314,23 +314,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## xf(Species)versicolor 0.1458 0.1645 0.886 0.3769
-## xf(Species)virginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926
+## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
@@ -354,23 +354,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## Speciesversicolor 0.1458 0.1645 0.886 0.3769
-## Speciesvirginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## Speciesversicolor 0.1181 0.1716 0.688 0.4926
+## Speciesvirginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
For transformation of factors only:
@@ -422,23 +422,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## xf(Species)versicolor 0.1458 0.1645 0.886 0.3769
-## xf(Species)virginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926
+## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
Good practice here is to provide data parameter as well to detect predictors classes, and model type (classification or regression).
@@ -462,23 +462,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## xf(Species)versicolor 0.1458 0.1645 0.886 0.3769
-## xf(Species)virginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926
+## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
In above examples each predictor is transformed by default. You can exclude needed, by specifying global alter = "never" parameters, or bare.
@@ -499,23 +499,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67928 -0.07680 -0.03180 0.09561 0.46812
+## -0.67513 -0.07534 -0.03094 0.09496 0.46835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.5206 0.2617 -5.810 3.82e-08 ***
-## xs(Sepal.Length) 0.2880 0.3120 0.923 0.3574
-## xs(Petal.Length) 1.8418 0.3845 4.791 4.07e-06 ***
-## xf(Species)versicolor 0.1458 0.1645 0.886 0.3769
-## xf(Species)virginica 0.4660 0.2245 2.075 0.0397 *
+## (Intercept) -1.4577 0.2641 -5.520 1.52e-07 ***
+## xs(Sepal.Length) 0.3332 0.3179 1.048 0.2963
+## xs(Petal.Length) 1.7661 0.3734 4.730 5.27e-06 ***
+## xf(Species)versicolor 0.1181 0.1716 0.688 0.4926
+## xf(Species)virginica 0.4411 0.2321 1.901 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089793)
+## (Dispersion parameter for gaussian family taken to be 0.03099558)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4802 on 145 degrees of freedom
-## AIC: -88.964
+## Residual deviance: 4.4944 on 145 degrees of freedom
+## AIC: -88.49
##
## Number of Fisher Scoring iterations: 2
Works! Can it be simpler? Actually not because of black box based transformation and theory, but we can provide some model based parameters upfront using DALEX’s explainer object (see next section).
## Preparation of a new explainer is initiated
+## -> model label : boston
+## -> data : 506 rows 4 cols ([33mextracted from the model[39m)
+## -> target variable : not specified! ([31mWARNING[39m)
+## -> predict function : yhat.randomForest will be used ([33mdefault[39m)
+## -> predicted values : numerical, min = 5.975977 , mean = 12.65206 , max = 24.93637
+## -> residual function : difference between y and yhat ([33mdefault[39m)
+## [32mA new explainer has been created![39m
## Setting levels: control = No, case = Yes
+## Setting levels: control = No, case = Yes
+
## Warning in coords.roc(roc_surrogate, x = thresholds, input = "threshold", :
+## An upcoming version of pROC will set the 'transpose' argument to FALSE
+## by default. Set transpose = TRUE explicitly to keep the current behavior,
+## or transpose = FALSE to adopt the new one and silence this warning. Type
+## help(coords_transpose) for additional information.
+
## Warning in coords.roc(roc_original, x = thresholds, input = "threshold", :
+## An upcoming version of pROC will set the 'transpose' argument to FALSE
+## by default. Set transpose = TRUE explicitly to keep the current behavior,
+## or transpose = FALSE to adopt the new one and silence this warning. Type
+## help(coords_transpose) for additional information.
+
## 1 - Max ROC diff: 0.8712113
## 1 - Mean ROC diff: 0.9506292
Another set of statistics is generated for prediction functions that return response levels.
Used as compare_stat parameter in xspline method.
Each function has attribute "higher-better".
If "higher-better" is TRUE then model with higher statistic value is treated as better one.
Environment in which newdata is stored (if not provided as parameter).
-
+
Details
The summary output depends strictly on data provided to it.
@@ -185,14 +184,14 @@
Details
1 - Maximum ROC difference$$1 - \max_{t \in T} ||ROC_{o}(t) - ROC_{s}(t)||_{2}$$ Calculates maximum of euclidean distances between ROC points for specified thresholds set T. In this imlplementation T is union of breakpoints for each ROC curve.
1 - Mean ROC difference Above version using mean instead of max measure.
#> Setting levels: control = versicolor, case = virginica
#> Setting levels: control = versicolor, case = virginica
#> Warning: An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
#> Warning: An upcoming version of pROC will set the 'transpose' argument to FALSE by default. Set transpose = TRUE explicitly to keep the current behavior, or transpose = FALSE to adopt the new one and silence this warning. Type help(coords_transpose) for additional information.
#> 1 - Max ROC diff: 0.56
#> 1 - Mean ROC diff: 0.8443484
# Prediction as final categoryresponse_rf<-function(object, newdata) predict(object, newdata=newdata)
response_xs<-function(object, newdata) {
@@ -302,9 +300,7 @@
While constructing formula interpreted by xspliner package, some parameters may be specified within xs(..) or xf(..) symbols.
Below are default parameters. See details in vignette("xspliner")
diff --git a/docs/reference/xspline-1.png b/docs/reference/xspline-1.png
index d704432..a27656d 100644
Binary files a/docs/reference/xspline-1.png and b/docs/reference/xspline-1.png differ
diff --git a/docs/reference/xspline-2.png b/docs/reference/xspline-2.png
index d704432..a27656d 100644
Binary files a/docs/reference/xspline-2.png and b/docs/reference/xspline-2.png differ
diff --git a/docs/reference/xspline-3.png b/docs/reference/xspline-3.png
index d704432..a27656d 100644
Binary files a/docs/reference/xspline-3.png and b/docs/reference/xspline-3.png differ
diff --git a/docs/reference/xspline.html b/docs/reference/xspline.html
index f10147f..760deae 100644
--- a/docs/reference/xspline.html
+++ b/docs/reference/xspline.html
@@ -13,6 +13,7 @@
+
@@ -23,6 +24,7 @@
+
@@ -95,7 +97,6 @@
Changelog
-
@@ -112,11 +113,9 @@
Builds predictive model based GLM.
-
The method provides main functionality on building GLM models with automatic variables transformation.
The transformations are based on specified single variable responses for selected black-box model.
See details in vignette("xspliner").
-
xspline(object, ...)
@@ -133,7 +132,7 @@
Builds predictive model based GLM.
# S3 method for explainerxspline(object, env=parent.frame(), ...)
-
+
Arguments
@@ -184,15 +183,13 @@
Ar
call are considered in transition.
-
+
Value
GLM object of class 'xspliner'.
-
Details
model_surrogate_xspliner is a wrapper of xspline method to assure consistency with https://github.com/ModelOriented/DrWhy tools
-
Examples
# preparing blackbox model
@@ -204,7 +201,8 @@
Examp
# formula based xsplinerxs_iris<-xspline(
Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + xf(Species),
- model=rf_iris)
#> Registered S3 method overwritten by 'DALEX':
+#> method from
+#> print.description questionr
#> Welcome to DALEX (version: 0.4.7).
+#> Find examples and detailed introduction at: https://pbiecek.github.io/PM_VEE/
+#> Additional features will be available after installation of: ingredients, iBreakDown.
+#> Use 'install_dependencies()' to get all suggested dependencies
#> Preparation of a new explainer is initiated
+#> -> model label : randomForest (default)
+#> -> data : 150 rows 4 cols (extracted from the model)
+#> -> target variable : not specified! (WARNING)
+#> -> predict function : yhat.randomForest will be used (default)
+#> -> predicted values : numerical, min = 0.1991898 , mean = 1.199876 , max = 2.134114
+#> -> residual function : difference between y and yhat (default)
+#> A new explainer has been created!