diff --git a/DESCRIPTION b/DESCRIPTION index 2dd9565..1fa4344 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xspliner Title: Assisted Model Building, using Surrogate Black-Box Models to Train Interpretable Spline Based Additive Models -Version: 0.0.3.9002 +Version: 0.0.3.9003 Authors@R: c( person("Krystian", "Igras", email = "krystian8207@gmail.com", role = c("aut", "cre")), person("Przemyslaw", "Biecek", role = c("aut", "ths"))) diff --git a/R/methods-xspliner.R b/R/methods-xspliner.R index 933cf53..0f38acf 100644 --- a/R/methods-xspliner.R +++ b/R/methods-xspliner.R @@ -14,16 +14,17 @@ #' @param compare_with Named list. Other models that should be compared with xspliner and \code{model}. #' @param n_plots Threshold for number of plots when plotting all variables. #' @param sort_by When comparing models determines according to which model should observations be ordered. +#' @param use_coeff If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient. #' @param prediction_funs Prediction functions that should be used in model comparison. #' @param ... Another arguments passed into model specific method. #' #' @export plot.xspliner <- function(x, variable_names = NULL, model = NULL, plot_response = TRUE, plot_approx = TRUE, - data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, sort_by = NULL, + data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, sort_by = NULL, use_coeff = TRUE, compare_with = list(), prediction_funs = list(function(object, newdata) predict(object, newdata)), ...) { if (is.null(model)) { - plot_variable_transition(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, n_plots) + plot_variable_transition(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, n_plots, use_coeff) } else { if (is.null(data)) { stop("Data must be provided.") diff --git a/R/utils-model.R b/R/utils-model.R index d83b9ba..1ca7723 100644 --- a/R/utils-model.R +++ b/R/utils-model.R @@ -10,7 +10,7 @@ specials <- function(model, type = "all") { } } -plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data, plot_data, plot_deriv) { +plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data, plot_data, plot_deriv, use_coeff) { if (plot_data && is.null(data)) { message("You can plot data points only when data parameter is provided.") plot_data <- FALSE @@ -21,6 +21,10 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data stop("You must specify at least one plot.") } transition_fun <- transition(x, variable_name, "function") + variable_coeff <- 1 + if (use_coeff) { + variable_coeff <- setNames(x$coefficients[-1], all.vars(x$call$formula)[-1])[variable_name] + } if (plot_data) { plot_range <- range(data[[variable_name]]) @@ -40,6 +44,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data } if (plot_response) { data <- transition(x, variable_name, "data") + data$yhat <- variable_coeff * data$yhat colnames(data)[colnames(data) == "yhat"] <- response_var names(color_values)[2] <- attr(data, "type") data$type <- attr(data, "type") @@ -47,7 +52,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data } if (plot_approx) { x_var <- seq(from = plot_range[1], to = plot_range[2], length.out = 50) - y_var <- transition_fun(x_var) + y_var <- variable_coeff * transition_fun(x_var) data <- data.frame(y_var, x_var) colnames(data) <- c(response_var, variable_name) data$type <- "approximation" @@ -56,7 +61,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data if (plot_deriv) { eps <- (plot_range[2] - plot_range[1]) / 500 x_var <- seq(from = plot_range[1], to = plot_range[2], length.out = 50)[-50] - y_var <- (transition_fun(x_var + eps) - transition_fun(x_var)) / eps + y_var <- variable_coeff * (transition_fun(x_var + eps) - transition_fun(x_var)) / eps data <- data.frame(y_var, x_var) colnames(data) <- c(response_var, variable_name) if (sum(to_plot) == 1) { @@ -116,6 +121,7 @@ utils::globalVariables(c("Observation", "Model", "Value")) #' @param plot_data If TRUE raw data is drawn. #' @param plot_deriv If TRUE derivative of approximation is showed on plot. #' @param n_plots Threshold for number of plots when plotting all variables. +#' @param use_coeff If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient. #' #' @examples #' library(randomForest) @@ -135,7 +141,7 @@ utils::globalVariables(c("Observation", "Model", "Value")) #' #' @export plot_variable_transition <- function(x, variable_names = NULL, plot_response = TRUE, plot_approx = TRUE, - data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6) { + data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, use_coeff = TRUE) { if (is.null(variable_names)) { special_vars <- specials(x, "all") special_vars_to_plot <- special_vars[1:min(n_plots, length(special_vars))] @@ -152,7 +158,7 @@ plot_variable_transition <- function(x, variable_names = NULL, plot_response = T } else if (variable_names %in% specials(x, "qualitative")) { plot(transition(x, variable_names, "base")) } else { - plot_quantitative(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv) + plot_quantitative(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, use_coeff) } } diff --git a/docs/articles/automation.html b/docs/articles/automation.html index 11e39e9..9b8f875 100644 --- a/docs/articles/automation.html +++ b/docs/articles/automation.html @@ -68,22 +68,22 @@
@@ -100,14 +100,13 @@ -automation.Rmd
In previous sections we learned how to specify parameters for the effect and transition of single variable. Let’s name them local parameters.
A quick example you can see below:
-library(xspliner)
-library(randomForest)
-
-rf_iris <- randomForest(Petal.Width ~ Sepal.Length + Petal.Length + Species, data = iris)
-model_xs <- xspline(Petal.Width ~
- Sepal.Length +
- xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) +
- xf(Species, transition = list(stat = "loglikelihood", value = 4)),
- model = rf_iris)
-summary(model_xs)
library(xspliner)
+library(randomForest)
+
+rf_iris <- randomForest(Petal.Width ~ Sepal.Length + Petal.Length + Species, data = iris)
+model_xs <- xspline(Petal.Width ~
+ Sepal.Length +
+ xs(Petal.Length, effect = list(grid.resolution = 100), transition = list(bs = "cr")) +
+ xf(Species, transition = list(stat = "loglikelihood", value = 4)),
+ model = rf_iris)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) +
@@ -142,23 +141,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67131 -0.06637 -0.02490 0.10319 0.46224
+## -0.66794 -0.06702 -0.02400 0.10130 0.45988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.38512 0.22444 -6.171 6.42e-09 ***
-## Sepal.Length 0.03075 0.03562 0.863 0.389
-## xs(Petal.Length) 1.98449 0.37852 5.243 5.49e-07 ***
-## xf(Species)versicolor -0.05853 0.19642 -0.298 0.766
-## xf(Species)virginica 0.25372 0.25383 1.000 0.319
+## (Intercept) -1.33910 0.21773 -6.150 7.14e-09 ***
+## Sepal.Length 0.03191 0.03548 0.899 0.370
+## xs(Petal.Length) 1.94344 0.37063 5.244 5.47e-07 ***
+## xf(Species)versicolor -0.05795 0.19628 -0.295 0.768
+## xf(Species)virginica 0.25921 0.25277 1.025 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03095075)
+## (Dispersion parameter for gaussian family taken to be 0.03094921)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4879 on 145 degrees of freedom
-## AIC: -88.707
+## Residual deviance: 4.4876 on 145 degrees of freedom
+## AIC: -88.715
##
## Number of Fisher Scoring iterations: 2
When the black box model is based on higher amount of variables it can be problematic to specify local parameters for each predictor. Also formula becomes large and hard to read.
@@ -168,15 +167,15 @@Its more common that we use similar configuration for each variable, as it’s simple and allows us to build and experiment faster. To do this you can specify xs_opts
and xf_opts
of xspline
function. Let’s name them global parameters.
Each of global parameters can specify effect and transition, that should be used for xs and xf transformations respectively. Then you just need to use base xs and xf symbols without parameters:
-model_xs <- xspline(Petal.Width ~
- Sepal.Length +
- xs(Petal.Length) +
- xf(Species),
- model = rf_iris,
- xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
- xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
-)
-summary(model_xs)
model_xs <- xspline(Petal.Width ~
+ Sepal.Length +
+ xs(Petal.Length) +
+ xf(Species),
+ model = rf_iris,
+ xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
+ xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ Sepal.Length + xs(Petal.Length) +
@@ -184,35 +183,35 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67131 -0.06637 -0.02490 0.10319 0.46224
+## -0.66794 -0.06702 -0.02400 0.10130 0.45988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.38512 0.22444 -6.171 6.42e-09 ***
-## Sepal.Length 0.03075 0.03562 0.863 0.389
-## xs(Petal.Length) 1.98449 0.37852 5.243 5.49e-07 ***
-## xf(Species)versicolor -0.05853 0.19642 -0.298 0.766
-## xf(Species)virginica 0.25372 0.25383 1.000 0.319
+## (Intercept) -1.33910 0.21773 -6.150 7.14e-09 ***
+## Sepal.Length 0.03191 0.03548 0.899 0.370
+## xs(Petal.Length) 1.94344 0.37063 5.244 5.47e-07 ***
+## xf(Species)versicolor -0.05795 0.19628 -0.295 0.768
+## xf(Species)virginica 0.25921 0.25277 1.025 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03095075)
+## (Dispersion parameter for gaussian family taken to be 0.03094921)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4879 on 145 degrees of freedom
-## AIC: -88.707
+## Residual deviance: 4.4876 on 145 degrees of freedom
+## AIC: -88.715
##
## Number of Fisher Scoring iterations: 2
But still you can specify local parameters that override the global ones.
-model_xs <- xspline(Petal.Width ~
- xs(Sepal.Length, transition = list(k = 10)) +
- xs(Petal.Length) +
- xf(Species),
- model = rf_iris,
- xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
- xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
-)
-summary(model_xs)
model_xs <- xspline(Petal.Width ~
+ xs(Sepal.Length, transition = list(k = 10)) +
+ xs(Petal.Length) +
+ xf(Species),
+ model = rf_iris,
+ xs_opts = list(effect = list(grid.resolution = 100), transition = list(bs = "cr")),
+ xf_opts = list(transition = list(stat = "loglikelihood", value = 4))
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -220,33 +219,33 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67299 -0.07637 -0.02785 0.09571 0.46543
+## -0.67087 -0.07655 -0.02882 0.09801 0.46273
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.53462 0.27171 -5.648 8.29e-08 ***
-## xs(Sepal.Length) 0.32640 0.32394 1.008 0.315
-## xs(Petal.Length) 1.91718 0.39978 4.796 3.98e-06 ***
-## xf(Species)versicolor -0.03572 0.20077 -0.178 0.859
-## xf(Species)virginica 0.28561 0.26087 1.095 0.275
+## (Intercept) -1.45787 0.25283 -5.766 4.71e-08 ***
+## xs(Sepal.Length) 0.30133 0.29983 1.005 0.317
+## xs(Petal.Length) 1.88307 0.39366 4.784 4.19e-06 ***
+## xf(Species)versicolor -0.03702 0.20165 -0.184 0.855
+## xf(Species)virginica 0.28875 0.26096 1.107 0.270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03089351)
+## (Dispersion parameter for gaussian family taken to be 0.03090658)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.4796 on 145 degrees of freedom
-## AIC: -88.985
+## Residual deviance: 4.4815 on 145 degrees of freedom
+## AIC: -88.922
##
## Number of Fisher Scoring iterations: 2
-In this case last_evaluation
variable will be transformed with thin plate regression spline (bs = "tp"
is default for mgcv::s
) with basis dimension equal to 10
. At the same time average_monthly_hours
will be transformed with cubic splines.
In this case last_evaluation
variable will be transformed with thin plate regression spline (bs = "tp"
is default for mgcv::s
) with basis dimension equal to 10
. At the same time average_monthly_hours
will be transformed with cubic splines.
What if you forget specifying local and global parameters?
As you can see in project objects reference, you may find there xs_opts_edfault
and xf_opts_default
objects. These objects specify default parameters for xs
and xf
transformations.
## $effect
## $effect$type
## [1] "pdp"
@@ -258,7 +257,7 @@
##
## $transition$monotonic
## [1] "not"
-
+
## $effect
## $effect$type
## [1] "ice"
@@ -286,11 +285,11 @@
Transform each formula predictor
If you want to transform each predictor of specified formula and not using xs and xf variables you can omit it using consider
parameter for xspline
function.
Possible values are "specials"
the default one and "all"
. For automatic transformation of each variable without specifying xs
and xf
symbols just set consider = "all"
and pass standard formula into xspline
function:
-model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
- model = rf_iris,
- consider = "all"
-)
-summary(model_xs)
+model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+ model = rf_iris,
+ consider = "all"
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -298,34 +297,34 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## xf(Species)versicolor 0.006079 0.194756 0.031 0.975
-## xf(Species)virginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## xf(Species)versicolor 0.000951 0.196324 0.005 0.996
+## xf(Species)virginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
Then each predictor is transformed with xs and xf symbols and use of default parameters or global ones when specified.
xs
is used for integer and numeric variables - xf
for factors.
-By default xspline function tries to extract the data from model (rf_model
) call and xspline’s parent.frame()
environment then uses it to determine predictor types. So to be sure that variable types are sourced correctly a good practice here is to add data parameter, storing black box’s training data.
-model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
- model = rf_iris,
- data = iris,
- consider = "all"
-)
-summary(model_xs)
+By default xspline function tries to extract the data from model (rf_model
) call and xspline’s parent.frame()
environment then uses it to determine predictor types. So to be sure that variable types are sourced correctly a good practice here is to add data parameter, storing black box’s training data.
+model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+ model = rf_iris,
+ data = iris,
+ consider = "all"
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -333,23 +332,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## xf(Species)versicolor 0.006079 0.194756 0.031 0.975
-## xf(Species)virginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## xf(Species)versicolor 0.000951 0.196324 0.005 0.996
+## xf(Species)virginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
In some cases you may want to transform only quantitative or qualitative predictors. Looking into default parameters xs_opts_default
and xf_opts_default
we may find alter
parameter for transition.
The parameter is used to specify if predictor for which xs
or xf
was specified needs to be transformed or used as a bare variable in formula. You can specify it in the local or global transition parameter. In this case using the global one is more reasonable.
So, in order to transform only continuous variables just set alter = "always"
(what is default) for xs_opts
and alter = "never"
for xf_opts
:
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
- model = rf_iris,
- data = iris,
- consider = "all",
- xf_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+ model = rf_iris,
+ data = iris,
+ consider = "all",
+ xf_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -373,33 +372,33 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## Speciesversicolor 0.006079 0.194756 0.031 0.975
-## Speciesvirginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## Speciesversicolor 0.000951 0.196324 0.005 0.996
+## Speciesvirginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
For transformation of factors only:
-model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
- model = rf_iris,
- data = iris,
- consider = "all",
- xs_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+ model = rf_iris,
+ data = iris,
+ consider = "all",
+ xs_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ Sepal.Length + Petal.Length +
@@ -432,8 +431,8 @@
Model based dot formula
For many existing models in R we usually can specify “dot formulas”, when used predictors are sourced from provided data. xspliner can also handle the form. Let’s return here for iris random forest model.
-
+
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -441,23 +440,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## xf(Species)versicolor 0.006079 0.194756 0.031 0.975
-## xf(Species)virginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## xf(Species)versicolor 0.000951 0.196324 0.005 0.996
+## xf(Species)virginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
Good practice here is to provide data
parameter as well to detect predictors classes, and model type (classification or regression).
@@ -469,11 +468,11 @@
xspline provided data parameter, excluding formula response
To assure correct predictors usage, you may also specify predictor names vector in predictors
parameter, and data (optional) to assure source of variable classes:
-model_xs <- xspline(Petal.Width ~ .,
- model = rf_iris,
- predictors = colnames(iris)[-c(2, 4)],
- data = iris)
-summary(model_xs)
+model_xs <- xspline(Petal.Width ~ .,
+ model = rf_iris,
+ predictors = colnames(iris)[-c(2, 4)],
+ data = iris)
+summary(model_xs)
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -481,23 +480,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## xf(Species)versicolor 0.006079 0.194756 0.031 0.975
-## xf(Species)virginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## xf(Species)versicolor 0.000951 0.196324 0.005 0.996
+## xf(Species)virginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
In above examples each predictor is transformed by default. You can exclude needed, by specifying global alter = "never"
parameters, or bare
.
@@ -509,8 +508,8 @@
Omit formula
As we could see in previous section, using dot formula the predictors are sourced from provided black box. Why cannot we fully extract formula from black box? We can.
Let’s use previously built rf_iris
model:
-
+
##
## Call:
## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) +
@@ -518,23 +517,23 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -0.67583 -0.07503 -0.02606 0.09536 0.47475
+## -0.67392 -0.07678 -0.02782 0.09576 0.47261
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 ***
-## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276
-## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 ***
-## xf(Species)versicolor 0.006079 0.194756 0.031 0.975
-## xf(Species)virginica 0.339118 0.253531 1.338 0.183
+## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 ***
+## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273
+## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 ***
+## xf(Species)versicolor 0.000951 0.196324 0.005 0.996
+## xf(Species)virginica 0.337681 0.254468 1.327 0.187
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 0.03104432)
+## (Dispersion parameter for gaussian family taken to be 0.03105078)
##
## Null deviance: 86.5699 on 149 degrees of freedom
-## Residual deviance: 4.5014 on 145 degrees of freedom
-## AIC: -88.255
+## Residual deviance: 4.5024 on 145 degrees of freedom
+## AIC: -88.223
##
## Number of Fisher Scoring iterations: 2
Works! Can it be simpler? Actually not because of black box based transformation and theory, but we can provide some model based parameters upfront using DALEX’s explainer
object (see next section).
@@ -549,24 +548,24 @@
Excluding predictors from transformation
For this example consider again Boston Housing Data from pdp package, and build simple svm
model for predicting chas
variable:
-library(pdp)
-library(e1071)
-data(boston)
-svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
-str(boston[, "rad"])
+library(pdp)
+library(e1071)
+data(boston)
+svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
+str(boston[, "rad"])
## int [1:506] 1 2 2 3 3 3 5 5 5 5 ...
-
+unique(boston$rad)
## [1] 1 2 3 5 4 8 6 7 24
As we can see rad
variable is integer and has only 9 unique values. As a result spline approximation may be misleading, and not possible to perform. We decide here to omit rad
variable when performing transformation, nevertheless remaining predictors should be transformed.
At first setup model based transformation:
-
+xs_model <- xspline(svm_boston)
## Error in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): A term has fewer unique covariate combinations than specified maximum degrees of freedom
As we can see, the error was returned due to the form of rad
variable.
How to exclude rad
from transformation? We can use xspline’s bare
parameter, responsible for specifying predictor which shouldn’t be transformed.
-
+
##
## Call:
## stats::glm(formula = chas ~ xs(cmedv) + rad + xs(lstat), family = family,
@@ -578,10 +577,10 @@
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) 51.7328439 62.0866827 0.833 0.40471
-## xs(cmedv) 48.8176925 18.8811816 2.586 0.00972 **
-## rad -0.0008446 0.0214853 -0.039 0.96864
-## xs(lstat) -7.1956058 59.0029904 -0.122 0.90294
+## (Intercept) -7.666e+01 8.474e+01 -0.905 0.36563
+## xs(cmedv) -6.667e+01 2.579e+01 -2.586 0.00972 **
+## rad -8.446e-04 2.149e-02 -0.039 0.96864
+## xs(lstat) 9.827e+00 8.058e+01 0.122 0.90294
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
@@ -602,21 +601,21 @@
Integration with DALEX
As mentioned in the previous section, xspliner is integrated with DALEX package. The main function from the package explain
returns useful black box data (such as bare black model or training data) that can be used by xspline function.
Just check below example
-library(DALEX)
-rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
-explainer <- explain(rf_boston, label = "boston")
+library(DALEX)
+rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
+explainer <- explain(rf_boston, label = "boston")
## Preparation of a new explainer is initiated
## -> model label : boston
## -> data : 506 rows 4 cols ([33mextracted from the model[39m)
## -> target variable : not specified! ([31mWARNING[39m)
## -> predict function : yhat.randomForest will be used ([33mdefault[39m)
-## -> predicted values : numerical, min = 6.123694 , mean = 12.65864 , max = 24.733
+## -> predicted values : numerical, min = 5.876439 , mean = 12.65189 , max = 24.95396
## -> residual function : difference between y and yhat ([33mdefault[39m)
## [32mA new explainer has been created![39m
-
+
##
## Call:
## stats::glm(formula = lstat ~ xs(cmedv) + xs(crim) + xf(chas),
@@ -624,22 +623,22 @@
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
-## -11.5314 -2.4027 -0.6677 1.7436 21.1974
+## -11.5003 -2.3983 -0.6648 1.7420 21.2034
##
## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -16.29727 1.44377 -11.288 < 2e-16 ***
-## xs(cmedv) 1.69440 0.07224 23.454 < 2e-16 ***
-## xs(crim) 0.58730 0.14448 4.065 5.57e-05 ***
-## xf(chas)1 1.41799 0.69509 2.040 0.0419 *
+## Estimate Std. Error t value Pr(>|t|)
+## (Intercept) -15.4544 1.3259 -11.656 < 2e-16 ***
+## xs(cmedv) 1.6853 0.0719 23.439 < 2e-16 ***
+## xs(crim) 0.5284 0.1337 3.951 8.9e-05 ***
+## xf(chas)1 1.4136 0.6954 2.033 0.0426 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
-## (Dispersion parameter for gaussian family taken to be 15.4412)
+## (Dispersion parameter for gaussian family taken to be 15.45128)
##
## Null deviance: 25752.4 on 505 degrees of freedom
-## Residual deviance: 7751.5 on 502 degrees of freedom
-## AIC: 2826.9
+## Residual deviance: 7756.5 on 502 degrees of freedom
+## AIC: 2827.2
##
## Number of Fisher Scoring iterations: 2
You can provide your own xspline’s parameters that overwrite that sourced from explainer.
@@ -679,19 +678,16 @@