diff --git a/DESCRIPTION b/DESCRIPTION index 2dd9565..1fa4344 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xspliner Title: Assisted Model Building, using Surrogate Black-Box Models to Train Interpretable Spline Based Additive Models -Version: 0.0.3.9002 +Version: 0.0.3.9003 Authors@R: c( person("Krystian", "Igras", email = "krystian8207@gmail.com", role = c("aut", "cre")), person("Przemyslaw", "Biecek", role = c("aut", "ths"))) diff --git a/R/methods-xspliner.R b/R/methods-xspliner.R index 933cf53..0f38acf 100644 --- a/R/methods-xspliner.R +++ b/R/methods-xspliner.R @@ -14,16 +14,17 @@ #' @param compare_with Named list. Other models that should be compared with xspliner and \code{model}. #' @param n_plots Threshold for number of plots when plotting all variables. #' @param sort_by When comparing models determines according to which model should observations be ordered. +#' @param use_coeff If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient. #' @param prediction_funs Prediction functions that should be used in model comparison. #' @param ... Another arguments passed into model specific method. #' #' @export plot.xspliner <- function(x, variable_names = NULL, model = NULL, plot_response = TRUE, plot_approx = TRUE, - data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, sort_by = NULL, + data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, sort_by = NULL, use_coeff = TRUE, compare_with = list(), prediction_funs = list(function(object, newdata) predict(object, newdata)), ...) { if (is.null(model)) { - plot_variable_transition(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, n_plots) + plot_variable_transition(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, n_plots, use_coeff) } else { if (is.null(data)) { stop("Data must be provided.") diff --git a/R/utils-model.R b/R/utils-model.R index d83b9ba..1ca7723 100644 --- a/R/utils-model.R +++ b/R/utils-model.R @@ -10,7 +10,7 @@ specials <- function(model, type = "all") { } } -plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data, plot_data, plot_deriv) { +plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data, plot_data, plot_deriv, use_coeff) { if (plot_data && is.null(data)) { message("You can plot data points only when data parameter is provided.") plot_data <- FALSE @@ -21,6 +21,10 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data stop("You must specify at least one plot.") } transition_fun <- transition(x, variable_name, "function") + variable_coeff <- 1 + if (use_coeff) { + variable_coeff <- setNames(x$coefficients[-1], all.vars(x$call$formula)[-1])[variable_name] + } if (plot_data) { plot_range <- range(data[[variable_name]]) @@ -40,6 +44,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data } if (plot_response) { data <- transition(x, variable_name, "data") + data$yhat <- variable_coeff * data$yhat colnames(data)[colnames(data) == "yhat"] <- response_var names(color_values)[2] <- attr(data, "type") data$type <- attr(data, "type") @@ -47,7 +52,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data } if (plot_approx) { x_var <- seq(from = plot_range[1], to = plot_range[2], length.out = 50) - y_var <- transition_fun(x_var) + y_var <- variable_coeff * transition_fun(x_var) data <- data.frame(y_var, x_var) colnames(data) <- c(response_var, variable_name) data$type <- "approximation" @@ -56,7 +61,7 @@ plot_quantitative <- function(x, variable_name, plot_response, plot_approx, data if (plot_deriv) { eps <- (plot_range[2] - plot_range[1]) / 500 x_var <- seq(from = plot_range[1], to = plot_range[2], length.out = 50)[-50] - y_var <- (transition_fun(x_var + eps) - transition_fun(x_var)) / eps + y_var <- variable_coeff * (transition_fun(x_var + eps) - transition_fun(x_var)) / eps data <- data.frame(y_var, x_var) colnames(data) <- c(response_var, variable_name) if (sum(to_plot) == 1) { @@ -116,6 +121,7 @@ utils::globalVariables(c("Observation", "Model", "Value")) #' @param plot_data If TRUE raw data is drawn. #' @param plot_deriv If TRUE derivative of approximation is showed on plot. #' @param n_plots Threshold for number of plots when plotting all variables. +#' @param use_coeff If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient. #' #' @examples #' library(randomForest) @@ -135,7 +141,7 @@ utils::globalVariables(c("Observation", "Model", "Value")) #' #' @export plot_variable_transition <- function(x, variable_names = NULL, plot_response = TRUE, plot_approx = TRUE, - data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6) { + data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, use_coeff = TRUE) { if (is.null(variable_names)) { special_vars <- specials(x, "all") special_vars_to_plot <- special_vars[1:min(n_plots, length(special_vars))] @@ -152,7 +158,7 @@ plot_variable_transition <- function(x, variable_names = NULL, plot_response = T } else if (variable_names %in% specials(x, "qualitative")) { plot(transition(x, variable_names, "base")) } else { - plot_quantitative(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv) + plot_quantitative(x, variable_names, plot_response, plot_approx, data, plot_data, plot_deriv, use_coeff) } } diff --git a/docs/articles/automation.html b/docs/articles/automation.html index 11e39e9..9b8f875 100644 --- a/docs/articles/automation.html +++ b/docs/articles/automation.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -

Default transition and effect

As you can see in project objects reference, you may find there xs_opts_edfault and xf_opts_default objects. These objects specify default parameters for xs and xf transformations.

- +
xs_opts_default
## $effect
 ## $effect$type
 ## [1] "pdp"
@@ -258,7 +257,7 @@ 

## ## $transition$monotonic ## [1] "not"

- +
xf_opts_default
## $effect
 ## $effect$type
 ## [1] "ice"
@@ -286,11 +285,11 @@ 

Transform each formula predictor

If you want to transform each predictor of specified formula and not using xs and xf variables you can omit it using consider parameter for xspline function.

Possible values are "specials" the default one and "all". For automatic transformation of each variable without specifying xs and xf symbols just set consider = "all" and pass standard formula into xspline function:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
-  model = rf_iris,
-  consider = "all"
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
+  model = rf_iris,
+  consider = "all"
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -298,34 +297,34 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## xf(Species)versicolor 0.006079 0.194756 0.031 0.975 -## xf(Species)virginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## xf(Species)versicolor 0.000951 0.196324 0.005 0.996 +## xf(Species)virginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

Then each predictor is transformed with xs and xf symbols and use of default parameters or global ones when specified.

xs is used for integer and numeric variables - xf for factors.

-

By default xspline function tries to extract the data from model (rf_model) call and xspline’s parent.frame() environment then uses it to determine predictor types. So to be sure that variable types are sourced correctly a good practice here is to add data parameter, storing black box’s training data.

-
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all"
-)
-summary(model_xs)
+

By default xspline function tries to extract the data from model (rf_model) call and xspline’s parent.frame() environment then uses it to determine predictor types. So to be sure that variable types are sourced correctly a good practice here is to add data parameter, storing black box’s training data.

+
model_xs <- xspline(Petal.Width ~ Sepal.Length  + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all"
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -333,23 +332,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## xf(Species)versicolor 0.006079 0.194756 0.031 0.975 -## xf(Species)virginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## xf(Species)versicolor 0.000951 0.196324 0.005 0.996 +## xf(Species)virginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

@@ -359,13 +358,13 @@

In some cases you may want to transform only quantitative or qualitative predictors. Looking into default parameters xs_opts_default and xf_opts_default we may find alter parameter for transition.

The parameter is used to specify if predictor for which xs or xf was specified needs to be transformed or used as a bare variable in formula. You can specify it in the local or global transition parameter. In this case using the global one is more reasonable.

So, in order to transform only continuous variables just set alter = "always" (what is default) for xs_opts and alter = "never" for xf_opts:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all",
-  xf_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all",
+  xf_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -373,33 +372,33 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## Speciesversicolor 0.006079 0.194756 0.031 0.975 -## Speciesvirginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## Speciesversicolor 0.000951 0.196324 0.005 0.996 +## Speciesvirginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

For transformation of factors only:

-
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
-  model = rf_iris,
-  data = iris,
-  consider = "all",
-  xs_opts = list(transition = list(alter = "never"))
-)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ Sepal.Length + Petal.Length + Species,
+  model = rf_iris,
+  data = iris,
+  consider = "all",
+  xs_opts = list(transition = list(alter = "never"))
+)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ Sepal.Length + Petal.Length + 
@@ -432,8 +431,8 @@ 

Model based dot formula

For many existing models in R we usually can specify “dot formulas”, when used predictors are sourced from provided data. xspliner can also handle the form. Let’s return here for iris random forest model.

-
model_xs <- xspline(Petal.Width ~ ., model = rf_iris)
-summary(model_xs)
+
model_xs <- xspline(Petal.Width ~ ., model = rf_iris)
+summary(model_xs)
## 
 ## Call:
 ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
@@ -441,23 +440,23 @@ 

## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## xf(Species)versicolor 0.006079 0.194756 0.031 0.975 -## xf(Species)virginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## xf(Species)versicolor 0.000951 0.196324 0.005 0.996 +## xf(Species)virginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

Good practice here is to provide data parameter as well to detect predictors classes, and model type (classification or regression).

@@ -469,11 +468,11 @@

  • xspline provided data parameter, excluding formula response
  • To assure correct predictors usage, you may also specify predictor names vector in predictors parameter, and data (optional) to assure source of variable classes:

    - +
    model_xs <- xspline(Petal.Width ~ ., 
    +                    model = rf_iris,
    +                    predictors = colnames(iris)[-c(2, 4)],
    +                    data = iris)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
    @@ -481,23 +480,23 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## xf(Species)versicolor 0.006079 0.194756 0.031 0.975 -## xf(Species)virginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## xf(Species)versicolor 0.000951 0.196324 0.005 0.996 +## xf(Species)virginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

    In above examples each predictor is transformed by default. You can exclude needed, by specifying global alter = "never" parameters, or bare.

    @@ -509,8 +508,8 @@

    Omit formula

    As we could see in previous section, using dot formula the predictors are sourced from provided black box. Why cannot we fully extract formula from black box? We can.

    Let’s use previously built rf_iris model:

    -
    model_xs <- xspline(rf_iris)
    -summary(model_xs)
    +
    model_xs <- xspline(rf_iris)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + 
    @@ -518,23 +517,23 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -0.67583 -0.07503 -0.02606 0.09536 0.47475 +## -0.67392 -0.07678 -0.02782 0.09576 0.47261 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -1.515807 0.274254 -5.527 1.47e-07 *** -## xs(Sepal.Length) 0.356926 0.326612 1.093 0.276 -## xs(Petal.Length) 1.842752 0.390148 4.723 5.43e-06 *** -## xf(Species)versicolor 0.006079 0.194756 0.031 0.975 -## xf(Species)virginica 0.339118 0.253531 1.338 0.183 +## (Intercept) -1.446117 0.255595 -5.658 7.92e-08 *** +## xs(Sepal.Length) 0.333286 0.303110 1.100 0.273 +## xs(Petal.Length) 1.815507 0.385453 4.710 5.74e-06 *** +## xf(Species)versicolor 0.000951 0.196324 0.005 0.996 +## xf(Species)virginica 0.337681 0.254468 1.327 0.187 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 0.03104432) +## (Dispersion parameter for gaussian family taken to be 0.03105078) ## ## Null deviance: 86.5699 on 149 degrees of freedom -## Residual deviance: 4.5014 on 145 degrees of freedom -## AIC: -88.255 +## Residual deviance: 4.5024 on 145 degrees of freedom +## AIC: -88.223 ## ## Number of Fisher Scoring iterations: 2

    Works! Can it be simpler? Actually not because of black box based transformation and theory, but we can provide some model based parameters upfront using DALEX’s explainer object (see next section).

    @@ -549,24 +548,24 @@

    Excluding predictors from transformation

    For this example consider again Boston Housing Data from pdp package, and build simple svm model for predicting chas variable:

    -
    library(pdp)
    -library(e1071)
    -data(boston)
    -svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
    -str(boston[, "rad"])
    +
    library(pdp)
    +library(e1071)
    +data(boston)
    +svm_boston <- svm(chas ~ cmedv + rad + lstat, data = boston, probability = TRUE)
    +str(boston[, "rad"])
    ##  int [1:506] 1 2 2 3 3 3 5 5 5 5 ...
    -
    unique(boston$rad)
    +
    unique(boston$rad)
    ## [1]  1  2  3  5  4  8  6  7 24

    As we can see rad variable is integer and has only 9 unique values. As a result spline approximation may be misleading, and not possible to perform. We decide here to omit rad variable when performing transformation, nevertheless remaining predictors should be transformed.

    At first setup model based transformation:

    - +
    xs_model <- xspline(svm_boston)
    ## Error in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): A term has fewer unique covariate combinations than specified maximum degrees of freedom

    As we can see, the error was returned due to the form of rad variable.

    How to exclude rad from transformation? We can use xspline’s bare parameter, responsible for specifying predictor which shouldn’t be transformed.

    - +
    xs_model <- xspline(
    +  svm_boston,
    +  bare = "rad")
    +summary(xs_model)
    ## 
     ## Call:
     ## stats::glm(formula = chas ~ xs(cmedv) + rad + xs(lstat), family = family, 
    @@ -578,10 +577,10 @@ 

    ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) -## (Intercept) 51.7328439 62.0866827 0.833 0.40471 -## xs(cmedv) 48.8176925 18.8811816 2.586 0.00972 ** -## rad -0.0008446 0.0214853 -0.039 0.96864 -## xs(lstat) -7.1956058 59.0029904 -0.122 0.90294 +## (Intercept) -7.666e+01 8.474e+01 -0.905 0.36563 +## xs(cmedv) -6.667e+01 2.579e+01 -2.586 0.00972 ** +## rad -8.446e-04 2.149e-02 -0.039 0.96864 +## xs(lstat) 9.827e+00 8.058e+01 0.122 0.90294 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## @@ -602,21 +601,21 @@

    Integration with DALEX

    As mentioned in the previous section, xspliner is integrated with DALEX package. The main function from the package explain returns useful black box data (such as bare black model or training data) that can be used by xspline function.

    Just check below example

    -
    library(DALEX)
    -rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
    -explainer <- explain(rf_boston, label = "boston")
    +
    library(DALEX)
    +rf_boston <- randomForest(lstat ~ cmedv + crim + chas, data = boston)
    +explainer <- explain(rf_boston, label = "boston")
    ## Preparation of a new explainer is initiated
     ##   -> model label       :  boston 
     ##   -> data              :  506  rows  4  cols ([33mextracted from the model[39m)
     ##   -> target variable   :  not specified! ([31mWARNING[39m)
     ##   -> predict function  :  yhat.randomForest  will be used ([33mdefault[39m)
    -##   -> predicted values  :  numerical, min =  6.123694 , mean =  12.65864 , max =  24.733  
    +##   -> predicted values  :  numerical, min =  5.876439 , mean =  12.65189 , max =  24.95396  
     ##   -> residual function :  difference between y and yhat ([33mdefault[39m)
     ## [32mA new explainer has been created![39m
    - +
    model <- xspline(
    +  explainer
    +)
    +summary(model)
    ## 
     ## Call:
     ## stats::glm(formula = lstat ~ xs(cmedv) + xs(crim) + xf(chas), 
    @@ -624,22 +623,22 @@ 

    ## ## Deviance Residuals: ## Min 1Q Median 3Q Max -## -11.5314 -2.4027 -0.6677 1.7436 21.1974 +## -11.5003 -2.3983 -0.6648 1.7420 21.2034 ## ## Coefficients: -## Estimate Std. Error t value Pr(>|t|) -## (Intercept) -16.29727 1.44377 -11.288 < 2e-16 *** -## xs(cmedv) 1.69440 0.07224 23.454 < 2e-16 *** -## xs(crim) 0.58730 0.14448 4.065 5.57e-05 *** -## xf(chas)1 1.41799 0.69509 2.040 0.0419 * +## Estimate Std. Error t value Pr(>|t|) +## (Intercept) -15.4544 1.3259 -11.656 < 2e-16 *** +## xs(cmedv) 1.6853 0.0719 23.439 < 2e-16 *** +## xs(crim) 0.5284 0.1337 3.951 8.9e-05 *** +## xf(chas)1 1.4136 0.6954 2.033 0.0426 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## (Dispersion parameter for gaussian family taken to be 15.4412) +## (Dispersion parameter for gaussian family taken to be 15.45128) ## ## Null deviance: 25752.4 on 505 degrees of freedom -## Residual deviance: 7751.5 on 502 degrees of freedom -## AIC: 2826.9 +## Residual deviance: 7756.5 on 502 degrees of freedom +## AIC: 2827.2 ## ## Number of Fisher Scoring iterations: 2

    You can provide your own xspline’s parameters that overwrite that sourced from explainer.

    @@ -679,19 +678,16 @@

    -
    - diff --git a/docs/articles/cases.html b/docs/articles/cases.html index 4e525db..d9eb579 100644 --- a/docs/articles/cases.html +++ b/docs/articles/cases.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/cases_files/figure-html/unnamed-chunk-2-1.png b/docs/articles/cases_files/figure-html/unnamed-chunk-2-1.png index 65da31d..bca9ec4 100644 Binary files a/docs/articles/cases_files/figure-html/unnamed-chunk-2-1.png and b/docs/articles/cases_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/docs/articles/discrete.html b/docs/articles/discrete.html index 95df07d..90949eb 100644 --- a/docs/articles/discrete.html +++ b/docs/articles/discrete.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    @@ -168,13 +167,13 @@

    In order to customize variable transition, just specified (inherited from above functions) parameters inside transition parameter of xf formula symbol. For example to use “fast-adaptive” method for groups merging with optimal partition at GIC statistics value of 4, we set:

    xf(salary, transition = list(method = "fast-adaptive", value = 4))

    In below example, we will transform salary predictor with cutting of GIC statistics at value = 2. As in continuous case we need to use the formula within xspline function:

    - +
    library(xspliner)
    +model_xs <- xspline(
    +  average_montly_hours ~ last_evaluation + xf(salary, transition = list(value = 2)) + satisfaction_level,
    +  model = model_rf
    +)
    +
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = average_montly_hours ~ last_evaluation + 
    @@ -202,13 +201,13 @@ 

    ## Number of Fisher Scoring iterations: 2

    Checking out the model summary, we can realize that “low” and “medium” values were merged into single level (generating “lowmedium” level).

    It can be also found by:

    -
    summary(model_xs, "salary")
    +
    summary(model_xs, "salary")
    ##     orig      pred
     ## 1   high      high
     ## 2    low lowmedium
     ## 3 medium lowmedium

    The graphical result if fully sourced from factorMerger. It is enough to run:

    -
    plot_variable_transition(model_xs, "salary")
    +
    plot_variable_transition(model_xs, "salary")

    @@ -216,19 +215,19 @@

    Quantitative predictors

    xspliner can work with classification problems as well. As the final GLM model can work only with binary classification, the only limit here is the number of levels for predicted value (equals to 2).

    -

    Let’s check below example based on SVM algorithm (e1071::svm), and modified iris data.

    +

    Let’s check below example based on SVM algorithm (e1071::svm), and modified iris data.

    Preparing data (we drop “setosa” level on Species value):

    -
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    +
    iris_data <- droplevels(iris[iris$Species != "setosa", ])

    Building SVM:

    -
    library(e1071) 
    -library(xspliner)
    -model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    -                 data = iris_data, probability = TRUE)
    +
    library(e1071) 
    +library(xspliner)
    +model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    +                 data = iris_data, probability = TRUE)

    When the base model response variable is of class factor (or integer with two unique values) then xspliner automatically detects classification problem. To force specific model response distribution you can set family and link parameters. In this case we can use xspliner in standard way.

    As each predictor is continuous variable, let’s transform it with xs usage on standard parameters, and build the model:

    -
    model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    -                    model = model_svm)
    -summary(model_xs)
    +
    model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    +                    model = model_svm)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Species ~ xs(Sepal.Length) + xs(Sepal.Width) + 
    @@ -240,11 +239,11 @@ 

    ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|) -## (Intercept) -0.5211 0.7715 -0.675 0.49941 -## xs(Sepal.Length) 12.7248 18.3724 0.693 0.48856 -## xs(Sepal.Width) 8.5943 5.2866 1.626 0.10402 -## xs(Petal.Length) 3.2984 1.3432 2.456 0.01406 * -## xs(Petal.Width) 3.7473 1.4287 2.623 0.00872 ** +## (Intercept) -0.9158 0.7547 -1.214 0.22491 +## xs(Sepal.Length) 12.0078 17.3372 0.693 0.48856 +## xs(Sepal.Width) 8.1101 4.9887 1.626 0.10402 +## xs(Petal.Length) 3.1126 1.2675 2.456 0.01406 * +## xs(Petal.Width) 3.5362 1.3482 2.623 0.00872 ** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## @@ -256,7 +255,7 @@

    ## ## Number of Fisher Scoring iterations: 8

    Simple plot for Petal.Width shows that approximation almost fully covers the PDP.

    -
    plot_variable_transition(model_xs, "Petal.Width")
    +
    plot_variable_transition(model_xs, "Petal.Width")

    @@ -281,19 +280,16 @@

    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png index 7f8d167..8aed2e4 100644 Binary files a/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/discrete_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png index 760a307..ad20208 100644 Binary files a/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/discrete_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/articles/extras.html b/docs/articles/extras.html index 217c155..8d32d8a 100644 --- a/docs/articles/extras.html +++ b/docs/articles/extras.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    @@ -162,20 +161,20 @@

    compare_stat - function of lm class object. It defines statistic that should be used in decision between spline model and linear one. The function should have the attribute higher. When the attribute has "better" value then the model with higher statistic value is chosen.

    You can see the feature in above example:

    -
    set.seed(123)
    -boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    -model_pdp_auto <- xspline(
    -  cmedv ~
    -    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 60)) +
    -    xs(ptratio, transition = list(k = 4), effect = list(type = "pdp", grid.resolution = 40)) +
    -    age,
    -  model = boston_rf,
    -  xs_opts = list(transition = list(alter = "auto"))
    -)
    -
    -# aic statistic is used by default
    -
    -summary(model_pdp_auto)
    +
    set.seed(123)
    +boston_rf <- randomForest(cmedv ~ lstat + ptratio + age, data = boston)
    +model_pdp_auto <- xspline(
    +  cmedv ~
    +    xs(lstat, transition = list(k = 6), effect = list(type = "pdp", grid.resolution = 60)) +
    +    xs(ptratio, transition = list(k = 4), effect = list(type = "pdp", grid.resolution = 40)) +
    +    age,
    +  model = boston_rf,
    +  xs_opts = list(transition = list(alter = "auto"))
    +)
    +
    +# aic statistic is used by default
    +
    +summary(model_pdp_auto)
    ## 
     ## Call:
     ## stats::glm(formula = cmedv ~ xs(lstat) + ptratio + age, family = family, 
    @@ -210,19 +209,19 @@ 

    Link parameters stores info about what function should be used to transform the response. The transformation is used in the final model fitting. The standard link is the identity (for gaussian distribution) - for binomial distribution logit is used.

    See more at ??stats::family.glm.

    xspline function allows you to decide which response should be used in the final model. Let’s check the example below in which poisson distribution with log link is used.

    -
    library(xspliner)
    -library(randomForest)
    -x <- rnorm(100)
    -z <- rnorm(100)
    -y <- rpois(100, exp(1 + x + z))
    -data <- data.frame(x, y, z)
    -model_rf <- randomForest(y ~ x + z, data = data)
    -model_xs_1 <- xspline(model_rf)
    -model_xs_2 <- xspline(model_rf, family = poisson(), link = "log")
    +
    library(xspliner)
    +library(randomForest)
    +x <- rnorm(100)
    +z <- rnorm(100)
    +y <- rpois(100, exp(1 + x + z))
    +data <- data.frame(x, y, z)
    +model_rf <- randomForest(y ~ x + z, data = data)
    +model_xs_1 <- xspline(model_rf)
    +model_xs_2 <- xspline(model_rf, family = poisson(), link = "log")

    Let’s compare two models by checking its AIC statistics:

    - +
    model_xs_1$aic
    ## [1] 672.5753
    - +
    model_xs_2$aic
    ## [1] 580.0274

    As we can see the second model is better.

    @@ -230,15 +229,15 @@

    Transformed response

    In some cases you may want to transform model response with you own function. Let’s check the example below with random forest model:

    -
    set.seed(123)
    -x <- rnorm(100, 10)
    -z <- rnorm(100, 10)
    -y <- x * z * rnorm(100, 1, 0.1)
    -data <- data.frame(x, z, y)
    -model_rf <- randomForest(log(y) ~ x + z, data = data)
    +
    set.seed(123)
    +x <- rnorm(100, 10)
    +z <- rnorm(100, 10)
    +y <- x * z * rnorm(100, 1, 0.1)
    +data <- data.frame(x, z, y)
    +model_rf <- randomForest(log(y) ~ x + z, data = data)

    In this case log transformation for y, removes interaction of x and z. In xspliner same transformation is used by default:

    -
    model_xs <- xspline(model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = log(y) ~ xs(x) + xs(z), family = family, 
    @@ -263,7 +262,7 @@ 

    ## AIC: -193.88 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = data)
    +
    plot_model_comparison(model_xs, model = model_rf, data = data)

    @@ -272,14 +271,14 @@

    When interactions between predictors occurs black box models in fact deal much better that linear models. xspliner offers using formulas with variables interactions.

    You can do it in two possible forms.

    Lets start with creating data and building black box:

    -
    x <- rnorm(100)
    -z <- rnorm(100)
    -y <- x + x * z + z + rnorm(100, 0, 0.1)
    -data <- data.frame(x, y, z)
    -model_rf <- randomForest(y ~ x + z, data = data)
    -

    The first option is specifying formula with * sign, as in standard linear models.

    -
    model_xs <- xspline(y ~ x * z, model = model_rf)
    -summary(model_xs)
    +
    x <- rnorm(100)
    +z <- rnorm(100)
    +y <- x + x * z + z + rnorm(100, 0, 0.1)
    +data <- data.frame(x, y, z)
    +model_rf <- randomForest(y ~ x + z, data = data)
    +

    The first option is specifying formula with * sign, as in standard linear models.

    +
    model_xs <- xspline(y ~ x * z, model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ x * z, family = family, data = data)
    @@ -304,11 +303,11 @@ 

    ## AIC: -153.1 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = data)
    +
    plot_model_comparison(model_xs, model = model_rf, data = data)

    The second one is adding form parameter equal to “multiplicative” in case of passing just the model or dot formula.

    -
    model_xs <- xspline(model_rf, form = "multiplicative")
    -summary(model_xs)
    +
    model_xs <- xspline(model_rf, form = "multiplicative")
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ xs(x) * xs(z), family = family, data = data)
    @@ -333,10 +332,10 @@ 

    ## AIC: 94.46 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = data)
    +
    plot_model_comparison(model_xs, model = model_rf, data = data)

    -
    model_xs <- xspline(y ~ ., model = model_rf, form = "multiplicative")
    -summary(model_xs)
    +
    model_xs <- xspline(y ~ ., model = model_rf, form = "multiplicative")
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = y ~ xs(x) * xs(z), family = family, data = data)
    @@ -361,23 +360,23 @@ 

    ## AIC: 94.46 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = data)
    +
    plot_model_comparison(model_xs, model = model_rf, data = data)

    Subset formula

    Every example we saw before used to use the same variables in black box and xspliner model. In fact this is not obligatory. How can it be used? For example to build a simpler model based on truncated amount of predictors. Let’s see below example:

    -
    library(randomForest)
    -library(xspliner)
    -data(airquality)
    -air <- na.omit(airquality)
    -model_rf <- randomForest(Ozone ~ ., data = air)
    -varImpPlot(model_rf)
    +
    library(randomForest)
    +library(xspliner)
    +data(airquality)
    +air <- na.omit(airquality)
    +model_rf <- randomForest(Ozone ~ ., data = air)
    +varImpPlot(model_rf)

    As we can see Wind and Temp variables are of the highest importance. Let’s build xspliner basing on just the Two variables.

    -
    model_xs <- xspline(Ozone ~ xs(Wind) + xs(Temp), model = model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(Ozone ~ xs(Wind) + xs(Temp), model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Ozone ~ xs(Wind) + xs(Temp), family = family, 
    @@ -402,11 +401,11 @@ 

    ## AIC: 960.62 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = air)
    +
    plot_model_comparison(model_xs, model = model_rf, data = air)

    Or model including variables interaction:

    -
    model_xs <- xspline(Ozone ~ xs(Wind) * xs(Temp), model = model_rf)
    -summary(model_xs)
    +
    model_xs <- xspline(Ozone ~ xs(Wind) * xs(Temp), model = model_rf)
    +summary(model_xs)
    ## 
     ## Call:
     ## stats::glm(formula = Ozone ~ xs(Wind) * xs(Temp), family = family, 
    @@ -432,7 +431,7 @@ 

    ## AIC: 953.43 ## ## Number of Fisher Scoring iterations: 2

    -
    plot_model_comparison(model_xs, model = model_rf, data = air)
    +
    plot_model_comparison(model_xs, model = model_rf, data = air)

    @@ -456,19 +455,16 @@

    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-1-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-1-1.png index 09c0dc1..e8ec513 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-1-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-1-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-1-2.png b/docs/articles/extras_files/figure-html/unnamed-chunk-1-2.png index 6f14def..5cccb47 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-1-2.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-1-2.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-10-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-10-1.png index 2dc2c9b..327cca4 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-10-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-11-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-11-1.png index 780fcb4..d23cf67 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-11-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-11-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-12-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-12-1.png index 9b1576e..e463645 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-12-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-13-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-13-1.png index 1b01c43..f2a559d 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-13-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-6-1.png index 97b03af..ca5b067 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-8-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-8-1.png index c84a366..54459c3 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-8-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/docs/articles/extras_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/extras_files/figure-html/unnamed-chunk-9-1.png index 2dc2c9b..327cca4 100644 Binary files a/docs/articles/extras_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/extras_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/articles/graphics.html b/docs/articles/graphics.html index c1ed00b..bbded1e 100644 --- a/docs/articles/graphics.html +++ b/docs/articles/graphics.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    @@ -176,54 +175,54 @@

    Models comparison

    As we may want to compare the final GLM model with its parent black box, xspliner provides one simple tool.

    For comparison just add use plot_model_comparison or use general plot method with corresponding parameters:

    -
    plot_model_comparison(model_xs, model = boston.rf, data = boston)
    +
    plot_model_comparison(model_xs, model = boston.rf, data = boston)

    or

    -
    plot(model_xs, model = boston.rf, data = boston)
    +
    plot(model_xs, model = boston.rf, data = boston)

    The resulting graphics compares predicted values for both GLM and black box model.

    For predicting values standard predict method is used: function(object, newdata) predict(object, newdata). So for regression models the results are on the same scale.

    The notable difference occurs in the classification models. GLM models by default return “link” function values, so for classification it can be any real number. Contrary to that, randomForest function returns predicted levels.

    To avoid the problem, predictor_funs parameter was added. This is the list of prediction functions for each model (in order: black box, xspliner). Let’s see it on SVM example:

    -
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    -
    -library(e1071) 
    -library(xspliner)
    -model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    -                 data = iris_data, probability = TRUE)
    -
    -model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    -                    model = model_svm)
    +
    iris_data <- droplevels(iris[iris$Species != "setosa", ])
    +
    +library(e1071) 
    +library(xspliner)
    +model_svm <- svm(Species ~  Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, 
    +                 data = iris_data, probability = TRUE)
    +
    +model_xs <- xspline(Species ~  xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width),
    +                    model = model_svm)

    Now we specify predict functions to return probability of virginica response.

    -
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")
    +
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    +prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")

    And plot the result

    -
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(prob_xs, prob_svm)
    -)  
    +
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(prob_xs, prob_svm)
    +)  

    It is also possible to sort the values of heatmap according to chosen model:

    -
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(prob_xs, prob_svm),
    -     sort_by = "svm"
    -)  
    +
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(prob_xs, prob_svm),
    +     sort_by = "svm"
    +)  

    In case of class predictions, let’s create class prediction function first:

    -
    class_svm <- function(object, newdata) predict(object, newdata = newdata)
    -response_levels <- levels(iris_data$Species)
    -class_xs <- function(object, newdata) {
    -  probs <- predict(object, newdata = newdata, type = "response")
    -  factor(ifelse(probs > 0.5, response_levels[2], response_levels[1]), levels = response_levels)
    -}
    +
    class_svm <- function(object, newdata) predict(object, newdata = newdata)
    +response_levels <- levels(iris_data$Species)
    +class_xs <- function(object, newdata) {
    +  probs <- predict(object, newdata = newdata, type = "response")
    +  factor(ifelse(probs > 0.5, response_levels[2], response_levels[1]), levels = response_levels)
    +}

    And plot the result:

    -
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(class_xs, class_svm)
    -)  
    +
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(class_xs, class_svm)
    +)  

    Sorting values according to specified model is also possible:

    -
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    -     prediction_funs = list(class_xs, class_svm),
    -     sort_by = "svm"
    -)  
    +
    plot_model_comparison(model_xs, model = model_svm, data = iris_data,
    +     prediction_funs = list(class_xs, class_svm),
    +     sort_by = "svm"
    +)  

    @@ -231,23 +230,23 @@

    Following above approach it’s easy to generate similar graphics for higher amount of models.

    Just include additional models inside compare_with parameter (named list), and add corresponding predict functions to them to predictor_funs parameter (if omitted, the default one is used).

    See below example on airquality data

    -
    library(mgcv)
    -
    -data(airquality)
    -ozone <- subset(na.omit(airquality),
    -                select = c("Ozone", "Solar.R", "Wind", "Temp"))
    -set.seed(123)
    -
    -model_rf <- randomForest(Ozone ~ ., data = ozone)
    -model_xs <- xspline(Ozone ~ xs(Solar.R) + xs(Wind) + xs(Temp), model_rf, data = ozone)
    -model_glm <- glm(Ozone ~ ., data = ozone)
    -model_gam <- mgcv::gam(Ozone ~ s(Solar.R) + s(Wind) + s(Temp), data = ozone)
    -
    -plot_model_comparison(model_xs, 
    -     model = model_rf, 
    -     data = ozone, 
    -     compare_with = list(glm = model_glm, gam = model_gam),
    -     sort_by = "xspliner")
    +
    library(mgcv)
    +
    +data(airquality)
    +ozone <- subset(na.omit(airquality),
    +                select = c("Ozone", "Solar.R", "Wind", "Temp"))
    +set.seed(123)
    +
    +model_rf <- randomForest(Ozone ~ ., data = ozone)
    +model_xs <- xspline(Ozone ~ xs(Solar.R) + xs(Wind) + xs(Temp), model_rf, data = ozone)
    +model_glm <- glm(Ozone ~ ., data = ozone)
    +model_gam <- mgcv::gam(Ozone ~ s(Solar.R) + s(Wind) + s(Temp), data = ozone)
    +
    +plot_model_comparison(model_xs, 
    +     model = model_rf, 
    +     data = ozone, 
    +     compare_with = list(glm = model_glm, gam = model_gam),
    +     sort_by = "xspliner")

    @@ -255,13 +254,13 @@

    Plotting more transitions at once

    If you want to display many transitions on one plot just pass xspliner model to plot:

    - +

    For models trained on top of many predictors, there will be displayed plots for 6 first transformed variables. To change that value just set n_plots variable:

    -
    plot_variable_transition(model_xs, n_plots = 2)
    +
    plot_variable_transition(model_xs, n_plots = 2)

    You can select interesting variables to plot just passing predictor names in vector:

    -
    plot_variable_transition(model_xs, c("Wind", "Temp"))
    +
    plot_variable_transition(model_xs, c("Wind", "Temp"))

    @@ -286,19 +285,16 @@

    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-12-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-12-1.png index 7f541d8..d18870a 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-12-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-13-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-13-1.png index 85719b1..8c0917b 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-13-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-15-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-15-1.png index a1a482b..77b7e2d 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-15-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-15-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-16-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-16-1.png index daf834c..e8276e2 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-16-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-17-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-17-1.png index de1ae83..98fc6f2 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-17-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-17-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-18-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-18-1.png index 5348986..ac94701 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-18-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-18-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-19-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-19-1.png index 1501848..39af00e 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-19-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-19-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-20-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-20-1.png index a364ab3..a15a419 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-20-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-20-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-4-1.png index 723de1c..1727923 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-5-1.png index e66bb55..5d02cb7 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-6-1.png index 6f31fed..ed098cc 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-7-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-7-1.png index 08313ca..00ae7b8 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-7-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/docs/articles/graphics_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/graphics_files/figure-html/unnamed-chunk-9-1.png index 1fd6a90..5b0a3c0 100644 Binary files a/docs/articles/graphics_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/graphics_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index cbd7436..153bf43 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -59,7 +59,6 @@ gtag('config', 'UA-5650686-14'); - @@ -106,22 +105,22 @@ @@ -136,7 +135,6 @@ -
    @@ -162,7 +160,6 @@

    All vignettes

    -

    @@ -170,14 +167,11 @@

    All vignettes

    Site built by pkgdown.

    -
    - - diff --git a/docs/articles/methods.html b/docs/articles/methods.html index 1653edd..7ad94e3 100644 --- a/docs/articles/methods.html +++ b/docs/articles/methods.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    @@ -304,10 +303,10 @@

    Predictor based summary

    Summary method allows you to check details about transformation of specific variable.

    -

    Standard usage summary(xspliner_object, variable_name)

    +

    Standard usage summary(xspliner_object, variable_name)

    Quantitative variable

    When predictor is quantitative variable its transition is based on GAM model. For this case summary displays summary of that model.

    -
    summary(model_xs, "Petal.Length")
    +
    summary(model_xs, "Petal.Length")
    ## 
     ## Family: gaussian 
     ## Link function: identity 
    @@ -317,21 +316,21 @@ 

    ## ## Parametric coefficients: ## Estimate Std. Error t value Pr(>|t|) -## (Intercept) 1.202642 0.004044 297.4 <2e-16 *** +## (Intercept) 1.204707 0.003354 359.2 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Approximate significance of smooth terms: ## edf Ref.df F p-value -## s(Petal.Length) 8.799 8.988 601.2 <2e-16 *** +## s(Petal.Length) 8.816 8.99 728.3 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## -## R-sq.(adj) = 0.982 Deviance explained = 98.4% -## GCV = 0.0018135 Scale est. = 0.0016358 n = 100

    +## R-sq.(adj) = 0.985 Deviance explained = 98.6% +## GCV = 0.0012472 Scale est. = 0.0011248 n = 100

    Qualitative variable

    In case of qualitative predictor, the method displays data.frame storing information how factors were merged during the transition.

    -
    summary(model_xs, "Species")
    +
    summary(model_xs, "Species")
    ##         orig                pred
     ## 1     setosa              setosa
     ## 2 versicolor versicolorvirginica
    @@ -342,16 +341,16 @@ 

    Surrogate vs Black Box comparison

    Providing model parameter instead of predictor, the summary displays a few statistics that compares original model with surrogate one. All statistics definitions are included in summary.xspline documentation.

    Here we show one example for classification model.

    -

    For this example we use ISLR::Default data and build svm model as black box. The model aims to predict default variable, indicating whether the customer defaulted on their debt.

    -
    library(xspliner)
    -library(e1071)
    -set.seed(1)
    -data <- ISLR::Default
    -default.svm <- svm(default ~ ., data = data, probability = TRUE)
    -default.xs <- xspline(default ~ student + xs(balance) + xs(income), model = default.svm)
    +

    For this example we use ISLR::Default data and build svm model as black box. The model aims to predict default variable, indicating whether the customer defaulted on their debt.

    +
    library(xspliner)
    +library(e1071)
    +set.seed(1)
    +data <- ISLR::Default
    +default.svm <- svm(default ~ ., data = data, probability = TRUE)
    +default.xs <- xspline(default ~ student + xs(balance) + xs(income), model = default.svm)

    In order to check the summary, we need to specify prediction functions for each model. In this case predictions are probabilities of success:

    -
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")
    +
    prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2]
    +prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response")

    Almost each summary statistic compares models basing on some data.

    In this case we’re going to compare models on training data providing:

      @@ -362,7 +361,7 @@

    • prediction_funs as a list of prediction functions (for surrogate and original model respectively)
    -
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(prob_xs, prob_svm))
    +
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(prob_xs, prob_svm))
    ## Models comparison 
     ##   1 - Max prediction normed-diff:  0.5268109 
     ##   R^2:  0.9185403
    @@ -371,13 +370,13 @@

    ##   1 - Max ROC diff:  0.8712113 
     ##   1 - Mean ROC diff:  0.9506292

    Another set of statistics is generated for prediction functions that return response levels.

    -
    response_svm <- function(object, newdata) predict(object, newdata = newdata)
    -response_xs <- function(object, newdata) {
    -  y_levels <- levels(newdata[[environment(object)$response]])
    -  factor(y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], levels = y_levels)
    -}
    +
    response_svm <- function(object, newdata) predict(object, newdata = newdata)
    +response_xs <- function(object, newdata) {
    +  y_levels <- levels(newdata[[environment(object)$response]])
    +  factor(y_levels[(predict.glm(object, newdata = newdata, type = "link") > 0) + 1], levels = y_levels)
    +}

    And similarly to previous example:

    -
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(response_xs, response_svm))
    +
    summary(default.xs, model = default.svm, newdata = data, prediction_funs = list(response_xs, response_svm))
    ## Models comparison 
     ##   Mean predictions similarity:  0.9966 
     ##   ACC Black Box:  0.9719 
    @@ -420,19 +419,16 @@ 

    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png b/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png index c283ec8..e47168d 100644 Binary files a/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png and b/docs/articles/methods_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/docs/articles/xspliner.html b/docs/articles/xspliner.html index d5bbf35..4561f4e 100644 --- a/docs/articles/xspliner.html +++ b/docs/articles/xspliner.html @@ -68,22 +68,22 @@ @@ -100,14 +100,13 @@ -
    @@ -345,19 +344,16 @@

    -

    Developed by Krystian Igras, Przemyslaw Biecek. Site built by pkgdown.

    -
    - diff --git a/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-1.png b/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-1.png index 9d91b0b..9e8ef78 100644 Binary files a/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-1.png and b/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-2.png b/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-2.png index 81e0541..ed4b5b3 100644 Binary files a/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-2.png and b/docs/articles/xspliner_files/figure-html/unnamed-chunk-2-2.png differ diff --git a/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-1.png b/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-1.png index 9179618..5b05277 100644 Binary files a/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-1.png and b/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-1.png differ diff --git a/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-2.png b/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-2.png index eeddc33..d716d3f 100644 Binary files a/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-2.png and b/docs/articles/xspliner_files/figure-html/unnamed-chunk-3-2.png differ diff --git a/docs/articles/xspliner_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/xspliner_files/figure-html/unnamed-chunk-9-1.png index 7e09858..4109f75 100644 Binary files a/docs/articles/xspliner_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/xspliner_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/authors.html b/docs/authors.html index 15729b3..e930519 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -59,7 +59,6 @@ gtag('config', 'UA-5650686-14'); - @@ -106,22 +105,22 @@ @@ -136,7 +135,6 @@ -
    @@ -161,7 +159,6 @@

    Authors

    -

    @@ -169,14 +166,11 @@

    Authors

    Site built by pkgdown.

    -
    - - diff --git a/docs/index.html b/docs/index.html index 85eac05..d02716a 100644 --- a/docs/index.html +++ b/docs/index.html @@ -12,7 +12,7 @@ @@ -70,22 +70,22 @@ @@ -102,7 +102,6 @@ -
    @@ -124,9 +123,9 @@

  • plot_variable_transition() or plot generic for graphical presentation of variables profiles and related information,
  • -summary() for statistical comparison of surrogate and original ML models,
  • +summary() for statistical comparison of surrogate and original ML models,
  • -print() for getting details about surrogate model components.
  • +print() for getting details about surrogate model components.

    The approach that stands behind surrogate model construction offered by xspliner sums up below graphics:

    @@ -135,7 +134,7 @@

    -
    - diff --git a/docs/news/index.html b/docs/news/index.html index 344cd90..6a5d246 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -59,7 +59,6 @@ gtag('config', 'UA-5650686-14'); - @@ -106,22 +105,22 @@ @@ -136,7 +135,6 @@
    -
    @@ -146,9 +144,9 @@

    Changelog

    -
    +

    -xspliner 0.0.3 2019-09-05 +xspliner 0.0.3 2019-09-05

    • Summary method extended with comparison statistics
    • @@ -159,9 +157,9 @@

    • Added more informative progress messages
    -
    +

    -xspliner 0.0.2 2018-12-21 +xspliner 0.0.2 2018-12-21

    -

    @@ -197,14 +194,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index f0b864f..b2fe7fe 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,5 +1,5 @@ -pandoc: 2.3.1 -pkgdown: 1.4.0 +pandoc: 2.7.3 +pkgdown: 1.3.0 pkgdown_sha: ~ articles: automation: automation.html diff --git a/docs/reference/approx_with_spline.html b/docs/reference/approx_with_spline.html index 1acc0bd..8299d55 100644 --- a/docs/reference/approx_with_spline.html +++ b/docs/reference/approx_with_spline.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -153,10 +151,10 @@

    Approximate spline on data

    approx_with_spline(effect_data, response, predictor,
    -  env = parent.frame(), ...)
    +  env = parent.frame(), ...)
     
     approx_with_monotonic_spline(effect_data, response, predictor,
    -  env = parent.frame(), monotonic, ...)
    + env = parent.frame(), monotonic, ...)

    Arguments

    @@ -179,7 +177,7 @@

    Arg

    - + @@ -189,13 +187,13 @@

    Arg

    Value

    -

    Object of class "gam". See gamObject

    +

    Object of class "gam". See gamObject

    Examples

    -
    x <- sort(rnorm(20, 5, 5)) -y <- rnorm(20, 2, 2) -env <- new.env() -approx_with_spline(data.frame(x = x, y = y), "y", "x", env)
    #> +
    x <- sort(rnorm(20, 5, 5)) +y <- rnorm(20, 2, 2) +env <- new.env() +approx_with_spline(data.frame(x = x, y = y), "y", "x", env)
    #> #> Family: gaussian #> Link function: identity #> @@ -206,7 +204,7 @@

    Examp #> 1 total = 2 #> #> GCV score: 4.577151

    -approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, "up")
    #> Warning: initial point very close to some inequality constraints
    #> Warning: initial parameters very close to inequality constraints
    #> +approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, "up")
    #> Warning: initial point very close to some inequality constraints
    #> Warning: initial parameters very close to inequality constraints
    #> #> Family: gaussian #> Link function: identity #> @@ -229,7 +227,6 @@

    Contents

    -

    @@ -237,14 +234,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/build_xspliner.html b/docs/reference/build_xspliner.html index bb29584..4b2d8c6 100644 --- a/docs/reference/build_xspliner.html +++ b/docs/reference/build_xspliner.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -154,7 +152,7 @@

    Helper function for building GLM object with transformed variables.

    build_xspliner(formula, model, data, xf_opts = xf_opts_default,
       xs_opts = xs_opts_default, link = "identity", family = "gaussian",
    -  env = parent.frame(), compare_stat = aic, control, ...)
    + env = parent.frame(), compare_stat = aic, control, ...)

    Arguments

    ...

    Other arguments passed to s function.

    Other arguments passed to s function.

    monotonic
    @@ -182,12 +180,12 @@

    Arg

    +model. By default 'identity'. See family for possibilities.

    +be extracted from model. By default 'gaussian'. See family for possibilities.

    @@ -200,7 +198,7 @@

    Arg

    - + @@ -219,7 +217,6 @@

    Contents

    -

    @@ -227,14 +224,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/index.html b/docs/reference/index.html index b080459..7d758fd 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -59,7 +59,6 @@ gtag('config', 'UA-5650686-14'); - @@ -106,22 +105,22 @@ @@ -136,7 +135,6 @@ -
    @@ -257,7 +255,6 @@

    Contents

    -

    @@ -265,14 +262,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/log_msg.html b/docs/reference/log_msg.html index 36836af..70e3eba 100644 --- a/docs/reference/log_msg.html +++ b/docs/reference/log_msg.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -174,7 +172,6 @@

    Contents

    -

    @@ -182,14 +179,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/plot.xspliner.html b/docs/reference/plot.xspliner.html index 9c217ff..388ad70 100644 --- a/docs/reference/plot.xspliner.html +++ b/docs/reference/plot.xspliner.html @@ -61,7 +61,6 @@ gtag('config', 'UA-5650686-14'); - @@ -108,22 +107,22 @@ @@ -138,7 +137,6 @@ -
    @@ -155,11 +153,11 @@

    Plot method for 'xspliner' model

    # S3 method for xspliner
    -plot(x, variable_names = NULL, model = NULL,
    +plot(x, variable_names = NULL, model = NULL,
       plot_response = TRUE, plot_approx = TRUE, data = NULL,
       plot_data = FALSE, plot_deriv = FALSE, n_plots = 6,
    -  sort_by = NULL, compare_with = list(),
    -  prediction_funs = list(function(object, newdata) predict(object,
    +  sort_by = NULL, use_coeff = TRUE, compare_with = list(),
    +  prediction_funs = list(function(object, newdata) predict(object,
       newdata)), ...)

    Arguments

    @@ -205,6 +203,10 @@

    Arg

    + + + + @@ -230,7 +232,6 @@

    Contents

    -

    @@ -238,14 +239,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/plot_model_comparison-1.png b/docs/reference/plot_model_comparison-1.png index 41bf0bc..6259881 100644 Binary files a/docs/reference/plot_model_comparison-1.png and b/docs/reference/plot_model_comparison-1.png differ diff --git a/docs/reference/plot_model_comparison-2.png b/docs/reference/plot_model_comparison-2.png index a699ed9..8982b5a 100644 Binary files a/docs/reference/plot_model_comparison-2.png and b/docs/reference/plot_model_comparison-2.png differ diff --git a/docs/reference/plot_model_comparison-3.png b/docs/reference/plot_model_comparison-3.png index 67e7e03..7df258d 100644 Binary files a/docs/reference/plot_model_comparison-3.png and b/docs/reference/plot_model_comparison-3.png differ diff --git a/docs/reference/plot_model_comparison.html b/docs/reference/plot_model_comparison.html index 524c472..313c921 100644 --- a/docs/reference/plot_model_comparison.html +++ b/docs/reference/plot_model_comparison.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -152,8 +150,8 @@

    Plot models comparison

    The function plots models comparison based on them predictions.

    -
    plot_model_comparison(x, model, data, compare_with = list(),
    -  prediction_funs = list(function(object, newdata) predict(object,
    +    
    plot_model_comparison(x, model, data, compare_with = list(),
    +  prediction_funs = list(function(object, newdata) predict(object,
       newdata)), sort_by = NULL)

    Arguments

    @@ -187,35 +185,35 @@

    Arg

    Examples

    -
    iris_data <- droplevels(iris[iris$Species != "setosa", ]) -library(e1071) -library(randomForest)
    #> randomForest 4.6-14
    #> Type rfNews() to see new features/changes/bug fixes.
    library(xspliner) +
    iris_data <- droplevels(iris[iris$Species != "setosa", ]) +library(e1071) +library(randomForest)
    #> randomForest 4.6-14
    #> Type rfNews() to see new features/changes/bug fixes.
    library(xspliner) # Build SVM model, random forest model and surrogate one constructed on top od SVM -model_svm <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, +model_svm <- svm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris_data, probability = TRUE) -model_rf <- randomForest(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris_data) +model_rf <- randomForest(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris_data) model_xs <- xspline(Species ~ xs(Sepal.Length) + xs(Sepal.Width) + xs(Petal.Length) + xs(Petal.Width), model = model_svm) # Prepare prediction functions returning label probability -prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2] -prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] -prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") +prob_svm <- function(object, newdata) attr(predict(object, newdata = newdata, probability = TRUE), "probabilities")[, 2] +prob_rf <- function(object, newdata) predict(object, newdata = newdata, type = "prob")[, 2] +prob_xs <- function(object, newdata) predict(object, newdata = newdata, type = "response") # Plotting predictions for original SVM and surrogate model on training data plot_model_comparison( model_xs, model_svm, data = iris_data, - prediction_funs = list(xs = prob_xs, svm = prob_svm) + prediction_funs = list(xs = prob_xs, svm = prob_svm) )
    # Plotting predictions for original SVM, surrogate model and random forest on training data plot_model_comparison( model_xs, model_svm, data = iris_data, - compare_with = list(rf = model_rf), - prediction_funs = list(xs = prob_xs, svm = prob_svm, rf = prob_rf) + compare_with = list(rf = model_rf), + prediction_funs = list(xs = prob_xs, svm = prob_svm, rf = prob_rf) )
    # Sorting values according to SVM predictions plot_model_comparison( model_xs, model_svm, data = iris_data, - compare_with = list(rf = model_rf), - prediction_funs = list(xs = prob_xs, svm = prob_svm, rf = prob_rf), + compare_with = list(rf = model_rf), + prediction_funs = list(xs = prob_xs, svm = prob_svm, rf = prob_rf), sort_by = "svm" )
    @@ -230,7 +228,6 @@

    Contents

    -

    @@ -238,14 +235,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/plot_variable_transition-1.png b/docs/reference/plot_variable_transition-1.png index 904fe66..0e57ffb 100644 Binary files a/docs/reference/plot_variable_transition-1.png and b/docs/reference/plot_variable_transition-1.png differ diff --git a/docs/reference/plot_variable_transition-2.png b/docs/reference/plot_variable_transition-2.png index 8c9208e..a2475b7 100644 Binary files a/docs/reference/plot_variable_transition-2.png and b/docs/reference/plot_variable_transition-2.png differ diff --git a/docs/reference/plot_variable_transition-3.png b/docs/reference/plot_variable_transition-3.png index 459a9aa..7a1bcaf 100644 Binary files a/docs/reference/plot_variable_transition-3.png and b/docs/reference/plot_variable_transition-3.png differ diff --git a/docs/reference/plot_variable_transition-4.png b/docs/reference/plot_variable_transition-4.png index 0c98039..3a4ddb7 100644 Binary files a/docs/reference/plot_variable_transition-4.png and b/docs/reference/plot_variable_transition-4.png differ diff --git a/docs/reference/plot_variable_transition.html b/docs/reference/plot_variable_transition.html index 6d95b80..e44380c 100644 --- a/docs/reference/plot_variable_transition.html +++ b/docs/reference/plot_variable_transition.html @@ -64,7 +64,6 @@ gtag('config', 'UA-5650686-14'); - @@ -111,22 +110,22 @@ @@ -141,7 +140,6 @@ -
    @@ -162,7 +160,8 @@

    Plot variable profile

    plot_variable_transition(x, variable_names = NULL,
       plot_response = TRUE, plot_approx = TRUE, data = NULL,
    -  plot_data = FALSE, plot_deriv = FALSE, n_plots = 6)
    + plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, + use_coeff = TRUE)

    Arguments

    link

    Link function that should be used in final model. The passed is used when cannot be extracted from -model. By default 'identity'. See family for possibilities.

    family

    Family of response variable that should be used in final model. The passed is used when cannot -be extracted from model. By default 'gaussian'. See family for possibilities.

    env
    control

    Fitting settings. See glm.control.

    Fitting settings. See glm.control.

    ... sort_by

    When comparing models determines according to which model should observations be ordered.

    use_coeff

    If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient.

    compare_with

    Named list. Other models that should be compared with xspliner and model.

    @@ -195,15 +194,19 @@

    Arg

    + + + +
    n_plots

    Threshold for number of plots when plotting all variables.

    use_coeff

    If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient.

    Examples

    -
    library(randomForest) -set.seed(1) +
    library(randomForest) +set.seed(1) data <- iris # regression model -iris.rf <- randomForest(Petal.Width ~ Sepal.Length + Petal.Length + Species, data = data) +iris.rf <- randomForest(Petal.Width ~ Sepal.Length + Petal.Length + Species, data = data) iris.xs <- xspline(iris.rf) # plot Sepal.Length transition plot_variable_transition(iris.xs, "Sepal.Length")
    # plot Species transition @@ -222,7 +225,6 @@

    Contents

    -

    @@ -230,14 +232,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/predict.xspliner.html b/docs/reference/predict.xspliner.html index fa28933..67179c8 100644 --- a/docs/reference/predict.xspliner.html +++ b/docs/reference/predict.xspliner.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -153,7 +151,7 @@

    Predict xspliner method

    # S3 method for xspliner
    -predict(object, newdata, ...)
    +predict(object, newdata, ...)

    Arguments

    @@ -168,7 +166,7 @@

    Arg

    - +
    ...

    Another arguments passed into predict.glm method.

    Another arguments passed into predict.glm method.

    @@ -183,7 +181,6 @@

    Contents

    -

    @@ -191,14 +188,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/print.xspliner.html b/docs/reference/print.xspliner.html index 4c540e5..a22e803 100644 --- a/docs/reference/print.xspliner.html +++ b/docs/reference/print.xspliner.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -153,7 +151,7 @@

    Print method for xspliner object

    # S3 method for xspliner
    -print(x, predictor, ...)
    +print(x, predictor, ...)

    Arguments

    @@ -183,7 +181,6 @@

    Contents

    -

    @@ -191,14 +188,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/stats.html b/docs/reference/stats.html index fb6a3c7..d5d2274 100644 --- a/docs/reference/stats.html +++ b/docs/reference/stats.html @@ -62,7 +62,6 @@ gtag('config', 'UA-5650686-14'); - @@ -109,22 +108,22 @@ @@ -139,7 +138,6 @@ -
    @@ -180,7 +178,6 @@

    Contents

    -

    @@ -188,14 +185,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/summary.xspliner.html b/docs/reference/summary.xspliner.html index a7f21e4..023525f 100644 --- a/docs/reference/summary.xspliner.html +++ b/docs/reference/summary.xspliner.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -153,9 +151,9 @@

    Summary method for xspliner object

    # S3 method for xspliner
    -summary(object, predictor, ..., model = NULL,
    -  newdata = NULL, prediction_funs = list(function(object, newdata)
    -  predict(object, newdata)), env = parent.frame())
    +summary(object, predictor, ..., model=NULL, + newdata=NULL, prediction_funs=list(function(object, newdata) + predict(object, newdata)), env=parent.frame())

    Arguments

    @@ -201,8 +199,7 @@

    Details
  • When variable was qualitative and transformed, factor matching is displayed.

  • When variable was not transformed, glm::summary output is displayed for the model.

  • - -

    If both object parameter and model (original black box) was provided, the summary displays comparison of original and surrogate model. +

    If both object parameter and model (original black box) was provided, the summary displays comparison of original and surrogate model. The following points decribe the rules (\(y_{s}\) and \(y_{o}\) are predictions of surrogate and original model respectively on provided dataset). When comparing statistic is close to 1, this means surrogate model is similiar to black box one (according to this statistic).

    For regression models:

      @@ -212,29 +209,26 @@

      Details $$1 - \frac{\sum_{i = 1}^{n} ({y_{s}^{(i)} - y_{o}^{(i)}}) ^ {2}}{\sum_{i = 1}^{n} ({y_{o}^{(i)} - \overline{y_{o}}}) ^ {2}}$$

    • Mean square errors for each model.

    - -

    For classification models the result depends on prediction type. +

    For classification models the result depends on prediction type. When predictions are classified levels:

    • Mean predictions similarity$$\frac{1}{n} \sum_{i = 1}^{n} I_{y_{s}^{(i)} = y_{o}^{(i)}}$$

    • Accuracies for each models.

    - -

    When predictions are response probabilities:

    @@ -218,7 +216,7 @@

    Arg

    -
    consider

    One of c("specials", "all"). If "specials", only components with xs or xf +

    One of c("specials", "all"). If "specials", only components with xs or xf call are considered in transition.

    @@ -232,8 +230,8 @@

    Details

    Examples

    # preparing blackbox model -library(randomForest) -rf_iris <- randomForest( +library(randomForest) +rf_iris <- randomForest( Petal.Width ~ Sepal.Length + Petal.Length + Species, data = iris) @@ -241,7 +239,7 @@

    Examp xs_iris <- xspline( Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + xf(Species), model = rf_iris) -summary(xs_iris)

    #> +summary(xs_iris)
    #> #> Call: #> stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + #> xf(Species), family = family, data = data) @@ -267,10 +265,10 @@

    Examp #> AIC: -88.349 #> #> Number of Fisher Scoring iterations: 2 -#>

    plot(xs_iris, "Sepal.Length")
    +#>
    plot(xs_iris, "Sepal.Length")
    # passing just the model xs_iris <- xspline(rf_iris) -summary(xs_iris)
    #> +summary(xs_iris)
    #> #> Call: #> stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + #> xf(Species), family = family, data = data) @@ -296,12 +294,14 @@

    Examp #> AIC: -88.349 #> #> Number of Fisher Scoring iterations: 2 -#>

    plot(xs_iris, "Sepal.Length")
    +#>
    plot(xs_iris, "Sepal.Length")
    # using DALEX -library(DALEX)
    #> Registered S3 method overwritten by 'DALEX': +library(DALEX)
    #> Registered S3 method overwritten by 'DALEX': #> method from #> print.description questionr
    #> Welcome to DALEX (version: 0.4.7). -#> Find examples and detailed introduction at: https://pbiecek.github.io/PM_VEE/
    xs_iris_explainer <- explain(rf_iris)
    #> Preparation of a new explainer is initiated +#> Find examples and detailed introduction at: https://pbiecek.github.io/PM_VEE/ +#> Additional features will be available after installation of: iBreakDown. +#> Use 'install_dependencies()' to get all suggested dependencies
    xs_iris_explainer <- explain(rf_iris)
    #> Preparation of a new explainer is initiated #> -> model label : randomForest (default) #> -> data : 150 rows 4 cols (extracted from the model) #> -> target variable : not specified! (WARNING) @@ -309,7 +309,7 @@

    Examp #> -> predicted values : numerical, min = 0.1977761 , mean = 1.199116 , max = 2.143874 #> -> residual function : difference between y and yhat (default) #> A new explainer has been created!

    xs_iris <- xspline(rf_iris) -summary(xs_iris)
    #> +summary(xs_iris)
    #> #> Call: #> stats::glm(formula = Petal.Width ~ xs(Sepal.Length) + xs(Petal.Length) + #> xf(Species), family = family, data = data) @@ -335,7 +335,7 @@

    Examp #> AIC: -88.349 #> #> Number of Fisher Scoring iterations: 2 -#>

    plot(xs_iris, "Sepal.Length")
    +#>
    plot(xs_iris, "Sepal.Length")
    -

    @@ -358,14 +357,11 @@

    Contents

    Site built by pkgdown.

    -
    - - diff --git a/docs/reference/xspliner-package.html b/docs/reference/xspliner-package.html index f7ea0c0..5245cf1 100644 --- a/docs/reference/xspliner-package.html +++ b/docs/reference/xspliner-package.html @@ -60,7 +60,6 @@ gtag('config', 'UA-5650686-14'); - @@ -107,22 +106,22 @@ @@ -137,7 +136,6 @@ -
    @@ -164,7 +162,6 @@

    Contents

    - - - diff --git a/man/plot.xspliner.Rd b/man/plot.xspliner.Rd index ea321e7..16ccd38 100644 --- a/man/plot.xspliner.Rd +++ b/man/plot.xspliner.Rd @@ -7,7 +7,7 @@ \method{plot}{xspliner}(x, variable_names = NULL, model = NULL, plot_response = TRUE, plot_approx = TRUE, data = NULL, plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, - sort_by = NULL, compare_with = list(), + sort_by = NULL, use_coeff = TRUE, compare_with = list(), prediction_funs = list(function(object, newdata) predict(object, newdata)), ...) } @@ -32,6 +32,8 @@ \item{sort_by}{When comparing models determines according to which model should observations be ordered.} +\item{use_coeff}{If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient.} + \item{compare_with}{Named list. Other models that should be compared with xspliner and \code{model}.} \item{prediction_funs}{Prediction functions that should be used in model comparison.} diff --git a/man/plot_variable_transition.Rd b/man/plot_variable_transition.Rd index 0516596..e930eee 100644 --- a/man/plot_variable_transition.Rd +++ b/man/plot_variable_transition.Rd @@ -6,7 +6,8 @@ \usage{ plot_variable_transition(x, variable_names = NULL, plot_response = TRUE, plot_approx = TRUE, data = NULL, - plot_data = FALSE, plot_deriv = FALSE, n_plots = 6) + plot_data = FALSE, plot_deriv = FALSE, n_plots = 6, + use_coeff = TRUE) } \arguments{ \item{variable_names}{Names of predictors which transitions should be plotted.} @@ -22,6 +23,8 @@ plot_variable_transition(x, variable_names = NULL, \item{plot_deriv}{If TRUE derivative of approximation is showed on plot.} \item{n_plots}{Threshold for number of plots when plotting all variables.} + +\item{use_coeff}{If TRUE both PDP function and its approximation is scaled with corresponding surrogate model coefficient.} } \description{ The function plots variable profile.