diff --git a/DESCRIPTION b/DESCRIPTION index 6d4bb8b..b8d3c9c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,5 +21,6 @@ Suggests: factoMerger, testthat, knitr, - rmarkdown + rmarkdown, + ResourceSelection VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 5dd3293..e0f1eaf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,22 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method(extract_xspliner_function,xspliner) S3method(plot,xspliner) S3method(predict,xspliner) S3method(xspline,default) S3method(xspline,explainer) S3method(xspline,formula) -export(extract_formula_var_names) -export(factor_opts_default) -export(get_component_params) -export(get_formula_details) -export(get_formula_raw_components) -export(get_formula_special) -export(get_special_component_details) -export(get_special_components_info) -export(numeric_opts_default) -export(prepare_call) -export(r_squared) -export(transform_formula_chr) -export(transformed_formula_object) +export(aic) +export(extract_xspliner_function) +export(get_special_predictors) +export(hoslem) +export(xf_opts_default) +export(xs_opts_default) export(xspline) importFrom(magrittr,"%>%") diff --git a/R/plot-xspliner.R b/R/plot-xspliner.R index c7d2d16..6bdc200 100644 --- a/R/plot-xspliner.R +++ b/R/plot-xspliner.R @@ -25,7 +25,7 @@ plot.xspliner <- function(x, variable_name = NULL, plot_response = TRUE, } response_var <- environment(x)$response - xp_call <- environment(x)$xs_call[[variable_name]] + xp_call <- environment(x)$xs_functions[[variable_name]] if (plot_data) { plot_range <- range(data[, variable_name]) @@ -35,7 +35,7 @@ plot.xspliner <- function(x, variable_name = NULL, plot_response = TRUE, plot_data_data <- data - plot_response_data <- environment(x)$xs_env_list[[variable_name]]$blackbox_response_obj + plot_response_data <- environment(x)$quantitative_transitions[[variable_name]]$effect_outcome colnames(plot_response_data) <- c(variable_name, response_var) x_approx <- seq(plot_range[1], plot_range[2], length.out = nrow(plot_response_data)) diff --git a/R/predict-xspliner.R b/R/predict-xspliner.R index 8547675..cb29efe 100644 --- a/R/predict-xspliner.R +++ b/R/predict-xspliner.R @@ -5,5 +5,5 @@ #' #' @export predict.xspliner <- function(xspliner, newdata) { - mgcv::predict.gam(xspliner, newdata = newdata) + stats::predict.glm(xspliner, newdata = newdata) } diff --git a/R/utils-approximations.R b/R/utils-approximations.R index b4c6af0..5eb1d90 100644 --- a/R/utils-approximations.R +++ b/R/utils-approximations.R @@ -1,18 +1,18 @@ -get_spline_formula <- function(response_var, pred_var, env, ...) { - formula_call <- substitute(list(pred_var, ...)) +build_approximation_formula <- function(response, predictor, env, ...) { + formula_call <- substitute(list(predictor, ...)) formula_call[[1]] <- quote(s) formula_call[[2]] <- quote(predictor) - formula_call <- sub("predictor", pred_var, deparse(formula_call), fixed = TRUE) - formula <- as.formula(sprintf("%s ~ %s", response_var, formula_call), env = env) + formula_call <- sub("predictor", predictor, deparse(formula_call), fixed = TRUE) + formula <- as.formula(sprintf("%s ~ %s", response, formula_call), env = env) formula } #' Approximate spline on data #' #' It aproximates data with spline function by fitting GAM model. -#' @param bb_response_data Blackbox response data, for example pdp curve. -#' @param response_var Name of response value from bb_response_data. -#' @param pred_var Name of predictor value from bb_response_data. +#' @param effect_data Blackbox response data, for example pdp curve. +#' @param response Name of response value from effect_data. +#' @param predictor Name of predictor value from effect_data. #' @param env Formula environment that should be used for fitting gam model. #' @param ... Other arguments passed to \link{mgcv::s} function. #' @return @@ -22,20 +22,21 @@ get_spline_formula <- function(response_var, pred_var, env, ...) { #' y <- rnorm(20, 2, 2) #' env <- new.env() #' approx_with_spline(data.frame(x = x, y = y), "y", "x", env) -approx_with_spline <- function(bb_response_data, response_var, pred_var, env = parent.frame(), ...) { +approx_with_spline <- function(effect_data, response, predictor, env = parent.frame(), ...) { s <- mgcv::s - formula <- get_spline_formula(response_var, pred_var, env, ...) - mgcv::gam(formula, data = bb_response_data) + formula <- build_approximation_formula(response, predictor, env, ...) + mgcv::gam(formula, data = effect_data) } #' Approximate monotonic spline on data #' #' It aproximates data with monotonic spline function by fitting GAM model. -#' @param bb_response_data. Blackbox response data, for example pdp curve. -#' @param response_var Name of response value from bb_response_data. -#' @param pred_var Name of predictor value from bb_response_data. +#' @param effect_data. Blackbox response data, for example pdp curve. +#' @param response Name of response value from effect_data. +#' @param predictor Name of predictor value from effect_data. #' @param env Formula environment that should be used for fitting gam model. -#' @param increasing If TRUE, spline approximation is increasing, if FALSE decreasing. +#' @param monotonic Possible options "up", "down" and "auto. If up the spline is incresing, when down decreasing. +#' For auto the better one (based on r.squared statistic) is chosen. #' @param ... Other arguments passed to \link{mgcv::s} function. #' @return #' Object of class "gam". See \link{mgcv::gamObject} @@ -43,22 +44,32 @@ approx_with_spline <- function(bb_response_data, response_var, pred_var, env = p #' x <- sort(rnorm(20, 5, 5)) #' y <- rnorm(20, 2, 2) #' env <- new.env() -#' approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, TRUE) -approx_with_monotonic_spline <- function(bb_response_data, response_var, - pred_var, env = parent.frame(), increasing, ...) { +#' approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, "up") +approx_with_monotonic_spline <- function(effect_data, response, + predictor, env = parent.frame(), monotonic, ...) { + if (monotonic == "auto") { + model_up <- approx_with_monotonic_spline(effect_data, response, predictor, env = parent.frame(), "up", ...) + model_down <- approx_with_monotonic_spline(effect_data, response, predictor, env = parent.frame(), "down", ...) + if (summary(model_up)$r.sq > summary(model_up)$r.sq) { + return(model_up) + } else { + return(model_down) + } + } + s <- mgcv::s - formula <- get_spline_formula(response_var, pred_var, env, ...) - G <- mgcv::gam(formula, data = bb_response_data, fit = FALSE) - contraint_sign <- if (increasing) 1 else -1 + formula <- build_approximation_formula(response, predictor, env, ...) + G <- mgcv::gam(formula, data = effect_data, fit = FALSE) + contraint_sign <- if (monotonic == "up") 1 else -1 gam_init <- mgcv::gam(G = G) ## generate constraints, by finite differencing ## using predict.gam .... eps <- 1e-7 - x_range <- range(bb_response_data[[pred_var]]) + x_range <- range(effect_data[[predictor]]) diff_grid_0 <- diff_grid_1 <- data.frame(x = seq(x_range[1], x_range[2], length.out = 100)) - colnames(diff_grid_0) <- colnames(diff_grid_1) <- pred_var - diff_grid_1$x <- diff_grid_1[[pred_var]] + eps + colnames(diff_grid_0) <- colnames(diff_grid_1) <- predictor + diff_grid_1$x <- diff_grid_1[[predictor]] + eps spline_vals_on_interv_start <- predict(gam_init, newdata = diff_grid_0, type = "lpmatrix") spline_vals_on_interv_end <- predict(gam_init, newdata = diff_grid_1, type = "lpmatrix") x_var_constraints <- contraint_sign * (spline_vals_on_interv_end - spline_vals_on_interv_start) / eps ## Xx %*% coef(b) must be positive @@ -75,148 +86,200 @@ approx_with_monotonic_spline <- function(bb_response_data, response_var, gam_init } -prepare_pdp_params <- function() { - -} - -prepare_spline_params_pdp <- function(formula_details, component_details, blackbox, data) { - method_params <- component_details$method_opts - method_params[["type"]] <- NULL - method_params[["object"]] <- blackbox - method_params[["pred.var"]] <- component_details$var - method_params[["train"]] <- data - method_params[["which.class"]] <- 2 # for glm 1st level is failure +prepare_transition_params_pdp <- function(formula_metadata, component_details, blackbox, data) { + effect <- component_details$effect + effect[["type"]] <- NULL + effect[["object"]] <- blackbox + effect[["pred.var"]] <- component_details$var + effect[["train"]] <- data + effect[["which.class"]] <- 2 # for glm 1st level is failure - blackbox_response_obj <- do.call(pdp::partial, method_params) + effect_outcome <- do.call(pdp::partial, effect) - spline_params <- component_details$transform_opts - spline_params[["bb_response_data"]] <- blackbox_response_obj - spline_params[["pred_var"]] <- component_details$var - spline_params[["response_var"]] <- "yhat" - spline_params[["env"]] <- attr(formula_details$formula, ".Environment") + transition <- component_details$transition + transition[["effect_data"]] <- effect_outcome + transition[["predictor"]] <- component_details$var + transition[["response"]] <- "yhat" + transition[["env"]] <- attr(formula_metadata$formula, ".Environment") - spline_params + transition } -prepare_spline_params_ale <- function(formula_details, component_details, blackbox, data) { - method_params <- component_details$method_opts - method_params[["type"]] <- NULL - method_params[["X.model"]] <- blackbox - method_params[["J"]] <- component_details$var - method_params[["X"]] <- data - method_params[["pred.fun"]] <- function(X.model, newdata) predict(object = X.model, newdata = newdata) - method_params[["NA.plot"]] <- FALSE +prepare_transition_params_ale <- function(formula_metadata, component_details, blackbox, data) { + effect <- component_details$effect + effect[["type"]] <- NULL + effect[["X.model"]] <- blackbox + effect[["J"]] <- component_details$var + effect[["X"]] <- data + effect[["pred.fun"]] <- function(X.model, newdata) predict(object = X.model, newdata = newdata) + effect[["NA.plot"]] <- FALSE plot_container <- tempfile() pdf(plot_container) - blackbox_response_obj <- do.call(ALEPlot::ALEPlot, method_params) + effect_outcome <- do.call(ALEPlot::ALEPlot, effect) dev.off() unlink(plot_container) - blackbox_response_obj <- data.frame(blackbox_response_obj$x.values, blackbox_response_obj$f.values) - names(blackbox_response_obj) <- c(component_details$var, "yhat") + effect_outcome <- data.frame(effect_outcome$x.values, effect_outcome$f.values) + names(effect_outcome) <- c(component_details$var, "yhat") - spline_params <- component_details$transform_opts - spline_params[["bb_response_data"]] <- blackbox_response_obj - spline_params[["pred_var"]] <- component_details$var - spline_params[["response_var"]] <- "yhat" - spline_params[["env"]] <- attr(formula_details$formula, ".Environment") + transition <- component_details$transition + transition[["effect_data"]] <- effect_outcome + transition[["predictor"]] <- component_details$var + transition[["response"]] <- "yhat" + transition[["env"]] <- attr(formula_metadata$formula, ".Environment") - spline_params + transition } -prepare_transform_params_fM <- function(formula_details, component_details, blackbox, data) { - method_params <- component_details$method_opts - method_params[["type"]] <- NULL - method_params[["object"]] <- blackbox - method_params[["pred.var"]] <- component_details$var - method_params[["train"]] <- data - method_params[["ice"]] <- TRUE +prepare_transition_params_fM <- function(formula_metadata, component_details, blackbox, data) { + effect <- component_details$effect + effect[["type"]] <- NULL + effect[["object"]] <- blackbox + effect[["pred.var"]] <- component_details$var + effect[["train"]] <- data + effect[["ice"]] <- TRUE - blackbox_response_obj <- do.call(pdp::partial, method_params) + effect_outcome <- do.call(pdp::partial, effect) - transform_params <- component_details$method_opts - transform_params[["response"]] <- blackbox_response_obj[, "yhat"] - transform_params[["factor"]] <- blackbox_response_obj[, component_details$var] - transform_params[["factorMerger"]] <- blackbox_response_obj + transition <- component_details$transition + transition[["response"]] <- effect_outcome[, "yhat"] + transition[["factor"]] <- effect_outcome[, component_details$var] + transition[["factorMerger"]] <- effect_outcome - transform_params + transition } -factor_component_env <- function(formula_details, component_details, blackbox, data) { - if (is.null(component_details$method_opts$type)) { +get_qualitative_transition <- function(formula_metadata, component_details, blackbox, data) { + if (is.null(component_details$effect$type)) { stop("No specified type for method!") } - transform_params <- switch(component_details$method_opts$type, - ice = prepare_transform_params_fM(formula_details, component_details, blackbox, data) + transition <- switch(component_details$effect$type, + fM = prepare_transition_params_fM(formula_metadata, component_details, blackbox, data) ) + alter <- transition[["alter"]] + transition[["alter"]] <- NULL + + if (alter == "never") { + quantitative_transition <- list( + effect_outcome = NULL, + transition_outcome = NULL + ) + } else { + partition_params <- transition[c("stat", "value")] %>% + purrr::keep(~ !is.null(.)) + transition[c("stat", "value")] <- NULL + transition$abbreviate <- FALSE - partition_params <- transform_params[c("stat", "value")] %>% - purrr::keep(~ !is.null(.)) - transform_params[c("stat", "value")] <- NULL - transform_params$abbreviate <- FALSE - - blackbox_response_obj <- do.call(factorMerger::mergeFactors, transform_params) + effect_outcome <- do.call(factorMerger::mergeFactors, transition) - partition_params$factorMerger <- blackbox_response_obj - blackbox_response_transform <- do.call(factorMerger::getOptimalPartitionDf, partition_params) + partition_params$factorMerger <- effect_outcome + transition_outcome <- do.call(factorMerger::getOptimalPartitionDf, partition_params) - list( - blackbox_response_obj = blackbox_response_obj, - blackbox_response_transform = blackbox_response_transform - ) + quantitative_transition <- list( + effect_outcome = transition[["effect_data"]], + transition_outcome = transition_outcome + ) + } + quantitative_transition } -numeric_component_env <- function(formula_details, component_details, blackbox, data) { - if (is.null(component_details$method_opts$type)) { +get_quantitative_transition <- function(formula_metadata, component_details, blackbox, data, + family, compare_stat) { + if (is.null(component_details$effect$type)) { stop("No specified type for method!") } - spline_params <- switch(component_details$method_opts$type, - pdp = prepare_spline_params_pdp(formula_details, component_details, blackbox, data), - ale = prepare_spline_params_ale(formula_details, component_details, blackbox, data) + transition <- switch(component_details$effect$type, + pdp = prepare_transition_params_pdp(formula_metadata, component_details, blackbox, data), + ale = prepare_transition_params_ale(formula_metadata, component_details, blackbox, data) ) - if (is.null(spline_params[["increasing"]])) { - blackbox_response_approx <- do.call(approx_with_spline, spline_params) + monotonic <- transition[["monotonic"]] + transition[["monotonic"]] <- NULL + alter <- transition[["alter"]] + transition[["alter"]] <- NULL + + if (alter == "never") { + quantitative_transition <- list( + effect_outcome = NULL, + transition_outcome = NULL + ) } else { - blackbox_response_approx <- do.call(approx_with_monotonic_spline, spline_params) + if (monotonic == "not") { + transition_outcome <- do.call(approx_with_spline, transition) + } else { # (todo) add auto option + transition_outcome <- do.call(approx_with_monotonic_spline, transition) + } + + quantitative_transition <- list( + effect_outcome = transition[["effect_data"]], + transition_outcome = transition_outcome + ) } - list( - blackbox_response_obj = spline_params[["bb_response_data"]], - blackbox_response_approx = blackbox_response_approx - ) + if (alter == "auto") { + xs_function <- build_xs_function(quantitative_transition, component_details$var) + lm_better <- is_lm_better_than_xs( + data, formula_metadata$lhs, component_details$var, xs_function, family, compare_stat) + if (lm_better) { + message(sprintf("%s fits better than %s. Using bare component.", + component_details$var, component_details$new_call)) + quantitative_transition <- list( + effect_outcome = NULL, + transition_outcome = NULL + ) + } + } + quantitative_transition } -get_common_components_env <- function(formula_details, special_components_details, blackbox, data) { +is_lm_better_than_xs <- function(data, lhs, var, xs_function, family, compare_stat) { + xs_function <- xs_function + linear_formula <- as.formula(sprintf("%s ~ %s", lhs, var)) + transition_formula <- as.formula(sprintf("%s ~ xs_function(%s)", lhs, var)) + linear_model <- glm(linear_formula, data = data, family = family) + transition_model <- glm(transition_formula, data = data, family = family) + comparison <- compare_stat(transition_model) <= compare_stat(linear_model) + if (isTRUE(attr(compare_stat, "higher-better"))) { + lm_better <- comparison + } else { + lm_better <- !comparison + } +} - xs_env <- list() - xf_env <- list() +get_transitions_outcome <- function(formula_metadata, special_components_details, + blackbox, data, family, compare_stat) { - xs_vars <- formula_details$xs_variables - xf_vars <- formula_details$xf_variables + quantitative_variables <- formula_metadata$xs_variables + qualitative_variables <- formula_metadata$xf_variables - if (length(xs_vars)) { - xs_env <- special_components_details %>% - purrr::keep(function(component_details) component_details[["var"]] %in% xs_vars) %>% - purrr::map(function(component_details) numeric_component_env(formula_details, component_details, blackbox, data)) %>% - purrr::set_names(xs_vars) + quantitative <- if (length(quantitative_variables)) { + special_components_details %>% + purrr::keep(function(component_details) component_details[["var"]] %in% quantitative_variables) %>% + purrr::map(function(component_details) get_quantitative_transition( + formula_metadata, component_details, blackbox, data, family, compare_stat)) %>% + purrr::set_names(quantitative_variables) + } else { + list() } - if (length(xf_vars)) { - xf_env <- special_components_details %>% - purrr::keep(function(component_details) component_details[["var"]] %in% xf_vars) %>% - purrr::map(function(component_details) factor_component_env(formula_details, component_details, blackbox, data)) %>% - purrr::set_names(xf_vars) + qualitative <- if (length(qualitative_variables)) { + special_components_details %>% + purrr::keep(function(component_details) component_details[["var"]] %in% qualitative_variables) %>% + purrr::map(function(component_details) get_qualitative_transition( + formula_metadata, component_details, blackbox, data)) %>% + purrr::set_names(qualitative_variables) + } else { + list() } list( - xs_env = xs_env, - xf_env = xf_env + quantitative = quantitative, + qualitative = qualitative ) } @@ -238,10 +301,10 @@ get_common_components_env <- function(formula_details, special_components_detail #' z_var_response <- pdp::partial(blackbox, "z") #' z_var_response_approx <- approx_with_spline(z_var_response, "yhat", "z") #' z_env <- list( -#' blackbox_response_obj = z_var_response, -#' blackbox_response_approx = z_var_response_approx +#' effect_outcome = z_var_response, +#' transition_outcome = z_var_response_approx #' ) -#' z_var_spline <- get_xs_call(z_env, "z") +#' z_var_spline <- build_xs_function(z_env, "z") #' z_range <- attr(z_var_spline, "variable_range") #' z_axis <- seq(z_range[1], z_range[2], length.out = 50) #' @@ -252,93 +315,35 @@ get_common_components_env <- function(formula_details, special_components_detail #' x_var_response <- pdp::partial(blackbox, "x") #' x_var_response_approx <- approx_with_spline(x_var_response, "yhat", "x") #' x_env <- list( -#' blackbox_response_obj = x_var_response, -#' blackbox_response_approx = x_var_response_approx +#' effect_outcome = x_var_response, +#' transition_outcome = x_var_response_approx #' ) -#' x_var_spline <- get_xs_call(x_env, "x") +#' x_var_spline <- build_xs_function(x_env, "x") #' x_range <- attr(x_var_spline, "variable_range") #' x_axis <- seq(x_range[1], x_range[2], length.out = 50) #' #' plot(x, y) #' lines(x_var_response) #' lines(x_axis, x_var_spline(x_axis)) -get_xs_call <- function(xs_env, pred_var_name) { - xs_approximation <- function(pred_var) { - data <- data.frame(pred_var) - names(data) <- pred_var_name - mgcv::predict.gam(xs_env$blackbox_response_approx, newdata = data) +build_xs_function <- function(quantitative_transition, predictor_name) { + xs_approximation <- function(predictor) { + data <- data.frame(predictor) + names(data) <- predictor_name + mgcv::predict.gam(quantitative_transition$transition_outcome, newdata = data) } - attr(xs_approximation, "variable_range") <- range(xs_env$blackbox_response_obj[[pred_var_name]]) + attr(xs_approximation, "variable_range") <- range(quantitative_transition$effect_outcome[[predictor_name]]) xs_approximation } -get_xf_call <- function(xf_env, pred_var_name) { - matched_factors <- xf_env$blackbox_response_transform +build_xf_function <- function(qualitative_transition, predictor_name) { + matched_factors <- qualitative_transition$transition_outcome if (length(unique(matched_factors$pred)) < 2) { - return(function(pred_var) pred_var) + return(function(predictor) predictor) } else { - function(pred_var) { - predictor_values <- data.frame(orig = pred_var) + function(predictor) { + predictor_values <- data.frame(orig = predictor) transformed_predictor <- dplyr::left_join(predictor_values, matched_factors) factor(transformed_predictor, levels = unique(matched_factors$pred)) } } } - -is_lm_better_than_approx <- function(data, response, predictor, approx_fun, compare_stat) { - approx_model_formula <- as.formula(sprintf("%s ~ approx_fun(%s)", response, predictor)) - approx_model <- lm(approx_model_formula, data) - lm_model_formula <- as.formula(sprintf("%s ~ %s", response, predictor)) - lm_model <- lm(lm_model_formula, data) - comparison <- compare_stat(approx_model) <= compare_stat(lm_model) - if (isTRUE(attr(compare_stat, "higher-better"))) { - comparison - } else { - !comparison - } -} - -correct_improved_components <- function(alter, compare_stat, xs, xf, special_components_details, data, response) { - - get_component_call <- function(special_component_details) { - if (special_component_details$call_fun == "xs") { - xs - } else { - xf - } - } - - use_untransformed <- function(special_component_details) { - call_fun <- special_component_details$call_fun - if (call_fun == "xs") { - alter_variable <- switch(alter$numeric, - always = FALSE, - auto = is_lm_better_than_approx(data, response, special_component_details$var, - get_component_call(special_component_details), compare_stat), - never = TRUE - ) - } else if (call_fun == "xf") { - alter_variable <- switch(alter$factor, - always = FALSE, - never = TRUE - ) - } else { - alter_variable <- TRUE - } - - if (is.null(alter_variable)) { - alter_variable <- TRUE - } - - alter_variable - } - - use_bare_call <- function(special_component_details) { - special_component_details$new_call <- special_component_details$var - special_component_details - } - - special_components_details %>% - purrr::map_if(use_untransformed, use_bare_call) - -} diff --git a/R/utils-formula-build.R b/R/utils-formula-build.R index 7959cc8..5be07af 100644 --- a/R/utils-formula-build.R +++ b/R/utils-formula-build.R @@ -1,4 +1,3 @@ -#' @export extract_formula_var_names <- function(formula, data) { formula_variables <- all.vars(formula) @@ -29,13 +28,12 @@ extract_formula_var_names <- function(formula, data) { formula_variables } -#' @export -get_formula_raw_components <- function(formula_terms) { +get_formula_single_components <- function(formula_terms) { as.character(attr(formula_terms,"variables"))[-c(1, 2)] } #' @export -get_formula_special <- function(variable_names, formula_terms, special, index = FALSE) { +get_special_predictors <- function(variable_names, formula_terms, special, index = FALSE) { if (index) { attr(formula_terms, "specials")[[special]] - 1 } else { @@ -43,8 +41,7 @@ get_formula_special <- function(variable_names, formula_terms, special, index = } } -#' @export -get_formula_details <- function(formula, variable_names) { +get_formula_metadata <- function(formula, variable_names) { if (!(length(formula) == 3)) { stop("Not specified response in formula") @@ -52,27 +49,26 @@ get_formula_details <- function(formula, variable_names) { formula_terms <- terms.formula(formula, specials = c("xs", "xf")) - formula_details <- list( + formula_metadata <- list( formula = formula, response = variable_names[1], predictors = variable_names[-1], lhs = get_formula_lhs(formula), rhs = get_formula_rhs(formula), - additive_components = get_formula_raw_components(formula_terms), - xs_variables = get_formula_special(variable_names, formula_terms, "xs"), - xf_variables = get_formula_special(variable_names, formula_terms, "xf"), - xs_variables_idx = get_formula_special(variable_names, formula_terms, "xs", TRUE), - xf_variables_idx = get_formula_special(variable_names, formula_terms, "xf", TRUE) + additive_components = get_formula_single_components(formula_terms), + xs_variables = get_special_predictors(variable_names, formula_terms, "xs"), + xf_variables = get_special_predictors(variable_names, formula_terms, "xf"), + xs_variables_idx = get_special_predictors(variable_names, formula_terms, "xs", TRUE), + xf_variables_idx = get_special_predictors(variable_names, formula_terms, "xf", TRUE) ) - if (length(formula_details$predictors) != length(formula_details$additive_components)) { + if (length(formula_metadata$predictors) != length(formula_metadata$additive_components)) { stop("Number of predictors is different than additive components") } - formula_details + formula_metadata } -#' @export -prepare_call <- function(component_string, add_variable = TRUE) { +clear_special_component <- function(component_string, add_variable = TRUE) { fun <- substr(component_string, 1, 2) if (!(fun %in% c("xs", "xf"))) { return(component_string) @@ -88,54 +84,60 @@ prepare_call <- function(component_string, add_variable = TRUE) { } -#' @export -get_component_params <- function(additive_component, env) { +get_component_params <- function(additive_component, xs_opts, xf_opts, env) { spline_params <- as.list(parse(text = additive_component[1])[[1]]) - transform_opts <- eval(spline_params$transform_opts, envir = env) - method_opts <- eval(spline_params$method_opts, envir = env) + transition <- eval(spline_params$transition, envir = env) + effect <- eval(spline_params$effect, envir = env) + transition_type <- substr(additive_component, 1, 2) + if (transition_type == "xs") { + transition <- match_parameters(transition, xs_opts$transition, xs_opts_default$transition) + effect <- match_parameters(effect, xs_opts$effect, xs_opts_default$effect) + } else { + transition <- match_parameters(transition, xf_opts$transition, xf_opts_default$transition) + effect <- match_parameters(effect, xf_opts$effect, xf_opts_default$effect) + } + list( - transform_opts = transform_opts, - method_opts = method_opts + transition = transition, + effect = effect ) } -#' @export -get_special_component_details <- function(raw_variable_name, additive_component_chr, env) { +get_special_component_metadata <- function(raw_variable_name, additive_component_chr, xs_opts, xf_opts, env) { - transformed_component <- prepare_call(additive_component_chr) - call_function <- prepare_call(additive_component_chr, FALSE) - component_params <- get_component_params(additive_component_chr, env) + transformed_component <- clear_special_component(additive_component_chr) + call_function <- clear_special_component(additive_component_chr, FALSE) + component_params <- get_component_params(additive_component_chr, xs_opts, xf_opts, env) component_details <- list( var = raw_variable_name, call = additive_component_chr, new_call = transformed_component, call_fun = call_function, - transform_opts = component_params$transform_opts, - method_opts = component_params$method_opts + transition = component_params$transition, + effect = component_params$effect ) component_details } -#' @export -get_special_components_info <- function(formula_details) { - special_predictor_indexes <- c(formula_details$xs_variables_idx, formula_details$xf_variables_idx) - special_predictor_names <- formula_details$predictors[special_predictor_indexes] - special_predictor_additive_components <- formula_details$additive_components[special_predictor_indexes] +collect_specials_metadata <- function(formula_metadata, xs_opts, xf_opts) { + special_predictor_indexes <- c(formula_metadata$xs_variables_idx, formula_metadata$xf_variables_idx) + special_predictor_names <- formula_metadata$predictors[special_predictor_indexes] + special_predictor_additive_components <- formula_metadata$additive_components[special_predictor_indexes] - special_component_details <- purrr::map2( + special_component_metadata <- purrr::map2( special_predictor_names, special_predictor_additive_components, - get_special_component_details, - env = attr(formula_details$formula, ".Environment") + get_special_component_metadata, + xs_opts = xs_opts, xf_opts = xf_opts, + env = attr(formula_metadata$formula, ".Environment") ) - names(special_component_details) <- special_predictor_names - special_component_details + names(special_component_metadata) <- special_predictor_names + special_component_metadata } -#' @export -transform_formula_chr <- function(formula_details, special_components_details) { +transform_formula_chr <- function(formula_metadata, special_components_details) { replace_component_call <- function(rhs_string_formula, component_details) { sub(component_details$call, component_details$new_call, rhs_string_formula, fixed = TRUE) @@ -144,82 +146,271 @@ transform_formula_chr <- function(formula_details, special_components_details) { transformed_rhs <- purrr::reduce( special_components_details, replace_component_call, - .init = formula_details$rhs + .init = formula_metadata$rhs ) - sprintf("%s ~ %s", formula_details$lhs, transformed_rhs) + sprintf("%s ~ %s", formula_metadata$lhs, transformed_rhs) } -#' @export -transformed_formula_object <- function(formula_details, blackbox, data, alter, compare_stat) { - - special_components_details <- get_special_components_info(formula_details) - transformed_formula_calls <- get_common_components_env(formula_details, special_components_details, blackbox, data) +correct_improved_components <- function(special_components_details, transformed_variables) { - xs_env_list <- transformed_formula_calls$xs_env - xs_call <- purrr::map2(xs_env_list, names(xs_env_list), get_xs_call) %>% - purrr::set_names(names(xs_env_list)) - xs <- function(variable) { - var_name <- deparse(substitute(variable)) - xs_call[[var_name]](variable) + use_untransformed <- function(special_component_metadata) { + !(special_component_metadata$var %in% transformed_variables) } - xf_env_list <- transformed_formula_calls$xf_env - xf_call <- purrr::map2(xf_env_list, names(xf_env_list), get_xf_call) %>% - purrr::set_names(names(xf_env_list)) - xf <- function(variable) { - var_name <- deparse(substitute(variable)) - xf_call[[var_name]](variable) + use_bare_call <- function(special_component_metadata) { + special_component_metadata$new_call <- special_component_metadata$var + special_component_metadata } - transformed_formula_env <- attr(formula_details$formula, ".Environment") - transformed_formula_env$xs <- xs - transformed_formula_env$xf <- xf - transformed_formula_env$xs_call <- xs_call - transformed_formula_env$xf_call <- xf_call - transformed_formula_env$xs_env_list <- xs_env_list - transformed_formula_env$xf_env_list <- xf_env_list - transformed_formula_env$response <- formula_details$response - special_components_details <- correct_improved_components( - alter, compare_stat, xs, xf, special_components_details, data, formula_details$lhs) - transformed_formula_string <- transform_formula_chr(formula_details, special_components_details) + special_components_details %>% + purrr::map_if(use_untransformed, use_bare_call) - as.formula(transformed_formula_string, env = transformed_formula_env) } -#' @export -factor_opts_default = list( - method_opts = list(type = "ice"), - transform_opts = list(stat = "GIC", value = 3) -) - -#' @export -numeric_opts_default = list( - method_opts = list(type = "pdp"), - transform_opts = list(k = 6) -) - -build_predictor_component <- function(predictor, class, factor_opts, numeric_opts) { +add_special_to_predictor <- function(predictor, class) { if (!(class %in% c("numeric", "integer", "factor"))) { stop("Wrong class passed.") } if (class == "factor") { - sprintf( - "xf(%s, method_opts = %s, transform_opts = %s)", - predictor, deparse(factor_opts$method_opts), deparse(factor_opts$transform_opts) - ) + sprintf("xf(%s)", predictor) } else { - sprintf( - "xs(%s, method_opts = %s, transform_opts = %s)", - predictor, deparse(numeric_opts$method_opts), deparse(numeric_opts$transform_opts) - ) + sprintf("xs(%s)", predictor) } } -build_formula <- function(response, predictors, classes, factor_opts, numeric_opts) { +build_predictor_based_formula <- function(response, predictors, classes, form = "additive") { + if (form == "additive") { + collapse = " + " + } else { + collapse = " * " + } rhs <- purrr::map2_chr( - predictors, classes, build_predictor_component, factor_opts = factor_opts, numeric_opts = numeric_opts) %>% - paste(collapse = " + ") + predictors, classes, add_special_to_predictor) %>% + paste(collapse = collapse) sprintf("%s ~ %s", response, rhs) } + +get_predictors_classes <- function(data) { + purrr::map_chr(1:ncol(data), function(x) class(data[, x])) +} + +try_get <- function(possible) { + possible_response <- try(possible, silent = TRUE) + if (class(possible_response) != "try-error") { + possible_response + } else { + NULL + } +} + +get_model_data <- function(model, data, env = parent.frame()) { + if (is.null(data)) { + data <- try_get(eval(stats::getCall(model)$data, envir = env)) + } + if (is.null(data)) { + stop("Data must be provided.") + } + data +} + +get_model_response <- function(model, data, response) { + if (is.null(response)) { + response <- try_get(all.vars(model$terms[[2]])) + } + if (!is.null(response)) { + response_in_data <- response %in% colnames(data) + if (!all(response_in_data)) { + stop("Response not found in data") + } + response <- response[response_in_data] + } else { + stop("Cannot extract model lhs") + } + response +} + +get_model_lhs <- function(model, lhs) { + if (is.null(lhs)) { + lhs <- try_get(deparse(model$terms[[2]])) + } + if (is.null(lhs)) { + lhs <- try_get(colnames(model.frame(model))[1]) + } + if (is.null(lhs)) { + stop("Cannot extract model lhs") + } + lhs +} + +get_model_predictors <- function(model, data, predictors, response) { + if (is.null(predictors)) { + predictors <- try_get(all.vars(model$terms[[3]])) + } + if (!is.null(predictors)) { + predictors_in_data <- predictors %in% colnames(data) + if (!all(predictors_in_data)) { + stop("Not all predictors found in data") + } + predictors <- predictors[predictors_in_data] + } else { + predictors <- setdiff(colnames(data), response) + } + predictors +} + +get_model_type <- function(model, data) { + response <- get_model_response(model, data, NULL) + + if (inherits(data[[response]], "factor")) { + type <- "classification" + } else if (inherits(data[[response]], "integer") && (length(unique(data[[response]])) <= 2)) { + type <- "classification" + } else { + type <- "regression" + } + type +} + +get_model_family <- function(model, family, type) { + + model_family <- try_get( + match.fun(eval(stats::getCall(model)$family)$family) + ) + + if (is.null(model_family)) { + if (type == "classification") { + model_family <- match.fun("binomial") + } else { + if (is.character(family)) { + model_family <- match.fun(family) + family_name <- family + } else { + model_family <- match.fun(family$family) + family_name <- family$family + } + message(sprintf("Cannot extract model family. Use %s.", family_name)) + } + } + + model_family +} + +get_model_link <- function(model, link, type) { + model_link <- try_get( + eval(stats::getCall(model)$family)$link + ) + + if (is.null(model_link)) { + if (type == "classification") { + model_link <- "logit" + } else { + if (is.null(model_link)) { + message(sprintf("Cannot extract model link. Use %s.", link)) + model_link <- link + } + } + } + + model_link +} + +get_formula_lhs <- function(formula) { + deparse(formula[[2]], width.cutoff = 500) +} + +get_formula_rhs <- function(formula) { + gsub("\\s+", " ", trimws(paste0(deparse(formula[[3]]), collapse = ""))) +} + +get_formula_predictors <- function(formula, data, predictors, response) { + if (is.null(predictors)) { + predictors <- all.vars(formula[[3]]) + } + if (!is.null(predictors)) { + predictors_in_data <- predictors %in% colnames(data) + if (!all(predictors_in_data)) { + stop("Not all predictors found in data") + } + predictors <- predictors[predictors_in_data] + } else { + predictors <- setdiff(colnames(data), response) + } + predictors +} + +get_formula_response <- function(formula, data, response) { + if (is.null(response)) { + response <- try_get(all.vars(formula[[2]])) + } + if (!is.null(response)) { + response_in_data <- response %in% colnames(data) + if (!all(response_in_data)) { + stop("Response not found in data") + } + response <- response[response_in_data] + } else { + stop("Cannot extract formula lhs") + } + response +} + +add_specials_to_formula <- function(formula_components, data, omit = c("xs", "xf", "+", "*", ":", "~")) { + if (length(formula_components) == 1 && !(as.character(formula_components)[1] %in% omit)) { + if (class(data[[as.character(formula_components)]]) == "factor") { + template <- quote(xf(var)) + } else { + template <- quote(xs(var)) + } + template[[2]] <- formula_components + formula_components <- template + formula_components <- formula_components + } else if (length(formula_components) >= 3) { + if (as.character(formula_components[[1]]) %in% omit[-c(1, 2)]) { + for (i in seq_along(formula_components)) { + formula_components[[i]] <- add_specials_to_formula(formula_components[[i]], data, omit) + } + } + } + formula_components +} + +transformed_formula_object <- function(formula_metadata, blackbox, data, family, xs_opts, xf_opts, compare_stat) { + + special_components_details <- collect_specials_metadata(formula_metadata, xs_opts, xf_opts) + transitions <- get_transitions_outcome(formula_metadata, special_components_details, blackbox, data, + family, compare_stat) + + quantitative_transitions <- transitions$quantitative %>% + purrr::keep(~ !(is.null(.$transition_outcome))) + xs_functions <- purrr::map2(quantitative_transitions, names(quantitative_transitions), build_xs_function) %>% + purrr::set_names(names(quantitative_transitions)) + xs <- function(variable) { + var_name <- deparse(substitute(variable)) + xs_functions[[var_name]](variable) + } + + qualitative_transitions <- transitions$qualitative %>% + purrr::keep(~ !(is.null(.$transition_outcome))) + xf_functions <- purrr::map2(qualitative_transitions, names(qualitative_transitions), build_xf_function) %>% + purrr::set_names(names(qualitative_transitions)) + xf <- function(variable) { + var_name <- deparse(substitute(variable)) + xf_functions[[var_name]](variable) + } + + transformed_formula_env <- attr(formula_metadata$formula, ".Environment") + transformed_formula_env$xs <- xs + transformed_formula_env$xf <- xf + transformed_formula_env$xs_functions <- xs_functions + transformed_formula_env$xf_functions <- xf_functions + transformed_formula_env$quantitative_transitions <- quantitative_transitions + transformed_formula_env$qualitative_transitions <- qualitative_transitions + transformed_formula_env$response <- formula_metadata$response + + variables_to_transform <- c(names(quantitative_transitions), names(qualitative_transitions)) + special_components_details <- correct_improved_components(special_components_details, variables_to_transform) + transformed_formula_string <- transform_formula_chr(formula_metadata, special_components_details) + as.formula(transformed_formula_string, env = transformed_formula_env) +} diff --git a/R/utils-model.R b/R/utils-model.R new file mode 100644 index 0000000..1dff9dd --- /dev/null +++ b/R/utils-model.R @@ -0,0 +1,19 @@ +#' @export +extract_xspliner_function <- function(object, ...) { + UseMethod("extract_xspliner_function", object) +} + +#' @export +extract_xspliner_function.xspliner <- function(model, predictor) { + quantity_transition_function <- environment(model)$xs_functions[[predictor]] + quality_transition_function <- environment(model)$xf_functions[[predictor]] + if (!is.null(quantity_transition_function)) { + return(quantity_transition_function) + } else if (!is.null(quantity_transition_function)) { + return(quantity_transition_function) + } else { + message("Variable is not transformed. Use identity.") + return(function(x) x) + } +} + diff --git a/R/utils.R b/R/utils.R index 27c2bcf..8fbf8a3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,163 +1,9 @@ -get_predictors_classes <- function(data) { - purrr::map_chr(1:ncol(data), function(x) class(data[, x])) -} - -try_get <- function(possible) { - possible_response <- try(possible, silent = TRUE) - if (class(possible_response) != "try-error") { - possible_response - } else { - NULL - } -} - -get_model_data <- function(model, data, env = parent.frame()) { - if (is.null(data)) { - data <- try_get(eval(stats::getCall(model)$data, envir = env)) - } - if (is.null(data)) { - stop("Data must be provided.") - } - data -} - -get_model_response <- function(model, data, response) { - if (is.null(response)) { - response <- try_get(all.vars(model$terms[[2]])) - } - if (!is.null(response)) { - response_in_data <- response %in% colnames(data) - if (!all(response_in_data)) { - stop("Response not found in data") - } - response <- response[response_in_data] - } else { - stop("Cannot extract model lhs") - } - response -} - -get_model_lhs <- function(model, lhs) { - if (is.null(lhs)) { - lhs <- try_get(deparse(model$terms[[2]])) - } - if (is.null(lhs)) { - lhs <- try_get(colnames(model.frame(model))[1]) - } - if (is.null(lhs)) { - stop("Cannot extract model lhs") - } - lhs -} - -get_model_predictors <- function(model, data, predictors, response) { - if (is.null(predictors)) { - predictors <- try_get(all.vars(model$terms[[3]])) - } - if (!is.null(predictors)) { - predictors_in_data <- predictors %in% colnames(data) - if (!all(predictors_in_data)) { - stop("Not all predictors found in data") - } - predictors <- predictors[predictors_in_data] - } else { - predictors <- setdiff(colnames(data), response) - } - predictors -} - -get_model_type <- function(model, data) { - response <- get_model_response(model, data, NULL) - - if (inherits(data[[response]], "factor")) { - type <- "classification" - } else if (inherits(data[[response]], "integer") && (length(unique(data[[response]])) <= 2)) { - type <- "classification" - } else { - type <- "regression" - } - type -} - -get_model_family <- function(model, family, type) { +match_parameters <- function(local, global, default) { + local_parameters <- names(local) + global_parameters <- setdiff(names(global), local_parameters) + default_parameters <- setdiff(setdiff(names(default), global_parameters), local_parameters) - model_family <- try_get( - match.fun(eval(stats::getCall(model)$family)$family) + append(local, + append(global[global_parameters], default[default_parameters]) ) - - if (is.null(model_family)) { - if (type == "classification") { - model_family <- match.fun("binomial") - } else { - if (is.character(family)) { - model_family <- match.fun(family) - family_name <- family - } else { - model_family <- match.fun(family$family) - family_name <- family$family - } - message(sprintf("Cannot extract model family. Use %s.", family_name)) - } - } - - model_family -} - -get_model_link <- function(model, link, type) { - model_link <- try_get( - eval(stats::getCall(model)$family)$link - ) - - if (is.null(model_link)) { - if (type == "classification") { - model_link <- "logit" - } else { - if (is.null(model_link)) { - message(sprintf("Cannot extract model link. Use %s.", link)) - model_link <- link - } - } - } - - model_link -} - -get_formula_lhs <- function(formula) { - deparse(formula[[2]], width.cutoff = 500) -} - -get_formula_rhs <- function(formula) { - gsub("\\s+", " ", trimws(paste0(deparse(formula[[3]]), collapse = ""))) -} - -get_formula_predictors <- function(formula, data, predictors, response) { - if (is.null(predictors)) { - predictors <- all.vars(formula[[3]]) - } - if (!is.null(predictors)) { - predictors_in_data <- predictors %in% colnames(data) - if (!all(predictors_in_data)) { - stop("Not all predictors found in data") - } - predictors <- predictors[predictors_in_data] - } else { - predictors <- setdiff(colnames(data), response) - } - predictors -} - -get_formula_response <- function(formula, data, response) { - if (is.null(response)) { - response <- try_get(all.vars(formula[[2]])) - } - if (!is.null(response)) { - response_in_data <- response %in% colnames(data) - if (!all(response_in_data)) { - stop("Response not found in data") - } - response <- response[response_in_data] - } else { - stop("Cannot extract formula lhs") - } - response } diff --git a/R/xspline.R b/R/xspline.R index ef9580e..8b95d34 100644 --- a/R/xspline.R +++ b/R/xspline.R @@ -12,40 +12,40 @@ xspline <- function(object, ...) { } #' @param model Predictive model. Basic model used for extracting predictors transformation. +#' @param lhs Left-hand side of model formula. Can be transformed response. #' @param response Name of response variable of \code{model}. #' @param predictors Predictor values that should be used in final model. #' @param data Training data of \code{model}. +#' @param form Can be 'additive' (default) or 'multiplicative'. Specifies formula form in final model. #' @param env Environment in which optional variables passed into parameters are stored. -#' @param factor_opts Formula parameters used for factor variable transformatoins inherited from factorMerger package. -#' @param numeric_opts Predictive model response method and approximation parameters used for numeric #' variables transformation. See vignette("xspliner") for details. #' #' @rdname xspline #' @export -xspline.default <- function(model, lhs = NULL, response = NULL, predictors = NULL, data = NULL, env = parent.frame(), - factor_opts = factor_opts_default, numeric_opts = numeric_opts_default, ...) { +xspline.default <- function(model, lhs = NULL, response = NULL, predictors = NULL, data = NULL, + form = "additive", env = parent.frame(), ...) { data <- get_model_data(model, data, env) lhs <- get_model_lhs(model, lhs) predictors <- get_model_predictors(model, data, predictors, get_model_response(model, data, response)) classes <- get_predictors_classes(data[, predictors]) formula <- as.formula( - build_formula(lhs, predictors, classes, factor_opts, numeric_opts), + build_predictor_based_formula(lhs, predictors, classes, form), env = env) build_xspliner(formula, model, data, env = env, ...) } #' @param formula xspliner-specific formula object. Check vignette("xspliner") for more details. -#' @param exact If TRUE, exact formula call is used. If not all formula variables are altered. +#' @param consider One of \code{c("specials", "all")}. If "specials", only components with xs or xf +#' call are considered in transition. #' @rdname xspline #' @export -xspline.formula <- function(formula, model, data = NULL, exact = TRUE, env = parent.frame(), ...) { +xspline.formula <- function(formula, model, data = NULL, consider = "specials", env = parent.frame(), ...) { data <- get_model_data(model, data, env) - formula_lhs <- get_formula_lhs(formula) model_lhs <- get_model_lhs(model, NULL) if (model_lhs != formula_lhs) { - warning("Model and formula lhs's must be the same. Using lhs from model.") + message("Model and formula lhs's must be the same. Using lhs from model.") formula[[2]] <- model$terms[[2]] } @@ -58,11 +58,11 @@ xspline.formula <- function(formula, model, data = NULL, exact = TRUE, env = par if (!(all(formula_predictors %in% model_predictors))) { stop("Not all variables from formula are included in model.") } - if (exact) { + if (consider == "specials") { build_xspliner(formula, model, data, env = env, ...) } else { - lhs <- get_formula_lhs(formula) - xspline.default(model, lhs, NULL, formula_predictors, data, env = env, ...) + formula[[3]] <- add_specials_to_formula(formula[[3]], data) + xspline.formula(formula, model, data = NULL, consider = "specials", env = env, ...) } } } @@ -79,45 +79,78 @@ xspline.explainer <- function(explainer, env = parent.frame(), ...) { #' @param formula xspliner-specific formula object. Check vignette("xspliner") for more details. #' @param model Predictive model. Basic model used for extracting predictors transformation. #' @param data Training data of \code{model}. +#' @param xf_opts Formula parameters used for factor variable transformatoins inherited from factorMerger package. +#' @param xs_opts Predictive model response method and approximation parameters used for quantitative. #' @param link Link function that should be used in final model. The passed is used when cannot be extracted from #' model. By default 'identity'. See \link{stats::family} for possibilities. #' @param family Family of response variable that should be used in final model. The passed is used when cannot #' be extracted from model. By default 'gaussian'. See \link{stats::family} for possibilities. #' @param env Environment in which optional variables passed into parameters are stored. -#' @param alter Specifies if each variable transformation should be compared with bare variable usage. -#' List of the form: list(numeric = type, factor = type), type one of c('always', 'auto', 'never). -#' 'auto' option available only for qualitative variable. The better model is specified by \code{compare_stat} parameter then. #' @param compare_stat Function of linear model (lm function output). Statistic that measures if linear model is better that transformed one. #' See \link{stats}. #' -build_xspliner <- function(formula, model, data, link = "identity", family = "gaussian", env = parent.frame(), - alter = list(numeric = 'always', factor = 'never'), compare_stat = r_squared) { +build_xspliner <- function(formula, model, data, xf_opts = xf_opts_default, xs_opts = xs_opts_default, + link = "identity", family = "gaussian", env = parent.frame(), compare_stat = aic) { formula_environment <- new.env(parent = env) attr(formula, ".Environment") <- formula_environment - formula_details <- get_formula_details(formula, extract_formula_var_names(formula, data)) - cleared_formula <- transformed_formula_object(formula_details, model, data, alter, compare_stat) + formula_metadata <- get_formula_metadata(formula, extract_formula_var_names(formula, data)) type <- get_model_type(model, data) model_family <- get_model_family(model, family, type) model_link <- get_model_link(model, link, type) - gam_model <- glm(cleared_formula, data = data, family = model_family(link = model_link)) - gam_model + family <- model_family(link = model_link) + cleared_formula <- transformed_formula_object(formula_metadata, model, data, family, xs_opts, xf_opts, compare_stat) + glm_model <- glm(cleared_formula, data = data, family = family) + environment(glm_model) <- attr(cleared_formula, ".Environment") + class(glm_model) <- c("xspliner", class(glm_model)) + glm_model } +#' Default parameters for transition methods +#' +#' While constructing formula interpreted by xspliner package, some parameters may be specified within xs(..) or xf(..) symbols. +#' Below are default parameters. See details in \code{vignette("xspliner")} +#' +#' @export +xf_opts_default = list( + effect = list(type = "fM"), + transition = list(alter = "newer", stat = "GIC", value = 3) +) + +#' @rdname xf_opts_default +#' @export +xs_opts_default = list( + effect = list(type = "pdp"), + transition = list(alter = "always", monotonic = "not") +) + #' Statistics used for better linear model selection #' #' Used as \code{compare_stat} parameter in \code{xspline} method. #' Each function has attribute "higher-better". #' If "higher-better" is TRUE then model with higher statistic value is treated as better one. #' -#' @param lm_model Linear model - \code{lm} function output. +#' @param glm_model Linear model - \code{glm} function output. +#' @param family Family used for fitting the model. #' @name stats NULL -#' Calculate R Squared for linear model. +#' Calculate AIC for glm model. #' #' @rdname stats #' @export -r_squared <- function(lm_model) { - summary(lm_model)$r.squared +aic <- function(glm_model) { + summary(glm_model)$aic +} +attr(aic, "higher-better") <- FALSE + +#' Calculate Hosmer-Lemeshow Goodness of Fit for glm model. +#' +#' @rdname stats +#' @export +hoslem <- function(glm_model) { + if (glm_model$family$family != "binomial") { + stop("Not classification model.") + } + ResourceSelection::hoslem.test(glm_model$model[, 1], fitted(glm_model))$statistic } -attr(r_squared, "higher-better") <- TRUE +attr(hoslem, "higher-better") <- TRUE diff --git a/docs/articles/xspliner.html b/docs/articles/xspliner.html index b399ebb..cfbb36a 100644 --- a/docs/articles/xspliner.html +++ b/docs/articles/xspliner.html @@ -61,7 +61,7 @@
xspliner.Rmd
method_opts = list(
+effect = list(
type = <method_type> # "pdp" or "ale",
... # named list - other parameters passed for chosen method
)
@@ -170,8 +170,8 @@
Specifying spline approximation parameters
Response function is approximated with mgcv::gam
package and mgcv::s
smoothing function.
xspliner
allows using all smoothing methods provided by mgcv::s
. You can pass corresponding parameters in
-transform_opts = <mgcv::s parameters> # named list
-Remark One of special parameters passed for transform_opts
is increasing
. When the parameter occurs then:
+transition = <mgcv::s parameters> # named list
+Remark One of special parameters passed for transition
is increasing
. When the parameter occurs then:
- for
TRUE
value, approximation will be increasing
- for
FALSE
value, approximation will be decreasing
@@ -185,12 +185,12 @@
yhat
- response function values on points specified in the first column
As we want to store 100 values for response function data, we specify in formula:
-cmedv ~ rm + lstat + xs(nox, method_opts = list(type = "pdp", grid.resolution = 100))
+cmedv ~ rm + lstat + xs(nox, effect = list(type = "pdp", grid.resolution = 100))
We also want to approximate response function with cubic splines and basis dimension equal to 10. As we can see in mgcv::s
documentation, we need to set: k = 10
and bs = "cr"
.
So we get the final formula:
cmedv ~ rm + lstat + xs(nox,
- method_opts = list(type = "pdp", grid.resolution = 100),
- transform_opts = list(k = 10, bs = "cr"))
+ effect = list(type = "pdp", grid.resolution = 100),
+ transition = list(k = 10, bs = "cr"))
xp_model <- xspline(
cmedv ~ rm + lstat +
xs(nox,
- method_opts = list(type = "pdp", grid.resolution = 100),
- transform_opts = list(k = 10, bs = "cr")),
+ effect = list(type = "pdp", grid.resolution = 100),
+ transition = list(k = 10, bs = "cr")),
model = boston_rf
-)
Lets check the model summary:
summary(xp_model)
#>
-#> Family: gaussian
-#> Link function: identity
+#> Call:
+#> glm(formula = cleared_formula, family = family, data = data)
#>
-#> Formula:
-#> cmedv ~ rm + lstat + xs(nox)
-#> <environment: 0x556a5185c8b0>
+#> Deviance Residuals:
+#> Min 1Q Median 3Q Max
+#> -13.3754 -3.3141 -0.9339 2.0499 27.1722
#>
-#> Parametric coefficients:
+#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -32.40688 4.84433 -6.690 5.98e-11 ***
#> rm 5.26616 0.41581 12.665 < 2e-16 ***
@@ -227,9 +229,13 @@
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
-#>
-#> R-sq.(adj) = 0.682 Deviance explained = 68.4%
-#> GCV = 26.984 Scale est. = 26.771 n = 506
As you can see in the summary, the final formula is simplified version of input one. All the information about the response functions are stored in xp_model
environment.
Linear approximation was better for ptratio
response function.
It aproximates data with monotonic spline function by fitting GAM model.
-approx_with_monotonic_spline(bb_response_data, response_var, pred_var, - env = parent.frame(), increasing, ...)+
approx_with_monotonic_spline(effect_data, response, predictor, + env = parent.frame(), monotonic, ...)
response_var | -Name of response value from bb_response_data. |
+ response | +Name of response value from effect_data. |
---|---|---|---|
pred_var | -Name of predictor value from bb_response_data. |
+ predictor | +Name of predictor value from effect_data. |
env | Formula environment that should be used for fitting gam model. |
||
increasing | -If TRUE, spline approximation is increasing, if FALSE decreasing. |
+ monotonic | +Possible options "up", "down" and "auto. If up the spline is incresing, when down decreasing. +For auto the better one (based on r.squared statistic) is chosen. |
... | Other arguments passed to mgcv::s function. |
||
bb_response_data. | +effect_data. | Blackbox response data, for example pdp curve. |
+approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, "up")x <- sort(rnorm(20, 5, 5)) y <- rnorm(20, 2, 2) env <- new.env() -approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, TRUE)#> Error in approx_with_monotonic_spline(data.frame(x = x, y = y), "y", "x", env, TRUE): could not find function "approx_with_monotonic_spline"