From cb5a541bb303f15b2b70774adf77578022d7c163 Mon Sep 17 00:00:00 2001 From: Alexander Kowarik Date: Thu, 9 Nov 2023 09:14:25 +0100 Subject: [PATCH] small changes to make the xgboostImpute version closer to CRAN readiness --- DESCRIPTION | 1 + NAMESPACE | 1 + R/xgboostImpute.R | 32 ++++++++++++++++++++---------- inst/tinytest/test_matchImpute.R | 4 ++++ inst/tinytest/test_xgboostImpute.R | 14 ++++++++++--- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c23af1f..6b68f15 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Imports: laeken, ranger, MASS, + xgboost, data.table(>= 1.9.4) Suggests: dplyr, diff --git a/NAMESPACE b/NAMESPACE index 539f0c1..c3182e8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(scattMiss) export(scattmatrixMiss) export(spineMiss) export(tableMiss) +export(xgboostImpute) import(Rcpp) import(colorspace) import(data.table) diff --git a/R/xgboostImpute.R b/R/xgboostImpute.R index ff0ed04..b2923a0 100644 --- a/R/xgboostImpute.R +++ b/R/xgboostImpute.R @@ -6,10 +6,14 @@ #' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed #' variable should be created show the imputation status #' @param imp_suffix suffix used for TF imputation variables -#' @param ... Arguments passed to [xgboost::xgboost()] #' @param verbose Show the number of observations used for training #' and evaluating the RF-Model. This parameter is also passed down to #' [xgboost::xgboost()] to show computation status. +#' @param ... Arguments passed to [xgboost::xgboost()] +#' @param nrounds max number of boosting iterations, +#' argument passed to [xgboost::xgboost()] +#' @param objective objective for xgboost, +#' argument passed to [xgboost::xgboost()] #' @return the imputed data set. #' @family imputation methods #' @examples @@ -63,21 +67,25 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, num_class <- max(labtmp)+1 } - }else if(inherits(labtmp,"numeric")){ - currentClass <- "numeric" + }else if(inherits(labtmp,"integer")){ + currentClass <- "integer" if(length(unique(labtmp))==2){ - warning("binary factor detected but not probably stored as factor.") + lvlsInt <- unique(labtmp) + labtmp <- match(labtmp,lvlsInt)-1 + warning("binary factor detected but not probproperlyably stored as factor.") objective <- "binary:logistic" }else{ - objective <- "reg:squarederror" + objective <- "count:poisson"## Todo: this might not be wise as default } - }else if(inherits(labtmp,"integer")){ - currentClass <- "integer" + }else if(inherits(labtmp,"numeric")){ + currentClass <- "numeric" if(length(unique(labtmp))==2){ - warning("binary factor detected but not probably stored as factor.") + lvlsInt <- unique(labtmp) + labtmp <- match(labtmp,lvlsInt)-1 + warning("binary factor detected but not properly stored as factor.") objective <- "binary:logistic" }else{ - objective <- "count:poisson"## Todo: this might not be wise as default + objective <- "reg:squarederror" } } @@ -85,10 +93,10 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, mm <- model.matrix(form,dattmp) if(!is.null(num_class)){ mod <- xgboost::xgboost(data = mm, label = labtmp, - nrounds=nrounds, objective=objective, num_class = num_class, verbose = FALSE,...) + nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose,...) }else{ mod <- xgboost::xgboost(data = mm, label = labtmp, - nrounds=nrounds, objective=objective, verbose = FALSE,...) + nrounds=nrounds, objective=objective, verbose = verbose,...) } if (verbose) @@ -101,6 +109,8 @@ xgboostImpute <- function(formula, data, imp_var = TRUE, }else{ data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1] } + }else if(currentClass%in%c("numeric","integer")&objective=="binary:logistic"){ + data[!rhs_na & lhs_na, lhsV] <- lvlsInt[as.numeric(predictions>.5)+1] }else{ data[!rhs_na & lhs_na, lhsV] <- predictions } diff --git a/inst/tinytest/test_matchImpute.R b/inst/tinytest/test_matchImpute.R index c396481..d33230e 100644 --- a/inst/tinytest/test_matchImpute.R +++ b/inst/tinytest/test_matchImpute.R @@ -1,5 +1,9 @@ library(VIM) message("matchImpute general") +setna <- function(d,i,col=2){ + d[i,col] <- NA + d +} d <- data.frame(x=LETTERS[1:6],y=as.double(1:6),z=as.double(1:6), w=ordered(LETTERS[1:6]), stringsAsFactors = FALSE) dorig <- rbind(d,d) diff --git a/inst/tinytest/test_xgboostImpute.R b/inst/tinytest/test_xgboostImpute.R index c51a6f9..8aafba9 100644 --- a/inst/tinytest/test_xgboostImpute.R +++ b/inst/tinytest/test_xgboostImpute.R @@ -13,7 +13,8 @@ max_dist <- function(x, y) { df$y[1:3] <- NA df$fac[3:5] <- NA - +df$binNum <- as.integer(df$fac)+17 +df$binInt <- as.integer(df$fac)+17L # xgboostImpute accuracy", { df.out <- xgboostImpute(y ~ x, df) expect_true( @@ -28,10 +29,17 @@ df$fac[3:5] <- NA # factor response predicted accurately", { df.out <- xgboostImpute(fac ~ x, df) - df.out[df.out$fac_imp,] expect_identical(df.out$fac, as.factor(df$x >= 0)) # - + + # interger binary response predicted accurately", { + expect_warning(df.out <- xgboostImpute(binInt ~ x, df)) + expect_identical(df.out$binInt==19, df$x >= 0) + # + # numeric binary response predicted accurately", { + expect_warning(df.out <- xgboostImpute(binNum ~ x, df)) + expect_identical(df.out$binNum==19, df$x >= 0) + # # factor regressor used reasonably", { df2 <- df df2$x[1:10] <- NA