Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/statistikat/VIM
Browse files Browse the repository at this point in the history
  • Loading branch information
matthias-da committed Nov 10, 2023
2 parents 2f57943 + 0542de0 commit c6214f0
Show file tree
Hide file tree
Showing 21 changed files with 351 additions and 19 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ jobs:
- {os: windows-latest, r: '3.6'}

# Use older ubuntu to maximise backward compatibility
- {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-18.04, r: 'release'}
- {os: ubuntu-18.04, r: 'oldrel-1'}
- {os: ubuntu-20.04, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-20.04, r: 'release'}
- {os: ubuntu-20.04, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand Down
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Imports:
laeken,
ranger,
MASS,
xgboost,
data.table(>= 1.9.4)
Suggests:
dplyr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export(scattMiss)
export(scattmatrixMiss)
export(spineMiss)
export(tableMiss)
export(xgboostImpute)
import(Rcpp)
import(colorspace)
import(data.table)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# VIM 6.x.x
- fix infinite loop in matchImpute in case all observations of a variable are missing

# VIM 6.2.3
- default robust regression method for irmi for numeric variables changes from rlm to lmrob.

Expand Down
14 changes: 14 additions & 0 deletions R/gowerD.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ gowerD <- function(data.x, data.y = data.x,
returnMin=FALSE,
methodStand="range") {
stopifnot(length(methodStand)==1&&methodStand%in%c("range", "iqr"))
if(length(levOrders)>0){
stopifnot(length(levOrders)==length(orders))
levOrdersUniqueX <- sapply(orders,function(x)length(unique(data.x[[x]])))
if(any(levOrdersUniqueX!=levOrders)){
warning("The number of unique values in the ordinal variables in data.x
does not correspond to the values given in levOrders")
}
levOrdersUniqueY <- sapply(orders,function(x)length(unique(data.x[[x]])))
if(any(levOrdersUniqueY!=levOrders)){
warning("The number of unique values in the ordinal variables in data.y
does not correspond to the values given in levOrders")
}
}
maxplus1 <- function(x){
if(all(is.na(x)))
return(1)
Expand Down Expand Up @@ -86,6 +99,7 @@ gowerD <- function(data.x, data.y = data.x,
2,min0,na.rm=TRUE)
rmax <- apply(rbind(data.x[,numerical,drop=FALSE],
data.y[,numerical,drop=FALSE]),2,max1,na.rm=TRUE)
print(c(rmin,rmax))
}else if(methodStand == "iqr"){
rmin <- apply(rbind(data.x[,numerical,drop=FALSE],data.y[,numerical,drop=FALSE]),2,quantile,na.rm=TRUE,
probs=.25)
Expand Down
6 changes: 6 additions & 0 deletions R/matchImpute.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,14 @@ matchImpute <- function(data,variable=colnames(data)[!colnames(data)%in%match_va
data <- as.data.table(data)
else
data <- data.table::copy(data)
tfna <- data[,sapply(lapply(.SD,is.na),all),.SDcols=variable]
if(any(tfna)){
stop(paste0(variable[tfna],collapse=", ")," ", ifelse(sum(tfna)>1,"are","is")," completely missing")

}
na_present <- data[,sum(sapply(lapply(.SD,is.na),sum)),.SDcols=variable]


if(imp_var){
data[,paste(variable,imp_suffix,sep="_"):=lapply(.SD,is.na),.SDcols=variable]
}
Expand Down
34 changes: 28 additions & 6 deletions R/rangerImpute.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,44 @@ rangerImpute <- function(formula, data, imp_var = TRUE,
for (lhsV in lhs) {
form <- as.formula(paste(lhsV, "~", rhs))
lhs_vector <- data[[lhsV]]
lhs_isfactor <- inherits(lhs_vector, "factor")

if (!any(is.na(lhs_vector))) {
cat(paste0("No missings in ", lhsV, ".\n"))
} else {
lhs_na <- is.na(lhs_vector)
if (verbose)
message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose)

if(lhs_isfactor){
mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), probability = TRUE, ..., verbose = verbose)
}else{
mod <- ranger::ranger(form, subset(data, !rhs_na & !lhs_na), ..., verbose = verbose)
}

if (verbose)
message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
if (median & inherits(lhs_vector, "numeric")) {
predictions <- apply(
predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions,
1, median)
} else {

if(lhs_isfactor){
predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions
predict_levels <- colnames(predictions)

predictions <- apply(predictions,1,function(z,lev){
z <- cumsum(z)
z_lev <- lev[z>runif(1)]
return(z_lev[1])
},lev=predict_levels)

}else{
if (median & inherits(lhs_vector, "numeric")) {
predictions <- apply(
predict(mod, subset(data, !rhs_na & lhs_na), predict.all = TRUE)$predictions,
1, median)
} else {
predictions <- predict(mod, subset(data, !rhs_na & lhs_na))$predictions
}
}

data[!rhs_na & lhs_na, lhsV] <- predictions
}

Expand Down
132 changes: 132 additions & 0 deletions R/xgboostImpute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#' Xgboost Imputation
#'
#' Impute missing values based on a random forest model using [xgboost::xgboost()]
#' @param formula model formula for the imputation
#' @param data A `data.frame` containing the data
#' @param imp_var `TRUE`/`FALSE` if a `TRUE`/`FALSE` variables for each imputed
#' variable should be created show the imputation status
#' @param imp_suffix suffix used for TF imputation variables
#' @param verbose Show the number of observations used for training
#' and evaluating the RF-Model. This parameter is also passed down to
#' [xgboost::xgboost()] to show computation status.
#' @param ... Arguments passed to [xgboost::xgboost()]
#' @param nrounds max number of boosting iterations,
#' argument passed to [xgboost::xgboost()]
#' @param objective objective for xgboost,
#' argument passed to [xgboost::xgboost()]
#' @return the imputed data set.
#' @family imputation methods
#' @examples
#' data(sleep)
#' xgboostImpute(Dream~BodyWgt+BrainWgt,data=sleep)
#' xgboostImpute(Dream+NonD~BodyWgt+BrainWgt,data=sleep)
#' xgboostImpute(Dream+NonD+Gest~BodyWgt+BrainWgt,data=sleep)
#'
#' sleepx <- sleep
#' sleepx$Pred <- as.factor(LETTERS[sleepx$Pred])
#' sleepx$Pred[1] <- NA
#' xgboostImpute(Pred~BodyWgt+BrainWgt,data=sleepx)
#' @export
xgboostImpute <- function(formula, data, imp_var = TRUE,
imp_suffix = "imp", verbose = FALSE,
nrounds=100, objective=NULL,
...){
check_data(data)
formchar <- as.character(formula)
lhs <- gsub(" ", "", strsplit(formchar[2], "\\+")[[1]])
rhs <- formchar[3]
rhs2 <- gsub(" ", "", strsplit(rhs, "\\+")[[1]])
#Missings in RHS variables
rhs_na <- apply(subset(data, select = rhs2), 1, function(x) any(is.na(x)))
#objective should be a vector of lenght equal to the lhs variables
if(!is.null(objective)){
stopifnot(length(objective)!=length(lhs))
}
for (lhsV in lhs) {
form <- as.formula(paste(lhsV, "~", rhs,"-1"))
# formula without left side for prediction
formPred <- as.formula(paste( "~", rhs,"-1"))
lhs_vector <- data[[lhsV]]
num_class <- NULL
if (!any(is.na(lhs_vector))) {
cat(paste0("No missings in ", lhsV, ".\n"))
} else {
lhs_na <- is.na(lhs_vector)
if (verbose)
message("Training model for ", lhsV, " on ", sum(!rhs_na & !lhs_na), " observations")
dattmp <- subset(data, !rhs_na & !lhs_na)
labtmp <- dattmp[[lhsV]]
currentClass <- NULL
if(inherits(labtmp,"factor")){
currentClass <- "factor"
labtmp <- as.integer(labtmp)-1
if(length(unique(labtmp))==2){
objective <- "binary:logistic"
}else if(length(unique(labtmp))>2){
objective <- "multi:softmax"
num_class <- max(labtmp)+1
}

}else if(inherits(labtmp,"integer")){
currentClass <- "integer"
if(length(unique(labtmp))==2){
lvlsInt <- unique(labtmp)
labtmp <- match(labtmp,lvlsInt)-1
warning("binary factor detected but not probproperlyably stored as factor.")
objective <- "binary:logistic"
}else{
objective <- "count:poisson"## Todo: this might not be wise as default
}
}else if(inherits(labtmp,"numeric")){
currentClass <- "numeric"
if(length(unique(labtmp))==2){
lvlsInt <- unique(labtmp)
labtmp <- match(labtmp,lvlsInt)-1
warning("binary factor detected but not properly stored as factor.")
objective <- "binary:logistic"
}else{
objective <- "reg:squarederror"
}
}


mm <- model.matrix(form,dattmp)
if(!is.null(num_class)){
mod <- xgboost::xgboost(data = mm, label = labtmp,
nrounds=nrounds, objective=objective, num_class = num_class, verbose = verbose,...)
}else{
mod <- xgboost::xgboost(data = mm, label = labtmp,
nrounds=nrounds, objective=objective, verbose = verbose,...)
}

if (verbose)
message("Evaluating model for ", lhsV, " on ", sum(!rhs_na & lhs_na), " observations")
predictions <-
predict(mod, model.matrix(formPred,subset(data, !rhs_na & lhs_na)))
if(currentClass=="factor"){
if(is.null(num_class)){
data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[as.numeric(predictions>.5)+1]
}else{
data[!rhs_na & lhs_na, lhsV] <- levels(dattmp[,lhsV])[predictions+1]
}
}else if(currentClass%in%c("numeric","integer")&objective=="binary:logistic"){
data[!rhs_na & lhs_na, lhsV] <- lvlsInt[as.numeric(predictions>.5)+1]
}else{
data[!rhs_na & lhs_na, lhsV] <- predictions
}

}

if (imp_var) {
if (imp_var %in% colnames(data)) {
data[, paste(lhsV, "_", imp_suffix, sep = "")] <- as.logical(data[, paste(lhsV, "_", imp_suffix, sep = "")])
warning(paste("The following TRUE/FALSE imputation status variables will be updated:",
paste(lhsV, "_", imp_suffix, sep = "")))
} else {
data$NEWIMPTFVARIABLE <- is.na(lhs_vector)
colnames(data)[ncol(data)] <- paste(lhsV, "_", imp_suffix, sep = "")
}
}
}
data
}
26 changes: 26 additions & 0 deletions inst/tinytest/test_matchImpute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
library(VIM)
message("matchImpute general")
setna <- function(d,i,col=2){
d[i,col] <- NA
d
}
d <- data.frame(x=LETTERS[1:6],y=as.double(1:6),z=as.double(1:6),
w=ordered(LETTERS[1:6]), stringsAsFactors = FALSE)
dorig <- rbind(d,d)
# minimal example with one match var
d1 <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x")
expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp])

d1b <- matchImpute(setna(dorig,7:12,1)[,1:2],match_var = "y", variable="x", imp_var = FALSE)
expect_identical(d1b$x[d1$x_imp],d1b$x[!d1$x_imp])
expect_false("x_imp" %in% colnames(d1b))
expect_true("x_imp" %in% colnames(d1))


# all missing in x -> error
expect_error(matchImpute(setna(dorig,1:12,1)[,1:2],match_var = "y", variable="x"))


# example with two match vars
d1 <- matchImpute(setna(dorig,7:12,1)[,1:3],match_var = c("y","z"), variable="x")
expect_identical(d1$x[d1$x_imp],d1$x[!d1$x_imp])
49 changes: 49 additions & 0 deletions inst/tinytest/test_xgboostImpute.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
library(VIM)
set.seed(104)
x <- rnorm(100)
df <- data.frame(
y = x + rnorm(100, sd = .01),
x = x,
fac = as.factor(x >= 0)
)

max_dist <- function(x, y) {
max(abs(x - y))
}

df$y[1:3] <- NA
df$fac[3:5] <- NA
df$binNum <- as.integer(df$fac)+17
df$binInt <- as.integer(df$fac)+17L
# xgboostImpute accuracy", {
df.out <- xgboostImpute(y ~ x, df)
expect_true(
max_dist(df.out$y, df$x)<
0.06
)

# xgboostImpute should do nothing for no missings", {
df.out <- xgboostImpute(x ~ y, df)
expect_identical(df.out$x, df$x)
#

# factor response predicted accurately", {
df.out <- xgboostImpute(fac ~ x, df)
expect_identical(df.out$fac, as.factor(df$x >= 0))
#

# interger binary response predicted accurately", {
expect_warning(df.out <- xgboostImpute(binInt ~ x, df))
expect_identical(df.out$binInt==19, df$x >= 0)
#
# numeric binary response predicted accurately", {
expect_warning(df.out <- xgboostImpute(binNum ~ x, df))
expect_identical(df.out$binNum==19, df$x >= 0)
#
# factor regressor used reasonably", {
df2 <- df
df2$x[1:10] <- NA
df.out <- xgboostImpute(x ~ fac, df2)
expect_identical(as.factor(df.out$x >= 0), df$fac)
#

3 changes: 2 additions & 1 deletion man/hotdeck.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/impPCA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/irmi.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/kNN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/matchImpute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/medianSamp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c6214f0

Please sign in to comment.