mlr-org · mb706 · Aug 24, 2024 · Aug 23, 2024
diff --git a/R/PipeOpImpute.R b/R/PipeOpImpute.R
@@ -56,6 +56,9 @@
 #' * `model` :: named `list`\cr
 #'   Model used for imputation. This is a list named by [`Task`][mlr3::Task] features, containing the result of the `private$.train_imputer()` or
 #'   `private$.train_nullmodel()` function for each one.
+#' * `imputed_train` :: `character`\cr
+#'   Names of features that were imputed during training. This is used to ensure that factor levels that were added during training are also added during prediction.
+#'   Note that features that are imputed during prediction but not during training will still have inconsistent factor levels.
 #'
 #' @section Parameters:
 #' * `affect_columns` :: `function` | [`Selector`] | `NULL` \cr
@@ -195,6 +198,8 @@ PipeOpImpute = R6Class("PipeOpImpute",
 
       self$state$outtasklayout = copy(intask$feature_types)
 
+      self$state$imputed_train = names(imputanda)
+
       if (!is.null(intask$internal_valid_task)) {
         intask$internal_valid_task = private$.predict(list(intask$internal_valid_task))[[1L]]
       }
@@ -215,7 +220,10 @@ PipeOpImpute = R6Class("PipeOpImpute",
       ..col = NULL  # avoid static checker complaints
 
       imputanda = intask$data(cols = self$state$affected_cols)
-      imputanda = imputanda[, map_lgl(imputanda, function(x) anyMissing(x)), with = FALSE]
+      imputanda = imputanda[,
+        colnames(imputanda) %in% self$state$imputed_train |
+          map_lgl(imputanda, function(x) anyMissing(x)),
+        with = FALSE]
 
       imap(imputanda, function(col, colname) {
         type = intask$feature_types[colname, get("type")]
@@ -257,11 +265,14 @@ PipeOpImpute = R6Class("PipeOpImpute",
         # in some edge cases there may be levels during training that are missing during predict.
         levels(feature) = c(levels(feature), as.character(model))
       }
+      nas = which(is.na(feature))
+      if (!length(nas)) return(feature)
+
       if (length(model) == 1) {
-        feature[is.na(feature)] = model
+        feature[nas] = model
       } else {
         outlen = count_missing(feature)
-        feature[is.na(feature)] = sample(model, outlen, replace = TRUE, prob = attr(model, "probabilities"))
+        feature[nas] = sample(model, outlen, replace = TRUE, prob = attr(model, "probabilities"))
       }
       feature
     }

diff --git a/R/PipeOpImputeLearner.R b/R/PipeOpImputeLearner.R
@@ -153,6 +153,8 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner",
     },
 
     .impute = function(feature, type, model, context) {
+      nas = which(is.na(feature))
+      if (!length(nas)) return(feature)
       if (is.atomic(model)) {  # handle nullmodel, making use of the fact that `Learner$state` is always a list
         return(super$.impute(feature, type, model, context))
       }
@@ -162,7 +164,7 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner",
 
       # Use the trained learner to perform the imputation
       task = private$.create_imputation_task(feature, context)
-      pred = private$.learner$predict(task, which(is.na(feature)))
+      pred = private$.learner$predict(task, nas)
 
       # Replace the missing values with imputed values of the correct format
       imp_vals = private$.convert_to_type(pred$response, type)
@@ -172,7 +174,7 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner",
         levels(feature) = c(levels(feature), as.character(type))
       }
 
-      feature[is.na(feature)] = imp_vals
+      feature[nas] = imp_vals
       feature
     },
 
@@ -187,7 +189,11 @@ PipeOpImputeLearner = R6Class("PipeOpImputeLearner",
       if (is.numeric(feature)) {
         feature
       } else {
-        factor(feature, ordered = FALSE)
+        if (!is.null(levels(feature))) {
+          factor(feature, levels = levels(feature), ordered = FALSE)
+        } else {
+          factor(feature, ordered = FALSE)
+        }
       }
     },
 

diff --git a/tests/testthat/test_pipeop_impute.R b/tests/testthat/test_pipeop_impute.R
@@ -414,3 +414,21 @@ test_that("impute, test rows and affect_columns", {
   outpredict = po_impute$predict(list(task$internal_valid_task))[[1L]]
   expect_true(isTRUE(all.equal(outtrain$internal_valid_task$data(), outpredict$data())))
 })
+
+test_that("imputeoor keeps missing level even if no missing data in predict task", {
+  task = as_task_classif(data.table(target = factor(c("a", "b", "a", "b", "a")), x = factor(c("a", "b", NA, "b", "a"))), target = "target", id = "testtask")
+
+  task_train = task$clone(deep = TRUE)$filter(1:3)
+  poi = po("imputeoor")
+  expect_identical(
+    poi$train(list(task_train))[[1L]]$data(),
+    data.table(target = factor(c("a", "b", "a")), x = factor(c("a", "b", ".MISSING"), levels = c("a", "b", ".MISSING")))
+  )
+
+  task_predict = task$clone(deep = TRUE)$filter(4:5)
+  expect_identical(
+    poi$predict(list(task_predict))[[1L]]$data(),
+    data.table(target = factor(c("b", "a")), x = factor(c("b", "a"), levels = c("a", "b", ".MISSING")))  # check that factor levels are still present
+  )
+
+})