diff --git a/DESCRIPTION b/DESCRIPTION index 4a3c8165..1a1eccaa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -127,6 +127,7 @@ Collate: 'PipeOpTorchReshape.R' 'PipeOpTorchSoftmax.R' 'Select.R' + 'TaskClassif_cifar.R' 'TaskClassif_lazy_iris.R' 'TaskClassif_melanoma.R' 'TaskClassif_mnist.R' diff --git a/R/TaskClassif_cifar.R b/R/TaskClassif_cifar.R new file mode 100644 index 00000000..9f95d2b4 --- /dev/null +++ b/R/TaskClassif_cifar.R @@ -0,0 +1,175 @@ +#' @title CIFAR Classification Tasks +#' +#' @name mlr_tasks_cifar +#' +#' @format [R6::R6Class] inheriting from [mlr3::TaskClassif]. +#' @include aaa.R +#' +#' @description +#' The CIFAR-10 and CIFAR-100 datasets. A subset of the 80 million tiny images dataset +#' with noisy labels was supplied to student labelers, who were asked to filter out +#' incorrectly labeled images. +#' +#' CIFAR-10 contains 10 classes. CIFAR-100 contains 100 classes, which may be partitioned into 20 superclasses of 5 classes each. +#' The CIFAR-10 and CIFAR-100 classes are mutually exclusive. +#' See Chapter 3.1 of [the technical report](https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf) for more details. +#' +#' The data is obtained from [`torchvision::cifar10_dataset()`] (or `torchvision::cifar100_dataset()`). +#' +#' @section Construction: +#' ``` +#' tsk("cifar10") +#' tsk("cifar100") +#' ``` +#' +#' @template task_download +#' +#' @section Properties: +#' `r rd_info_task_torch("cifar10", missings = FALSE)` +#' +#' @references +#' `r format_bib("cifar2009")` +#' @examples +#' task_cifar10 = tsk("cifar10") +#' task_cifar100 = tsk("cifar100") +#' print(task_cifar10) +#' print(task_cifar100) +NULL + +cifar_ds_generator = torch::dataset( + initialize = function(images) { + self$images = images + }, + .getitem = function(idx) { + force(idx) + + x = torch_tensor(self$images[idx, , , ]) + + return(list(x = x)) + }, + .length = function() { + dim(self$images)[1L] + } +) + +constructor_cifar = function(path, type = 10) { + if (type == 10) { + d_train = torchvision::cifar10_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar10_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-10-batches-bin", "batches.meta.txt")) + class_names = class_names[class_names != ""] + } else if (type == 100) { + d_train = torchvision::cifar100_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar100_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-100-binary", "fine_label_names.txt")) + } + + classes = c(d_train$y, d_test$y) + images = array(NA, dim = c(60000, 3, 32, 32)) + # original data has channel dimension at the end + perm_idx = c(1, 4, 2, 3) + images[1:50000, , , ] = aperm(d_train$x, perm_idx, resize = TRUE) + images[50001:60000, , , ] = aperm(d_test$x, perm_idx, resize = TRUE) + + return(list(class = factor(classes, labels = class_names), images = images)) +} + +constructor_cifar10 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 10)) +} + +load_task_cifar10 = function(id = "cifar10") { + cached_constructor = function(backend) { + data <- cached(constructor_cifar10, "datasets", "cifar10")$data + + cifar10_ds = cifar_ds_generator(data$images) + + dd = as_data_descriptor(cifar10_ds, list(x = c(NA, 3, 32, 32))) + lt = lazy_tensor(dd) + + dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) + ) + + DataBackendDataTable$new(data = dt, primary_key = "..row_id") + } + + backend = DataBackendLazy$new( + constructor = cached_constructor, + rownames = seq_len(60000), + col_info = load_col_info("cifar10"), + primary_key = "..row_id" + ) + + task = TaskClassif$new( + backend = backend, + id = "cifar10", + target = "class", + label = "CIFAR-10 Classification" + ) + + task$col_roles$feature = "image" + + backend$hash = "mlr3torch::mlr_tasks_cifar10" + task$man = "mlr3torch::mlr_tasks_cifar" + + return(task) +} + +register_task("cifar10", load_task_cifar10) + +constructor_cifar100 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 100)) +} + +load_task_cifar100 = function(id = "cifar100") { + cached_constructor = function(backend) { + data = cached(constructor_cifar100, "datasets", "cifar100")$data + + cifar100_ds = cifar_ds_generator(data$images) + + dd = as_data_descriptor(cifar100_ds, list(x = c(NA, 3, 32, 32))) + lt = lazy_tensor(dd) + + dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) + ) + + DataBackendDataTable$new(data = dt, primary_key = "..row_id") + } + + backend = DataBackendLazy$new( + constructor = cached_constructor, + rownames = seq_len(60000), + col_info = load_col_info("cifar100"), + primary_key = "..row_id" + ) + + task = TaskClassif$new( + backend = backend, + id = "cifar100", + target = "class", + label = "CIFAR-100 Classification" + ) + + task$col_roles$feature = "image" + + backend$hash = "mlr3torch::mlr_tasks_cifar100" + task$man = "mlr3torch::mlr_tasks_cifar" + + return(task) +} + +register_task("cifar100", load_task_cifar100) + + diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 3c4c5531..d3a62063 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -37,6 +37,7 @@ #' #' @references #' `r format_bib("melanoma2021")` +#' @examples #' task = tsk("melanoma") #' task NULL diff --git a/R/TaskClassif_tiny_imagenet.R b/R/TaskClassif_tiny_imagenet.R index bc32d60b..8355a627 100644 --- a/R/TaskClassif_tiny_imagenet.R +++ b/R/TaskClassif_tiny_imagenet.R @@ -7,7 +7,7 @@ #' The data is obtained from [`torchvision::tiny_imagenet_dataset()`]. #' #' The underlying [`DataBackend`][mlr3::DataBackend] contains columns `"class"`, `"image"`, `"..row_id"`, `"split"`, where the last column -#' indicates whether the row belongs to the train, validation or test set that defined provided in torchvision. +#' indicates whether the row belongs to the train, validation or test set that are provided in torchvision. #' #' There are no labels for the test rows, so by default, these observations are inactive, which means that the task #' uses only 110000 of the 120000 observations that are defined in the underlying data backend. diff --git a/R/bibentries.R b/R/bibentries.R index f9d7fa67..9a1c7667 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -121,6 +121,12 @@ bibentries = c(# nolint start pages = "34", year = "2021", doi = "10.1038/s41597-021-00815-z" + ), + cifar2009 = bibentry("article", + title = "Learning Multiple Layers of Features from Tiny Images", + author = "Krizhevsky, Alex", + journal= "Master's thesis, Department of Computer Science, University of Toronto",, + year = "2009", ) ) # nolint end diff --git a/data-raw/cifar.R b/data-raw/cifar.R new file mode 100644 index 00000000..0d5e1474 --- /dev/null +++ b/data-raw/cifar.R @@ -0,0 +1,110 @@ +devtools::load_all() + +library(mlr3misc) +library(data.table) +library(torchvision) + +# cached +cifar_ds_generator = torch::dataset( + initialize = function(images) { + self$images = images + }, + .getitem = function(idx) { + force(idx) + + x = torch_tensor(self$images[idx, , , ]) + + return(list(x = x)) + }, + .length = function() { + dim(self$images)[1L] + } +) + +constructor_cifar = function(path, type = 10) { + if (type == 10) { + d_train = torchvision::cifar10_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar10_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-10-batches-bin", "batches.meta.txt")) + class_names = class_names[class_names != ""] + } else if (type == 100) { + d_train = torchvision::cifar100_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar100_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-100-binary", "fine_label_names.txt")) + } + + classes = c(d_train$y, d_test$y) + images = array(NA, dim = c(60000, 3, 32, 32)) + # original data has channel dimension at the end + perm_idx = c(1, 4, 2, 3) + images[1:50000, , , ] = aperm(d_train$x, perm_idx, resize = TRUE) + images[50001:60000, , , ] = aperm(d_test$x, perm_idx, resize = TRUE) + + return(list(class = factor(classes, labels = class_names), images = images)) +} + +constructor_cifar10 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 10)) +} + +withr::local_options(mlr3torch.cache = TRUE) +path = file.path(get_cache_dir(), "datasets", "cifar10", "raw") + +# begin CIFAR-10 +data <- constructor_cifar10(path) + +cifar10_ds = cifar_ds_generator(data$images) + +dd = as_data_descriptor(cifar10_ds, list(x = c(NA, 3, 32, 32))) +lt = lazy_tensor(dd) + +tsk_dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) +) + +# tsk_dt = cbind(data, data.table(image = lt)) + +tsk_cifar10 = as_task_classif(tsk_dt, target = "class", id = "cifar10") +tsk_cifar10$col_roles$feature = "image" + +ci = col_info(tsk_cifar10$backend) + +saveRDS(ci, here::here("inst/col_info/cifar10.rds")) +# end CIFAR-10 + +path = file.path(get_cache_dir(), "datasets", "cifar100", "raw") + +# begin CIFAR-100 +constructor_cifar100 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 100)) +} + +data = constructor_cifar100(path) + +cifar100_ds = cifar_ds_generator(data$images) + +dd = as_data_descriptor(cifar100_ds, list(x = c(NA, 3, 32, 32))) +lt = lazy_tensor(dd) + +dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) +) + +task = as_task_classif(dt, target = "class") + +task$col_roles$feature = "image" + +ci = col_info(task$backend) + +saveRDS(ci, here::here("inst/col_info/cifar100.rds")) + diff --git a/inst/col_info/cifar10.rds b/inst/col_info/cifar10.rds new file mode 100644 index 00000000..797790c7 Binary files /dev/null and b/inst/col_info/cifar10.rds differ diff --git a/inst/col_info/cifar100.rds b/inst/col_info/cifar100.rds new file mode 100644 index 00000000..5dd013ce Binary files /dev/null and b/inst/col_info/cifar100.rds differ diff --git a/man/mlr_tasks_cifar.Rd b/man/mlr_tasks_cifar.Rd new file mode 100644 index 00000000..06475198 --- /dev/null +++ b/man/mlr_tasks_cifar.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TaskClassif_cifar.R +\name{mlr_tasks_cifar} +\alias{mlr_tasks_cifar} +\title{CIFAR Classification Tasks} +\format{ +\link[R6:R6Class]{R6::R6Class} inheriting from \link[mlr3:TaskClassif]{mlr3::TaskClassif}. +} +\description{ +The CIFAR-10 and CIFAR-100 datasets. A subset of the 80 million tiny images dataset +with noisy labels was supplied to student labelers, who were asked to filter out +incorrectly labeled images. + +CIFAR-10 contains 10 classes. CIFAR-100 contains 100 classes, which may be partitioned into 20 superclasses of 5 classes each. +The CIFAR-10 and CIFAR-100 classes are mutually exclusive. +See Chapter 3.1 of \href{https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf}{the technical report} for more details. + +The data is obtained from \code{\link[torchvision:cifar10_dataset]{torchvision::cifar10_dataset()}} (or \code{torchvision::cifar100_dataset()}). +} +\section{Construction}{ + + +\if{html}{\out{