diff --git a/DESCRIPTION b/DESCRIPTION index 4a3c8165..1a1eccaa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -127,6 +127,7 @@ Collate: 'PipeOpTorchReshape.R' 'PipeOpTorchSoftmax.R' 'Select.R' + 'TaskClassif_cifar.R' 'TaskClassif_lazy_iris.R' 'TaskClassif_melanoma.R' 'TaskClassif_mnist.R' diff --git a/R/TaskClassif_cifar.R b/R/TaskClassif_cifar.R new file mode 100644 index 00000000..9f95d2b4 --- /dev/null +++ b/R/TaskClassif_cifar.R @@ -0,0 +1,175 @@ +#' @title CIFAR Classification Tasks +#' +#' @name mlr_tasks_cifar +#' +#' @format [R6::R6Class] inheriting from [mlr3::TaskClassif]. +#' @include aaa.R +#' +#' @description +#' The CIFAR-10 and CIFAR-100 datasets. A subset of the 80 million tiny images dataset +#' with noisy labels was supplied to student labelers, who were asked to filter out +#' incorrectly labeled images. +#' +#' CIFAR-10 contains 10 classes. CIFAR-100 contains 100 classes, which may be partitioned into 20 superclasses of 5 classes each. +#' The CIFAR-10 and CIFAR-100 classes are mutually exclusive. +#' See Chapter 3.1 of [the technical report](https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf) for more details. +#' +#' The data is obtained from [`torchvision::cifar10_dataset()`] (or `torchvision::cifar100_dataset()`). +#' +#' @section Construction: +#' ``` +#' tsk("cifar10") +#' tsk("cifar100") +#' ``` +#' +#' @template task_download +#' +#' @section Properties: +#' `r rd_info_task_torch("cifar10", missings = FALSE)` +#' +#' @references +#' `r format_bib("cifar2009")` +#' @examples +#' task_cifar10 = tsk("cifar10") +#' task_cifar100 = tsk("cifar100") +#' print(task_cifar10) +#' print(task_cifar100) +NULL + +cifar_ds_generator = torch::dataset( + initialize = function(images) { + self$images = images + }, + .getitem = function(idx) { + force(idx) + + x = torch_tensor(self$images[idx, , , ]) + + return(list(x = x)) + }, + .length = function() { + dim(self$images)[1L] + } +) + +constructor_cifar = function(path, type = 10) { + if (type == 10) { + d_train = torchvision::cifar10_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar10_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-10-batches-bin", "batches.meta.txt")) + class_names = class_names[class_names != ""] + } else if (type == 100) { + d_train = torchvision::cifar100_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar100_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-100-binary", "fine_label_names.txt")) + } + + classes = c(d_train$y, d_test$y) + images = array(NA, dim = c(60000, 3, 32, 32)) + # original data has channel dimension at the end + perm_idx = c(1, 4, 2, 3) + images[1:50000, , , ] = aperm(d_train$x, perm_idx, resize = TRUE) + images[50001:60000, , , ] = aperm(d_test$x, perm_idx, resize = TRUE) + + return(list(class = factor(classes, labels = class_names), images = images)) +} + +constructor_cifar10 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 10)) +} + +load_task_cifar10 = function(id = "cifar10") { + cached_constructor = function(backend) { + data <- cached(constructor_cifar10, "datasets", "cifar10")$data + + cifar10_ds = cifar_ds_generator(data$images) + + dd = as_data_descriptor(cifar10_ds, list(x = c(NA, 3, 32, 32))) + lt = lazy_tensor(dd) + + dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) + ) + + DataBackendDataTable$new(data = dt, primary_key = "..row_id") + } + + backend = DataBackendLazy$new( + constructor = cached_constructor, + rownames = seq_len(60000), + col_info = load_col_info("cifar10"), + primary_key = "..row_id" + ) + + task = TaskClassif$new( + backend = backend, + id = "cifar10", + target = "class", + label = "CIFAR-10 Classification" + ) + + task$col_roles$feature = "image" + + backend$hash = "mlr3torch::mlr_tasks_cifar10" + task$man = "mlr3torch::mlr_tasks_cifar" + + return(task) +} + +register_task("cifar10", load_task_cifar10) + +constructor_cifar100 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 100)) +} + +load_task_cifar100 = function(id = "cifar100") { + cached_constructor = function(backend) { + data = cached(constructor_cifar100, "datasets", "cifar100")$data + + cifar100_ds = cifar_ds_generator(data$images) + + dd = as_data_descriptor(cifar100_ds, list(x = c(NA, 3, 32, 32))) + lt = lazy_tensor(dd) + + dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) + ) + + DataBackendDataTable$new(data = dt, primary_key = "..row_id") + } + + backend = DataBackendLazy$new( + constructor = cached_constructor, + rownames = seq_len(60000), + col_info = load_col_info("cifar100"), + primary_key = "..row_id" + ) + + task = TaskClassif$new( + backend = backend, + id = "cifar100", + target = "class", + label = "CIFAR-100 Classification" + ) + + task$col_roles$feature = "image" + + backend$hash = "mlr3torch::mlr_tasks_cifar100" + task$man = "mlr3torch::mlr_tasks_cifar" + + return(task) +} + +register_task("cifar100", load_task_cifar100) + + diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 3c4c5531..d3a62063 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -37,6 +37,7 @@ #' #' @references #' `r format_bib("melanoma2021")` +#' @examples #' task = tsk("melanoma") #' task NULL diff --git a/R/TaskClassif_tiny_imagenet.R b/R/TaskClassif_tiny_imagenet.R index bc32d60b..8355a627 100644 --- a/R/TaskClassif_tiny_imagenet.R +++ b/R/TaskClassif_tiny_imagenet.R @@ -7,7 +7,7 @@ #' The data is obtained from [`torchvision::tiny_imagenet_dataset()`]. #' #' The underlying [`DataBackend`][mlr3::DataBackend] contains columns `"class"`, `"image"`, `"..row_id"`, `"split"`, where the last column -#' indicates whether the row belongs to the train, validation or test set that defined provided in torchvision. +#' indicates whether the row belongs to the train, validation or test set that are provided in torchvision. #' #' There are no labels for the test rows, so by default, these observations are inactive, which means that the task #' uses only 110000 of the 120000 observations that are defined in the underlying data backend. diff --git a/R/bibentries.R b/R/bibentries.R index f9d7fa67..9a1c7667 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -121,6 +121,12 @@ bibentries = c(# nolint start pages = "34", year = "2021", doi = "10.1038/s41597-021-00815-z" + ), + cifar2009 = bibentry("article", + title = "Learning Multiple Layers of Features from Tiny Images", + author = "Krizhevsky, Alex", + journal= "Master's thesis, Department of Computer Science, University of Toronto",, + year = "2009", ) ) # nolint end diff --git a/data-raw/cifar.R b/data-raw/cifar.R new file mode 100644 index 00000000..0d5e1474 --- /dev/null +++ b/data-raw/cifar.R @@ -0,0 +1,110 @@ +devtools::load_all() + +library(mlr3misc) +library(data.table) +library(torchvision) + +# cached +cifar_ds_generator = torch::dataset( + initialize = function(images) { + self$images = images + }, + .getitem = function(idx) { + force(idx) + + x = torch_tensor(self$images[idx, , , ]) + + return(list(x = x)) + }, + .length = function() { + dim(self$images)[1L] + } +) + +constructor_cifar = function(path, type = 10) { + if (type == 10) { + d_train = torchvision::cifar10_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar10_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-10-batches-bin", "batches.meta.txt")) + class_names = class_names[class_names != ""] + } else if (type == 100) { + d_train = torchvision::cifar100_dataset(root = path, train = TRUE, download = TRUE) + d_test = torchvision::cifar100_dataset(root = path, train = FALSE, download = FALSE) + class_names = readLines(file.path(path, "cifar-100-binary", "fine_label_names.txt")) + } + + classes = c(d_train$y, d_test$y) + images = array(NA, dim = c(60000, 3, 32, 32)) + # original data has channel dimension at the end + perm_idx = c(1, 4, 2, 3) + images[1:50000, , , ] = aperm(d_train$x, perm_idx, resize = TRUE) + images[50001:60000, , , ] = aperm(d_test$x, perm_idx, resize = TRUE) + + return(list(class = factor(classes, labels = class_names), images = images)) +} + +constructor_cifar10 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 10)) +} + +withr::local_options(mlr3torch.cache = TRUE) +path = file.path(get_cache_dir(), "datasets", "cifar10", "raw") + +# begin CIFAR-10 +data <- constructor_cifar10(path) + +cifar10_ds = cifar_ds_generator(data$images) + +dd = as_data_descriptor(cifar10_ds, list(x = c(NA, 3, 32, 32))) +lt = lazy_tensor(dd) + +tsk_dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) +) + +# tsk_dt = cbind(data, data.table(image = lt)) + +tsk_cifar10 = as_task_classif(tsk_dt, target = "class", id = "cifar10") +tsk_cifar10$col_roles$feature = "image" + +ci = col_info(tsk_cifar10$backend) + +saveRDS(ci, here::here("inst/col_info/cifar10.rds")) +# end CIFAR-10 + +path = file.path(get_cache_dir(), "datasets", "cifar100", "raw") + +# begin CIFAR-100 +constructor_cifar100 = function(path) { + require_namespaces("torchvision") + + return(constructor_cifar(path, type = 100)) +} + +data = constructor_cifar100(path) + +cifar100_ds = cifar_ds_generator(data$images) + +dd = as_data_descriptor(cifar100_ds, list(x = c(NA, 3, 32, 32))) +lt = lazy_tensor(dd) + +dt = data.table( + class = data$class, + image = lt, + split = factor(rep(c("train", "test"), c(50000, 10000))), + ..row_id = seq_len(60000) +) + +task = as_task_classif(dt, target = "class") + +task$col_roles$feature = "image" + +ci = col_info(task$backend) + +saveRDS(ci, here::here("inst/col_info/cifar100.rds")) + diff --git a/inst/col_info/cifar10.rds b/inst/col_info/cifar10.rds new file mode 100644 index 00000000..797790c7 Binary files /dev/null and b/inst/col_info/cifar10.rds differ diff --git a/inst/col_info/cifar100.rds b/inst/col_info/cifar100.rds new file mode 100644 index 00000000..5dd013ce Binary files /dev/null and b/inst/col_info/cifar100.rds differ diff --git a/man/mlr_tasks_cifar.Rd b/man/mlr_tasks_cifar.Rd new file mode 100644 index 00000000..06475198 --- /dev/null +++ b/man/mlr_tasks_cifar.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TaskClassif_cifar.R +\name{mlr_tasks_cifar} +\alias{mlr_tasks_cifar} +\title{CIFAR Classification Tasks} +\format{ +\link[R6:R6Class]{R6::R6Class} inheriting from \link[mlr3:TaskClassif]{mlr3::TaskClassif}. +} +\description{ +The CIFAR-10 and CIFAR-100 datasets. A subset of the 80 million tiny images dataset +with noisy labels was supplied to student labelers, who were asked to filter out +incorrectly labeled images. + +CIFAR-10 contains 10 classes. CIFAR-100 contains 100 classes, which may be partitioned into 20 superclasses of 5 classes each. +The CIFAR-10 and CIFAR-100 classes are mutually exclusive. +See Chapter 3.1 of \href{https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf}{the technical report} for more details. + +The data is obtained from \code{\link[torchvision:cifar10_dataset]{torchvision::cifar10_dataset()}} (or \code{torchvision::cifar100_dataset()}). +} +\section{Construction}{ + + +\if{html}{\out{
}}\preformatted{tsk("cifar10") +tsk("cifar100") +}\if{html}{\out{
}} +} + +\section{Download}{ + +The \link[mlr3:Task]{task}'s backend is a \code{\link{DataBackendLazy}} which will download the data once it is requested. +Other meta-data is already available before that. +You can cache these datasets by setting the \code{mlr3torch.cache} option to \code{TRUE} or to a specific path to be used +as the cache directory. +} + +\section{Properties}{ + +\itemize{ +\item Task type: \dQuote{classif} +\item Properties: \dQuote{multiclass} +\item Has Missings: no +\item Target: \dQuote{class} +\item Features: \dQuote{image} +\item Data Dimension: 60000x4 +} +} + +\examples{ +task_cifar10 = tsk("cifar10") +task_cifar100 = tsk("cifar100") +print(task_cifar10) +print(task_cifar100) +} +\references{ +Krizhevsky, Alex (2009). +\dQuote{Learning Multiple Layers of Features from Tiny Images.} +\emph{Master's thesis, Department of Computer Science, University of Toronto}. +} diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd index 7ea655ee..5701a022 100644 --- a/man/mlr_tasks_melanoma.Rd +++ b/man/mlr_tasks_melanoma.Rd @@ -57,11 +57,13 @@ as the cache directory. } } +\examples{ +task = tsk("melanoma") +task +} \references{ Rotemberg, V., Kurtansky, N., Betz-Stablein, B., Caffery, L., Chousakos, E., Codella, N., Combalia, M., Dusza, S., Guitera, P., Gutman, D., Halpern, A., Helba, B., Kittler, H., Kose, K., Langer, S., Lioprys, K., Malvehy, J., Musthaq, S., Nanda, J., Reiter, O., Shih, G., Stratigos, A., Tschandl, P., Weber, J., Soyer, P. (2021). \dQuote{A patient-centric dataset of images and metadata for identifying melanomas using clinical context.} \emph{Scientific Data}, \bold{8}, 34. \doi{10.1038/s41597-021-00815-z}. -task = tsk("melanoma") -task } diff --git a/man/mlr_tasks_tiny_imagenet.Rd b/man/mlr_tasks_tiny_imagenet.Rd index eef98c0b..001f78ec 100644 --- a/man/mlr_tasks_tiny_imagenet.Rd +++ b/man/mlr_tasks_tiny_imagenet.Rd @@ -8,7 +8,7 @@ Subset of the famous ImageNet dataset. The data is obtained from \code{\link[torchvision:tiny_imagenet_dataset]{torchvision::tiny_imagenet_dataset()}}. The underlying \code{\link[mlr3:DataBackend]{DataBackend}} contains columns \code{"class"}, \code{"image"}, \code{"..row_id"}, \code{"split"}, where the last column -indicates whether the row belongs to the train, validation or test set that defined provided in torchvision. +indicates whether the row belongs to the train, validation or test set that are provided in torchvision. There are no labels for the test rows, so by default, these observations are inactive, which means that the task uses only 110000 of the 120000 observations that are defined in the underlying data backend. diff --git a/tests/testthat/test_TaskClassif_cifar.R b/tests/testthat/test_TaskClassif_cifar.R new file mode 100644 index 00000000..86c952db --- /dev/null +++ b/tests/testthat/test_TaskClassif_cifar.R @@ -0,0 +1,41 @@ +skip_on_cran() + +test_that("CIFAR-10 works", { + withr::local_options(mlr3torch.cache = TRUE) + task = tsk("cifar10") + + expect_equal(task$nrow, 60000) + + task$filter(1:10) + expect_equal(task$id, "cifar10") + expect_equal(task$label, "CIFAR-10 Classification") + expect_equal(task$feature_names, "image") + expect_equal(task$target_names, "class") + expect_equal(task$man, "mlr3torch::mlr_tasks_cifar") + expect_equal(task$backend$hash, "mlr3torch::mlr_tasks_cifar10") + task$data() + expect_true("cifar-10-batches-bin" %in% list.files(file.path(get_cache_dir(), "datasets", "cifar10", "raw"))) + expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "cifar10"))) + expect_equal(task$backend$nrow, 60000) + expect_equal(task$backend$ncol, 4) +}) + +test_that("CIFAR-100 works", { + withr::local_options(mlr3torch.cache = TRUE) + task = tsk("cifar100") + + expect_equal(task$nrow, 60000) + + task$filter(1:10) + expect_equal(task$id, "cifar100") + expect_equal(task$label, "CIFAR-100 Classification") + expect_equal(task$feature_names, "image") + expect_equal(task$target_names, "class") + expect_equal(task$man, "mlr3torch::mlr_tasks_cifar") + expect_equal(task$backend$hash, "mlr3torch::mlr_tasks_cifar100") + task$data() + expect_true("cifar-100-binary" %in% list.files(file.path(get_cache_dir(), "datasets", "cifar100", "raw"))) + expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "cifar100"))) + expect_equal(task$backend$nrow, 60000) + expect_equal(task$backend$ncol, 4) +}) \ No newline at end of file