diff --git a/DESCRIPTION b/DESCRIPTION index a12e282..db2d651 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,22 +1,25 @@ Package: dv.loader Type: Package Title: Data loading module -Version: 2.0.0 +Version: 2.1.0 Authors@R: c( person( "Boehringer-Ingelheim Pharma GmbH & Co.KG", role = c("cph", "fnd")), person( given = "Ming", family = "Yang", role = c("aut", "cre"), email = "ming.yang.ext@boehringer-ingelheim.com"), person( given = "Steven", family = "Brooks", role = "aut", email = "steven.brooks@boehringer-ingelheim.com"), person( given = "Sorin", family = "Voicu", role = "aut", email = "sorin.voicu.ext@boehringer-ingelheim.com") ) -Description: This is a module for loading .RDS / .sas7bdat data files from a network file storage environment. It also allows loading data locally. +Description: A package for loading multiple data files, returning a list of data frames with associated metadata, designed to integrate with the modular DaVinci framework. License: Apache License (>= 2) Encoding: UTF-8 LazyData: true Depends: R (>= 3.5.0) -Imports: haven +Imports: + haven, + checkmate Suggests: testthat, knitr, rmarkdown RoxygenNote: 7.3.0 VignetteBuilder: knitr +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 79ce8a0..18daabb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,3 +3,4 @@ export(get_cre_path) export(get_nfs_path) export(load_data) +export(load_files) diff --git a/NEWS.md b/NEWS.md index a5c5124..7e7b764 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# dv.loader 2.1.0 + +- Added `load_files()` to load data using explicit file paths. + # dv.loader 2.0.0 - GitHub release with QC report diff --git a/R/dvloader.R b/R/dvloader.R index a36a7b9..f5f879d 100644 --- a/R/dvloader.R +++ b/R/dvloader.R @@ -59,3 +59,41 @@ load_data <- function(sub_dir = NULL, file_names, use_wd = FALSE, prefer_sas = F return(data_list) } + +#' Load data files from explicit paths +#' +#' Read data from provided paths and return it as a list of data frames. +#' Supports both .rds and .sas7bdat formats. +#' +#' @param file_paths [character(1+)] Files to read. Optionally named. +#' +#' @return [list] A named list of data frames, where each name is either: +#' - the name associated to the element in the `file_paths` argument, or, if not provided... +#' - the name of the file itself, after stripping it of its leading path and trailing extension +#' +#' @export +load_files <- function(file_paths) { + checkmate::assert_character(file_paths, min.len = 1) + checkmate::assert_file_exists(file_paths, access = "r", extension = c("rds", "sas7bdat")) + + data_list <- lapply(file_paths, read_file_and_attach_metadata) + + # Use names provided as arguments + arg_names <- names(file_paths) + if (is.null(arg_names)) arg_names <- rep("", length(file_paths)) + names(data_list) <- arg_names + + # If names are not provided, fall back to file names without leading path or trailing extension + empty_name_indices <- which(arg_names == "") + names(data_list)[empty_name_indices] <- tools::file_path_sans_ext(basename(file_paths[empty_name_indices])) + + dup_indices <- duplicated(names(data_list)) + if (any(dup_indices)) { + stop(sprintf( + "Duplicate entries detected (%s). Please review `file_paths` argument.", + paste(names(data_list)[dup_indices], collapse = ", ") + )) + } + + return(data_list) +} diff --git a/R/utils.R b/R/utils.R index a3791e1..2d01574 100644 --- a/R/utils.R +++ b/R/utils.R @@ -36,8 +36,7 @@ create_data_list <- function(file_path, file_names, prefer_sas) { stop(paste("create_data_list(): No RDS or SAS files found for", file_path, x)) } - output <- read_file(file_path, file_name_to_load) - + output <- read_file_and_attach_metadata(file.path(file_path, file_name_to_load)) return(output) }) @@ -47,35 +46,33 @@ create_data_list <- function(file_path, file_names, prefer_sas) { } -#' Reads RDS/SAS file and metadatas from first 6 items from file.info() its file path -#' @param file_path a path to a file -#' @param file_name name of a file -#' @return a data object with an extra attribute of metadata -read_file <- function(file_path, file_name) { - ext <- tools::file_ext(file_name) - - if (!(toupper(ext) %in% c("RDS", "SAS7BDAT"))) { - stop("Usage error: read_file: file_name: file must either be RDS or SAS7BDAT.") +#' Read a data file and attach metadata +#' +#' Reads an .rds or .sas7bdat file from the given path and attaches metadata about the file +#' as an attribute. +#' +#' @param path [character(1)] Path to the data file to read +#' +#' @return A data frame with metadata attached as an attribute named "meta". +#' +#' @keywords internal +read_file_and_attach_metadata <- function(path) { + extension <- tools::file_ext(path) + + if (toupper(extension) == "RDS") { + data <- readRDS(path) + } else if (toupper(extension) == "SAS7BDAT") { + data <- haven::read_sas(path) + } else { + stop("Not supported file type, only .rds or .sas7bdat files can be loaded.") } - is_rds <- toupper(ext) == "RDS" - - file <- file.path(file_path, file_name) - file_name <- tools::file_path_sans_ext(file_name) - - # grab file info - meta <- file.info(file)[1L:6L] - meta[["path"]] <- row.names(meta) - meta[["file_name"]] <- file_name - meta <- data.frame(meta, stringsAsFactors = FALSE) + meta <- file.info(path, extra_cols = FALSE) + meta[["path"]] <- path + meta[["file_name"]] <- basename(path) row.names(meta) <- NULL - if (is_rds) { - out <- readRDS(file) - } else { - out <- haven::read_sas(file) - } - attr(out, "meta") <- meta + attr(data, "meta") <- meta - return(out) + return(data) } diff --git a/README.md b/README.md index 1199129..e59133e 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,70 @@ # Data Loading -The {dv.loader} package provides a simple interface for loading data from a network file storage folder or -locally. It is designed to be used with `.RDS` and `.sas7bdat` file formats. -The package provides a simple function, `load_data()`, which loads R and SAS data files into memory. -Loading data from SQL databases is not yet supported. The function returns a list named by the file names passed, -and containing data frames, along with metadata for that table. By default, the function will look for files in a -sub-directory `sub_dir` of the base path defined by a environment variable "RXD_DATA". You can check if the base path -is set by running `Sys.getenv("RXD_DATA")`. A single file or multiple files can be loaded at once. -To make the loading process faster for large datasets, it is suggested that '.sas7bdat' files are converted to -'.RDS' files. The function will prefer '.RDS' files over '.sas7bdat' files by default. +The `dv.loader` package provides two functions for loading `.rds` and `.sas7bdat` files into R. + +- `load_data()`: loads data files from a specified subdirectory of the base path defined by the environment variable "RXD_DATA". This function is useful when working with data files stored in a centralized location. +- `load_files()`: accepts explicit file paths to load data files from any location on your system. You can optionally provide custom names for the data frames in the returned list. ## Installation +The `dv.loader` package is available on GitHub. To install it, you can use the following commands: + ```r if (!require("remotes")) install.packages("remotes") remotes::install_github("Boehringer-Ingelheim/dv.loader") ``` -## Basic usage +After installation, you can load the package using: ```r -# getting data from a network file storage folder -dv.loader::load_data(sub_dir = "subdir1/subdir2", file_names = c("adsl", "adae")) +library(dv.loader) ``` +## Basic Usage + +### Using `load_data()` + +The `load_data()` function loads data from the specified subdirectory relative to `RXD_DATA`. For the `file_names` argument, you can optionally specify the file extensions in the names. If not provided, the function will attempt to search for `.rds` and `.sas7bdat` files in the subdirectory and decide which one to load based on the `prefer_sas` argument when both file types are present. By default, `prefer_sas` is `FALSE`, meaning `.rds` files are preferred due to their smaller file size and faster loading time. + ```r -# getting data locally (e.g., if you have file `./data/adsl.RDS`) -dv.loader::load_data(sub_dir = "data", file_names = c("adsl"), use_wd = TRUE) +# Set the RXD_DATA environment variable +Sys.setenv(RXD_DATA = "path/to/data/folder") + +# Load data from path/to/data/folder/subdir1 +load_data( + sub_dir = "subdir1", + file_names = c("file1", "file2"), + prefer_sas = TRUE +) + +# Load data from path/to/data/folder/subdir1/subdir2 +load_data( + sub_dir = "subdir1/subdir2", + file_names = c("file1.rds", "file2.sas7bdat"), +) ``` + +### Using `load_files()` + +The `load_files()` function requires you to provide explicit file paths including the file extensions for the data files you want to load. You can optionally provide custom names for the data frames in the returned list. + + +```r +# Load data files with default names +load_files( + file_paths = c( + "path/to/file1.rds", + "path/to/file2.sas7bdat" + ) +) + +# Load data files with custom names +load_files( + file_paths = c( + "file1 (rds)" = "path/to/file1.rds", + "file2 (sas)" = "path/to/file2.sas7bdat" + ) +) +``` + +For more details, please refer to the package vignettes and function documentation. diff --git a/man/load_data.Rd b/man/load_data.Rd index 5367158..f53beb3 100644 --- a/man/load_data.Rd +++ b/man/load_data.Rd @@ -7,7 +7,7 @@ load_data(sub_dir = NULL, file_names, use_wd = FALSE, prefer_sas = FALSE) } \arguments{ -\item{sub_dir}{A relative directory/folder that will be appended to a base path defined by `Sys.getenv("RXD_DATA")`. +\item{sub_dir}{A relative directory/folder that will be appended to a base path defined by `Sys.getenv("RXD_DATA")`. If the argument is left as NULL, the function will load data from the working directory `getwd()`.} \item{file_names}{Study file or file_names name(s) - can be a vector of strings. diff --git a/man/load_files.Rd b/man/load_files.Rd new file mode 100644 index 0000000..b59e46d --- /dev/null +++ b/man/load_files.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dvloader.R +\name{load_files} +\alias{load_files} +\title{Load data files from explicit paths} +\usage{ +load_files(file_paths) +} +\arguments{ +\item{file_paths}{[character(1+)] Files to read. Optionally named.} +} +\value{ +[list] A named list of data frames, where each name is either: + - the name associated to the element in the `file_paths` argument, or, if not provided... + - the name of the file itself, after stripping it of its leading path and trailing extension +} +\description{ +Read data from provided paths and return it as a list of data frames. +Supports both .rds and .sas7bdat formats. +} diff --git a/man/read_file.Rd b/man/read_file.Rd deleted file mode 100644 index e46767b..0000000 --- a/man/read_file.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{read_file} -\alias{read_file} -\title{Reads RDS/SAS file and metadatas from first 6 items from file.info() its file path} -\usage{ -read_file(file_path, file_name) -} -\arguments{ -\item{file_path}{a path to a file} - -\item{file_name}{name of a file} -} -\value{ -a data object with an extra attribute of metadata -} -\description{ -Reads RDS/SAS file and metadatas from first 6 items from file.info() its file path -} diff --git a/man/read_file_and_attach_metadata.Rd b/man/read_file_and_attach_metadata.Rd new file mode 100644 index 0000000..fff95a4 --- /dev/null +++ b/man/read_file_and_attach_metadata.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{read_file_and_attach_metadata} +\alias{read_file_and_attach_metadata} +\title{Read a data file and attach metadata} +\usage{ +read_file_and_attach_metadata(path) +} +\arguments{ +\item{path}{[character(1)] Path to the data file to read} +} +\value{ +A data frame with metadata attached as an attribute named "meta". +} +\description{ +Reads an .rds or .sas7bdat file from the given path and attaches metadata about the file +as an attribute. +} +\keyword{internal} diff --git a/tests/testthat/test-load_files.R b/tests/testthat/test-load_files.R new file mode 100644 index 0000000..3871d1a --- /dev/null +++ b/tests/testthat/test-load_files.R @@ -0,0 +1,90 @@ +test_that("load_files() correctly loads both RDS and SAS files", { + rds_file <- "inst/extdata/dummyads1.RDS" + sas_file <- "inst/extdata/dummyads2.sas7bdat" + + data_list <- load_files(file_paths = c(rds_file, sas_file)) + + # Check that default names are correctly assigned based on filenames + expect_equal(names(data_list), c("dummyads1", "dummyads2")) + + # Verify RDS file contents match direct reading + expect_equal(data_list[["dummyads1"]], readRDS(rds_file), ignore_attr = "meta") + + # Verify SAS file contents match direct reading + expect_equal(data_list[["dummyads2"]], haven::read_sas(sas_file), ignore_attr = "meta") + + # Create expected metadata for comparison + rds_metadata <- cbind( + file.info(rds_file, extra_cols = FALSE), + path = rds_file, + file_name = basename(rds_file) + ) + sas_metadata <- cbind( + file.info(sas_file, extra_cols = FALSE), + path = sas_file, + file_name = basename(sas_file) + ) + row.names(rds_metadata) <- NULL + row.names(sas_metadata) <- NULL + + # Verify metadata is correctly attached to loaded data + expect_equal(attr(data_list[["dummyads1"]], "meta"), rds_metadata) + expect_equal(attr(data_list[["dummyads2"]], "meta"), sas_metadata) +}) + +test_that("load_files() works with different file extensions", { + # GitHub Actions (Assertion on 'file_paths' failed: File does not exist) + expect_error( + load_files(file_paths = c( + "inst/extdata/dummyads1.rds", # extension: RDS + "inst/extdata/dummyads2.SAS7BDAT" # extension: sas7bdat + )) + ) +}) + +test_that("load_files() properly validates file extensions", { + expect_error( + load_files(file_paths = c( + "inst/extdata/bad_file_type.myrds", + "inst/extdata/bad_file_type.txt" + )) + ) +}) + +test_that("load_files() can return both default and custom names for loaded data", { + # Check that duplicate names are caught and error is thrown + expect_error( + load_files(file_paths = c( + "inst/extdata/just_rds/dummyads1.RDS", + "inst/extdata/just_sas/dummyads1.sas7bdat" + )), + "Duplicate entries detected \\(dummyads1\\). Please review `file_paths` argument." + ) + + # Loading files with default names + data_list1 <- load_files( + file_paths = c( + "inst/extdata/just_rds/dummyads1.RDS", + "inst/extdata/just_sas/dummyads2.sas7bdat" + ) + ) + expect_equal(names(data_list1), c("dummyads1", "dummyads2")) + + # Loading files with custom names + data_list2 <- load_files( + file_paths = c( + "rds_dummyads1" = "inst/extdata/just_rds/dummyads1.RDS", + "sas_dummyads2" = "inst/extdata/just_sas/dummyads2.sas7bdat" + ) + ) + expect_equal(names(data_list2), c("rds_dummyads1", "sas_dummyads2")) + + # Loading files with mixed naming (custom and default) + data_list3 <- load_files( + file_paths = c( + "rds_dummyads1" = "inst/extdata/just_rds/dummyads1.RDS", + "inst/extdata/dummyads2.sas7bdat" + ) + ) + expect_equal(names(data_list3), c("rds_dummyads1", "dummyads2")) +}) diff --git a/vignettes/loading-data-into-memory.Rmd b/vignettes/loading-data-into-memory.Rmd index 577da15..0ba89fe 100644 --- a/vignettes/loading-data-into-memory.Rmd +++ b/vignettes/loading-data-into-memory.Rmd @@ -14,32 +14,76 @@ knitr::opts_chunk$set( ) ``` +The `dv.loader` package simplifies the process of loading data files into R memory. It provides two main functions - `load_data()` and `load_files()` - that can handle two widely used data formats: + +- `.rds` files: R's native data storage format, which efficiently stores R objects in a compressed binary format +- `.sas7bdat` files: SAS dataset files commonly used in clinical research and other industries + +The package is designed to be flexible, allowing you to load data either from a centralized location using environment variables, or by specifying explicit file paths. Each loaded dataset includes metadata about the source file, such as its size, modification time, and location on disk. + +To demonstrate the package's capabilities, we'll first create some example `.rds` files in a temporary directory that we can work with. + +```{r} +# Create a temporary directory for the example data +temp_dir <- tempdir() + +# Save the cars and mtcars datasets to the temporary directory +saveRDS(cars, file = file.path(temp_dir, "cars.rds")) +saveRDS(mtcars, file = file.path(temp_dir, "mtcars.rds")) +``` + +To begin, we'll need to load the dv.loader package. + ```{r setup} library(dv.loader) ``` -Note: `use_wd = TRUE` can be used to source from local folder. Just set your working directory before using with `setwd()`, or use an explicit path in `sub_dir`. +## Using `load_data()` + +The `load_data()` function requires the `RXD_DATA` environment variable to be set to the base directory containing your data files. This variable defines the root path from which subdirectories will be searched. -## Usage: `load_data()` +When you call `load_data()`, it searches the specified subdirectory for data files and returns them as a named list of data frames. Each data frame in the list is named after its source file. + +For files that exist in both `.rds` and `.sas7bdat` formats, `load_data()` will load the `.rds` version by default since these are more compact and faster to read. You can override this behavior by setting `prefer_sas = TRUE` to prioritize loading `.sas7bdat` files instead. ```{r} -test_data_path <- "../tests/testthat/inst/extdata" -data_list <- load_data( - sub_dir = test_data_path, - file_names = "dummyads2", - use_wd = TRUE +# Set the RXD_DATA environment variable to the temporary directory +Sys.setenv(RXD_DATA = temp_dir) + +# Load the data files into a named list of data frames +data_list1 <- load_data( + sub_dir = ".", + file_names = c("cars", "mtcars") ) +# Display the structure of the resulting list +str(data_list1) +``` + +## Using `load_files()` -class(data_list) +The `load_files()` function accepts explicit file paths and loads them into a named list of data frames. Each data frame includes metadata as an attribute. If no custom names are provided, the function will use the file names (without paths or extensions) as the list names. -class(data_list[["dummyads2"]]) +```{r} +# Load the data files into a named list of data frames +data_list2 <- load_files( + file_paths = c( + file.path(temp_dir, "cars.rds"), + file.path(temp_dir, "mtcars.rds") + ) +) -head(data_list[["dummyads2"]]) +# Display the structure of the resulting list +str(data_list2) ``` -Get the dataframe's metadata through its attributes: +When using `load_files()`, you can specify files from multiple directories and customize the output list names by providing named arguments in the `file_paths` parameter. ```{r} -attr(data_list[["dummyads2"]], "meta") +dv.loader::load_files( + file_paths = c( + "cars (rds)" = file.path(temp_dir, "cars.rds"), + "iris (sas)" = system.file("examples", "iris.sas7bdat", package = "haven") + ) +) |> names() ```