diff --git a/DESCRIPTION b/DESCRIPTION index c447c8c..1798a6e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,6 +16,8 @@ BugReports: https://github.com/CCBR/reneeTools/issues Depends: R (>= 2.10) Imports: + assertthat, + DESeq2, dplyr, tidyr Suggests: diff --git a/NAMESPACE b/NAMESPACE index 6e1f1dc..024a199 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export("%>%") +export(create_deseq_obj) export(filter_low_counts) export(read_raw_counts) importFrom(dplyr,"%>%") diff --git a/NEWS.md b/NEWS.md index b0dfed8..cd5e27e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,3 +5,4 @@ ## New functions - `filter_low_counts()` (#10) +- `create_deseq_obj()` (#13) diff --git a/R/data.R b/R/data.R index e0aae9f..cb0e8a1 100644 --- a/R/data.R +++ b/R/data.R @@ -1,7 +1,7 @@ -#' RSEM gene counts +#' RSEM expected gene counts #' #' @format ## `gene_counts` -#' A data frame with columns 'gene_id', 'GeneName', and a column for each sample's count. +#' A data frame with columns 'gene_id', 'GeneName', and a column for each sample's expected count. #' #' @source Generated by running RENEE v2.5.8 on the #' [test dataset](https://github.com/CCBR/RENEE/tree/e08f7db6c6e638cfd330caa182f64665d2ef37fa/.tests) diff --git a/R/deseq2.R b/R/deseq2.R new file mode 100644 index 0000000..c6459e8 --- /dev/null +++ b/R/deseq2.R @@ -0,0 +1,37 @@ +#' Create DESeq2 object from gene counts and sample metadata +#' +#' @param counts_tbl expected gene counts from RSEM as a data frame or tibble. +#' @param meta_dat sample metadata as a data frame with rownames as sample IDs. +#' @param design model formula for experimental design. Columns must exist in `meta_dat`. +#' +#' @return DESeqDataSet +#' @export +#' +#' @examples +#' sample_meta <- data.frame( +#' row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), +#' condition = factor(c("knockout", "knockout", "wildtype", "wildtype"), +#' levels = c("wildtype", "knockout") +#' ) +#' ) +#' dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) +create_deseq_obj <- function(counts_tbl, meta_dat, design) { + gene_id <- GeneName <- NULL + counts_dat <- counts_tbl %>% + # deseq2 requires integer counts + dplyr::mutate(dplyr::across( + dplyr::where(is.numeric), + \(x) as.integer(round(x, 0)) + )) %>% + as.data.frame() + row.names(counts_dat) <- counts_dat %>% dplyr::pull("gene_id") + # convert counts tibble to matrix + counts_mat <- counts_dat %>% + dplyr::select(-c(gene_id, GeneName)) %>% + as.matrix() + + # sample IDs must be in the same order + assertthat::are_equal(colnames(counts_mat), rownames(meta_dat)) + + return(DESeq2::DESeqDataSetFromMatrix(counts_mat, meta_dat, design)) +} diff --git a/R/filter_low_counts.R b/R/filter_low_counts.R index 4f9494c..6d32e54 100644 --- a/R/filter_low_counts.R +++ b/R/filter_low_counts.R @@ -1,6 +1,6 @@ #' filter_low_counts #' -#' @param counts_dat dataframe of expected gene counts from RSEM +#' @param counts_dat expected gene counts from RSEM as a data frame or tibble #' @param min_counts integer number of minimum counts across all samples (default: 0) #' #' @return filtered counts dataframe diff --git a/man/create_deseq_obj.Rd b/man/create_deseq_obj.Rd new file mode 100644 index 0000000..e0b3d83 --- /dev/null +++ b/man/create_deseq_obj.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2.R +\name{create_deseq_obj} +\alias{create_deseq_obj} +\title{Create DESeq2 object from gene counts and sample metadata} +\usage{ +create_deseq_obj(counts_tbl, meta_dat, design) +} +\arguments{ +\item{counts_tbl}{expected gene counts from RSEM as a data frame or tibble.} + +\item{meta_dat}{sample metadata as a data frame with rownames as sample IDs.} + +\item{design}{model formula for experimental design. Columns must exist in \code{meta_dat}.} +} +\value{ +DESeqDataSet +} +\description{ +Create DESeq2 object from gene counts and sample metadata +} +\examples{ +sample_meta <- data.frame( + row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), + condition = factor(c("knockout", "knockout", "wildtype", "wildtype"), + levels = c("wildtype", "knockout") + ) +) +dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) +} diff --git a/man/filter_low_counts.Rd b/man/filter_low_counts.Rd index 47386fc..d83cdc4 100644 --- a/man/filter_low_counts.Rd +++ b/man/filter_low_counts.Rd @@ -7,7 +7,7 @@ filter_low_counts(counts_dat, min_counts = 0) } \arguments{ -\item{counts_dat}{dataframe of expected gene counts from RSEM} +\item{counts_dat}{expected gene counts from RSEM as a data frame or tibble} \item{min_counts}{integer number of minimum counts across all samples (default: 0)} } diff --git a/man/gene_counts.Rd b/man/gene_counts.Rd index 6d3f8e8..0c250df 100644 --- a/man/gene_counts.Rd +++ b/man/gene_counts.Rd @@ -3,11 +3,11 @@ \docType{data} \name{gene_counts} \alias{gene_counts} -\title{RSEM gene counts} +\title{RSEM expected gene counts} \format{ \subsection{\code{gene_counts}}{ -A data frame with columns 'gene_id', 'GeneName', and a column for each sample's count. +A data frame with columns 'gene_id', 'GeneName', and a column for each sample's expected count. } } \source{ @@ -18,6 +18,6 @@ Generated by running RENEE v2.5.8 on the gene_counts } \description{ -RSEM gene counts +RSEM expected gene counts } \keyword{datasets} diff --git a/tests/testthat/test-deseq2.R b/tests/testthat/test-deseq2.R new file mode 100644 index 0000000..321bb01 --- /dev/null +++ b/tests/testthat/test-deseq2.R @@ -0,0 +1,48 @@ +set.seed(20231228) +test_that("create_deseq_obj works", { + sample_meta <- + data.frame( + row.names = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), + condition = factor( + c("knockout", "knockout", "wildtype", "wildtype"), + levels = c("wildtype", "knockout") + ) + ) + dds <- create_deseq_obj(gene_counts, sample_meta, ~condition) + + expect_equal( + dds@colData %>% as.data.frame(), + structure( + list(condition = structure( + c(2L, 2L, 1L, 1L), + levels = c( + "wildtype", + "knockout" + ), + class = "factor" + )), + class = "data.frame", + row.names = c( + "KO_S3", + "KO_S4", "WT_S1", "WT_S2" + ) + ) + ) + expect_equal( + dds@assays@data@listData %>% as.data.frame() %>% dplyr::filter(counts.KO_S3 > 15), + structure( + list( + counts.KO_S3 = c(25L, 16L, 19L), + counts.KO_S4 = c(22L, 10L, 26L), + counts.WT_S1 = c(74L, 0L, 10L), + counts.WT_S2 = c(104L, 0L, 8L) + ), + class = "data.frame", + row.names = c( + "ENSG00000185658.13", + "ENSG00000233922.2", + "ENSG00000157601.14" + ) + ) + ) +})