diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96737c2..9693bf4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,8 @@ exclude: | (?x)( ^assets/| ^docs/.*.html| - ^data-raw/*.txt + ^data-raw/*.txt| + ^man/ ) repos: - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/.prettierignore b/.prettierignore index 4f54fd8..63da1a4 100644 --- a/.prettierignore +++ b/.prettierignore @@ -7,3 +7,4 @@ results/ *.code-workspace assets/*.html data-raw/*.txt +man/* diff --git a/DESCRIPTION b/DESCRIPTION index d68038f..c447c8c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,8 +17,9 @@ Depends: R (>= 2.10) Imports: dplyr, - readr + tidyr Suggests: + readr, testthat (>= 3.0.0) Config/testthat/edition: 3 Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 3e73273..6e1f1dc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(filter_low_counts) export(read_raw_counts) +importFrom(dplyr,"%>%") diff --git a/R/filter_low_counts.R b/R/filter_low_counts.R index d7ca813..357f8bd 100644 --- a/R/filter_low_counts.R +++ b/R/filter_low_counts.R @@ -1,18 +1,27 @@ #' filter_low_counts #' -#' @param raw_counts_matrix raw_counts_matrix object -#' @param min_counts integer number of min_counts across all samples, default 0 -#' @param min_cpm float minimum cpm value, default 0 -#' @param min_cpm_fraction float fraction of samples that need to satisfy min_cpm filter, default 1.0 +#' @param counts_dat dataframe of expected gene counts from RSEM +#' @param min_counts integer number of minimum counts across all samples (default: 0) #' -#' @return filtered_raw_count_matrix +#' @return filtered counts dataframe #' @export #' #' @examples +#' filter_low_counts(gene_counts) filter_low_counts <- function( - raw_counts_matrix, - min_counts = 0, - min_cpm = 0, - min_cpm_fraction = 1.0) { - + counts_dat, + min_counts = 0) { + gene_id <- count <- count_sum <- NULL + genes_above_threshold <- counts_dat %>% + tidyr::pivot_longer(!c("gene_id", "GeneName"), + names_to = "sample_id", values_to = "count" + ) %>% + dplyr::group_by(gene_id) %>% + dplyr::summarize(count_sum = sum(count)) %>% + dplyr::filter(count_sum >= min_counts) %>% + dplyr::pull(gene_id) + return( + counts_dat %>% + dplyr::filter(gene_id %in% (genes_above_threshold)) + ) } diff --git a/man/filter_low_counts.Rd b/man/filter_low_counts.Rd index b37fb27..ffcff2a 100644 --- a/man/filter_low_counts.Rd +++ b/man/filter_low_counts.Rd @@ -4,25 +4,19 @@ \alias{filter_low_counts} \title{filter_low_counts} \usage{ -filter_low_counts( - raw_counts_matrix, - min_counts = 0, - min_cpm = 0, - min_cpm_fraction = 1 -) +filter_low_counts(counts_dat, min_counts = 0) } \arguments{ -\item{raw_counts_matrix}{raw_counts_matrix object} +\item{counts_dat}{dataframe of expected gene counts from RSEM} -\item{min_counts}{integer number of min_counts across all samples, default 0} - -\item{min_cpm}{float minimum cpm value, default 0} - -\item{min_cpm_fraction}{float fraction of samples that need to satisfy min_cpm filter, default 1.0} +\item{min_counts}{integer number of minimum counts across all samples (default: 0)} } \value{ -filtered_raw_count_matrix +filtered counts dataframe } \description{ filter_low_counts } +\examples{ +filter_low_counts(gene_counts) +} diff --git a/man/gene_counts.Rd b/man/gene_counts.Rd new file mode 100644 index 0000000..6d3f8e8 --- /dev/null +++ b/man/gene_counts.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{gene_counts} +\alias{gene_counts} +\title{RSEM gene counts} +\format{ +\subsection{\code{gene_counts}}{ + +A data frame with columns 'gene_id', 'GeneName', and a column for each sample's count. +} +} +\source{ +Generated by running RENEE v2.5.8 on the +\href{https://github.com/CCBR/RENEE/tree/e08f7db6c6e638cfd330caa182f64665d2ef37fa/.tests}{test dataset} +} +\usage{ +gene_counts +} +\description{ +RSEM gene counts +} +\keyword{datasets} diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 0000000..9c345fa --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{\%>\%} +\title{dplyr pipe} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{dplyr}{\code{\link[dplyr:reexports]{\%>\%}}} +}} + diff --git a/man/reneeTools-package.Rd b/man/reneeTools-package.Rd new file mode 100644 index 0000000..e3ee607 --- /dev/null +++ b/man/reneeTools-package.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reneeTools-package.R +\docType{package} +\name{reneeTools-package} +\alias{reneeTools} +\alias{reneeTools-package} +\title{reneeTools: R helper functions for RENEE} +\description{ +\code{reneeTools} implements helper functions for RENEE, a comprehensive +quality-control and quantification RNA-seq pipeline +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://github.com/CCBR/reneeTools} + \item \url{https://ccbr.github.io/reneeTools/} + \item Report bugs at \url{https://github.com/CCBR/reneeTools/issues} +} + +} +\author{ +\strong{Maintainer}: Vishal Koparde \email{vishal.koparde@nih.gov} (\href{https://orcid.org/0000-0001-8978-8495}{ORCID}) + +Authors: +\itemize{ + \item Kelly Sovacool \email{kelly.sovacool@nih.gov} (\href{https://orcid.org/0000-0003-3283-829X}{ORCID}) +} + +Other contributors: +\itemize{ + \item CCR Collaborative Bioinformatics Resource [copyright holder] +} + +} +\keyword{internal} diff --git a/tests/testthat/test-filter_low_counts.R b/tests/testthat/test-filter_low_counts.R index 4e547c4..441d338 100644 --- a/tests/testthat/test-filter_low_counts.R +++ b/tests/testthat/test-filter_low_counts.R @@ -1,3 +1,33 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) # TODO write real tests +test_that("filter_low_counts works", { + test_dat <- data.frame( + gene_id = c("A", "B", "C"), + GeneName = c("geneA", "geneB", "geneC"), + s1 = c(0, 0, 0), + s2 = c(0, 1, 0), + s3 = c(0, 0, 3) + ) + expect_equal(filter_low_counts(test_dat), test_dat) + expect_equal( + filter_low_counts(test_dat, min_counts = 1), + data.frame( + gene_id = c("B", "C"), + GeneName = c("geneB", "geneC"), + s1 = c(0, 0), s2 = c(1, 0), s3 = c(0, 3) + ) + ) + expect_equal( + filter_low_counts(test_dat, min_counts = 2), + data.frame( + gene_id = "C", GeneName = "geneC", + s1 = 0, s2 = 0, s3 = 3 + ) + ) + expect_equal( + filter_low_counts(test_dat, min_counts = 5), + data.frame( + gene_id = character(0), + GeneName = character(0), + s1 = numeric(0), s2 = numeric(0), s3 = numeric(0) + ) + ) })