From 9df94f0a916ff9fe86a32c07049a6bfa7422b0f0 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 7 Oct 2024 22:51:39 -0400 Subject: [PATCH] docs: document all params --- NAMESPACE | 2 +- R/0_renee-class.R | 7 +- R/counts.R | 4 +- R/filter.R | 80 +++++++------- R/metadata.R | 2 + man/as_integer_df.Rd | 23 ++++ man/calc_cpm.Rd | 31 ++++++ man/calc_cpm_df.Rd | 22 ++++ man/counts_dat_to_matrix.Rd | 3 + man/create_reneeDataSet_from_dataframes.Rd | 9 +- man/create_reneeDataSet_from_files.Rd | 13 ++- man/filter_counts.Rd | 122 ++++++++++++++++++--- man/meta_tbl_to_dat.Rd | 2 + man/plot_heatmap.Rd | 6 +- man/plot_pca.Rd | 30 ++++- man/remove_low_count_genes.Rd | 26 +++-- man/rename_samples.Rd | 8 +- man/reneeDataSet.Rd | 5 +- man/reneeTools-package.Rd | 1 + man/validate_sample_metadata.Rd | 6 +- 20 files changed, 311 insertions(+), 91 deletions(-) create mode 100644 man/as_integer_df.Rd create mode 100644 man/calc_cpm.Rd create mode 100644 man/calc_cpm_df.Rd diff --git a/NAMESPACE b/NAMESPACE index 204f7d0..3e8830d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,7 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(counts_dat_to_matrix) +export(calc_cpm) export(create_reneeDataSet_from_dataframes) export(create_reneeDataSet_from_files) export(filter_counts) diff --git a/R/0_renee-class.R b/R/0_renee-class.R index e8aef2b..601fb6c 100644 --- a/R/0_renee-class.R +++ b/R/0_renee-class.R @@ -1,9 +1,8 @@ #' reneeDataSet class #' -#' @param count_dat expected gene counts from RSEM as a data frame or tibble. -#' Must contain a `gene_id` column and a column for each sample ID in the metadata. #' @param sample_meta_dat sample metadata as a data frame or tibble. #' Must contain a `sample_id` column. +#' @param counts_lst named list of dataframes containing counts, e.g. expected gene counts from RSEM. Each data frame is expected to contain a `gene_id` column and a column for each sample ID in the metadata. #' #' reneeDataSet <- S7::new_class("renee", @@ -34,6 +33,8 @@ reneeDataSet <- S7::new_class("renee", #' #' @param sample_meta_filepath path to tsv file with sample IDs and metadata for differential analysis. #' @param gene_counts_filepath path to tsv file of expected gene counts from RSEM. +#' @param count_type type to assign the values of `gene_counts_filepath` to in the `counts` slot +#' @param sample_id_colname name of the column in `sample_meta_filepath` that contains the sample IDs #' #' @return reneeDataSet object #' @export @@ -67,6 +68,8 @@ create_reneeDataSet_from_files <- function(sample_meta_filepath, gene_counts_fil #' Construct a reneeDataSet object from data frames #' #' @inheritParams reneeDataSet +#' @inheritParams create_reneeDataSet_from_files +#' @param count_dat data frame of feature counts (e.g. expected gene counts from RSEM) #' #' @return reneeDataSet object #' @export diff --git a/R/counts.R b/R/counts.R index af9ab9e..42892c8 100644 --- a/R/counts.R +++ b/R/counts.R @@ -40,7 +40,7 @@ calc_cpm_df <- function(dat, gene_colname = "gene_id", ...) { gene_ids <- dat %>% dplyr::pull(gene_colname) row_names <- rownames(dat) dat_cpm <- dat %>% - dplyr::select(-any_of(gene_colname)) %>% + dplyr::select(-tidyselect::any_of(gene_colname)) %>% as.matrix() %>% edgeR::cpm(...) %>% as.data.frame() @@ -61,7 +61,9 @@ calc_cpm_df <- function(dat, gene_colname = "gene_id", ...) { #' @keywords internal #' #' @examples +#' \dontrun{ #' counts_dat_to_matrix(head(gene_counts)) +#' } counts_dat_to_matrix <- function(counts_tbl, gene_colname = "gene_id") { gene_colnames <- c("gene_id", "GeneName", "gene_name", "Gene", gene_colname) %>% unique() diff --git a/R/filter.R b/R/filter.R index 3e93072..7e51381 100644 --- a/R/filter.R +++ b/R/filter.R @@ -19,39 +19,42 @@ #' another based on unsupervised clustering. #' #' -#' @param renee_ds -#' @param gene_names_column -#' @param sample_names_column -#' @param group_column -#' @param label_column -#' @param columns_to_include -#' @param outlier_samples_to_remove -#' @param minimum_count_value_to_be_considered_nonzero -#' @param minimum_number_of_samples_with_nonzero_counts_in_total -#' @param use_group_based_filtering -#' @param minimum_number_of_samples_with_nonzero_counts_in_a_group -#' @param principal_component_on_x_axis -#' @param principal_component_on_y_axis -#' @param legend_position_for_pca -#' @param point_size_for_pca -#' @param add_label_to_pca -#' @param label_font_size -#' @param label_offset_y_ -#' @param label_offset_x_ -#' @param samples_to_rename_manually -#' @param color_histogram_by_group -#' @param set_min_max_for_x_axis_for_histogram -#' @param minimum_for_x_axis_for_histogram -#' @param maximum_for_x_axis_for_histogram -#' @param legend_position_for_histogram -#' @param legend_font_size_for_histogram -#' @param number_of_histogram_legend_columns -#' @param colors_for_plots -#' @param number_of_image_rows -#' @param interactive_plots -#' @param plot_correlation_matrix_heatmap +#' @param renee_ds reneeDataSet object (see `create_reneeDataSet_from_dataframes()`) +#' @param count_type the type of counts to use -- must be a name in the counts slot (`renee_ds@counts`) +#' @param gene_names_column The column from your input Counts Matrix containing the Feature IDs (Usually Gene or Protein ID). This is usually the first column of your input Counts Matrix. Only columns of Text type from your input Counts Matrix will be available to select for this parameter. +#' @param sample_names_column The column from your input Sample Metadata table containing the sample names. The names in this column must exactly match the names used as the sample column names of your input Counts Matrix. Only columns of Text type from your input Sample Metadata table will be available to select for this parameter. +#' @param group_column The column from your input Sample Metadata table containing the sample group information. This is usually a column showing to which experimental treatments each sample belongs (e.g. WildType, Knockout, Tumor, Normal, Before, After, etc.). Only columns of Text type from your input Sample Metadata will be available to select for this parameter. +#' @param label_column The column from your input Sample Metadata table containing the sample labels as you wish them to appear in the plots produced by this template. This can be the same Sample Names Column. However, you may desire different labels to display on your figure (e.g. shorter labels are sometimes preferred on plots). In that case, select the column with your preferred Labels here. The selected column should contain unique names for each sample. +#' @param columns_to_include Which Columns would you like to include? Usually, you will choose to a feature ID column (e.g. gene or protein ID) and all sample columns. Columns excluded here will be removed in this step and from further analysis downstream of this step. +#' @param outlier_samples_to_remove A list of sample names to remove from the analysis. +#' @param use_cpm_counts_to_filter If no transformation has been been performed on counts matrix (eg Raw Counts) set to TRUE. If TRUE counts will be transformed to CPM and filtered based on given criteria. If gene counts matrix has been transformed (eg log2, CPM, FPKM or some form of Normalization) set to FALSE. If FALSE no further transformation will be applied and features will be filtered as is. For RNAseq data RAW counts should be transformed to CPM in order to properly filter. +#' @param minimum_count_value_to_be_considered_nonzero Minimum count value to be considered non-zero for a sample +#' @param minimum_number_of_samples_with_nonzero_counts_in_total Minimum number of samples (total) with non-zero counts +#' @param use_group_based_filtering If TRUE, only keeps features (e.g. genes) that have at least a certain number of samples with nonzero CPM counts in at least one group +#' @param minimum_number_of_samples_with_nonzero_counts_in_a_group Only keeps genes that have at least this number of samples with nonzero CPM counts in at least one group +#' @param make_plots whether to create plots +#' @param principal_component_on_x_axis The principle component to plot on the x-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 1) +#' @param principal_component_on_y_axis The principle component to plot on the y-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 2) +#' @param legend_position_for_pca legend position for the PCA plot +#' @param point_size_for_pca geom point size for the PCA plot +#' @param add_label_to_pca label points on the PCA plot +#' @param label_font_size label font size for the PCA plot +#' @param label_offset_y_ label offset y for the PCA plot +#' @param label_offset_x_ label offset x for the PCA plot +#' @param samples_to_rename_manually If you do not have a Plot Labels Column in your sample metadata table, you can use this parameter to rename samples manually for display on the PCA plot. Use "Add item" to add each additional sample for renaming. Use the following format to describe which old name (in your sample metadata table) you want to rename to which new name: old_name: new_name +#' @param color_histogram_by_group Set to FALSE to label histogram by Sample Names, or set to TRUE to label histogram by the column you select in the "Group Column Used to Color Histogram" parameter (below). Default is FALSE. +#' @param set_min_max_for_x_axis_for_histogram whether to set min/max value for histogram x-axis +#' @param minimum_for_x_axis_for_histogram x-axis minimum for histogram plot +#' @param maximum_for_x_axis_for_histogram x-axis maximum for histogram plot +#' @param legend_position_for_histogram legend position for the histogram plot. consider setting to 'none' for a large number of samples. +#' @param legend_font_size_for_histogram legend font size for the histogram plot +#' @param number_of_histogram_legend_columns number of columns for the histogram legend +#' @param colors_for_plots Colors for the PCA and histogram will be picked, in order, from this list. If you have >12 samples or groups, program will choose from a wide range of random colors +#' @param number_of_image_rows number of rows for the plot image. 1 = side-by-side, 2 = stacked +#' @param interactive_plots set to TRUE to make PCA and Histogram plots interactive with `plotly`, allowing you to hover your mouse over a point or line to view sample information. The similarity heat map will not display if this toggle is set to TRUE. Default is FALSE. +#' @param plot_correlation_matrix_heatmap Data sets with a large number of samples may be too large to create a correlation matrix heat map. If this template takes longer than 5 minutes to run, Toggle switch to FALSE and the correlation matrix will not be be created. Default is TRUE. #' -#' @return reneeDataSet with filtered counts +#' @return `reneeDataSet` with filtered counts #' @export #' #' @examples @@ -59,12 +62,13 @@ #' as.data.frame(nidap_sample_metadata), #' as.data.frame(nidap_clean_raw_counts), #' sample_id_colname = "Sample" -#' ) -#' set.seed(10) -#' renee_ds2 <- renee_ds %>% -#' calc_cpm() %>% -#' filter_counts() -#' head(renee_ds2@counts$filt) +#' ) %>% +#' calc_cpm(gene_colname = "Gene") %>% +#' filter_counts( +#' sample_names_column = "Sample", +#' gene_names_column = "Gene" +#' ) +#' head(renee_ds@counts$filt) #' filter_counts <- function(renee_ds, count_type = "raw", diff --git a/R/metadata.R b/R/metadata.R index 276594f..3b7a9fc 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -2,6 +2,8 @@ #' #' @param meta_tbl tibble with `sample_id` column #' +#' @inheritParams create_reneeDataSet_from_files +#' #' @return dataframe where row names are the sample IDs #' @export #' diff --git a/man/as_integer_df.Rd b/man/as_integer_df.Rd new file mode 100644 index 0000000..ea852e3 --- /dev/null +++ b/man/as_integer_df.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/counts.R +\name{as_integer_df} +\alias{as_integer_df} +\title{Convert all numeric columns in a dataframe to integers} +\usage{ +as_integer_df(counts_tbl) +} +\arguments{ +\item{counts_tbl}{data frame with numeric columns} +} +\value{ +data frame with any numeric columns as integers +} +\description{ +Round doubles to integers and convert to integer type +} +\examples{ +\dontrun{ +data.frame(a = c(0, 0.1, 2.3, 5L, 6.9)) \%>\% as_integer_df() +} +} +\keyword{internal} diff --git a/man/calc_cpm.Rd b/man/calc_cpm.Rd new file mode 100644 index 0000000..fcce4f5 --- /dev/null +++ b/man/calc_cpm.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/counts.R +\name{calc_cpm} +\alias{calc_cpm} +\title{Calculate counts-per-million (CPM) on raw counts in a reneeDataSet} +\usage{ +calc_cpm(renee_ds, ...) +} +\arguments{ +\item{renee_ds}{reneeDataSet object} + +\item{...}{additional arguments to pass to edgeR::cpm()} +} +\value{ +reneeDataSet with cpm-transformed counts +} +\description{ +Calculate counts-per-million (CPM) on raw counts in a reneeDataSet +} +\examples{ +sample_meta <- data.frame( + sample_id = c("KO_S3", "KO_S4", "WT_S1", "WT_S2"), + condition = factor( + c("knockout", "knockout", "wildtype", "wildtype"), + levels = c("wildtype", "knockout") + ) +) +renee_ds <- create_reneeDataSet_from_dataframes(sample_meta, gene_counts) \%>\% + calc_cpm() +head(renee_ds@counts$cpm) +} diff --git a/man/calc_cpm_df.Rd b/man/calc_cpm_df.Rd new file mode 100644 index 0000000..fb0f680 --- /dev/null +++ b/man/calc_cpm_df.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/counts.R +\name{calc_cpm_df} +\alias{calc_cpm_df} +\title{Calculate CPM on a data frame} +\usage{ +calc_cpm_df(dat, gene_colname = "gene_id", ...) +} +\arguments{ +\item{dat}{data frame of counts with a gene column} + +\item{gene_colname}{name of the gene column (default: "gene_id")} + +\item{...}{additional arguments to pass to edger::cpm()} +} +\value{ +cpm-transformed counts as a data frame +} +\description{ +Calculate CPM on a data frame +} +\keyword{internal} diff --git a/man/counts_dat_to_matrix.Rd b/man/counts_dat_to_matrix.Rd index 3b17a30..c833338 100644 --- a/man/counts_dat_to_matrix.Rd +++ b/man/counts_dat_to_matrix.Rd @@ -16,5 +16,8 @@ matrix of gene counts with rows as gene IDs Convert a data frame of gene counts to a matrix } \examples{ +\dontrun{ counts_dat_to_matrix(head(gene_counts)) } +} +\keyword{internal} diff --git a/man/create_reneeDataSet_from_dataframes.Rd b/man/create_reneeDataSet_from_dataframes.Rd index 2f60eef..7b6f4a2 100644 --- a/man/create_reneeDataSet_from_dataframes.Rd +++ b/man/create_reneeDataSet_from_dataframes.Rd @@ -13,10 +13,13 @@ create_reneeDataSet_from_dataframes( } \arguments{ \item{sample_meta_dat}{sample metadata as a data frame or tibble. -Must contain a \code{sample_ID} column.} +Must contain a \code{sample_id} column.} -\item{count_dat}{expected gene counts from RSEM as a data frame or tibble. -Must contain a \code{gene_id} column and a column for each sample ID in the metadata.} +\item{count_dat}{data frame of feature counts (e.g. expected gene counts from RSEM)} + +\item{sample_id_colname}{name of the column in \code{sample_meta_filepath} that contains the sample IDs} + +\item{count_type}{type to assign the values of \code{gene_counts_filepath} to in the \code{counts} slot} } \value{ reneeDataSet object diff --git a/man/create_reneeDataSet_from_files.Rd b/man/create_reneeDataSet_from_files.Rd index 1dfb9f6..a21e1b9 100644 --- a/man/create_reneeDataSet_from_files.Rd +++ b/man/create_reneeDataSet_from_files.Rd @@ -15,6 +15,10 @@ create_reneeDataSet_from_files( \item{sample_meta_filepath}{path to tsv file with sample IDs and metadata for differential analysis.} \item{gene_counts_filepath}{path to tsv file of expected gene counts from RSEM.} + +\item{count_type}{type to assign the values of \code{gene_counts_filepath} to in the \code{counts} slot} + +\item{sample_id_colname}{name of the column in \code{sample_meta_filepath} that contains the sample IDs} } \value{ reneeDataSet object @@ -25,10 +29,13 @@ Construct a reneeDataSet object from tsv files. \examples{ renee_ds <- create_reneeDataSet_from_files( sample_meta_filepath = system.file("extdata", - "sample_metadata.tsv", package = "reneeTools"), + "sample_metadata.tsv", + package = "reneeTools" + ), gene_counts_filepath = system.file("extdata", - "RSEM.genes.expected_count.all_samples.txt", - package = "reneeTools") + "RSEM.genes.expected_count.all_samples.txt", + package = "reneeTools" + ) ) renee_ds@counts$raw \%>\% head() renee_ds@sample_meta diff --git a/man/filter_counts.Rd b/man/filter_counts.Rd index 08ab705..85cc898 100644 --- a/man/filter_counts.Rd +++ b/man/filter_counts.Rd @@ -6,22 +6,23 @@ \usage{ filter_counts( renee_ds, - gene_names_column = "Gene", - sample_names_column = "Sample", - groups_column = "Group", - labels_column = "Label", + count_type = "raw", + gene_names_column = "gene_id", + sample_names_column = "sample_id", + group_column = "Group", + label_column = "Label", columns_to_include = c("Gene", "A1", "A2", "A3", "B1", "B2", "B3", "C1", "C2", "C3"), outlier_samples_to_remove = c(), + minimum_count_value_to_be_considered_nonzero = 8, + minimum_number_of_samples_with_nonzero_counts_in_total = 7, use_cpm_counts_to_filter = TRUE, - Minimum_Count_Value_to_be_Considered_Nonzero = 8, - Minimum_Number_of_Samples_with_Nonzero_Counts_in_Total = 7, - Use_Group_Based_Filtering = FALSE, - Minimum_Number_of_Samples_with_Nonzero_Counts_in_a_Group = 3, + use_group_based_filtering = FALSE, + minimum_number_of_samples_with_nonzero_counts_in_a_group = 3, principal_component_on_x_axis = 1, principal_component_on_y_axis = 2, legend_position_for_pca = "top", point_size_for_pca = 1, - add_labels_to_pca = TRUE, + add_label_to_pca = TRUE, label_font_size = 3, label_offset_y_ = 2, label_offset_x_ = 2, @@ -37,28 +38,113 @@ filter_counts( "coral", "azure", "green", "rum", "orange", "olive"), number_of_image_rows = 2, interactive_plots = FALSE, - plot_correlation_matrix_heatmap = TRUE + plot_correlation_matrix_heatmap = TRUE, + make_plots = TRUE ) } \arguments{ -\item{counts_matrix}{The input Counts Matrix. Usually, this will be your Cleaned Counts matrix.} +\item{renee_ds}{reneeDataSet object (see \code{create_reneeDataSet_from_dataframes()})} -\item{sample_metadata}{The Sample Metadata table containing your sample metadata. At minimum, this table must include one column each of the following: Samples, Groups, Batches, and Labels. The names in the Samples column of your input Sample Metadata must match the Sample Column Names of your input Counts Matrix exactly. You may have more than one column showing different Groups by which your samples may be organized (e.g. Genotype, Response, Time, etc.).} +\item{count_type}{the type of counts to use -- must be a name in the counts slot (\code{renee_ds@counts})} + +\item{gene_names_column}{The column from your input Counts Matrix containing the Feature IDs (Usually Gene or Protein ID). This is usually the first column of your input Counts Matrix. Only columns of Text type from your input Counts Matrix will be available to select for this parameter.} + +\item{sample_names_column}{The column from your input Sample Metadata table containing the sample names. The names in this column must exactly match the names used as the sample column names of your input Counts Matrix. Only columns of Text type from your input Sample Metadata table will be available to select for this parameter.} + +\item{group_column}{The column from your input Sample Metadata table containing the sample group information. This is usually a column showing to which experimental treatments each sample belongs (e.g. WildType, Knockout, Tumor, Normal, Before, After, etc.). Only columns of Text type from your input Sample Metadata will be available to select for this parameter.} + +\item{label_column}{The column from your input Sample Metadata table containing the sample labels as you wish them to appear in the plots produced by this template. This can be the same Sample Names Column. However, you may desire different labels to display on your figure (e.g. shorter labels are sometimes preferred on plots). In that case, select the column with your preferred Labels here. The selected column should contain unique names for each sample.} + +\item{columns_to_include}{Which Columns would you like to include? Usually, you will choose to a feature ID column (e.g. gene or protein ID) and all sample columns. Columns excluded here will be removed in this step and from further analysis downstream of this step.} + +\item{outlier_samples_to_remove}{A list of sample names to remove from the analysis.} + +\item{minimum_count_value_to_be_considered_nonzero}{Minimum count value to be considered non-zero for a sample} + +\item{minimum_number_of_samples_with_nonzero_counts_in_total}{Minimum number of samples (total) with non-zero counts} + +\item{use_cpm_counts_to_filter}{If no transformation has been been performed on counts matrix (eg Raw Counts) set to TRUE. If TRUE counts will be transformed to CPM and filtered based on given criteria. If gene counts matrix has been transformed (eg log2, CPM, FPKM or some form of Normalization) set to FALSE. If FALSE no further transformation will be applied and features will be filtered as is. For RNAseq data RAW counts should be transformed to CPM in order to properly filter.} + +\item{use_group_based_filtering}{If TRUE, only keeps features (e.g. genes) that have at least a certain number of samples with nonzero CPM counts in at least one group} + +\item{minimum_number_of_samples_with_nonzero_counts_in_a_group}{Only keeps genes that have at least this number of samples with nonzero CPM counts in at least one group} + +\item{principal_component_on_x_axis}{The principle component to plot on the x-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 1)} + +\item{principal_component_on_y_axis}{The principle component to plot on the y-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 2)} + +\item{legend_position_for_pca}{legend position for the PCA plot} + +\item{point_size_for_pca}{geom point size for the PCA plot} + +\item{add_label_to_pca}{label points on the PCA plot} + +\item{label_font_size}{label font size for the PCA plot} + +\item{label_offset_y_}{label offset y for the PCA plot} + +\item{label_offset_x_}{label offset x for the PCA plot} + +\item{samples_to_rename_manually}{If you do not have a Plot Labels Column in your sample metadata table, you can use this parameter to rename samples manually for display on the PCA plot. Use "Add item" to add each additional sample for renaming. Use the following format to describe which old name (in your sample metadata table) you want to rename to which new name: old_name: new_name} + +\item{color_histogram_by_group}{Set to FALSE to label histogram by Sample Names, or set to TRUE to label histogram by the column you select in the "Group Column Used to Color Histogram" parameter (below). Default is FALSE.} + +\item{set_min_max_for_x_axis_for_histogram}{whether to set min/max value for histogram x-axis} + +\item{minimum_for_x_axis_for_histogram}{x-axis minimum for histogram plot} + +\item{maximum_for_x_axis_for_histogram}{x-axis maximum for histogram plot} + +\item{legend_position_for_histogram}{legend position for the histogram plot. consider setting to 'none' for a large number of samples.} + +\item{legend_font_size_for_histogram}{legend font size for the histogram plot} + +\item{number_of_histogram_legend_columns}{number of columns for the histogram legend} + +\item{colors_for_plots}{Colors for the PCA and histogram will be picked, in order, from this list. If you have >12 samples or groups, program will choose from a wide range of random colors} + +\item{number_of_image_rows}{number of rows for the plot image. 1 = side-by-side, 2 = stacked} + +\item{interactive_plots}{set to TRUE to make PCA and Histogram plots interactive with \code{plotly}, allowing you to hover your mouse over a point or line to view sample information. The similarity heat map will not display if this toggle is set to TRUE. Default is FALSE.} + +\item{plot_correlation_matrix_heatmap}{Data sets with a large number of samples may be too large to create a correlation matrix heat map. If this template takes longer than 5 minutes to run, Toggle switch to FALSE and the correlation matrix will not be be created. Default is TRUE.} + +\item{make_plots}{whether to create plots} } \value{ -Filtered counts as a dataframe +\code{reneeDataSet} with filtered counts } \description{ -Filter low counts +This is often the first step in the QC portion of an analysis to filter out +features that have very low raw counts across most or all of your samples. +} +\details{ +This function takes a reneeDataSet containing raw counts and a sample +metadata table, and returns the reneeDataSet object with filtered counts. +It also produces an image consisting of three QC plots. + +You can tune the threshold for tuning how low counts for a given gene are +before they are deemed "too low" and filtered out of downstream analysis. By +default, this parameter is set to 1, meaning any raw count value less than 1 +will count as "too low". + +The QC plots are provided to help you assess: (1) PCA Plot: the within and +between group variance in expression after dimensionality reduction; (2) +Count Density Histogram: the dis/similarity of count distributions between +samples; and (3) Similarity Heatmap: the overall similarity of samples to one +another based on unsupervised clustering. } \examples{ renee_ds <- create_reneeDataSet_from_dataframes( as.data.frame(nidap_sample_metadata), as.data.frame(nidap_clean_raw_counts), sample_id_colname = "Sample" -) -set.seed(10) -renee_ds2 <- filter_counts(renee_ds) -head(renee_ds2@counts[["filt"]]) + ) \%>\% + calc_cpm(gene_colname = "Gene") \%>\% + filter_counts( + sample_names_column = "Sample", + gene_names_column = "Gene" + ) +head(renee_ds@counts$filt) } diff --git a/man/meta_tbl_to_dat.Rd b/man/meta_tbl_to_dat.Rd index 9495bd9..90b3239 100644 --- a/man/meta_tbl_to_dat.Rd +++ b/man/meta_tbl_to_dat.Rd @@ -8,6 +8,8 @@ meta_tbl_to_dat(meta_tbl, sample_id_colname = sample_id) } \arguments{ \item{meta_tbl}{tibble with \code{sample_id} column} + +\item{sample_id_colname}{name of the column in \code{sample_meta_filepath} that contains the sample IDs} } \value{ dataframe where row names are the sample IDs diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 19f856a..8d8ffb0 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -8,15 +8,15 @@ plot_heatmap( counts_matrix, sample_metadata, sample_names_column, - labels_column, + label_column, anno_column, anno_colors ) } \arguments{ -\item{counts_matrix}{The input Counts Matrix. Usually, this will be your Cleaned Counts matrix.} +\item{sample_names_column}{The column from your input Sample Metadata table containing the sample names. The names in this column must exactly match the names used as the sample column names of your input Counts Matrix. Only columns of Text type from your input Sample Metadata table will be available to select for this parameter.} -\item{sample_metadata}{The Sample Metadata table containing your sample metadata. At minimum, this table must include one column each of the following: Samples, Groups, Batches, and Labels. The names in the Samples column of your input Sample Metadata must match the Sample Column Names of your input Counts Matrix exactly. You may have more than one column showing different Groups by which your samples may be organized (e.g. Genotype, Response, Time, etc.).} +\item{label_column}{The column from your input Sample Metadata table containing the sample labels as you wish them to appear in the plots produced by this template. This can be the same Sample Names Column. However, you may desire different labels to display on your figure (e.g. shorter labels are sometimes preferred on plots). In that case, select the column with your preferred Labels here. The selected column should contain unique names for each sample.} \item{anno_column}{annotation (group) column} diff --git a/man/plot_pca.Rd b/man/plot_pca.Rd index 0b8a01b..41b0d8b 100644 --- a/man/plot_pca.Rd +++ b/man/plot_pca.Rd @@ -9,14 +9,14 @@ plot_pca( sample_metadata, samples_to_include, samples_to_rename_manually, - groups_column, - labels_column, + group_column, + label_column, color_values, principal_component_on_x_axis = 1, principal_component_on_y_axis = 2, legend_position_for_pca = "top", point_size_for_pca = 1, - add_labels_to_pca = TRUE, + add_label_to_pca = TRUE, label_font_size = 3, label_offset_y_ = 2, label_offset_x_ = 2 @@ -25,9 +25,29 @@ plot_pca( \arguments{ \item{log_counts}{log-transformed filtered counts} -\item{sample_metadata}{The Sample Metadata table containing your sample metadata. At minimum, this table must include one column each of the following: Samples, Groups, Batches, and Labels. The names in the Samples column of your input Sample Metadata must match the Sample Column Names of your input Counts Matrix exactly. You may have more than one column showing different Groups by which your samples may be organized (e.g. Genotype, Response, Time, etc.).} - \item{samples_to_include}{samples in \code{sample_metadata} to include in the analysis} + +\item{samples_to_rename_manually}{If you do not have a Plot Labels Column in your sample metadata table, you can use this parameter to rename samples manually for display on the PCA plot. Use "Add item" to add each additional sample for renaming. Use the following format to describe which old name (in your sample metadata table) you want to rename to which new name: old_name: new_name} + +\item{group_column}{The column from your input Sample Metadata table containing the sample group information. This is usually a column showing to which experimental treatments each sample belongs (e.g. WildType, Knockout, Tumor, Normal, Before, After, etc.). Only columns of Text type from your input Sample Metadata will be available to select for this parameter.} + +\item{label_column}{The column from your input Sample Metadata table containing the sample labels as you wish them to appear in the plots produced by this template. This can be the same Sample Names Column. However, you may desire different labels to display on your figure (e.g. shorter labels are sometimes preferred on plots). In that case, select the column with your preferred Labels here. The selected column should contain unique names for each sample.} + +\item{principal_component_on_x_axis}{The principle component to plot on the x-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 1)} + +\item{principal_component_on_y_axis}{The principle component to plot on the y-axis for the PCA plot. Choices include 1, 2, 3, ... (default: 2)} + +\item{legend_position_for_pca}{legend position for the PCA plot} + +\item{point_size_for_pca}{geom point size for the PCA plot} + +\item{add_label_to_pca}{label points on the PCA plot} + +\item{label_font_size}{label font size for the PCA plot} + +\item{label_offset_y_}{label offset y for the PCA plot} + +\item{label_offset_x_}{label offset x for the PCA plot} } \value{ PCA plot diff --git a/man/remove_low_count_genes.Rd b/man/remove_low_count_genes.Rd index 64138d7..62973f9 100644 --- a/man/remove_low_count_genes.Rd +++ b/man/remove_low_count_genes.Rd @@ -8,24 +8,34 @@ remove_low_count_genes( counts_matrix, sample_metadata, gene_names_column, - groups_column, + group_column, use_cpm_counts_to_filter = TRUE, - Use_Group_Based_Filtering = FALSE, - Minimum_Count_Value_to_be_Considered_Nonzero = 8, - Minimum_Number_of_Samples_with_Nonzero_Counts_in_Total = 7, - Minimum_Number_of_Samples_with_Nonzero_Counts_in_a_Group = 3 + use_group_based_filtering = FALSE, + minimum_count_value_to_be_considered_nonzero = 8, + minimum_number_of_samples_with_nonzero_counts_in_total = 7, + minimum_number_of_samples_with_nonzero_counts_in_a_group = 3 ) } \arguments{ -\item{counts_matrix}{The input Counts Matrix. Usually, this will be your Cleaned Counts matrix.} +\item{gene_names_column}{The column from your input Counts Matrix containing the Feature IDs (Usually Gene or Protein ID). This is usually the first column of your input Counts Matrix. Only columns of Text type from your input Counts Matrix will be available to select for this parameter.} -\item{sample_metadata}{The Sample Metadata table containing your sample metadata. At minimum, this table must include one column each of the following: Samples, Groups, Batches, and Labels. The names in the Samples column of your input Sample Metadata must match the Sample Column Names of your input Counts Matrix exactly. You may have more than one column showing different Groups by which your samples may be organized (e.g. Genotype, Response, Time, etc.).} +\item{group_column}{The column from your input Sample Metadata table containing the sample group information. This is usually a column showing to which experimental treatments each sample belongs (e.g. WildType, Knockout, Tumor, Normal, Before, After, etc.). Only columns of Text type from your input Sample Metadata will be available to select for this parameter.} + +\item{use_cpm_counts_to_filter}{If no transformation has been been performed on counts matrix (eg Raw Counts) set to TRUE. If TRUE counts will be transformed to CPM and filtered based on given criteria. If gene counts matrix has been transformed (eg log2, CPM, FPKM or some form of Normalization) set to FALSE. If FALSE no further transformation will be applied and features will be filtered as is. For RNAseq data RAW counts should be transformed to CPM in order to properly filter.} + +\item{use_group_based_filtering}{If TRUE, only keeps features (e.g. genes) that have at least a certain number of samples with nonzero CPM counts in at least one group} + +\item{minimum_count_value_to_be_considered_nonzero}{Minimum count value to be considered non-zero for a sample} + +\item{minimum_number_of_samples_with_nonzero_counts_in_total}{Minimum number of samples (total) with non-zero counts} + +\item{minimum_number_of_samples_with_nonzero_counts_in_a_group}{Only keeps genes that have at least this number of samples with nonzero CPM counts in at least one group} } \value{ counts matrix with low-count genes removed } \description{ -TODO this function also transforms raw counts to CPM, but that should be a separate function before this step +TODO this function also transforms raw counts to CPM, but that should be a separate function before this step, before filter_counts function() TODO document \code{isexpr1} column in output } \keyword{internal} diff --git a/man/rename_samples.Rd b/man/rename_samples.Rd index 5055b0d..626851f 100644 --- a/man/rename_samples.Rd +++ b/man/rename_samples.Rd @@ -9,13 +9,15 @@ rename_samples(dat, samples_to_rename_manually) \arguments{ \item{dat}{data frame containing a \code{sample} column} -\item{samples_to_rename_manually}{TODO ask Phil for expected format} +\item{samples_to_rename_manually}{TODO use sample metadata spreadsheet custom column} } \value{ data frame with samples renamed } \description{ -TODO this should probably be performed earlier on in the template? -why wait til after PCA is calculated? +TODO this should happen right at the beginning of the template? +} +\details{ +TODO accept new names for samples in sample metadata spreadsheet } \keyword{internal} diff --git a/man/reneeDataSet.Rd b/man/reneeDataSet.Rd index 923dec5..6f6e9f5 100644 --- a/man/reneeDataSet.Rd +++ b/man/reneeDataSet.Rd @@ -8,10 +8,9 @@ reneeDataSet(sample_meta_dat, counts_lst) } \arguments{ \item{sample_meta_dat}{sample metadata as a data frame or tibble. -Must contain a \code{sample_ID} column.} +Must contain a \code{sample_id} column.} -\item{count_dat}{expected gene counts from RSEM as a data frame or tibble. -Must contain a \code{gene_id} column and a column for each sample ID in the metadata.} +\item{counts_lst}{named list of dataframes containing counts, e.g. expected gene counts from RSEM. Each data frame is expected to contain a \code{gene_id} column and a column for each sample ID in the metadata.} } \description{ reneeDataSet class diff --git a/man/reneeTools-package.Rd b/man/reneeTools-package.Rd index e3ee607..50908a4 100644 --- a/man/reneeTools-package.Rd +++ b/man/reneeTools-package.Rd @@ -24,6 +24,7 @@ Useful links: Authors: \itemize{ \item Kelly Sovacool \email{kelly.sovacool@nih.gov} (\href{https://orcid.org/0000-0003-3283-829X}{ORCID}) + \item Samantha Chill \email{samantha.chill@nih.gov} (\href{https://orcid.org/0000-0002-8734-9875}{ORCID}) } Other contributors: diff --git a/man/validate_sample_metadata.Rd b/man/validate_sample_metadata.Rd index d7c2cae..951256d 100644 --- a/man/validate_sample_metadata.Rd +++ b/man/validate_sample_metadata.Rd @@ -8,13 +8,13 @@ validate_sample_metadata( counts_matrix, sample_metadata, sample_names_column = "Sample", - groups_column = "Group" + group_column = "Group" ) } \arguments{ -\item{counts_matrix}{The input Counts Matrix. Usually, this will be your Cleaned Counts matrix.} +\item{sample_names_column}{The column from your input Sample Metadata table containing the sample names. The names in this column must exactly match the names used as the sample column names of your input Counts Matrix. Only columns of Text type from your input Sample Metadata table will be available to select for this parameter.} -\item{sample_metadata}{The Sample Metadata table containing your sample metadata. At minimum, this table must include one column each of the following: Samples, Groups, Batches, and Labels. The names in the Samples column of your input Sample Metadata must match the Sample Column Names of your input Counts Matrix exactly. You may have more than one column showing different Groups by which your samples may be organized (e.g. Genotype, Response, Time, etc.).} +\item{group_column}{The column from your input Sample Metadata table containing the sample group information. This is usually a column showing to which experimental treatments each sample belongs (e.g. WildType, Knockout, Tumor, Normal, Before, After, etc.). Only columns of Text type from your input Sample Metadata will be available to select for this parameter.} } \value{ sample metadata with empty cells removed and special characters replaced