diff --git a/R/annot_rse.R b/R/annot_rse.R index 130ba7e..0b345b1 100644 --- a/R/annot_rse.R +++ b/R/annot_rse.R @@ -20,7 +20,7 @@ #' plus strand). #' @param drop If TRUE, remove sites overlapping SNPs #' @param RLE If TRUE, columns added will returned as [S4Vectors::Rle()] vectors -#' to reduce memory +#' to reduce memory usage. #' #' @param ... For the generic, further arguments to pass to specific methods. #' Unused for now. @@ -179,14 +179,15 @@ annot_snps.SummarizedExperiment <- function(obj, ...) { #' #' @description Utility function to map annotations from GRanges to rowData of #' SummarizedExperiment or to mcols of GRanges object. If multiple features overlap then -#' they will be concatenated as comma separated values. +#' they will be concatenated with the specified separtor string . #' #' @param obj RangedSummarizedExperiment or GRanges object #' @param gr GRanges with annotations to map to obj -#' @param cols_to_map character vector of columns from gr to map to obj. If the -#' vector has names, the names will be the column names in the output obj +#' @param cols_to_map character vector of columns from GRanges to map to SummarizedExperiment. +#' If the vector has names, the names will be the column names in the output. #' @param RLE If TRUE, columns added will returned as [S4Vectors::Rle()] vectors #' to reduce memory +#' @param sep separator string, defaults to comma. #' @param ... additional arguments to pass to [GenomicRanges::findOverlaps()] #' #' @return Either a SummarizedExperiment or GRanges object with additional @@ -210,7 +211,7 @@ annot_snps.SummarizedExperiment <- function(obj, ...) { #' @importFrom GenomeInfoDb seqlevelsStyle seqlevelsStyle<- seqlevels #' #' @export -annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, ...) { +annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, sep = ",", ...) { if (is(obj, "RangedSummarizedExperiment")) { gr_sites <- rowRanges(obj) return_se <- TRUE @@ -241,7 +242,7 @@ annot_from_gr <- function(obj, gr, cols_to_map, RLE = TRUE, ...) { overlaps, tmp = unstrsplit( unique(eval(parse(text = col))), - "," + sep = sep ), drop = FALSE ) diff --git a/R/calc_AEI.R b/R/calc_AEI.R index 197900e..7ece5ba 100644 --- a/R/calc_AEI.R +++ b/R/calc_AEI.R @@ -6,7 +6,7 @@ #' ALU elements in the human genome, and these regions have a high A-to-I #' editing signal compared to other regions such as coding exons. This #' function will perform pileup at specified repeat regions and return a -#' summary AEI metric. +#' summary AEI metric. #' #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods diff --git a/R/calc_scAEI.R b/R/calc_scAEI.R index 159819d..67dd9f7 100644 --- a/R/calc_scAEI.R +++ b/R/calc_scAEI.R @@ -5,8 +5,9 @@ #' bases) observed at A positions. The vast majority A-to-I editing occurs in #' ALU elements in the human genome, and these regions have a high A-to-I #' editing signal compared to other regions such as coding exons. This -#' function will examine potential editing sites and return -#' summary AEI metric per cell. +#' function will examine enumerate edited and non-edited base counts at the supplied +#' sites and return summary AEI metric per cell. Potential editing sites within +#' repeat regions can be generated using `get_scAEI_sites()`. #' #' @references Roth, S.H., Levanon, E.Y. & Eisenberg, E. Genome-wide #' quantification of ADAR adenosine-to-inosine RNA editing activity. Nat Methods diff --git a/R/differential_editing.R b/R/differential_editing.R index 6f190d5..1a96c47 100644 --- a/R/differential_editing.R +++ b/R/differential_editing.R @@ -6,7 +6,7 @@ #' for each site (`edit_freq`), depth of coverage computed #' using the indicated edited nucleotides (`depth`) and new `colData` #' columns with the number of edited sites (`n_sites`) and the -#' fraction of edits (`edit_idx`) is returned. +#' fraction of edits (`edit_idx`) is returned. #' #' @param rse A [RangedSummarizedExperiment] object created by [pileup_sites()] #' @param edit_from This should correspond to a nucleotide or assay @@ -234,11 +234,8 @@ make_de_object <- function(rse, #' Perform differential editing #' #' @description Use `edgeR` or `DESeq2` to perform differential editing -#' analysis. This will work for simple designs that have 1 treatment and 1 -#' control. For more complex designs, we suggest you perform your own. -#' -#' At the moment, this function will only find editing events specific to the -#' treatment. +#' analysis. This will work for designs that have 1 treatment and 1 +#' control group. For more complex designs, we suggest you perform your own modeling. #' #' @param deobj A [RangedSummarizedExperiment] object prepared for differential #' editing analysis by [make_de_object()] diff --git a/R/filter_rse.R b/R/filter_rse.R index 33757fa..9d851d4 100644 --- a/R/filter_rse.R +++ b/R/filter_rse.R @@ -41,7 +41,7 @@ filter_multiallelic <- function(se) { #' Extract regions surrounding splice sites #' -#' @description Find intervals containing splice sites and their adjacent +#' @description Extract intervals at splice sites and their adjacent #' regions. #' #' @param txdb `GenomicFeatures::TxDb` @@ -162,10 +162,10 @@ filter_splice_variants <- function(rse, txdb, #' Filter out clustered sequence variants #' -#' @description Sequence variants of multiple allele types (e.g., `A>G`, `A>C`) -#' in nearby regions can be due to mis-alignment. Remove variants if multiple -#' allele types are present within a given distance in genomic or -#' transcriptome coordinate space. +#' @description Sequence variants of multiple allele types (e.g., `A -> G`, `A -> C`) +#' proximal to a putative editing site can be indicative of a region prone to mis-alignment +#' artifacts. Sites will be removed if variants of multiple allele types are present +#' within a given distance in genomic or transcriptome coordinate space. #' #' @param rse `SummarizedExperiment::SummarizedExperiment` containing editing sites #' @param txdb `GenomicFeatures::TxDb` @@ -298,7 +298,10 @@ filter_clustered_variants <- function(rse, txdb, #' @param edit_from non-edited base #' @param per_sample if TRUE, calculate confidence per sample, otherwise edited #' and non-edited counts will be summed across all samples. -#' @param exp_fraction Numeric, confidence margin parameter for +#' @param exp_fraction Numeric value between 0 and 1, specifying the expected error +#' rate +#' @param alpha Pseudo-count to add to non-edited base counts +#' @param beta Pseudo-count to add to edited base counts #' #' @examples #' rse_adar_ifn <- mock_rse() @@ -319,14 +322,21 @@ calc_confidence <- function(se, edit_to = "G", edit_from = "A", per_sample = FALSE, - exp_fraction = 0.01) { - if (length(exp_fraction) != 1) { - cli::cli_abort("exp_fraction must be numeric(1)") + exp_fraction = 0.01, + alpha = 0L, + beta = 0L) { + if (length(exp_fraction) != 1 || (exp_fraction < 0 || exp_fraction > 1)) { + cli::cli_abort("exp_fraction must be numeric(1) and between 0 and 1") } + + if (length(alpha) != 1 || length(beta) != 1) { + cli::cli_abort("alpha and beta must be length 1") + } + edit_to <- paste0("n", edit_to) edit_from <- paste0("n", edit_from) - alt <- assay(se, edit_to) - ref <- assay(se, edit_from) + alt <- assay(se, edit_to) + as.integer(beta) + ref <- assay(se, edit_from) + as.integer(alpha) if (per_sample) { nc <- ncol(se) res <- vapply(seq_len(nc), function(i) { diff --git a/R/pileup.R b/R/pileup.R index a205514..4cc7f64 100644 --- a/R/pileup.R +++ b/R/pileup.R @@ -1,19 +1,26 @@ #' Generate base counts using pileup -#' +#' +#' @description This function uses a pileup routine to examine numerate base counts from +#' alignments at specified sites, regions, or across all read alignments, from one or +#' more BAM files. Alignment and site filtering +#' options are controlled by the `FilterParam` class.A [RangedSummarizedExperiment] object +#' is returned, populated with base count statistics for each supplied BAM file. +#' #' @param bamfiles a character vector, [BamFile] or [BamFileList] indicating 1 or #' more BAM files to process. If named, the names will be included in the [colData] #' of the [RangedSummarizedExperiment] as a `sample` column, otherwise the names will #' be taken from the basename of the BAM file. -#' @param fasta path to genome fasta file used for read alignment. Can be in -#' gzip or bgzip format. +#' @param fasta path to genome fasta file used for read alignment. Can be provided in +#' compressed gzip or bgzip format. #' @param sites a [GRanges] object containing regions or sites to process. #' @param region samtools region query string (i.e. `chr1:100-1000`). Can be combined #' with sites, in which case sites will be filtered to keep only sites within the #' region. -#' @param chroms chromosomes to process, not to be used with region. +#' @param chroms chromosomes to process, provided as a character vector. Not to be used with the +#' region parameter. #' @param param object of class [FilterParam()] which specify various #' filters to apply to reads and sites during pileup. -#' @param umi_tag The bam tag containing a UMI sequence. If supplied, multiple +#' @param umi_tag The BAM tag containing a UMI sequence. If supplied, multiple #' reads with the same UMI sequence will only be counted once per position. #' @param BPPARAM A [BiocParallel] class to control parallel execution. Parallel #' processing occurs per chromosome and is disabled when run on a single @@ -33,7 +40,7 @@ #' * `REF`: The reference base #' * `rpbz`: Mann-Whitney U test of Read Position Bias from bcftools, #' extreme negative or positive values indicate more bias. -#' * `vdb`: Variant Distance Bias for filtering splice-site artefacts from +#' * `vdb`: Variant Distance Bias for filtering splice-site artifacts from #' bcftools, lower values indicate more bias. #' * `sor` Strand Odds Ratio Score, strand bias estimated by the Symmetric #' Odds Ratio test, based on GATK code. Higher values indicate more bias. diff --git a/R/sc-pileup.R b/R/sc-pileup.R index c4bb556..57f2f4c 100644 --- a/R/sc-pileup.R +++ b/R/sc-pileup.R @@ -1,22 +1,22 @@ -#' Pileup sites per cell +#' Generate base counts per cell #' -#' @description This function performs a pileup operation at specified -#' sites, returning counts for Reference (e.g. unedited) or Alternate (e.g. -#' edited) bases. `pileup_cells` can process either a 10x Genomic's style -#' library, from a aligned BAM file containing a tag with a cell-barcode and a -#' tag with a UMI, or smart-seq style libraries without cell-barcodes. +#' @description This function processes scRNA-seq library to enumerate base counts +#' for Reference (unedited) or Alternate ( +#' edited) bases at specified sites in single cells. `pileup_cells` can process +#' droplet scRNA-seq libraries, from a BAM file containing a cell-barcode and UMI, +#' or well-based libraries that do not contain cell-barcodes. #' -#' The `sites` parameter specifies sites to pileup. This must be a [GRanges] +#' The `sites` parameter specifies sites to quantify. This must be a [GRanges] #' object with 1 base intervals, a strand (+ or -), and supplemented with #' metadata columns named `REF` and `ALT` containing the reference and -#' alternate base to query. See examples for an example format. +#' alternate base to query. See examples for the required format. #' #' At each site, bases from overlapping reads will be examined, and counts of #' each ref and alt base enumerated for each cell-barcode present. A single #' base will be counted once for each UMI sequence present in each cell. #' -#' @param bamfiles a path to a BAM file (for 10x libraries), or a vector of paths -#' to BAM files (smart-seq2). Can be supplied as a character vector, [BamFile], or +#' @param bamfiles a path to a BAM file (for droplet scRNA-seq), or a vector of paths +#' to BAM files (Smart-seq2). Can be supplied as a character vector, [BamFile], or #' [BamFileList]. #' @param sites a GRanges object containing sites to process. See examples for #' valid formatting. @@ -25,7 +25,7 @@ #' @param chroms A character vector of chromosomes to process. If supplied, only #' sites present in the listed chromosomes will be processed #' @param cell_barcodes A character vector of single cell barcodes to process. If -#' processing multiple BAM files (e.g. smart-seq-2), provide a character vector +#' processing multiple BAM files (e.g. Smart-seq2), provide a character vector #' of unique identifiers for each input BAM, to name each BAM file in the output files. #' @param param object of class [FilterParam()] which specify various filters to #' apply to reads and sites during pileup. Note that the `min_depth` and @@ -34,11 +34,9 @@ #' set to 2, then at least 2 reads (from any cell) must have a variant in order #' to report the site. Setting `min_depth` and `min_variant_reads` to 0 reports #' all sites present in the `sites` object. The following options are not enabled -#' for pileup_cells(): max_mismatch_type, homopolymer_len, and min_allelic_freq. +#' for pileup_cells(): `max_mismatch_type`, `homopolymer_len`, and `min_allelic_freq`. #' @param umi_tag tag in BAM containing the UMI sequence #' @param cb_tag tag in BAM containing the cell-barcode sequence -#' @param paired_end set to TRUE if data is paired-end to prevent double counting -#' overlapping read pairs. #' @param return_sce if `TRUE`, data is returned as a SingleCellExperiment, if #' `FALSE` a character vector of the output files, specified by #' `outfile_prefix`, will be returned. @@ -78,16 +76,19 @@ #' sce <- pileup_cells(bam_fn, gr, cbs, outdir, param = fp) #' sce #' -#' # example of processing multiple smart-seq2 style libraries +#' # example of processing multiple Smart-seq2 style libraries #' #' many_small_bams <- rep(bam_fn, 10) #' bam_ids <- LETTERS[1:10] +#' +#' fp <- FilterParam(library_type = "unstranded", +#' remove_overlaps = TRUE) +#' #' pileup_cells(many_small_bams, #' sites = gr, #' cell_barcodes = bam_ids, #' cb_tag = NULL, #' umi_tag = NULL, -#' paired_end = TRUE, #' outdir, #' param = fp #' ) @@ -107,7 +108,6 @@ pileup_cells <- function(bamfiles, chroms = NULL, umi_tag = "UB", cb_tag = "CB", - paired_end = FALSE, param = FilterParam(), BPPARAM = SerialParam(), return_sce = TRUE, @@ -189,7 +189,6 @@ pileup_cells <- function(bamfiles, cb_tag = cb_tag, umi_tag = umi_tag, param = param, - pe = paired_end, verbose = verbose ), BPPARAM = BPPARAM, @@ -211,7 +210,6 @@ pileup_cells <- function(bamfiles, cb_tag = cb_tag, umi_tag = umi_tag, param = param, - pe = paired_end, verbose = verbose ), BPPARAM = BPPARAM, @@ -261,7 +259,7 @@ defaultScBamFlags <- Rsamtools::scanBamFlag( get_sc_pileup <- function(bamfn, index, id, sites, barcodes, outfile_prefix, chrom, umi_tag, cb_tag, param, - pe, verbose) { + verbose) { if (length(chrom) > 0) { sites <- sites[seqnames(sites) %in% chrom, ] } @@ -293,7 +291,7 @@ get_sc_pileup <- function(bamfn, index, id, sites, barcodes, fp[["library_type"]], plp_outfns, umi_tag, - pe, + fp$lgl_args[["remove_overlaps"]], fp[["min_mapq"]], max(fp$int_args["min_variant_reads"], fp$int_args["min_depth"]) ) diff --git a/man/annot_from_gr.Rd b/man/annot_from_gr.Rd index 6b33ef8..e8a0002 100644 --- a/man/annot_from_gr.Rd +++ b/man/annot_from_gr.Rd @@ -4,19 +4,21 @@ \alias{annot_from_gr} \title{Annotate sites using GRanges object} \usage{ -annot_from_gr(obj, gr, cols_to_map, RLE = TRUE, ...) +annot_from_gr(obj, gr, cols_to_map, RLE = TRUE, sep = ",", ...) } \arguments{ \item{obj}{RangedSummarizedExperiment or GRanges object} \item{gr}{GRanges with annotations to map to obj} -\item{cols_to_map}{character vector of columns from gr to map to obj. If the -vector has names, the names will be the column names in the output obj} +\item{cols_to_map}{character vector of columns from GRanges to map to SummarizedExperiment. +If the vector has names, the names will be the column names in the output.} \item{RLE}{If TRUE, columns added will returned as \code{\link[S4Vectors:Rle-class]{S4Vectors::Rle()}} vectors to reduce memory} +\item{sep}{separator string, defaults to comma.} + \item{...}{additional arguments to pass to \code{\link[GenomicRanges:findOverlaps-methods]{GenomicRanges::findOverlaps()}}} } \value{ @@ -26,7 +28,7 @@ annotations provided by the supplied GRanges object. \description{ Utility function to map annotations from GRanges to rowData of SummarizedExperiment or to mcols of GRanges object. If multiple features overlap then -they will be concatenated as comma separated values. +they will be concatenated with the specified separtor string . } \examples{ library(SummarizedExperiment) diff --git a/man/annot_snps.Rd b/man/annot_snps.Rd index 758580d..8ff6159 100644 --- a/man/annot_snps.Rd +++ b/man/annot_snps.Rd @@ -48,7 +48,7 @@ comparing against the SNP db (which always returns sequences w.r.t the plus strand).} \item{RLE}{If TRUE, columns added will returned as \code{\link[S4Vectors:Rle-class]{S4Vectors::Rle()}} vectors -to reduce memory} +to reduce memory usage.} } \value{ Either a GRanges or SummarizedExperiment object with diff --git a/man/calc_confidence.Rd b/man/calc_confidence.Rd index 91ae960..435c599 100644 --- a/man/calc_confidence.Rd +++ b/man/calc_confidence.Rd @@ -9,7 +9,9 @@ calc_confidence( edit_to = "G", edit_from = "A", per_sample = FALSE, - exp_fraction = 0.01 + exp_fraction = 0.01, + alpha = 0L, + beta = 0L ) } \arguments{ @@ -22,7 +24,12 @@ calc_confidence( \item{per_sample}{if TRUE, calculate confidence per sample, otherwise edited and non-edited counts will be summed across all samples.} -\item{exp_fraction}{Numeric, confidence margin parameter for} +\item{exp_fraction}{Numeric value between 0 and 1, specifying the expected error +rate} + +\item{alpha}{Pseudo-count to add to non-edited base counts} + +\item{beta}{Pseudo-count to add to edited base counts} } \value{ \code{SummarizedExperiment::SummarizedExperiment} with either a new assay diff --git a/man/calc_scAEI.Rd b/man/calc_scAEI.Rd index 7abd8bc..128e544 100644 --- a/man/calc_scAEI.Rd +++ b/man/calc_scAEI.Rd @@ -70,8 +70,9 @@ editing in a sample. The index is a weighted average of editing events (G bases) observed at A positions. The vast majority A-to-I editing occurs in ALU elements in the human genome, and these regions have a high A-to-I editing signal compared to other regions such as coding exons. This -function will examine potential editing sites and return -summary AEI metric per cell. +function will examine enumerate edited and non-edited base counts at the supplied +sites and return summary AEI metric per cell. Potential editing sites within +repeat regions can be generated using \code{get_scAEI_sites()}. } \examples{ suppressPackageStartupMessages(library(Rsamtools)) diff --git a/man/filter_clustered_variants.Rd b/man/filter_clustered_variants.Rd index 7ab9e9f..8dc09df 100644 --- a/man/filter_clustered_variants.Rd +++ b/man/filter_clustered_variants.Rd @@ -27,10 +27,10 @@ variants} object dependent on filtering applied. } \description{ -Sequence variants of multiple allele types (e.g., \code{A>G}, \code{A>C}) -in nearby regions can be due to mis-alignment. Remove variants if multiple -allele types are present within a given distance in genomic or -transcriptome coordinate space. +Sequence variants of multiple allele types (e.g., \code{A -> G}, \code{A -> C}) +proximal to a putative editing site can be indicative of a region prone to mis-alignment +artifacts. Sites will be removed if variants of multiple allele types are present +within a given distance in genomic or transcriptome coordinate space. } \examples{ library(GenomicFeatures) diff --git a/man/find_de_sites.Rd b/man/find_de_sites.Rd index 1f2cdac..66a51e6 100644 --- a/man/find_de_sites.Rd +++ b/man/find_de_sites.Rd @@ -45,11 +45,8 @@ A named list: } \description{ Use \code{edgeR} or \code{DESeq2} to perform differential editing -analysis. This will work for simple designs that have 1 treatment and 1 -control. For more complex designs, we suggest you perform your own. - -At the moment, this function will only find editing events specific to the -treatment. +analysis. This will work for designs that have 1 treatment and 1 +control group. For more complex designs, we suggest you perform your own modeling. } \examples{ library(SummarizedExperiment) diff --git a/man/get_splice_sites.Rd b/man/get_splice_sites.Rd index e02ca6e..0fe8a40 100644 --- a/man/get_splice_sites.Rd +++ b/man/get_splice_sites.Rd @@ -17,7 +17,7 @@ extract} flanking bases. } \description{ -Find intervals containing splice sites and their adjacent +Extract intervals at splice sites and their adjacent regions. } \examples{ diff --git a/man/pileup_cells.Rd b/man/pileup_cells.Rd index 57d5da6..3fc1b21 100644 --- a/man/pileup_cells.Rd +++ b/man/pileup_cells.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/sc-pileup.R \name{pileup_cells} \alias{pileup_cells} -\title{Pileup sites per cell} +\title{Generate base counts per cell} \usage{ pileup_cells( bamfiles, @@ -12,7 +12,6 @@ pileup_cells( chroms = NULL, umi_tag = "UB", cb_tag = "CB", - paired_end = FALSE, param = FilterParam(), BPPARAM = SerialParam(), return_sce = TRUE, @@ -20,15 +19,15 @@ pileup_cells( ) } \arguments{ -\item{bamfiles}{a path to a BAM file (for 10x libraries), or a vector of paths -to BAM files (smart-seq2). Can be supplied as a character vector, \link{BamFile}, or +\item{bamfiles}{a path to a BAM file (for droplet scRNA-seq), or a vector of paths +to BAM files (Smart-seq2). Can be supplied as a character vector, \link{BamFile}, or \link{BamFileList}.} \item{sites}{a GRanges object containing sites to process. See examples for valid formatting.} \item{cell_barcodes}{A character vector of single cell barcodes to process. If -processing multiple BAM files (e.g. smart-seq-2), provide a character vector +processing multiple BAM files (e.g. Smart-seq2), provide a character vector of unique identifiers for each input BAM, to name each BAM file in the output files.} \item{output_directory}{Output directory for output matrix files. The directory @@ -41,9 +40,6 @@ sites present in the listed chromosomes will be processed} \item{cb_tag}{tag in BAM containing the cell-barcode sequence} -\item{paired_end}{set to TRUE if data is paired-end to prevent double counting -overlapping read pairs.} - \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various filters to apply to reads and sites during pileup. Note that the \code{min_depth} and \code{min_variant_reads} parameters if set > 0 specify the number of reads @@ -51,7 +47,7 @@ from any cell required in order to report a site. E.g. if \code{min_variant_read set to 2, then at least 2 reads (from any cell) must have a variant in order to report the site. Setting \code{min_depth} and \code{min_variant_reads} to 0 reports all sites present in the \code{sites} object. The following options are not enabled -for pileup_cells(): max_mismatch_type, homopolymer_len, and min_allelic_freq.} +for pileup_cells(): \code{max_mismatch_type}, \code{homopolymer_len}, and \code{min_allelic_freq}.} \item{BPPARAM}{BiocParallel instance. Parallel computation occurs across chromosomes.} @@ -77,16 +73,16 @@ sparseMatrix files (\code{barcodes.txt.gz}, \code{sites.txt.gz}, \code{counts.mt will be returned. These files can be imported using \code{\link[=read_sparray]{read_sparray()}}. } \description{ -This function performs a pileup operation at specified -sites, returning counts for Reference (e.g. unedited) or Alternate (e.g. -edited) bases. \code{pileup_cells} can process either a 10x Genomic's style -library, from a aligned BAM file containing a tag with a cell-barcode and a -tag with a UMI, or smart-seq style libraries without cell-barcodes. +This function processes scRNA-seq library to enumerate base counts +for Reference (unedited) or Alternate ( +edited) bases at specified sites in single cells. \code{pileup_cells} can process +droplet scRNA-seq libraries, from a BAM file containing a cell-barcode and UMI, +or well-based libraries that do not contain cell-barcodes. -The \code{sites} parameter specifies sites to pileup. This must be a \link{GRanges} +The \code{sites} parameter specifies sites to quantify. This must be a \link{GRanges} object with 1 base intervals, a strand (+ or -), and supplemented with metadata columns named \code{REF} and \code{ALT} containing the reference and -alternate base to query. See examples for an example format. +alternate base to query. See examples for the required format. At each site, bases from overlapping reads will be examined, and counts of each ref and alt base enumerated for each cell-barcode present. A single @@ -111,16 +107,19 @@ fp <- FilterParam(library_type = "fr-second-strand") sce <- pileup_cells(bam_fn, gr, cbs, outdir, param = fp) sce -# example of processing multiple smart-seq2 style libraries +# example of processing multiple Smart-seq2 style libraries many_small_bams <- rep(bam_fn, 10) bam_ids <- LETTERS[1:10] + +fp <- FilterParam(library_type = "unstranded", + remove_overlaps = TRUE) + pileup_cells(many_small_bams, sites = gr, cell_barcodes = bam_ids, cb_tag = NULL, umi_tag = NULL, - paired_end = TRUE, outdir, param = fp ) diff --git a/man/pileup_sites.Rd b/man/pileup_sites.Rd index f68aa1c..1b139c9 100644 --- a/man/pileup_sites.Rd +++ b/man/pileup_sites.Rd @@ -47,8 +47,8 @@ more BAM files to process. If named, the names will be included in the \link{col of the \link{RangedSummarizedExperiment} as a \code{sample} column, otherwise the names will be taken from the basename of the BAM file.} -\item{fasta}{path to genome fasta file used for read alignment. Can be in -gzip or bgzip format.} +\item{fasta}{path to genome fasta file used for read alignment. Can be provided in +compressed gzip or bgzip format.} \item{sites}{a \link{GRanges} object containing regions or sites to process.} @@ -56,7 +56,8 @@ gzip or bgzip format.} with sites, in which case sites will be filtered to keep only sites within the region.} -\item{chroms}{chromosomes to process, not to be used with region.} +\item{chroms}{chromosomes to process, provided as a character vector. Not to be used with the +region parameter.} \item{param}{object of class \code{\link[=FilterParam]{FilterParam()}} which specify various filters to apply to reads and sites during pileup.} @@ -65,7 +66,7 @@ filters to apply to reads and sites during pileup.} processing occurs per chromosome and is disabled when run on a single region.} -\item{umi_tag}{The bam tag containing a UMI sequence. If supplied, multiple +\item{umi_tag}{The BAM tag containing a UMI sequence. If supplied, multiple reads with the same UMI sequence will only be counted once per position.} \item{verbose}{if TRUE, then report progress and warnings.} @@ -150,7 +151,7 @@ The \code{\link[=rowRanges]{rowRanges()}} contains the genomic interval for each \item \code{REF}: The reference base \item \code{rpbz}: Mann-Whitney U test of Read Position Bias from bcftools, extreme negative or positive values indicate more bias. -\item \code{vdb}: Variant Distance Bias for filtering splice-site artefacts from +\item \code{vdb}: Variant Distance Bias for filtering splice-site artifacts from bcftools, lower values indicate more bias. \item \code{sor} Strand Odds Ratio Score, strand bias estimated by the Symmetric Odds Ratio test, based on GATK code. Higher values indicate more bias. @@ -161,7 +162,11 @@ The rownames will be populated with the format as 1 = +, 2 = -, and 3 = *. } \description{ -Generate base counts using pileup +This function uses a pileup routine to examine numerate base counts from +alignments at specified sites, regions, or across all read alignments, from one or +more BAM files. Alignment and site filtering +options are controlled by the \code{FilterParam} class.A \link{RangedSummarizedExperiment} object +is returned, populated with base count statistics for each supplied BAM file. } \examples{ library(SummarizedExperiment) diff --git a/vignettes/raer.Rmd b/vignettes/raer.Rmd index 26f4d87..68691b6 100644 --- a/vignettes/raer.Rmd +++ b/vignettes/raer.Rmd @@ -26,29 +26,23 @@ knitr::opts_chunk$set(message = FALSE, warning = FALSE) The *raer* (RNA Adenosine editing in R) package provides tools to characterize A-to-I editing in single cell and bulk RNA-sequencing datasets. Both novel and known editing sites can be detected and quantified beginning with BAM alignment files. At it's core the *raer* package uses the pileup routines from the HTSlib C library (@Bonfield2021-yo) to identify candidate RNA editing sites, and leverages the annotation resources in the Bioconductor ecosystem to further characterize and identify high-confidence RNA editing sites. -Here we demonstrate how to use the *raer* package to a) quantify known RNA editing in droplet scRNA-seq dataset, b) identify editing sites with condition specific editing in bulk RNA-seq data, and c) predict novel editing sites from bulk RNA-seq. +Here we demonstrate how to use the *raer* package to a) quantify RNA editing sites in droplet scRNA-seq dataset, b) identify editing sites with condition specific editing in bulk RNA-seq data, and c) predict novel editing sites from bulk RNA-seq. # Characterizing RNA editing sites in scRNA-seq data -Here we will use the *raer* package to examine RNA editing in droplet-based single cell RNA-seq data. For this example we will examine a scRNA-seq dataset from human PBMC cells provided by [10x Genomics](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-x-with-intronic-reads-3-1-high). The single cell data was aligned and processed using the 10x Genomics cellranger pipeline. +Here we will use the *raer* package to examine RNA editing in droplet-based single cell RNA-seq data. `pileup_cells()` enables quantification of edited and non-edited bases at specified sites from scRNA-seq data. + +For this example we will examine a scRNA-seq dataset from human PBMC cells provided by [10x Genomics](https://www.10xgenomics.com/resources/datasets/10k-human-pbmcs-3-v3-1-chromium-x-with-intronic-reads-3-1-high). The single cell data was aligned and processed using the 10x Genomics cellranger pipeline. The PBMC scRNA-seq dataset from 10x Genomics, along with other needed files will downloaded and cached using `pbmc_10x()`from the *raerdata* ExperimentHub package. For this vignette, the BAM file was subset to retain 2 million alignments that overlap human RNA editing sites on chromosome 16. +`pbmc_10x()` returns a list containing a `BamFile` object, a `GRanges` object with known RNA editing sites from the `REDIportal` database @Mansi2021-za, and a `SingleCellExperiment` populated with the gene expression data and cell type annotations. + ```{r message=FALSE} library(raer) library(raerdata) -library(scater) -library(SingleCellExperiment) -library(TxDb.Hsapiens.UCSC.hg38.knownGene) -library(AnnotationHub) -library(SNPlocs.Hsapiens.dbSNP144.GRCh38) -library(ComplexHeatmap) -``` -`pbmc_10x()` returns a list containing a `BamFile` object, a `GRanges` object with known RNA editing sites from the `REDIportal` database, and a `SingleCellExperiment` populated with the gene expression data and cell type annotations. - -```{r} pbmc <- pbmc_10x() pbmc_bam <- pbmc$bam @@ -56,14 +50,17 @@ editing_sites <- pbmc$sites sce <- pbmc$sce ``` +This dataset contains T-cell, B-cells, and monocyte cell populations. ```{r} +library(scater) +library(SingleCellExperiment) plotUMAP(sce, colour_by = "celltype") ``` ## Specifying sites to quantify -Next we'll select editing sites to query. For this analysis we will use RNA editing sites cataloged in the REDIportal database @Mansi2021-za. +Next we'll select editing sites to quantify. For this analysis we will use RNA editing sites cataloged in the REDIportal database @Mansi2021-za. ```{r} editing_sites @@ -80,9 +77,9 @@ editing_sites ## Quantifying sites in single cells using *pileup_cells* -`pileup_cells()` quantifies edited and non-edited UMI counts per cell barcode, then organizes the site counts into a `SingleCellExperiment` object. `pileup_cells()` accepts a `FilterParam()` object that specifies parameters for multiple read-level and site-level filtering and processing options. Note that `pileup_cells()` is strand sensitive by default, so it is important to ensure that the strand of the input sites is correctly annotated, and that the `library-type` is set correctly for the strandedness of the sequencing library. For 10x Genomics data, the library type is set to `fr-second-strand`, indicating that the strand of the BAM alignments is the same strand as the RNA. See [quantifying smart-seq scRNA-seq libraries](#ss2) for an example of using *pileup_cells()* to handle unstranded data and data from libraries that produce 1 BAM file for each cell. +`pileup_cells()` quantifies edited and non-edited UMI counts per cell barcode, then organizes the site counts into a `SingleCellExperiment` object. `pileup_cells()` accepts a `FilterParam()` object that specifies parameters for multiple read-level and site-level filtering and processing options. Note that `pileup_cells()` is strand sensitive by default, so it is important to ensure that the strand of the input sites is correctly annotated, and that the `library-type` is set correctly for the strandedness of the sequencing library. For 10x Genomics data, the library type is set to `fr-second-strand`, indicating that the strand of the BAM alignments is the same strand as the RNA. See [quantifying Smart-seq2 scRNA-seq libraries](#ss2) for an example of using *pileup_cells()* to handle unstranded data and data from libraries that produce 1 BAM file for each cell. -To exclude duplicate reads derived from PCR, `pileup_cells()` can use a UMI sequence, supplied via the `umi_tag` argument, to only count 1 read for each CB-UMI pair at each editing site position. Note however that by default the `bam_flags` argument for the `FilterParam` class is set to **include** duplicate reads when using `pileup_cells()`. Droplet single cell libraries produce multiple cDNA fragments from a single reverse transcription event. The cDNA fragments have different alignment positions due to fragmentation despite being derived from a single RNA molecule. scRNA-seq data processed by cellranger from 10x Genomics will set the "Not primary alignment" bam flag for every read except one read for each UMI. If duplicates are removed based on this bam flag, then only 1 representative fragment for a single UMI will be examined, which will exclude many valid regions. +To exclude duplicate reads derived from PCR, `pileup_cells()` can use a UMI sequence, supplied via the `umi_tag` argument, to only count 1 read for each CB-UMI pair at each editing site position. Note however that by default the `bam_flags` argument for the `FilterParam` class is set to **include** duplicate reads when using `pileup_cells()`. Droplet single cell libraries produce multiple cDNA fragments from a single reverse transcription event. The cDNA fragments have different alignment positions due to fragmentation despite being derived from a single RNA molecule. scRNA-seq data processed by cellranger from 10x Genomics will set the "Not primary alignment" BAM flag for every read except one read for each UMI. If duplicates are removed based on this BAM flag, then only 1 representative fragment for a single UMI will be examined, which will exclude many valid regions. To reduce processing time many functions in the *raer* package operate in parallel across multiple chromosomes. To enable parallel processing, a `BiocParallel` backend can be supplied via the `BPPARAM` argument (e.g. `MultiCoreParam()`). @@ -156,11 +153,11 @@ plotGroupedHeatmap(altExp(sce), - `calc_scAEI()` will calculate the Alu Editing Index (`AEI`) metric in single cells. -**If the editing sites of interest are not known, we recommend the following approach. First, treat the single cell data as a bulk RNA-seq experiment, and follow the [bulk RNA-seq workflow](#bulk) to identify putative editing sites. Then query these sites in single cell mode using `pileup_cells()`** +**If the editing sites of interest are not known, we recommend the following approach. First, treat the single cell data as a bulk RNA-seq experiment, and follow approaches described in the [Novel editing site detection](#novel) to identify putative editing sites. Then query these sites in single cell mode using `pileup_cells()`** -## Quantifying sites in Smart-seq libaries {#ss2} +## Quantifying sites in Smart-seq2 libaries {#ss2} -`pileup_cells()` can also process Smart-seq style single cell libraries. These datasets typically store data from each cell in separate BAM files and the library type for these alignments are generally unstranded. To process these datasets the `library-type` should be set to `unstranded`, and the reference editing sites need to be reported all on the `+` strand. +`pileup_cells()` can also process Smart-seq2 single cell libraries. These datasets typically store data from each cell in separate BAM files and the library type for these alignments are generally unstranded. To process these datasets the `library-type` should be set to `unstranded`, and the reference editing sites need to be reported all on the `+` strand. For example, the editing sites on the minus strand will need to be complemented (set as T -> C rather than A -> G). Additionally the `umi_tag` and `cb_tag` arguments should be set as follows to disable UMI and cell barcode detection. @@ -177,14 +174,16 @@ vector_of_cell_ids <- c("cell1", "cell2", "cell3") pileup_cells(bamfiles = vector_of_bam_files, cell_barcodes = vector_of_cell_ids, sites = editing_sites, - umi_tag = NULL, # no UMI tag in most smart-seq libraries - cb_tag = NULL) # no cell barcode in most smart-seq libraries + umi_tag = NULL, # no UMI tag in most Smart-seq2 libraries + cb_tag = NULL) # no cell barcode in most Smart-seq2 libraries ``` # Quantifying RNA editing sites in bulk RNA-Seq {#bulk} -Next we will perform a reanalysis of a published bulk RNA-seq dataset, [GSE99249](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99249), which consists of ADAR1 mutants and control human cell lines, conditionally treated with Interferon-Beta. We will examine data from two genotypes, ADAR1 WT and KO, both treated with Ifn-B, with triplicate samples. +Next we will perform a reanalysis of a published bulk RNA-seq dataset using the *raer* package. The `pileup_sites()` function enable quantification of base counts from bulk RNA-seq data and can be used to identify novel sites (see [Novel editing site detection](#novel)). + +For this reanalysis, we will examine a bulk RNA-seq dataset from accession [GSE99249](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99249), which consists of RNA-seq data from ADAR1 mutants and control human cell lines, conditionally treated with Interferon-Beta. We will examine data from two genotypes, ADAR1 WT and KO, both treated with Interferon-B, with triplicate samples. Aligned BAM files and other necessary files have been preprocessed for this vignette and are available using `GSE99249()` from the `raerdata` package. Calling `GSE99249()` will downloaded and cache the necessary files and return a list containing the data. @@ -310,6 +309,7 @@ de_results$sig_results[1:5, ] ``` ```{r, fig.height=7, fig.width=5} +library(ComplexHeatmap) top_sites <- rownames(de_results$sig_results)[1:20] Heatmap(assay(rse, "edit_freq")[top_sites, ], @@ -333,6 +333,9 @@ First we will use the `AnnotationHub` package to obtain coordinates for ALU elem ```{r} +library(AnnotationHub) +library(SNPlocs.Hsapiens.dbSNP144.GRCh38) + ah <- AnnotationHub() rmsk_hg38 <- ah[["AH99003"]] @@ -375,11 +378,13 @@ Heatmap(alu_index$AEI, The AEI in the `Wildtype` samples is highest for `A-to-G`, and sharply reduced in the `ADAR1KO` samples as expected. -# Novel RNA editing site detection +# Novel RNA editing site detection {#novel} + +Next we will demonstrate how to identify novel RNA editing sites using the *raer* package. It is best practice to have a matched DNA sequencing dataset to exclude sample specific genetic variation and common alignment artifacts. However, high confidence editing sites can also be identified if the dataset contains many samples and there are high coverage SNP databases for the organism queried. Additionally high confidence editing sites can also be identified if a dataset contains a sample with reduced or absent ADAR activity. A false-positive rate estimate can be obtained by examining the proportion of A->I editing sites recovered, relative to other variants, (e.g. G->C, C->A). -Next we will demonstrate how to identify novel RNA editing sites using the *raer* package. In this analysis a published RNA-seq and whole genome sequencing dataset will be analyzed. High coverage whole-genome sequencing was conducted [ERR262997](https://www.ebi.ac.uk/ena/browser/view/ERR262997?show=reads) along with paired-end RNA-seq [SRR1258218](https://www.ebi.ac.uk/ena/browser/view/SRR1258218?show=reads) in a human cell line (`NA12878`). +In this analysis a published RNA-seq and whole genome sequencing dataset will be analyzed. High coverage whole-genome sequencing was conducted [ERR262997](https://www.ebi.ac.uk/ena/browser/view/ERR262997?show=reads) along with paired-end RNA-seq [SRR1258218](https://www.ebi.ac.uk/ena/browser/view/SRR1258218?show=reads) in a human cell line (`NA12878`). -Aligned BAM files, a genome fasta file, and a GRanges object containing SNPs corresponding to the first 1Mb region of chr4 have been prepared for this vignette and can be downloaded and cached using `NA12878()`. +Aligned BAM files, a genome FASTA file, and a GRanges object containing SNPs corresponding to the first 1Mb region of chr4 have been prepared for this vignette and can be downloaded and cached using `NA12878()`. ```{r} rna_wgs <- NA12878() @@ -388,18 +393,20 @@ names(rna_wgs) Additionally we will use the following additional annotation resources: - - A database of known SNPs, for example the `SNPlocs.Hsapiens.dbSNP155.GRCh38` package. Due to space and memory constraints in this vignette we will only examine SNPs from the first 1Mb region of chr4. + - A database of known SNPs, for example from the `SNPlocs.Hsapiens.dbSNP144.GRCh38` package. Due to space and memory constraints in this vignette we will only examine SNPs from the first 1Mb region of chr4. - `TxDb.Hsapiens.UCSC.hg38.knownGene`, a database of transcript models. Alternatively these can be generated from a `.gtf` file using `makeTxDbFromGRanges()` from the `GenomicFeatures` package. - RepeatMasker annotations, which can be obtained from the `AnnotationHub()` for hg38, as shown in the [bulk RNA-seq](#bulk) tutorial. ```{r} +library(TxDb.Hsapiens.UCSC.hg38.knownGene) + txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene chr4snps <- rna_wgs$snps ``` -The `pileup_sites()` function accept multiple bam files, here we supply one from RNA-seq, and one from whole genome sequencing. A subset of the filtering parameters (`FilterParam()`) can accept multiple arguments matched to each of the input bam files. This allows us to have distinct settings for the WGS and RNA-seq BAM files. +The `pileup_sites()` function accept multiple BAM files, here we supply one from RNA-seq, and one from whole genome sequencing. A subset of the filtering parameters (`FilterParam()`) can accept multiple arguments matched to each of the input BAM files. This allows us to have distinct settings for the WGS and RNA-seq BAM files. ```{r} bams <- rna_wgs$bams @@ -407,14 +414,14 @@ names(bams) <- c("rna", "dna") fp <- FilterParam( min_depth = 1, # minimum read depth across all samples min_base_quality = 30, # minimum base quality - min_mapq = c(255, 30), # minimum MAPQ for each bam file - library_type = c("fr-first-strand", "unstranded"), # library-type for each bam file + min_mapq = c(255, 30), # minimum MAPQ for each BAM file + library_type = c("fr-first-strand", "unstranded"), # library-type for each BAM file trim_5p = 5, # bases to trim from 5' end of alignment trim_3p = 5, # bases to trim from 3' end of alignment indel_dist = 4, # ignore read if contains an indel within distance from site min_splice_overhang = 10, # required alignment overhang in order to count read with splice read_bqual = c(0.25, 20), # minimum fraction of the read (0.25) that must have base quality of (20) - only_keep_variants = c(TRUE, FALSE), # report site if rnaseq bam has variant + only_keep_variants = c(TRUE, FALSE), # report site if rnaseq BAM has variant report_multiallelic = FALSE, # do not report sites with multiple variant alleles ) @@ -502,7 +509,7 @@ With the above filtering approach we obtain a set of putative editing sites. The rowRanges(rse) ``` -Finally once a set of sites has been identified, additional packges in the bioconductor ecosystem, such the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package, can be used to determine the genomic context and potential molecular consequences of the editing event. +Finally once a set of sites has been identified, additional packages in the bioconductor ecosystem, such the [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) package, can be used to determine the genomic context and potential molecular consequences of the editing event.