diff --git a/NEWS.md b/NEWS.md index a00e65c..795e813 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,9 @@ NEW FEATURES - Accelerated the computation of quantile position by ~20 times. - New `resetCAGEexp()` function. - New `flagByUpstreamSequences()` function. +- The `annotateCTSS` and `annotateConsensusClusters` function gain a + `upstream` and a `downstream` parameter to change the width of promoter + regions. # Changes in version 2.6.0 diff --git a/R/Annotations.R b/R/Annotations.R index b13b4e4..0dfce0a 100644 --- a/R/Annotations.R +++ b/R/Annotations.R @@ -374,17 +374,21 @@ msScope_annotation <- function(libs) { } -#' @name annotateCTSS +#' Annotate and compute summary statistics #' -#' @title Annotate and compute summary statistics -#' -#' @description `annotateCTSS` annotates the _CTSS_ of a [`CAGEexp`] object and -#' computes annotation statistics. +#' `annotateCTSS` annotates the _CTSS_ of a [`CAGEexp`] object and computes +#' annotation statistics. #' #' @param object `CAGEexp` object. #' #' @param ranges A [`GRanges`] object, optionally containing `gene_name`, #' `type` and `transcript_type` metadata. +#' +#' @param upstream Number of bases _upstream_ the start of the transcript models +#' to be considered as part of the _promoter region_. +#' +#' @param downstream Number of bases _downstream_ the start of the transcript +#' models to be considered as part of the _promoter region_. #' #' @return `annotateCTSS` returns the input object with the following #' modifications: @@ -411,13 +415,14 @@ msScope_annotation <- function(libs) { #' #' @export -setGeneric("annotateCTSS", function(object, ranges) standardGeneric("annotateCTSS")) +setGeneric("annotateCTSS", function(object, ranges, upstream=500, downstream=500) + standardGeneric("annotateCTSS")) #' @rdname annotateCTSS -setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges){ +setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges, upstream=500, downstream=500){ CTSScoordinatesGR(object)$genes <- ranges2genes(CTSScoordinatesGR(object), ranges) - CTSScoordinatesGR(object)$annotation <- ranges2annot(CTSScoordinatesGR(object), ranges) + CTSScoordinatesGR(object)$annotation <- ranges2annot(CTSScoordinatesGR(object), ranges, upstream, downstream) annot <- sapply( CTSStagCountDF(object) , function(X) tapply(X, CTSScoordinatesGR(object)$annotation, sum)) @@ -443,14 +448,15 @@ setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges){ #' #' @export -setGeneric("annotateConsensusClusters", function(object, ranges) standardGeneric("annotateConsensusClusters")) +setGeneric("annotateConsensusClusters", function(object, ranges, upstream=500, downstream=500) + standardGeneric("annotateConsensusClusters")) #' @rdname annotateCTSS -setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object, ranges){ +setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object, ranges, upstream=500, downstream=500){ if(is.null(experiments(object)$tagCountMatrix)) stop("Input does not contain CTSS expressiond data, see ", dQuote("getCTSS()"), ".") - consensusClustersGR(object)$annotation <- ranges2annot(consensusClustersGR(object), ranges) + consensusClustersGR(object)$annotation <- ranges2annot(consensusClustersGR(object), ranges, upstream, downstream) if(!is.null(ranges$gene_name)) consensusClustersGR(object)$genes <- ranges2genes(consensusClustersGR(object), ranges) validObject(object) @@ -458,24 +464,28 @@ setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object }) -#' @name ranges2annot +#' Hierarchical annotation of genomic regions. #' -#' @title Hierarchical annotation of CTSSes +#' Assigns region types such as `promoter`, `exon` or `unknown` to genomic +#' regions such as _CTSS_, _tag clusters_, or _consensus clusters_. #' -#' @description Assigns region types such as `promoter`, `exon` or `unknown` -#' to CTSSes. -#' -#' @param ranges A [`CTSS`] object, for example extracted from a -#' `RangedSummarizedExperiment` object with the [`rowRanges`] +#' @param ranges A [`GenomicRanges::GRanges`] object, for example extracted from +#' a `RangedSummarizedExperiment` object with the [`rowRanges`] #' command. #' -#' @param annot A [`GRanges`] from which promoter positions will be inferred. +#' @param annot A `GRanges` from which promoter positions will be inferred. #' Typically GENCODE. If the `type` metadata is present, it should #' contain `gene`, `exon` and `transcript` among its values. Otherwise, #' all entries are considered transcripts. If the `transcript_type` #' metadata is available, the entries that may not be primary products #' (for instance \sQuote{snoRNA}) are discarded. #' +#' @param upstream Number of bases _upstream_ the start of the transcript models +#' to be considered as part of the _promoter region_. +#' +#' @param downstream Number of bases _downstream_ the start of the transcript +#' models to be considered as part of the _promoter region_. +#' #' @return A Run-length-encoded ([`Rle`]) factor of same length as the `CTSS` #' object, indicating if the interval is `promoter`, `exon`, `intron` or #' `unknown`, or just `promoter`, `gene`, `unknown` if the `type` @@ -507,12 +517,12 @@ setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object #' gr2 <- gr1 #' gr2$type <- c("transcript", "exon", "transcript") #' gr2$transcript_type <- c("protein_coding", "protein_coding", "miRNA") -#' CAGEr:::ranges2annot(ctss, gr2) +#' CAGEr:::ranges2annot(ctss, gr2, up=500, down=20) #' #' @importFrom GenomicRanges findOverlaps promoters #' @importFrom S4Vectors Rle -ranges2annot <- function(ranges, annot) { +ranges2annot <- function(ranges, annot, upstream=500, downstream=500) { typesWithPromoter <- c( "protein_coding", "processed_transcript", "lincRNA" , "antisense", "processed_pseudogene" , "unprocessed_pseudogene") @@ -527,7 +537,7 @@ ranges2annot <- function(ranges, annot) { if(!is.null(annot$type)) { classes <- c("promoter", "exon", "intron", "unknown") - p <- findOverlapsBool(ranges, promoters(annot[annot$type == "transcript"], 500, 500)) + p <- findOverlapsBool(ranges, promoters(annot[annot$type == "transcript"], upstream, downstream)) e <- findOverlapsBool(ranges, annot[annot$type == "exon"]) t <- findOverlapsBool(ranges, annot[annot$type == "transcript"]) annot <- sapply( 1:length(ranges), function(i) { @@ -538,7 +548,7 @@ ranges2annot <- function(ranges, annot) { }) } else { classes <- c("promoter", "gene", "unknown") - p <- findOverlapsBool(ranges, promoters(annot, 500, 500)) + p <- findOverlapsBool(ranges, promoters(annot, upstream, downstream)) g <- findOverlapsBool(ranges, annot) annot <- sapply( 1:length(ranges), function(i) { if (p[i]) {classes[1]} diff --git a/man/annotateCTSS.Rd b/man/annotateCTSS.Rd index 7ae5d29..7f6771b 100644 --- a/man/annotateCTSS.Rd +++ b/man/annotateCTSS.Rd @@ -7,19 +7,25 @@ \alias{annotateConsensusClusters,CAGEexp,GRanges-method} \title{Annotate and compute summary statistics} \usage{ -annotateCTSS(object, ranges) +annotateCTSS(object, ranges, upstream = 500, downstream = 500) -\S4method{annotateCTSS}{CAGEexp,GRanges}(object, ranges) +\S4method{annotateCTSS}{CAGEexp,GRanges}(object, ranges, upstream = 500, downstream = 500) -annotateConsensusClusters(object, ranges) +annotateConsensusClusters(object, ranges, upstream = 500, downstream = 500) -\S4method{annotateConsensusClusters}{CAGEexp,GRanges}(object, ranges) +\S4method{annotateConsensusClusters}{CAGEexp,GRanges}(object, ranges, upstream = 500, downstream = 500) } \arguments{ \item{object}{\code{CAGEexp} object.} \item{ranges}{A \code{\link{GRanges}} object, optionally containing \code{gene_name}, \code{type} and \code{transcript_type} metadata.} + +\item{upstream}{Number of bases \emph{upstream} the start of the transcript models +to be considered as part of the \emph{promoter region}.} + +\item{downstream}{Number of bases \emph{downstream} the start of the transcript +models to be considered as part of the \emph{promoter region}.} } \value{ \code{annotateCTSS} returns the input object with the following @@ -40,8 +46,8 @@ detected. modifications as above. } \description{ -\code{annotateCTSS} annotates the \emph{CTSS} of a \code{\link{CAGEexp}} object and -computes annotation statistics. +\code{annotateCTSS} annotates the \emph{CTSS} of a \code{\link{CAGEexp}} object and computes +annotation statistics. \code{annotateConsensusClusters} annotates the \emph{consensus clusters} of a CAGEr object. diff --git a/man/ranges2annot.Rd b/man/ranges2annot.Rd index 279b971..180c0ed 100644 --- a/man/ranges2annot.Rd +++ b/man/ranges2annot.Rd @@ -2,21 +2,27 @@ % Please edit documentation in R/Annotations.R \name{ranges2annot} \alias{ranges2annot} -\title{Hierarchical annotation of CTSSes} +\title{Hierarchical annotation of genomic regions.} \usage{ -ranges2annot(ranges, annot) +ranges2annot(ranges, annot, upstream = 500, downstream = 500) } \arguments{ -\item{ranges}{A \code{\link{CTSS}} object, for example extracted from a -\code{RangedSummarizedExperiment} object with the \code{\link{rowRanges}} +\item{ranges}{A \code{\link[GenomicRanges:GRanges-class]{GenomicRanges::GRanges}} object, for example extracted from +a \code{RangedSummarizedExperiment} object with the \code{\link{rowRanges}} command.} -\item{annot}{A \code{\link{GRanges}} from which promoter positions will be inferred. +\item{annot}{A \code{GRanges} from which promoter positions will be inferred. Typically GENCODE. If the \code{type} metadata is present, it should contain \code{gene}, \code{exon} and \code{transcript} among its values. Otherwise, all entries are considered transcripts. If the \code{transcript_type} metadata is available, the entries that may not be primary products (for instance \sQuote{snoRNA}) are discarded.} + +\item{upstream}{Number of bases \emph{upstream} the start of the transcript models +to be considered as part of the \emph{promoter region}.} + +\item{downstream}{Number of bases \emph{downstream} the start of the transcript +models to be considered as part of the \emph{promoter region}.} } \value{ A Run-length-encoded (\code{\link{Rle}}) factor of same length as the \code{CTSS} @@ -25,8 +31,8 @@ object, indicating if the interval is \code{promoter}, \code{exon}, \code{intron metadata is absent. } \description{ -Assigns region types such as \code{promoter}, \code{exon} or \code{unknown} -to CTSSes. +Assigns region types such as \code{promoter}, \code{exon} or \code{unknown} to genomic +regions such as \emph{CTSS}, \emph{tag clusters}, or \emph{consensus clusters}. } \details{ Only the biotypes that are likely to have a pol II promoter will be @@ -49,7 +55,7 @@ CAGEr:::ranges2annot(ctss, gr1) gr2 <- gr1 gr2$type <- c("transcript", "exon", "transcript") gr2$transcript_type <- c("protein_coding", "protein_coding", "miRNA") -CAGEr:::ranges2annot(ctss, gr2) +CAGEr:::ranges2annot(ctss, gr2, up=500, down=20) } \seealso{