Skip to content

Commit

Permalink
Allow annotation functions to vary promoter width.
Browse files Browse the repository at this point in the history
Closes #98
  • Loading branch information
charles-plessy committed Jul 27, 2023
1 parent 1c7c400 commit 59b847a
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 37 deletions.
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ NEW FEATURES
- Accelerated the computation of quantile position by ~20 times.
- New `resetCAGEexp()` function.
- New `flagByUpstreamSequences()` function.
- The `annotateCTSS` and `annotateConsensusClusters` function gain a
`upstream` and a `downstream` parameter to change the width of promoter
regions.

# Changes in version 2.6.0

Expand Down
56 changes: 33 additions & 23 deletions R/Annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -374,17 +374,21 @@ msScope_annotation <- function(libs) {
}


#' @name annotateCTSS
#' Annotate and compute summary statistics
#'
#' @title Annotate and compute summary statistics
#'
#' @description `annotateCTSS` annotates the _CTSS_ of a [`CAGEexp`] object and
#' computes annotation statistics.
#' `annotateCTSS` annotates the _CTSS_ of a [`CAGEexp`] object and computes
#' annotation statistics.
#'
#' @param object `CAGEexp` object.
#'
#' @param ranges A [`GRanges`] object, optionally containing `gene_name`,
#' `type` and `transcript_type` metadata.
#'
#' @param upstream Number of bases _upstream_ the start of the transcript models
#' to be considered as part of the _promoter region_.
#'
#' @param downstream Number of bases _downstream_ the start of the transcript
#' models to be considered as part of the _promoter region_.
#'
#' @return `annotateCTSS` returns the input object with the following
#' modifications:
Expand All @@ -411,13 +415,14 @@ msScope_annotation <- function(libs) {
#'
#' @export

setGeneric("annotateCTSS", function(object, ranges) standardGeneric("annotateCTSS"))
setGeneric("annotateCTSS", function(object, ranges, upstream=500, downstream=500)
standardGeneric("annotateCTSS"))

#' @rdname annotateCTSS

setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges){
setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges, upstream=500, downstream=500){
CTSScoordinatesGR(object)$genes <- ranges2genes(CTSScoordinatesGR(object), ranges)
CTSScoordinatesGR(object)$annotation <- ranges2annot(CTSScoordinatesGR(object), ranges)
CTSScoordinatesGR(object)$annotation <- ranges2annot(CTSScoordinatesGR(object), ranges, upstream, downstream)

annot <- sapply( CTSStagCountDF(object)
, function(X) tapply(X, CTSScoordinatesGR(object)$annotation, sum))
Expand All @@ -443,39 +448,44 @@ setMethod("annotateCTSS", c("CAGEexp", "GRanges"), function (object, ranges){
#'
#' @export

setGeneric("annotateConsensusClusters", function(object, ranges) standardGeneric("annotateConsensusClusters"))
setGeneric("annotateConsensusClusters", function(object, ranges, upstream=500, downstream=500)
standardGeneric("annotateConsensusClusters"))

#' @rdname annotateCTSS

setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object, ranges){
setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object, ranges, upstream=500, downstream=500){
if(is.null(experiments(object)$tagCountMatrix))
stop("Input does not contain CTSS expressiond data, see ", dQuote("getCTSS()"), ".")
consensusClustersGR(object)$annotation <- ranges2annot(consensusClustersGR(object), ranges)
consensusClustersGR(object)$annotation <- ranges2annot(consensusClustersGR(object), ranges, upstream, downstream)
if(!is.null(ranges$gene_name))
consensusClustersGR(object)$genes <- ranges2genes(consensusClustersGR(object), ranges)
validObject(object)
object
})


#' @name ranges2annot
#' Hierarchical annotation of genomic regions.
#'
#' @title Hierarchical annotation of CTSSes
#' Assigns region types such as `promoter`, `exon` or `unknown` to genomic
#' regions such as _CTSS_, _tag clusters_, or _consensus clusters_.
#'
#' @description Assigns region types such as `promoter`, `exon` or `unknown`
#' to CTSSes.
#'
#' @param ranges A [`CTSS`] object, for example extracted from a
#' `RangedSummarizedExperiment` object with the [`rowRanges`]
#' @param ranges A [`GenomicRanges::GRanges`] object, for example extracted from
#' a `RangedSummarizedExperiment` object with the [`rowRanges`]
#' command.
#'
#' @param annot A [`GRanges`] from which promoter positions will be inferred.
#' @param annot A `GRanges` from which promoter positions will be inferred.
#' Typically GENCODE. If the `type` metadata is present, it should
#' contain `gene`, `exon` and `transcript` among its values. Otherwise,
#' all entries are considered transcripts. If the `transcript_type`
#' metadata is available, the entries that may not be primary products
#' (for instance \sQuote{snoRNA}) are discarded.
#'
#' @param upstream Number of bases _upstream_ the start of the transcript models
#' to be considered as part of the _promoter region_.
#'
#' @param downstream Number of bases _downstream_ the start of the transcript
#' models to be considered as part of the _promoter region_.
#'
#' @return A Run-length-encoded ([`Rle`]) factor of same length as the `CTSS`
#' object, indicating if the interval is `promoter`, `exon`, `intron` or
#' `unknown`, or just `promoter`, `gene`, `unknown` if the `type`
Expand Down Expand Up @@ -507,12 +517,12 @@ setMethod("annotateConsensusClusters", c("CAGEexp", "GRanges"), function (object
#' gr2 <- gr1
#' gr2$type <- c("transcript", "exon", "transcript")
#' gr2$transcript_type <- c("protein_coding", "protein_coding", "miRNA")
#' CAGEr:::ranges2annot(ctss, gr2)
#' CAGEr:::ranges2annot(ctss, gr2, up=500, down=20)
#'
#' @importFrom GenomicRanges findOverlaps promoters
#' @importFrom S4Vectors Rle

ranges2annot <- function(ranges, annot) {
ranges2annot <- function(ranges, annot, upstream=500, downstream=500) {
typesWithPromoter <- c( "protein_coding", "processed_transcript", "lincRNA"
, "antisense", "processed_pseudogene"
, "unprocessed_pseudogene")
Expand All @@ -527,7 +537,7 @@ ranges2annot <- function(ranges, annot) {

if(!is.null(annot$type)) {
classes <- c("promoter", "exon", "intron", "unknown")
p <- findOverlapsBool(ranges, promoters(annot[annot$type == "transcript"], 500, 500))
p <- findOverlapsBool(ranges, promoters(annot[annot$type == "transcript"], upstream, downstream))
e <- findOverlapsBool(ranges, annot[annot$type == "exon"])
t <- findOverlapsBool(ranges, annot[annot$type == "transcript"])
annot <- sapply( 1:length(ranges), function(i) {
Expand All @@ -538,7 +548,7 @@ ranges2annot <- function(ranges, annot) {
})
} else {
classes <- c("promoter", "gene", "unknown")
p <- findOverlapsBool(ranges, promoters(annot, 500, 500))
p <- findOverlapsBool(ranges, promoters(annot, upstream, downstream))
g <- findOverlapsBool(ranges, annot)
annot <- sapply( 1:length(ranges), function(i) {
if (p[i]) {classes[1]}
Expand Down
18 changes: 12 additions & 6 deletions man/annotateCTSS.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 14 additions & 8 deletions man/ranges2annot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 59b847a

Please sign in to comment.