Skip to content

Commit

Permalink
Merge pull request #89 from charles-plessy/filterByUpstreamSequences
Browse files Browse the repository at this point in the history
New flagByUpstreamSequences function.
  • Loading branch information
charles-plessy authored Jul 18, 2023
2 parents 6e0e7a2 + 7ae9328 commit c1d9d19
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 32 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ export(cumulativeCTSSdistribution)
export(exportToTrack)
export(expressionClasses)
export(findStrandInvaders)
export(flagByUpstreamSequences)
export(genomeName)
export(getCTSS)
export(getExpressionProfiles)
Expand Down Expand Up @@ -72,6 +73,7 @@ exportClasses(CAGEr)
exportClasses(CTSS)
exportMethods(coerce)
exportMethods(findStrandInvaders)
exportMethods(flagByUpstreamSequences)
exportMethods(quickEnhancers)
exportMethods(removeStrandInvaders)
import(BiocGenerics)
Expand All @@ -80,6 +82,7 @@ import(DelayedMatrixStats)
import(MultiAssayExperiment)
import(SummarizedExperiment)
import(methods)
importFrom(BSgenome,getBSgenome)
importFrom(BSgenome,getSeq)
importFrom(BSgenome,installed.genomes)
importFrom(BiocParallel,MulticoreParam)
Expand Down
126 changes: 102 additions & 24 deletions R/StrandInvaders.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@
#' (defaults to `TATAGGG`).
#'
#' @return `findStrandInvaders` returns a logical-[Rle] vector indicating the
#' position of the strand invaders in the input ranges. With [CTSS] objects as
#' input `removeStrandInvaders` returns the object after removing the CTSS
#' positions identified as strand invaders. In the case of `CAGEexp` objects,
#' the input object is modified in place. Its sample metadata is also
#' updated by creating a new `strandInvaders` column that indicates the number
#' of molecule counts removed. This value is subtracted from the `counts` colum
#' so that the total number of tags is still equal to `librarySizes`.
#' position of the strand invaders in the input ranges.
#'
#' @return With [CTSS] objects as input `removeStrandInvaders` returns the
#' object after removing the CTSS positions identified as strand invaders.
#' In the case of `CAGEexp` objects, a modified object is returned. Its sample
#' metadata is also updated by creating a new `strandInvaders` column that
#' indicates the number of molecule counts removed. This value is subtracted
#' from the `counts` colum so that the total number of tags is still equal to
#' `librarySizes`.
#'
# @family CAGEr object modifiers
# @family CAGEr TSS functions
Expand All @@ -57,14 +59,11 @@
#' @examples
#' # Note that these examples do not do much on the example data since it was
#' # not constructed using a protocol based using the template-switching method.
#' "BSgenome.Drerio.UCSC.danRer7"
#'
#' findStrandInvaders(exampleCAGEexp)
#' removeStrandInvaders(exampleCAGEexp)
#'
#' @importFrom BSgenome getSeq
#' @importFrom stringdist stringdist
#' @importFrom stringi stri_sub

NULL

#' @rdname strandInvaders
Expand Down Expand Up @@ -93,6 +92,7 @@ setMethod( "findStrandInvaders", "CAGEexp"

setMethod( "removeStrandInvaders", "CAGEexp"
, function (object, distance, barcode, linker) {
if ( ! is.null(barcode)) stop("Sorry, the barcode is not implemented yet.\nPlease ask by opening an issue on GitHub.")
strandInvaders <- findStrandInvaders(object, distance, barcode, linker)
se <- CTSStagCountSE(object)
CTSStagCountSE(object) <- se[!decode(strandInvaders),]
Expand All @@ -107,19 +107,7 @@ setMethod( "removeStrandInvaders", "CAGEexp"

setMethod( "findStrandInvaders", "CTSS"
, function (object, distance, barcode, linker) {
upstreamSeq <- getSeq( getRefGenome(genomeName(object))
, suppressWarnings(trim(promoters(GRanges(object), nchar(linker), 0)))
, as.character = TRUE)
strandInvaders <- Rle(stringdist(upstreamSeq, linker) <= distance)
# Only remove if at least 2 of the last 3 nucleotides are guanosines,
# as in https://github.com/davetang/23180801/blob/master/find_strand_invasion-20130307.pl
if (distance > 1) {
strandInvaders <-
strandInvaders &
stringdist( stri_sub(upstreamSeq, nchar(linker) - 2, nchar(linker))
, stri_sub(linker, nchar(linker) - 2, nchar(linker))) < 2
}
strandInvaders
.flagByUpstreamSequences(object, target = linker, distance = distance, strandInvaders = TRUE)
})

#' @rdname strandInvaders
Expand All @@ -128,3 +116,93 @@ setMethod( "findStrandInvaders", "CTSS"
setMethod( "removeStrandInvaders", "CTSS"
, function (object, distance, barcode, linker)
object[!decode(findStrandInvaders(object))])


#' Filter by upstream sequences
#'
#' Looks up the bases directly upstream provided _genomic ranges_ and searches
#' for a gapless match with a _target_ seqence within a given edit _distance_.
#'
#' If the provided `object` represents _tag clusters_ or _consensus clusters_,
#' the search will be done upstream its _dominant peak_. Convert the object
#' to the `GRanges` class if this is not the behaviour you want.
#'
#' @param object A [`CTSS`], a [`TagClusters`], [`ConsensusClusters`] or a
#' [`GenomicRanges::GRanges`] object from which a _BSgenome_ object can
#' be reached.
#'
#' @param target A target sequence.
#'
#' @param distance The maximal edit distance between the genome and the target
#' sequence (default: 0).
#'
#' @returns A `logical-RLe` vector indicating if ranges matched the target.
#'
#' @importFrom BSgenome getBSgenome
#' @importFrom BSgenome getSeq
#' @importFrom stringdist stringdist
#' @importFrom stringi stri_sub
#'
#' @author Charles Plessy
#'
#' @family CAGEr filter functions
#'
#' @export

setGeneric( "flagByUpstreamSequences"
, function (object, target, distance = 0)
standardGeneric("flagByUpstreamSequences"))

#' @rdname flagByUpstreamSequences
#' @export

setMethod( "flagByUpstreamSequences", "CTSS"
, function (object, target, distance = 0) {
flagByUpstreamSequences(GRanges(object), target = target, distance = distance)
})

#' @rdname flagByUpstreamSequences
#' @export

setMethod( "flagByUpstreamSequences", "TagClusters"
, function (object, target, distance = 0) {
flagByUpstreamSequences(object$dominant_ctss, target = target, distance = distance)
})

#' @rdname flagByUpstreamSequences
#' @export

setMethod( "flagByUpstreamSequences", "ConsensusClusters"
, function (object, target, distance = 0) {
flagByUpstreamSequences(object$dominant_ctss, target = target, distance = distance)
})

#' @rdname flagByUpstreamSequences
#' @export

setMethod( "flagByUpstreamSequences", "GRanges"
, function (object, target, distance = 0) {
.flagByUpstreamSequences(GRanges(object), target = target, distance = distance)
})

#' @noRd
#' @examples
#' # Note that we do not expect TATAGGG artefacts in this example dataset
#' CAGEr:::.flagByUpstreamSequences(CTSScoordinatesGR(exampleCAGEexp), "TATAGGG", 2)
#' CAGEr:::.flagByUpstreamSequences(CTSScoordinatesGR(exampleCAGEexp), "TATAGGG", 2, strandInvaders = TRUE)

.flagByUpstreamSequences <- function (object, target, distance = 0, strandInvaders = FALSE) {
bsgenome <- object |> genome() |> unique() |> getBSgenome()
updreamRanges <- promoters(object, nchar(target), 0) |> trim() |> suppressWarnings()
upstreamSeq <- getSeq(bsgenome, updreamRanges)
matching <- Rle(stringdist(upstreamSeq, target) <= distance)
# Only remove if at least 2 of the last 3 nucleotides are guanosines,
# as in https://github.com/davetang/23180801/blob/master/find_strand_invasion-20130307.pl
if (distance > 1 & isTRUE(strandInvaders)) {
matching <-
matching &
stringdist( stri_sub(upstreamSeq, nchar(target) - 2, nchar(target))
, stri_sub(target, nchar(target) - 2, nchar(target))) < 2
}
matching
}
46 changes: 46 additions & 0 deletions man/flagByUpstreamSequences.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 9 additions & 8 deletions man/strandInvaders.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c1d9d19

Please sign in to comment.