Skip to content

Commit

Permalink
Totally replace clusterCTSS() with distclu() and paraclu()
Browse files Browse the repository at this point in the history
The expression filtering is now done by `filterLowExpCTSS` because 1) it has consequences
beyond tag cluster generation and 2) because it simplifies the argument list of the
clustering functions.

The remaining `clusterCTSS` function had a set of mutually exclusive arguments for `distclu`
and `paraclu`; I find it easier to study and use the two methods when they are taken care
of by different functions.
  • Loading branch information
Charles Plessy committed May 24, 2024
1 parent 4702c46 commit ad92aa3
Show file tree
Hide file tree
Showing 36 changed files with 516 additions and 408 deletions.
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ export(aggregateTagClusters)
export(annotateCTSS)
export(annotateConsensusClusters)
export(annotateTagClusters)
export(clusterCTSS)
export(consensusClustersDESeq2)
export(consensusClustersGR)
export(consensusClustersSE)
Expand All @@ -40,8 +39,10 @@ export(cumulativeCTSSdistribution)
export(distclu)
export(exportToTrack)
export(expressionClasses)
export(filterLowExpCTSS)
export(findStrandInvaders)
export(flagByUpstreamSequences)
export(flagLowExpCTSS)
export(genomeName)
export(getCTSS)
export(getExpressionProfiles)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,14 @@ BACKWARDS-INCOMPATIBLE CHANGES
standard behavior.
- In cluster objects, the dominant CTSS score is now stored in the
`dominantCTSS` object directly.
- The `clusterCTSS` function is replaced by the new `paraclu` and `distclu`
function. CTSS filtering is done beforehand with the new `filterLowExpCTSS`
function.

BUG FIXES

- The `importPublicData` function was repaired for FANTOM samples.
- CTSS filtering now works correctly with `threshold = 0, thresholdIsTpm = TRUE`.

NEW FEATURES

Expand Down
6 changes: 3 additions & 3 deletions R/AggregationMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#' @param nrCores Number of cores to use when `useMulticore = TRUE`. Default
#' (`NULL`) uses all detected cores.
#'
#' @details Since the tag clusters (TCs) returned by the [`clusterCTSS`]
#' @details Since the tag clusters (TCs) returned by the CTSS clustering functions
#' function are constructed separately for every CAGE sample within the CAGEr
#' object, they can differ between samples in both their number, genomic
#' coordinates, position of dominant TSS and overall signal. To be able to
Expand Down Expand Up @@ -122,7 +122,7 @@ setMethod( "aggregateTagClusters", "CAGEr"
.aggregateTagClustersGRL(gr.list = TC.list, CAGEexp_obj = object, maxDist = maxDist)

if (excludeSignalBelowThreshold) {
filter <- .filterCtss( object
filter <- flagLowExpCTSS( object
, threshold = tpmThreshold
, nrPassThreshold = 1
, thresholdIsTpm = TRUE)
Expand Down Expand Up @@ -244,7 +244,7 @@ setMethod( "CustomConsensusClusters", c("CAGEexp", "GRanges")

clusters <- .ConsensusClusters(clusters)

filter <- .filterCtss( object
filter <- flagLowExpCTSS( object
, threshold = threshold
, nrPassThreshold = nrPassThreshold
, thresholdIsTpm = thresholdIsTpm)
Expand Down
3 changes: 2 additions & 1 deletion R/CAGEexp.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ setMethod( "initialize", "CAGEexp"
#' CTSStoGenes() |>
#' normalizeTagCount() |>
#' getExpressionProfiles("CTSS") |>
#' clusterCTSS() |>
#' filterLowExpCTSS() |>
#' distclu() |>
#' annotateTagClusters(exampleZv9_annot) |>
#' cumulativeCTSSdistribution("tagClusters") |>
#' quantilePositions("tagClusters") |>
Expand Down
88 changes: 66 additions & 22 deletions R/CAGEr.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,41 +159,85 @@ setMethod("validSamples", "CAGEr", function (object, x){
})


#' @name .filterCtss
#' @noRd
#' @param threshold,nrPassThreshold Only CTSSs with signal \code{>= threshold} in
#' \code{>= nrPassThreshold} experiments will be used for clustering and will
#' contribute towards total signal of the cluster.
#' @param thresholdIsTpm Logical, is threshold raw tag count value (FALSE) or
#' normalized signal (TRUE).
#' @title Private function
#' @details Check if a vector of strings or numbers can be used to identify a sample.
#' Flag CTSSes based on sample expression
#'
#' Flag CTSSes for that do not pass an expression threshold in at least a given
#' number of samples. This is typically used to ignore CTSSes that have been
#' seen only once in a single sample, as they can be considered to not be
#' reproduced.
#'
#' @param object An object from the _CAGEr_ package that contains expression
#' values for multiple samples.
#'
#' @param threshold Flag CTSSs with signal `< threshold`.
#'
#' @param nrPassThreshold Only flag CTSSs when signal is below threshold in at
#' least `nrPassThreshold` samples.
#'
#' @param thresholdIsTpm Logical, is threshold raw tag count value (`FALSE`) or
#' normalized signal (`TRUE`).
#'
#' @returns `flagLowExpCTSS` returns a [`Rle`] vector where `TRUE` indicates the
#' index of a CTSS that passes the filter.
#'
#' @export
#'
#' @examples
#' flagLowExpCTSS(exampleCAGEexp, threshold = 100, nrPassThreshold = 2)

setGeneric(".filterCtss", function( object
, threshold = 0
, nrPassThreshold = 1
, thresholdIsTpm = TRUE) {
if (threshold == 0) return(Rle(TRUE))
standardGeneric(".filterCtss")
})
setGeneric("flagLowExpCTSS", function( object
, threshold = 1
, nrPassThreshold = 1
, thresholdIsTpm = TRUE)
standardGeneric("flagLowExpCTSS")
)

setMethod(".filterCtss", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
.filterCtss(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
#' @rdname flagLowExpCTSS

setMethod("flagLowExpCTSS", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
flagLowExpCTSS(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
})

setMethod(".filterCtss", "RangedSummarizedExperiment", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
#' @rdname flagLowExpCTSS

setMethod("flagLowExpCTSS", "RangedSummarizedExperiment", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
assay <- ifelse(thresholdIsTpm, "normalizedTpmMatrix", "counts")
if(assay == "normalizedTpmMatrix" & is.null(assays(object)[[assay]]))
stop("Normalise the CAGEr object first with ", sQuote("normalizeTagCount()"), ".")
.filterCtss(assays(object)[[assay]], threshold, nrPassThreshold, thresholdIsTpm)
flagLowExpCTSS(assays(object)[[assay]], threshold, nrPassThreshold, thresholdIsTpm)
})

setMethod(".filterCtss", "DataFrame", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
#' @rdname flagLowExpCTSS

setMethod("flagLowExpCTSS", "DataFrame", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
nr.pass.threshold <- rowSums.RleDataFrame(lapply(object, \(x) x > threshold) |> DataFrame())
nr.pass.threshold >= min(nrPassThreshold, ncol(object))
})

setMethod(".filterCtss", "matrix", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
#' @rdname flagLowExpCTSS

setMethod("flagLowExpCTSS", "matrix", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
nr.pass.threshold <- rowSums(object > threshold)
nr.pass.threshold >= min(nrPassThreshold, ncol(object))
})

#' @rdname flagLowExpCTSS
#'
#' @return `filterLowExpCTSS` returns the `CAGEr` object where the output of
#' `flagLowExpCTSS` was stored internally.
#'
#' @export

setGeneric("filterLowExpCTSS", function( object
, threshold = 1
, nrPassThreshold = 1
, thresholdIsTpm = TRUE)
standardGeneric("filterLowExpCTSS")
)

#' @rdname flagLowExpCTSS

setMethod("filterLowExpCTSS", "CAGEr", function (object, threshold, nrPassThreshold, thresholdIsTpm) {
filteredCTSSidx(object) <- flagLowExpCTSS(CTSStagCountSE(object), threshold, nrPassThreshold, thresholdIsTpm)
object
})
151 changes: 0 additions & 151 deletions R/ClusteringMethods.R
Original file line number Diff line number Diff line change
@@ -1,156 +1,5 @@
#' @include CTSS.R Multicore.R

#' @name clusterCTSS
#'
#' @title Cluster CTSSs into tag clusters
#'
#' @description Clusters individual CAGE transcription start sites (CTSSs) along
#' the genome into tag clusters (TCs) using specified _ab initio_ method, or
#' assigns them to predefined genomic regions.
#'
#' @param object A [`CAGEr`] object.
#'
#' @param threshold,nrPassThreshold Ignore CTSSs with signal `< threshold`
#' in `< nrPassThreshold` experiments.
#'
#' @param thresholdIsTpm Logical indicating if `threshold` is expressed in
#' raw tag counts (`FALSE`) or normalized signal (`TRUE`).
#'
#' @param method Clustering method: `"distclu"` or `"paraclu"`.
#'
#' @param maxDist Maximal distance between two neighbouring CTSSs for them to be
#' part of the same cluster. Used only when `method = "distclu"`,
#' otherwise ignored.
#'
#' @param keepSingletonsAbove Remove "singleton" tag clusters of width 1 with
#' signal `< keepSingletonsAbove`. Default value `0` results in keeping
#' all TCs by default. Setting it to `Inf` removes all singletons.
#'
#' @param minStability Minimal stability of the cluster, where stability is
#' defined as ratio between maximal and minimal density value for which
#' this cluster is maximal scoring. For definition of stability refer to
#' Frith _et al._, Genome Research, 2007. Clusters with stability
#' `< minStability` will be discarded. Used only when `method = "paraclu"`.
#'
#' @param maxLength Maximal length of cluster in base-pairs. Clusters with length
#' `> maxLength` will be discarded.
#'
#' @param reduceToNonoverlapping Logical, should smaller clusters contained
#' within bigger cluster be removed to make a final set of tag clusters
#' non-overlapping. Used only `method = "paraclu"`.
#'
#' @param useMulticore Logical, should multicore be used. `useMulticore = TRUE`
#' has no effect on non-Unix-like platforms.
#'
#' @param nrCores Number of cores to use when `useMulticore = TRUE`. Default
#' value `NULL` uses all detected cores.
#'
#' @details The `"distclu"` method is an implementation of simple distance-based
#' clustering of data attached to sequences, where two neighbouring TSSs are
#' joined together if they are closer than some specified distance (see
#' [`GenomicRanges::reduce`] for implementation details.
#'
#' `"paraclu"` is an implementation of Paraclu algorithm for parametric
#' clustering of data attached to sequences (Frith _et al._, Genome Research,
#' 2007). Since Paraclu finds clusters within clusters (unlike distclu),
#' additional parameters (`keepSingletonsAbove`,
#' `minStability`, `maxLength` and `reduceToNonoverlapping`) can be specified to
#' simplify the output by discarding too small (singletons) or too big clusters,
#' and to reduce the clusters to a final set of non-overlapping clusters.
#'
#' Clustering is done for every CAGE dataset within the CAGEr object separately,
#' resulting in a different set of tag clusters for every CAGE dataset. TCs from
#' different datasets can further be aggregated into a single referent set of
#' consensus clusters by calling the [`aggregateTagClusters`] function.
#'
#' @return Returns the [`CAGEexp`] object, in which, the results will be stored as a `GRangesList` of
#' [`TagClusters`] objects in the metadata slot `tagClusters`. The
#' `TagClusters` objects will contain a `filteredCTSSidx` column if appropriate.
#' The clustering method name is saved in the metadata slot of the `GRangesList`.
#'
#' @references Frith _et al._ (2007) A code for transcription initiation in
#' mammalian genomes, _Genome Research_ **18**(1):1-12,
#' (\href{http://www.cbrc.jp/paraclu/}{http://www.cbrc.jp/paraclu/}).
#'
#' @author Vanja Haberle
#'
#' @seealso [`aggregateTagClusters`]
#'
#' @family CAGEr object modifiers
#' @family CAGEr clusters functions
#'
#' @examples
#'
#' # Using 'distclu', notice argument 'maxDist'
#' ce <- clusterCTSS( exampleCAGEexp, threshold = 50, thresholdIsTpm = TRUE
#' , nrPassThreshold = 1, method = "distclu", maxDist = 20
#' , keepSingletonsAbove = 100)
#' tagClustersGR(ce, "Zf.30p.dome")
#'
#' # Using 'paraclu', notice arguments 'maxLength' and 'minStability'
#' ce <- clusterCTSS( exampleCAGEexp, threshold = 50, thresholdIsTpm = TRUE
#' , nrPassThreshold = 1, method = "paraclu"
#' , keepSingletonsAbove = 100
#' , maxLength = 500, minStability = 1
#' , reduceToNonoverlapping = TRUE)
#' tagClustersGR(ce, "Zf.30p.dome")
#'
#' @export

setGeneric( "clusterCTSS"
, function( object
, threshold = 1, nrPassThreshold = 1, thresholdIsTpm = TRUE
, method = c("distclu", "paraclu"), maxDist = 20
, keepSingletonsAbove = 0
, minStability = 1, maxLength = 500
, reduceToNonoverlapping = TRUE
, useMulticore = FALSE, nrCores = NULL)
standardGeneric("clusterCTSS"))

#' @rdname clusterCTSS

setMethod( "clusterCTSS", "CAGEexp"
, function( object, threshold, nrPassThreshold, thresholdIsTpm, method, maxDist
, keepSingletonsAbove, minStability, maxLength
, reduceToNonoverlapping, useMulticore, nrCores) {

assay <- ifelse(isTRUE(thresholdIsTpm), "normalizedTpmMatrix", "counts")
data <- CTSStagCountSE(object)

if (! "normalizedTpmMatrix" %in% assayNames(data))
stop( "Could not find normalized CAGE signal values, see ?normalizeTagCount.\n"
, "clusterCTSS() needs normalized values to create its output tables, that "
, "include TPM expression columns.")

message("\nFiltering out CTSSs below threshold...")
filteredCTSSidx(object) <-
.filterCtss(data, threshold = threshold
, nrPassThreshold = nrPassThreshold, thresholdIsTpm = thresholdIsTpm)

message("Clustering...")
method <- match.arg(method)

if (method == "distclu") {
ctss.cluster.list <- distclu( object = data[decode(filteredCTSSidx(object)),]
, max.dist = maxDist, keepSingletonsAbove = keepSingletonsAbove)
} else if (method == "paraclu") {
ctss.cluster.list <- paraclu( object = data[decode(filteredCTSSidx(object)),]
, minStability = minStability, maxLength = maxLength
, keepSingletonsAbove = keepSingletonsAbove
, reduceToNonoverlapping = reduceToNonoverlapping
, useMulticore = useMulticore, nrCores = nrCores)
} else if(method == "custom") {
stop("Deprecated method. See ", dQuote("CustomConsensusClusters()"), " instead.")
}

seqlevels(ctss.cluster.list) <- seqlevels(CTSStagCountSE(object))
seqinfo(ctss.cluster.list) <- seqinfo(CTSStagCountSE(object))
# Changing the sequence levels may change the sort order. Re-sort
ctss.cluster.list <- sort(ctss.cluster.list)
metadata(object)$tagClusters <- ctss.cluster.list
object
})

#' @rdname byCtss
#'
#' @title Apply functions to identical CTSSes.
Expand Down
Loading

0 comments on commit ad92aa3

Please sign in to comment.