Skip to content

Commit

Permalink
Singleton filtering now done by paraclu and distclu directly
Browse files Browse the repository at this point in the history
The updated `.ctss_summary_for_clusters` does not remove clusters anymore.

The information on dominant CTSS score and total number of CTSS is now
stored in the dominantCTSS object directly.

Also, reorder computations so that score is the first metadata column in the cluster objects.
  • Loading branch information
Charles Plessy committed May 23, 2024
1 parent 8ed1da1 commit 4702c46
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 29 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export("genomeName<-")
export("inputFiles<-")
export("inputFilesType<-")
export("sampleLabels<-")
export(.ctss_summary_for_clusters)
export(CAGEexp)
export(CTSS)
export(CTSScoordinatesGR)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ BACKWARDS-INCOMPATIBLE CHANGES
- The `removeSingletons` option of clustering methods is removed and the
default value of `keepSingletonsAbove` is set to `0`, which keeps the
standard behavior.
- In cluster objects, the dominant CTSS score is now stored in the
`dominantCTSS` object directly.

BUG FIXES

Expand All @@ -34,6 +36,9 @@ NEW FEATURES
OTHER CHANGES

- Accelerated the computation of cumulative sums ~10×.
- Singleton filtering is now done by the `paraclu` and `distclu` functions
themeselves; `.ctss_summary_for_clusters` does not change the input clusters
except for adding information.

# Changes in version 2.11.1

Expand Down
46 changes: 30 additions & 16 deletions R/ClusteringFunctions.R
Original file line number Diff line number Diff line change
@@ -1,29 +1,46 @@
#' @include CAGEexp.R CTSS.R
NULL

#' @noRd
#' Summarise CTSSs included in clusters
#'
#' @param ctss A [`CTSS`] object.
#'
#' @param clusters A [`TagClusters`], [`ConsensusClusters`] or any other
#' object implementing the [`GRanges`] class.
#'
#' @return The `clusters` object with a new `dominant_CTSS` metadata in `CTSS`
#' format reporting the genomic coordinate and expression score of most
#' highly expressed position in each cluster, plus a `nr_ctss` metadata reporting
#' the number of expressed CTSSs in each cluster.
#'
#' @importFrom S4Vectors queryHits subjectHits runLength runValue
#' @export
#'
#' @examples
#' # See also benchmarks/dominant_ctss.md
#' (ctss <- GRanges('chr1', IRanges(start = 1:10, end = 1:10), '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
#' (clusters <- GRanges('chr1', IRanges(start = c(1,9), end = c(8,10)), '+'))
#' (ctss <- CTSS( 'chr1', IRanges(start = 1:10, end = 1:10)
#' , '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
#' (clusters <- GRanges( 'chr1', IRanges(start = c(1,9)
#' , end = c(8,10)), '+')) |> as("TagClusters")
#'
#' # The function assumes that all CTSSes have a score above zero
#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters, keepSingletonsAbove = Inf)
#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters)
#' # If not the case, it will give incorrect nr_ctss and fail to remove singletons
#' .ctss_summary_for_clusters(ctss, clusters, keepSingletonsAbove = Inf)
#' .ctss_summary_for_clusters(ctss, clusters)
#'
#' # The function needs its output to be sorted and is not going to check it.
#' .ctss_summary_for_clusters(rev(ctss), clusters)
#' .ctss_summary_for_clusters(ctss, rev(clusters))
#'
#' # Ties are resolved with 5' preference for both plus and minus strands.
#' # This may create a small bias.
#' .ctss_summary_for_clusters(ctss |> plyranges::mutate(strand = '-'), clusters |> plyranges::mutate(strand = '-'))
#' ctss_minus <- ctss
#' strand(ctss_minus) <- '-'
#' clusters_minus <- clusters
#' strand(clusters_minus) <- '-'
#' .ctss_summary_for_clusters(ctss_minus, clusters_minus)

.ctss_summary_for_clusters <- function(ctss, clusters, keepSingletonsAbove = 0) {
.ctss_summary_for_clusters <- function(ctss, clusters) {
# Match the clusters and the CTSS
o <- findOverlaps(clusters, ctss)

Expand All @@ -47,21 +64,18 @@ NULL
# Find absolute position of dominant CTSS in each run.
global_max_ids <- cluster_start_idx + local_max_idx - 1

# Record dominant CTSS as GRanges object.
clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids]

# Record dominant CTSS score. Mabye we should use its GRanges's score instead.
clusters$tpm.dominant_ctss <- score(ctss)[subjectHits(o)][global_max_ids]

# Record total expression of the cluster
score(clusters) <- Rle(sum(grouped_scores))

# Record dominant CTSS as CTSS object.
clusters$dominant_ctss <- CTSS(granges(ctss)[subjectHits(o)][global_max_ids])

# Record dominant CTSS score.
score(clusters$dominant_ctss) <- score(ctss)[subjectHits(o)][global_max_ids]

# Count the number of clusters
clusters$nr_ctss <- rl

# Remove clusters that match only one CTSS unless their expression is high enough
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)

# Give numerical names to the clusters
names(clusters) <- seq_along(clusters)

Expand Down
5 changes: 3 additions & 2 deletions R/Distclu.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ setMethod("distclu", "SummarizedExperiment",

.distclu_CTSS <- function(object, max.dist, keepSingletonsAbove) {
clusters <- reduce(GRanges(object), min = max.dist)
clusters <- .ctss_summary_for_clusters(object, clusters,
keepSingletonsAbove = keepSingletonsAbove)
clusters <- .ctss_summary_for_clusters(object, clusters)
# Remove clusters that match only one CTSS unless their expression is high enough
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
names(clusters) <- seq_along(clusters)
as(clusters, "TagClusters")
}
Expand Down
9 changes: 1 addition & 8 deletions R/ExportMethods.R
Original file line number Diff line number Diff line change
Expand Up @@ -557,17 +557,10 @@ function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {


.exportToTrack_clusters <- function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
# Simplify this after the format of dominant_ctss is standardised in all cluster objects.
ranges_ <- function(x) {
if (inherits(x, "GRanges")) return(IRanges(ranges(x)))
IRanges(x)
}
object$thick <- ranges_(object$dominant_ctss)
object$thick <- IRanges(ranges(object$dominant_ctss))
object$dominant_ctss <- NULL
names(object) <- NULL
object$name <- NA
object$nr_ctss <- NULL
object$tpm.dominant_ctss <- NULL
exportToTrack( GRanges(object), qLow = qLow, qUp = qUp
, colorByExpressionProfile = colorByExpressionProfile
, oneTrack = oneTrack)
Expand Down
6 changes: 3 additions & 3 deletions R/Paraclu.R
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ setMethod("paraclu", "CTSS",
clusters <- clusters[(clusters$max_d >= (minStability * clusters$min_d)) &
(width(clusters) <= maxLength)]
# Compute score and dominant CTSs, and remove singletons as wanted.
clusters <-
.ctss_summary_for_clusters( object, clusters
, keepSingletonsAbove = keepSingletonsAbove)
clusters <- .ctss_summary_for_clusters(object, clusters)
# Remove clusters that match only one CTSS unless their expression is high enough
clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
# Reduce to non-overlapping as wanted
if(reduceToNonoverlapping == TRUE){
o <- findOverlaps(clusters, drop.self = TRUE, type = "within")
Expand Down

0 comments on commit 4702c46

Please sign in to comment.