From 4702c46257a758970d520bed15ad250286236688 Mon Sep 17 00:00:00 2001
From: Charles Plessy <charles-plessy@.oist.jp>
Date: Thu, 23 May 2024 10:19:27 +0900
Subject: [PATCH] Singleton filtering now done by `paraclu` and `distclu`
 directly

The updated `.ctss_summary_for_clusters` does not remove clusters anymore.

The information on dominant CTSS score and total number of CTSS is now
stored in the dominantCTSS object directly.

Also, reorder computations so that score is the first metadata column in the cluster objects.
---
 NAMESPACE               |  1 +
 NEWS.md                 |  5 +++++
 R/ClusteringFunctions.R | 46 +++++++++++++++++++++++++++--------------
 R/Distclu.R             |  5 +++--
 R/ExportMethods.R       |  9 +-------
 R/Paraclu.R             |  6 +++---
 6 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 96d7863..e8763c4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -11,6 +11,7 @@ export("genomeName<-")
 export("inputFiles<-")
 export("inputFilesType<-")
 export("sampleLabels<-")
+export(.ctss_summary_for_clusters)
 export(CAGEexp)
 export(CTSS)
 export(CTSScoordinatesGR)
diff --git a/NEWS.md b/NEWS.md
index 73a9993..29e4b4c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -16,6 +16,8 @@ BACKWARDS-INCOMPATIBLE CHANGES
 -   The `removeSingletons` option of clustering methods is removed and the
     default value of `keepSingletonsAbove` is set to `0`, which keeps the
     standard behavior.
+-   In cluster objects, the dominant CTSS score is now stored in the
+    `dominantCTSS` object directly.
 
 BUG FIXES
 
@@ -34,6 +36,9 @@ NEW FEATURES
 OTHER CHANGES
 
 -   Accelerated the computation of cumulative sums ~10×.
+-   Singleton filtering is now done by the `paraclu` and `distclu` functions
+    themeselves; `.ctss_summary_for_clusters` does not change the input clusters
+    except for adding information.
 
 # Changes in version 2.11.1
 
diff --git a/R/ClusteringFunctions.R b/R/ClusteringFunctions.R
index 7c98620..d75ec5a 100644
--- a/R/ClusteringFunctions.R
+++ b/R/ClusteringFunctions.R
@@ -1,19 +1,32 @@
 #' @include CAGEexp.R CTSS.R
 NULL
 
-#' @noRd
+#' Summarise CTSSs included in clusters
+#' 
+#' @param ctss A [`CTSS`] object.
+#' 
+#' @param clusters A [`TagClusters`], [`ConsensusClusters`] or any other
+#' object implementing the [`GRanges`] class.
+#' 
+#' @return The `clusters` object with a new `dominant_CTSS` metadata in `CTSS`
+#' format reporting the genomic coordinate and expression score of most
+#' highly expressed position in each cluster, plus a `nr_ctss` metadata reporting
+#' the number of expressed CTSSs in each cluster.
 #' 
 #' @importFrom S4Vectors queryHits subjectHits runLength runValue
+#' @export
 #' 
 #' @examples 
 #' # See also benchmarks/dominant_ctss.md
-#' (ctss <- GRanges('chr1', IRanges(start = 1:10, end = 1:10), '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
-#' (clusters <- GRanges('chr1', IRanges(start = c(1,9), end = c(8,10)), '+'))
+#' (ctss <- CTSS( 'chr1', IRanges(start = 1:10, end = 1:10)
+#'              , '+', score = c(1, 0, 0, 1, 2, 0, 2, 1, 0, 1)))
+#' (clusters <- GRanges( 'chr1', IRanges(start = c(1,9)
+#'                     , end = c(8,10)), '+')) |> as("TagClusters")
 #' 
 #' # The function assumes that all CTSSes have a score above zero
-#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters, keepSingletonsAbove = Inf)
+#' .ctss_summary_for_clusters(ctss[score(ctss)>0], clusters)
 #' # If not the case, it will give incorrect nr_ctss and  fail to remove singletons
-#' .ctss_summary_for_clusters(ctss, clusters, keepSingletonsAbove = Inf)
+#' .ctss_summary_for_clusters(ctss, clusters)
 #' 
 #' # The function needs its output to be sorted and is not going to check it.
 #' .ctss_summary_for_clusters(rev(ctss), clusters)
@@ -21,9 +34,13 @@ NULL
 #' 
 #' # Ties are resolved with 5' preference for both plus and minus strands.
 #' # This may create a small bias.
-#' .ctss_summary_for_clusters(ctss |> plyranges::mutate(strand = '-'), clusters |> plyranges::mutate(strand = '-'))
+#' ctss_minus <- ctss
+#' strand(ctss_minus) <- '-'
+#' clusters_minus <- clusters
+#' strand(clusters_minus) <- '-'
+#' .ctss_summary_for_clusters(ctss_minus, clusters_minus)
 
-.ctss_summary_for_clusters <- function(ctss, clusters, keepSingletonsAbove = 0) {
+.ctss_summary_for_clusters <- function(ctss, clusters) {
   # Match the clusters and the CTSS
   o <- findOverlaps(clusters, ctss)
 
@@ -47,21 +64,18 @@ NULL
   # Find absolute position of dominant CTSS in each run.
   global_max_ids <- cluster_start_idx + local_max_idx - 1
   
-  # Record dominant CTSS as GRanges object.
-  clusters$dominant_ctss <- granges(ctss)[subjectHits(o)][global_max_ids]
-  
-  # Record dominant CTSS score.  Mabye we should use its GRanges's score instead.
-  clusters$tpm.dominant_ctss <-   score(ctss)[subjectHits(o)][global_max_ids]
-  
   # Record total expression of the cluster
   score(clusters) <- Rle(sum(grouped_scores))
+  
+  # Record dominant CTSS as CTSS object.
+  clusters$dominant_ctss <- CTSS(granges(ctss)[subjectHits(o)][global_max_ids])
+  
+  # Record dominant CTSS score.
+  score(clusters$dominant_ctss) <- score(ctss)[subjectHits(o)][global_max_ids]
 
   # Count the number of clusters   
   clusters$nr_ctss <- rl
   
-  # Remove clusters that match only one CTSS unless their expression is high enough
-  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
-  
   # Give numerical names to the clusters
   names(clusters) <- seq_along(clusters)
   
diff --git a/R/Distclu.R b/R/Distclu.R
index 3325b30..095abc7 100644
--- a/R/Distclu.R
+++ b/R/Distclu.R
@@ -50,8 +50,9 @@ setMethod("distclu", "SummarizedExperiment",
 
 .distclu_CTSS <- function(object, max.dist, keepSingletonsAbove) {
   clusters <- reduce(GRanges(object), min = max.dist)
-  clusters <- .ctss_summary_for_clusters(object, clusters,
-                                         keepSingletonsAbove = keepSingletonsAbove)
+  clusters <- .ctss_summary_for_clusters(object, clusters)
+  # Remove clusters that match only one CTSS unless their expression is high enough
+  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
   names(clusters) <- seq_along(clusters)
   as(clusters, "TagClusters")
 }
diff --git a/R/ExportMethods.R b/R/ExportMethods.R
index 2c9f794..770579c 100644
--- a/R/ExportMethods.R
+++ b/R/ExportMethods.R
@@ -557,17 +557,10 @@ function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
 
 
 .exportToTrack_clusters <- function( object, what, qLow, qUp, colorByExpressionProfile, oneTrack) {
-  # Simplify this after the format of dominant_ctss is standardised in all cluster objects.
-  ranges_ <- function(x) {
-    if (inherits(x, "GRanges")) return(IRanges(ranges(x)))
-    IRanges(x)
-  }
-  object$thick <- ranges_(object$dominant_ctss)
+  object$thick <- IRanges(ranges(object$dominant_ctss))
   object$dominant_ctss <- NULL
   names(object) <- NULL
   object$name <- NA
-  object$nr_ctss <- NULL
-  object$tpm.dominant_ctss <- NULL
   exportToTrack( GRanges(object), qLow = qLow, qUp = qUp
                  , colorByExpressionProfile = colorByExpressionProfile
                  , oneTrack = oneTrack)
diff --git a/R/Paraclu.R b/R/Paraclu.R
index c36d38c..d5fa179 100644
--- a/R/Paraclu.R
+++ b/R/Paraclu.R
@@ -150,9 +150,9 @@ setMethod("paraclu", "CTSS",
   clusters <- clusters[(clusters$max_d  >= (minStability * clusters$min_d)) &
                        (width(clusters) <= maxLength)]
   # Compute score and dominant CTSs, and remove singletons as wanted.
-  clusters <-
-    .ctss_summary_for_clusters( object, clusters
-                              , keepSingletonsAbove = keepSingletonsAbove)
+  clusters <- .ctss_summary_for_clusters(object, clusters)
+  # Remove clusters that match only one CTSS unless their expression is high enough
+  clusters <- subset(clusters, clusters$nr_ctss > 1 | score(clusters) >= keepSingletonsAbove)
   # Reduce to non-overlapping as wanted
   if(reduceToNonoverlapping == TRUE){
     o <- findOverlaps(clusters, drop.self = TRUE, type = "within")