Fix QC karyotype plot for mm10

friendsofstrandseq · Feb 8, 2024 · b6bf5c1 · b6bf5c1
1 parent 3473e3e
commit b6bf5c1
Show file tree

Hide file tree

Showing 22 changed files with 305,962 additions and 12 deletions.
diff --git a/workflow/data/normalization/T2T/HGSVC.100000.txt b/workflow/data/normalization/T2T/HGSVC.100000.txt
diff --git a/workflow/data/normalization/T2T/HGSVC.200000.txt b/workflow/data/normalization/T2T/HGSVC.200000.txt
diff --git a/workflow/data/normalization/T2T/HGSVC.50000.txt b/workflow/data/normalization/T2T/HGSVC.50000.txt
diff --git a/workflow/data/normalization/hg19/HGSVC.100000.txt b/workflow/data/normalization/hg19/HGSVC.100000.txt
diff --git a/workflow/data/normalization/hg19/HGSVC.200000.txt b/workflow/data/normalization/hg19/HGSVC.200000.txt
diff --git a/workflow/data/normalization/hg19/HGSVC.50000.txt b/workflow/data/normalization/hg19/HGSVC.50000.txt
diff --git a/...lization/HGSVC.100000.centromere-only.txt → ...ion/hg38/HGSVC.100000.centromere-only.txt b/...lization/HGSVC.100000.centromere-only.txt → ...ion/hg38/HGSVC.100000.centromere-only.txt
diff --git a/workflow/data/normalization/HGSVC.100000.txt → .../data/normalization/hg38/HGSVC.100000.txt b/workflow/data/normalization/HGSVC.100000.txt → .../data/normalization/hg38/HGSVC.100000.txt
diff --git a/...lization/HGSVC.200000.centromere-only.txt → ...ion/hg38/HGSVC.200000.centromere-only.txt b/...lization/HGSVC.200000.centromere-only.txt → ...ion/hg38/HGSVC.200000.centromere-only.txt
diff --git a/workflow/data/normalization/HGSVC.200000.txt → .../data/normalization/hg38/HGSVC.200000.txt b/workflow/data/normalization/HGSVC.200000.txt → .../data/normalization/hg38/HGSVC.200000.txt
diff --git a/...alization/HGSVC.50000.centromere-only.txt → ...tion/hg38/HGSVC.50000.centromere-only.txt b/...alization/HGSVC.50000.centromere-only.txt → ...tion/hg38/HGSVC.50000.centromere-only.txt
diff --git a/workflow/data/normalization/HGSVC.50000.txt → ...w/data/normalization/hg38/HGSVC.50000.txt b/workflow/data/normalization/HGSVC.50000.txt → ...w/data/normalization/hg38/HGSVC.50000.txt
diff --git a/workflow/data/normalization/mm10/HGSVC.100000.txt b/workflow/data/normalization/mm10/HGSVC.100000.txt
diff --git a/workflow/data/normalization/mm10/HGSVC.200000.txt b/workflow/data/normalization/mm10/HGSVC.200000.txt
diff --git a/workflow/data/normalization/mm10/HGSVC.50000.txt b/workflow/data/normalization/mm10/HGSVC.50000.txt
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -298,11 +298,17 @@ class HandleInput:
             "strandphaser",
         ]
 
-        for sample in [
+        l_to_process = [
             e
             for e in os.listdir(thisdir)
             if e not in exclude and e.endswith(".zip") is False
-        ]:
+        ]
+        if config["samples_to_process"]:
+            l_to_process = [
+                e for e in l_to_process if e in config["samples_to_process"]
+            ]
+
+        for sample in l_to_process:
             # Create a list of  files to process for each sample
             l_files_all = [
                 f
@@ -313,12 +319,13 @@ class HandleInput:
                 )
                 if f.endswith(ext)
             ]
+            # print(l_files_all)
 
             for f in l_files_all:
                 if len(f.split("_")) == 4:
                     assert (
                         len(f.split("_")) != 4
-                    ), "Your file name is using 4 times the '_' character, which is currently not supported by ashleys-qc, please rename your files"
+                    ), "Your file name is using 3 times the '_' character, which is currently not supported by ashleys-qc, please rename your files"
 
             # print(l_files_all)
             # Dataframe creation
@@ -528,7 +535,7 @@ def publishdir_fct(wildcards):
     Function to generate a list of files and directories for backup.
     """
 
-    print(config)
+    # print(config)
 
     list_files_to_copy = [
         e

diff --git a/workflow/scripts/normalization/merge-blacklist.py b/workflow/scripts/normalization/merge-blacklist.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+"""
+Merge two blacklisted intervals if distance between them is below a given distance.
+"""
+
+import sys
+from argparse import ArgumentParser
+import pandas as pd
+
+
+def main():
+    parser = ArgumentParser(prog="merge-blacklist.py", description=__doc__)
+    parser.add_argument(
+        "--merge_distance",
+        default=500000,
+        type=int,
+        help="If the distance between two blacklisted intervals is below this threshold, they are merged.",
+    )
+    parser.add_argument("--output", default=None, help="Output file name")
+    parser.add_argument(
+        "--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)."
+    )
+    parser.add_argument("--min_whitelist_interval_size", default=400000, type=int, help="Ignore whitelisted intervals below this size.")
+
+    parser.add_argument("normalization", metavar="NORM", help="File (tsv) with normalization and blacklist data")
+
+    args = parser.parse_args()
+
+    print("Reading", args.normalization, file=sys.stderr)
+    norm_table = pd.read_csv(args.normalization, sep="\t")
+
+    assert set(norm_table.columns) == set(["chrom", "start", "end", "scalar", "class"])
+
+    whitelist = None
+    if args.whitelist is not None:
+        whitelist = pd.read_csv(args.whitelist, sep="\t")
+        assert set(whitelist.columns) == set(["chrom", "start", "end"])
+        print("Read", len(whitelist), "whitelisted intervals from", args.whitelist, file=sys.stderr)
+        whitelist = whitelist[whitelist.end - whitelist.start >= args.min_whitelist_interval_size]
+        print("  -->", len(whitelist), "remained after removing intervals below", args.min_whitelist_interval_size, "bp", file=sys.stderr)
+
+    additional_blacklist = 0
+    prev_blacklist_index = None
+    prev_blacklist_chrom = None
+    prev_blacklist_end = None
+    for i in range(len(norm_table)):
+        row = norm_table.iloc[i]
+        # print('Processing row', i, ' -->', tuple(row), file=sys.stderr)
+        # is row blacklisted?
+        if row["class"] == "None":
+            # print(' --> is black', file=sys.stderr)
+            if (prev_blacklist_chrom == row["chrom"]) and (row["start"] - prev_blacklist_end <= args.merge_distance):
+                # print(' --> black listing', prev_blacklist_index+1, 'to', i, file=sys.stderr)
+                for j in range(prev_blacklist_index + 1, i):
+                    norm_table.loc[[j], "class"] = "None"
+                    row_j = norm_table.iloc[j]
+                    additional_blacklist += row_j.end - row_j.start
+            prev_blacklist_index = i
+            prev_blacklist_chrom = row["chrom"]
+            prev_blacklist_end = row["end"]
+
+    print("Additionally blacklisted", additional_blacklist, "bp of sequence", file=sys.stderr)
+
+    additional_whitelist = 0
+    if whitelist is not None:
+        for i in range(len(norm_table)):
+            row = norm_table.iloc[i]
+            if row["class"] == "None":
+                if len(whitelist[(whitelist.chrom == row.chrom) & (row.start < whitelist.end) & (whitelist.start < row.end)]) > 0:
+                    norm_table.loc[[i], "class"] = "good"
+                    additional_whitelist += row.end - row.start
+
+
+    print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr)
+
+    norm_table.to_csv(args.output, index=False, sep="\t")
+
+    ## Identify "complex" intervals
+    # segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end'])
+
+    ## merge complex segments if closer than args.merge_distance
+    # complex_segments = pd.DataFrame(columns=['chrom','start','end'])
+    # cur_chrom, cur_start, cur_end = None, None, None
+    # for chrom, start, end in segments[segments.is_complex][['chrom','start','end']].values:
+    # if cur_chrom is None:
+    # cur_chrom, cur_start, cur_end = chrom, start, end
+    # elif (cur_chrom == chrom) and (start - cur_end < args.merge_distance):
+    # cur_end = end
+    # else:
+    # complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)
+    # cur_chrom, cur_start, cur_end = chrom, start, end
+    # if cur_chrom is not None:
+    # complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)
+
+    # print(complex_segments, file=sys.stderr)
+    # total_complex = sum(complex_segments.end - complex_segments.start)
+
+    # print('Total amount of complex sequence: {}Mbp'.format(total_complex/1000000), file=sys.stderr)
+    # complex_segments[['chrom','start','end']].to_csv(sys.stdout, index=False, sep='\t')
+    ##print(complex_segments, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workflow/scripts/normalization/normalize.R b/workflow/scripts/normalization/normalize.R
@@ -0,0 +1,160 @@
+suppressMessages(library(dplyr))
+suppressMessages(library(data.table))
+suppressMessages(library(assertthat))
+
+
+args <- commandArgs(trailingOnly = T)
+if (length(args) != 4) {
+  print("Usage: Rscript scale.R <count table> <norm factors> <out>")
+  print("")
+  print("       Normalize Strand-seq read counts. Divide the counts of all bins")
+  print("       by a scaling factor (norm$scalar) and further black-list bins")
+  print("       if requested in the normalizatio file (norm$class).")
+  options(show.error.messages = F)
+  stop()
+}
+
+# Read counts
+message(" * Reading counts from ", args[1])
+counts <- fread(paste("zcat", args[1]))
+assert_that(
+  is.data.table(counts),
+  "chrom" %in% colnames(counts),
+  "start" %in% colnames(counts),
+  "end" %in% colnames(counts),
+  "class" %in% colnames(counts),
+  "sample" %in% colnames(counts),
+  "cell" %in% colnames(counts),
+  "w" %in% colnames(counts),
+  "c" %in% colnames(counts)
+) %>% invisible()
+setkey(counts, chrom, start, end)
+
+# Check that all cells have the same bins
+bins <- unique(counts[, .(chrom, start, end)])
+counts[,
+  assert_that(all(.SD == bins), msg = "Not the same bins in all cells"),
+  by = .(sample, cell),
+  .SDcols = c("chrom", "start", "end")
+] %>% invisible()
+
+
+# remove bad cells
+bad_cells <- counts[class == "None", .N, by = .(sample, cell)][N == nrow(bins)]
+if (nrow(bad_cells) > 0) {
+  message(" * Removing ", nrow(bad_cells), " cells because thery were black-listed.")
+  counts <- counts[!bad_cells, on = c("sample", "cell")]
+}
+
+# Check that the "None" bins are all the same across cells
+none_bins <- unique(counts[!bad_cells, on = c("sample", "cell")][class == "None", .(chrom, start, end)])
+if (nrow(none_bins) > 0) {
+  counts[!bad_cells, on = c("sample", "cell")][class == "None",
+    assert_that(all(.SD == none_bins, msg = "None bins are not the same in all cells (excl. bad cells)")),
+    by = .(sample, cell),
+    .SDcols = c("chrom", "start", "end")
+  ] %>% invisible()
+}
+
+
+# Read normalization factors
+message(" * Reading norm file from ", args[2])
+norm <- fread(args[2])
+assert_that(
+  is.data.table(norm),
+  "chrom" %in% colnames(norm),
+  "start" %in% colnames(norm),
+  "end" %in% colnames(norm),
+  "scalar" %in% colnames(norm)
+) %>% invisible()
+if ("class" %in% colnames(norm)) {
+  norm <- norm[, .(chrom, start, end, scalar, norm_class = class)]
+} else {
+  norm <- norm[, .(chrom, start, end, scalar, norm_class = "good")]
+}
+setkey(norm, chrom, start, end)
+
+# Check normalization type from snakemake
+if (args[4] == "False") {
+  norm$scalar <- 1
+}
+
+# Set particular values of the norm_class to "None":
+norm[scalar < 0.01, norm_class := "None"]
+
+
+# annotate counts with scaling factor
+counts <- merge(counts,
+  norm,
+  by = c("chrom", "start", "end"),
+  all.x = T
+)
+
+if (any(is.na(counts$scalar))) {
+  message(
+    " * Assign scalars: Could not match ",
+    unique(counts[, .(chrom, start, end, scalar)])[is.na(scalar), .N],
+    " bins (out of ",
+    unique(counts[, .(chrom, start, end)])[, .N],
+    ") -> set those to 1"
+  )
+}
+
+# Fill gaps in the norm file
+counts[is.na(scalar), `:=`(scalar = 1, norm_class = "good")]
+
+# Black-listing bins
+test <- counts[!bad_cells, on = c("sample", "cell")][cell == unique(cell)[1]]
+test <- test[, .(
+  count_None = sum(class == "None"),
+  norm_None = sum(norm_class == "None"),
+  final_None = sum(class == "None" | norm_class == "None")
+)]
+message(" * ", test$count_None, " bins were already black-listed; ", test$norm_None, " are blacklisted via the normalization, leading to a total of ", test$final_None)
+counts[norm_class == "None", class := "None"]
+
+
+
+# Apply normalization factor
+counts[, `:=`(c = as.numeric(c), w = as.numeric(w))]
+
+counts[class != "None", `:=`(
+  c = c * scalar,
+  w = w * scalar
+)]
+
+# # Check normalization type from snakemake
+# if (args[4] == "True") {
+#   counts[class != "None", `:=`(
+#     c = c * scalar,
+#     w = w * scalar
+#   )]
+# } else {
+#   scalar <- 1
+
+# }
+
+
+
+counts[class == "None", `:=`(c = 0.0, w = 0.0)]
+
+message(
+  " * Applying normalization: min = ",
+  round(min(counts[class != "None", scalar]), 3),
+  ", max = ",
+  round(max(counts[class != "None", scalar]), 3),
+  ", median = ",
+  median(unique(counts[, .(chrom, start, end, class, scalar)][class != "None", scalar]))
+)
+
+
+# Remove column
+counts[, norm_class := NULL]
+counts[, scalar := NULL]
+
+
+# Write down table
+message(" * Write data to ", args[3])
+gz1 <- gzfile(args[3], "w")
+write.table(counts, gz1, sep = "\t", quote = F, col.names = T, row.names = F)
+close(gz1)
diff --git a/workflow/scripts/plotting/qc.R b/workflow/scripts/plotting/qc.R
@@ -156,9 +156,9 @@ if (add_overview_plot == T) {
     d_mv <- d[class != "None", .(mean = mean(w + c), var = var(w + c)), by = .(sample, cell)]
     print(d_mv)
 
-    if(nrow(d_mv) > 0) {
+    if (nrow(d_mv) > 0) {
         d_p <- d_mv[, .(p = sum(mean * mean) / sum(mean * var)), by = sample]
-        
+
         ov_meanvar <- ggplot(d_mv) +
             geom_point(aes(mean, var), alpha = 0.4) +
             facet_wrap(~sample, nrow = 1) +
@@ -168,7 +168,7 @@ if (add_overview_plot == T) {
             ggtitle("Mean variance relationship of reads per bin") +
             xlab("Mean") +
             ylab("Variance")
-        
+
         # Arranging overview plot
         content <- ggdraw() +
             draw_plot(ov_binsizes, x = 0, y = .66, width = .5, height = .33) +
@@ -177,7 +177,7 @@ if (add_overview_plot == T) {
             draw_plot(ov_meanvar, x = 0, y = 0, width = min(n_samples / 3, 1), height = .33)
     } else {
         print("d_mv is empty. Skipping the overview mean/variance plot...")
-        
+
         # Arranging overview plot without ov_meanvar
         content <- ggdraw() +
             draw_plot(ov_binsizes, x = 0, y = .66, width = .5, height = .33) +