Skip to content

Commit

Permalink
Fix QC karyotype plot for mm10
Browse files Browse the repository at this point in the history
  • Loading branch information
weber8thomas committed Feb 8, 2024
1 parent 3473e3e commit b6bf5c1
Show file tree
Hide file tree
Showing 22 changed files with 305,962 additions and 12 deletions.
30,583 changes: 30,583 additions & 0 deletions workflow/data/normalization/T2T/HGSVC.100000.txt

Large diffs are not rendered by default.

15,285 changes: 15,285 additions & 0 deletions workflow/data/normalization/T2T/HGSVC.200000.txt

Large diffs are not rendered by default.

61,110 changes: 61,110 additions & 0 deletions workflow/data/normalization/T2T/HGSVC.50000.txt

Large diffs are not rendered by default.

30,377 changes: 30,377 additions & 0 deletions workflow/data/normalization/hg19/HGSVC.100000.txt

Large diffs are not rendered by default.

15,193 changes: 15,193 additions & 0 deletions workflow/data/normalization/hg19/HGSVC.200000.txt

Large diffs are not rendered by default.

60,740 changes: 60,740 additions & 0 deletions workflow/data/normalization/hg19/HGSVC.50000.txt

Large diffs are not rendered by default.

File renamed without changes.
26,351 changes: 26,351 additions & 0 deletions workflow/data/normalization/mm10/HGSVC.100000.txt

Large diffs are not rendered by default.

13,182 changes: 13,182 additions & 0 deletions workflow/data/normalization/mm10/HGSVC.200000.txt

Large diffs are not rendered by default.

52,686 changes: 52,686 additions & 0 deletions workflow/data/normalization/mm10/HGSVC.50000.txt

Large diffs are not rendered by default.

15 changes: 11 additions & 4 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -298,11 +298,17 @@ class HandleInput:
"strandphaser",
]

for sample in [
l_to_process = [
e
for e in os.listdir(thisdir)
if e not in exclude and e.endswith(".zip") is False
]:
]
if config["samples_to_process"]:
l_to_process = [
e for e in l_to_process if e in config["samples_to_process"]
]

for sample in l_to_process:
# Create a list of files to process for each sample
l_files_all = [
f
Expand All @@ -313,12 +319,13 @@ class HandleInput:
)
if f.endswith(ext)
]
# print(l_files_all)

for f in l_files_all:
if len(f.split("_")) == 4:
assert (
len(f.split("_")) != 4
), "Your file name is using 4 times the '_' character, which is currently not supported by ashleys-qc, please rename your files"
), "Your file name is using 3 times the '_' character, which is currently not supported by ashleys-qc, please rename your files"

# print(l_files_all)
# Dataframe creation
Expand Down Expand Up @@ -528,7 +535,7 @@ def publishdir_fct(wildcards):
Function to generate a list of files and directories for backup.
"""

print(config)
# print(config)

list_files_to_copy = [
e
Expand Down
104 changes: 104 additions & 0 deletions workflow/scripts/normalization/merge-blacklist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python
"""
Merge two blacklisted intervals if distance between them is below a given distance.
"""

import sys
from argparse import ArgumentParser
import pandas as pd


def main():
parser = ArgumentParser(prog="merge-blacklist.py", description=__doc__)
parser.add_argument(
"--merge_distance",
default=500000,
type=int,
help="If the distance between two blacklisted intervals is below this threshold, they are merged.",
)
parser.add_argument("--output", default=None, help="Output file name")
parser.add_argument(
"--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)."
)
parser.add_argument("--min_whitelist_interval_size", default=400000, type=int, help="Ignore whitelisted intervals below this size.")

parser.add_argument("normalization", metavar="NORM", help="File (tsv) with normalization and blacklist data")

args = parser.parse_args()

print("Reading", args.normalization, file=sys.stderr)
norm_table = pd.read_csv(args.normalization, sep="\t")

assert set(norm_table.columns) == set(["chrom", "start", "end", "scalar", "class"])

whitelist = None
if args.whitelist is not None:
whitelist = pd.read_csv(args.whitelist, sep="\t")
assert set(whitelist.columns) == set(["chrom", "start", "end"])
print("Read", len(whitelist), "whitelisted intervals from", args.whitelist, file=sys.stderr)
whitelist = whitelist[whitelist.end - whitelist.start >= args.min_whitelist_interval_size]
print(" -->", len(whitelist), "remained after removing intervals below", args.min_whitelist_interval_size, "bp", file=sys.stderr)

additional_blacklist = 0
prev_blacklist_index = None
prev_blacklist_chrom = None
prev_blacklist_end = None
for i in range(len(norm_table)):
row = norm_table.iloc[i]
# print('Processing row', i, ' -->', tuple(row), file=sys.stderr)
# is row blacklisted?
if row["class"] == "None":
# print(' --> is black', file=sys.stderr)
if (prev_blacklist_chrom == row["chrom"]) and (row["start"] - prev_blacklist_end <= args.merge_distance):
# print(' --> black listing', prev_blacklist_index+1, 'to', i, file=sys.stderr)
for j in range(prev_blacklist_index + 1, i):
norm_table.loc[[j], "class"] = "None"
row_j = norm_table.iloc[j]
additional_blacklist += row_j.end - row_j.start
prev_blacklist_index = i
prev_blacklist_chrom = row["chrom"]
prev_blacklist_end = row["end"]

print("Additionally blacklisted", additional_blacklist, "bp of sequence", file=sys.stderr)

additional_whitelist = 0
if whitelist is not None:
for i in range(len(norm_table)):
row = norm_table.iloc[i]
if row["class"] == "None":
if len(whitelist[(whitelist.chrom == row.chrom) & (row.start < whitelist.end) & (whitelist.start < row.end)]) > 0:
norm_table.loc[[i], "class"] = "good"
additional_whitelist += row.end - row.start


print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr)

norm_table.to_csv(args.output, index=False, sep="\t")

## Identify "complex" intervals
# segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end'])

## merge complex segments if closer than args.merge_distance
# complex_segments = pd.DataFrame(columns=['chrom','start','end'])
# cur_chrom, cur_start, cur_end = None, None, None
# for chrom, start, end in segments[segments.is_complex][['chrom','start','end']].values:
# if cur_chrom is None:
# cur_chrom, cur_start, cur_end = chrom, start, end
# elif (cur_chrom == chrom) and (start - cur_end < args.merge_distance):
# cur_end = end
# else:
# complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)
# cur_chrom, cur_start, cur_end = chrom, start, end
# if cur_chrom is not None:
# complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True)

# print(complex_segments, file=sys.stderr)
# total_complex = sum(complex_segments.end - complex_segments.start)

# print('Total amount of complex sequence: {}Mbp'.format(total_complex/1000000), file=sys.stderr)
# complex_segments[['chrom','start','end']].to_csv(sys.stdout, index=False, sep='\t')
##print(complex_segments, file=sys.stderr)


if __name__ == "__main__":
main()
160 changes: 160 additions & 0 deletions workflow/scripts/normalization/normalize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
suppressMessages(library(assertthat))


args <- commandArgs(trailingOnly = T)
if (length(args) != 4) {
print("Usage: Rscript scale.R <count table> <norm factors> <out>")
print("")
print(" Normalize Strand-seq read counts. Divide the counts of all bins")
print(" by a scaling factor (norm$scalar) and further black-list bins")
print(" if requested in the normalizatio file (norm$class).")
options(show.error.messages = F)
stop()
}

# Read counts
message(" * Reading counts from ", args[1])
counts <- fread(paste("zcat", args[1]))
assert_that(
is.data.table(counts),
"chrom" %in% colnames(counts),
"start" %in% colnames(counts),
"end" %in% colnames(counts),
"class" %in% colnames(counts),
"sample" %in% colnames(counts),
"cell" %in% colnames(counts),
"w" %in% colnames(counts),
"c" %in% colnames(counts)
) %>% invisible()
setkey(counts, chrom, start, end)

# Check that all cells have the same bins
bins <- unique(counts[, .(chrom, start, end)])
counts[,
assert_that(all(.SD == bins), msg = "Not the same bins in all cells"),
by = .(sample, cell),
.SDcols = c("chrom", "start", "end")
] %>% invisible()


# remove bad cells
bad_cells <- counts[class == "None", .N, by = .(sample, cell)][N == nrow(bins)]
if (nrow(bad_cells) > 0) {
message(" * Removing ", nrow(bad_cells), " cells because thery were black-listed.")
counts <- counts[!bad_cells, on = c("sample", "cell")]
}

# Check that the "None" bins are all the same across cells
none_bins <- unique(counts[!bad_cells, on = c("sample", "cell")][class == "None", .(chrom, start, end)])
if (nrow(none_bins) > 0) {
counts[!bad_cells, on = c("sample", "cell")][class == "None",
assert_that(all(.SD == none_bins, msg = "None bins are not the same in all cells (excl. bad cells)")),
by = .(sample, cell),
.SDcols = c("chrom", "start", "end")
] %>% invisible()
}


# Read normalization factors
message(" * Reading norm file from ", args[2])
norm <- fread(args[2])
assert_that(
is.data.table(norm),
"chrom" %in% colnames(norm),
"start" %in% colnames(norm),
"end" %in% colnames(norm),
"scalar" %in% colnames(norm)
) %>% invisible()
if ("class" %in% colnames(norm)) {
norm <- norm[, .(chrom, start, end, scalar, norm_class = class)]
} else {
norm <- norm[, .(chrom, start, end, scalar, norm_class = "good")]
}
setkey(norm, chrom, start, end)

# Check normalization type from snakemake
if (args[4] == "False") {
norm$scalar <- 1
}

# Set particular values of the norm_class to "None":
norm[scalar < 0.01, norm_class := "None"]


# annotate counts with scaling factor
counts <- merge(counts,
norm,
by = c("chrom", "start", "end"),
all.x = T
)

if (any(is.na(counts$scalar))) {
message(
" * Assign scalars: Could not match ",
unique(counts[, .(chrom, start, end, scalar)])[is.na(scalar), .N],
" bins (out of ",
unique(counts[, .(chrom, start, end)])[, .N],
") -> set those to 1"
)
}

# Fill gaps in the norm file
counts[is.na(scalar), `:=`(scalar = 1, norm_class = "good")]

# Black-listing bins
test <- counts[!bad_cells, on = c("sample", "cell")][cell == unique(cell)[1]]
test <- test[, .(
count_None = sum(class == "None"),
norm_None = sum(norm_class == "None"),
final_None = sum(class == "None" | norm_class == "None")
)]
message(" * ", test$count_None, " bins were already black-listed; ", test$norm_None, " are blacklisted via the normalization, leading to a total of ", test$final_None)
counts[norm_class == "None", class := "None"]



# Apply normalization factor
counts[, `:=`(c = as.numeric(c), w = as.numeric(w))]

counts[class != "None", `:=`(
c = c * scalar,
w = w * scalar
)]

# # Check normalization type from snakemake
# if (args[4] == "True") {
# counts[class != "None", `:=`(
# c = c * scalar,
# w = w * scalar
# )]
# } else {
# scalar <- 1

# }



counts[class == "None", `:=`(c = 0.0, w = 0.0)]

message(
" * Applying normalization: min = ",
round(min(counts[class != "None", scalar]), 3),
", max = ",
round(max(counts[class != "None", scalar]), 3),
", median = ",
median(unique(counts[, .(chrom, start, end, class, scalar)][class != "None", scalar]))
)


# Remove column
counts[, norm_class := NULL]
counts[, scalar := NULL]


# Write down table
message(" * Write data to ", args[3])
gz1 <- gzfile(args[3], "w")
write.table(counts, gz1, sep = "\t", quote = F, col.names = T, row.names = F)
close(gz1)
8 changes: 4 additions & 4 deletions workflow/scripts/plotting/qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,9 @@ if (add_overview_plot == T) {
d_mv <- d[class != "None", .(mean = mean(w + c), var = var(w + c)), by = .(sample, cell)]
print(d_mv)

if(nrow(d_mv) > 0) {
if (nrow(d_mv) > 0) {
d_p <- d_mv[, .(p = sum(mean * mean) / sum(mean * var)), by = sample]

ov_meanvar <- ggplot(d_mv) +
geom_point(aes(mean, var), alpha = 0.4) +
facet_wrap(~sample, nrow = 1) +
Expand All @@ -168,7 +168,7 @@ if (add_overview_plot == T) {
ggtitle("Mean variance relationship of reads per bin") +
xlab("Mean") +
ylab("Variance")

# Arranging overview plot
content <- ggdraw() +
draw_plot(ov_binsizes, x = 0, y = .66, width = .5, height = .33) +
Expand All @@ -177,7 +177,7 @@ if (add_overview_plot == T) {
draw_plot(ov_meanvar, x = 0, y = 0, width = min(n_samples / 3, 1), height = .33)
} else {
print("d_mv is empty. Skipping the overview mean/variance plot...")

# Arranging overview plot without ov_meanvar
content <- ggdraw() +
draw_plot(ov_binsizes, x = 0, y = .66, width = .5, height = .33) +
Expand Down
Loading

0 comments on commit b6bf5c1

Please sign in to comment.