-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3473e3e
commit b6bf5c1
Showing
22 changed files
with
305,962 additions
and
12 deletions.
There are no files selected for viewing
30,583 changes: 30,583 additions & 0 deletions
30,583
workflow/data/normalization/T2T/HGSVC.100000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
15,285 changes: 15,285 additions & 0 deletions
15,285
workflow/data/normalization/T2T/HGSVC.200000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
61,110 changes: 61,110 additions & 0 deletions
61,110
workflow/data/normalization/T2T/HGSVC.50000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
30,377 changes: 30,377 additions & 0 deletions
30,377
workflow/data/normalization/hg19/HGSVC.100000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
15,193 changes: 15,193 additions & 0 deletions
15,193
workflow/data/normalization/hg19/HGSVC.200000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
60,740 changes: 60,740 additions & 0 deletions
60,740
workflow/data/normalization/hg19/HGSVC.50000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
26,351 changes: 26,351 additions & 0 deletions
26,351
workflow/data/normalization/mm10/HGSVC.100000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
13,182 changes: 13,182 additions & 0 deletions
13,182
workflow/data/normalization/mm10/HGSVC.200000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
52,686 changes: 52,686 additions & 0 deletions
52,686
workflow/data/normalization/mm10/HGSVC.50000.txt
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env python | ||
""" | ||
Merge two blacklisted intervals if distance between them is below a given distance. | ||
""" | ||
|
||
import sys | ||
from argparse import ArgumentParser | ||
import pandas as pd | ||
|
||
|
||
def main(): | ||
parser = ArgumentParser(prog="merge-blacklist.py", description=__doc__) | ||
parser.add_argument( | ||
"--merge_distance", | ||
default=500000, | ||
type=int, | ||
help="If the distance between two blacklisted intervals is below this threshold, they are merged.", | ||
) | ||
parser.add_argument("--output", default=None, help="Output file name") | ||
parser.add_argument( | ||
"--whitelist", default=None, help="TSV file with intervals to be removed from the blacklist (columns: chrom, start, end)." | ||
) | ||
parser.add_argument("--min_whitelist_interval_size", default=400000, type=int, help="Ignore whitelisted intervals below this size.") | ||
|
||
parser.add_argument("normalization", metavar="NORM", help="File (tsv) with normalization and blacklist data") | ||
|
||
args = parser.parse_args() | ||
|
||
print("Reading", args.normalization, file=sys.stderr) | ||
norm_table = pd.read_csv(args.normalization, sep="\t") | ||
|
||
assert set(norm_table.columns) == set(["chrom", "start", "end", "scalar", "class"]) | ||
|
||
whitelist = None | ||
if args.whitelist is not None: | ||
whitelist = pd.read_csv(args.whitelist, sep="\t") | ||
assert set(whitelist.columns) == set(["chrom", "start", "end"]) | ||
print("Read", len(whitelist), "whitelisted intervals from", args.whitelist, file=sys.stderr) | ||
whitelist = whitelist[whitelist.end - whitelist.start >= args.min_whitelist_interval_size] | ||
print(" -->", len(whitelist), "remained after removing intervals below", args.min_whitelist_interval_size, "bp", file=sys.stderr) | ||
|
||
additional_blacklist = 0 | ||
prev_blacklist_index = None | ||
prev_blacklist_chrom = None | ||
prev_blacklist_end = None | ||
for i in range(len(norm_table)): | ||
row = norm_table.iloc[i] | ||
# print('Processing row', i, ' -->', tuple(row), file=sys.stderr) | ||
# is row blacklisted? | ||
if row["class"] == "None": | ||
# print(' --> is black', file=sys.stderr) | ||
if (prev_blacklist_chrom == row["chrom"]) and (row["start"] - prev_blacklist_end <= args.merge_distance): | ||
# print(' --> black listing', prev_blacklist_index+1, 'to', i, file=sys.stderr) | ||
for j in range(prev_blacklist_index + 1, i): | ||
norm_table.loc[[j], "class"] = "None" | ||
row_j = norm_table.iloc[j] | ||
additional_blacklist += row_j.end - row_j.start | ||
prev_blacklist_index = i | ||
prev_blacklist_chrom = row["chrom"] | ||
prev_blacklist_end = row["end"] | ||
|
||
print("Additionally blacklisted", additional_blacklist, "bp of sequence", file=sys.stderr) | ||
|
||
additional_whitelist = 0 | ||
if whitelist is not None: | ||
for i in range(len(norm_table)): | ||
row = norm_table.iloc[i] | ||
if row["class"] == "None": | ||
if len(whitelist[(whitelist.chrom == row.chrom) & (row.start < whitelist.end) & (whitelist.start < row.end)]) > 0: | ||
norm_table.loc[[i], "class"] = "good" | ||
additional_whitelist += row.end - row.start | ||
|
||
|
||
print("White listing: Removed", additional_whitelist, "bp of sequence for blacklist", file=sys.stderr) | ||
|
||
norm_table.to_csv(args.output, index=False, sep="\t") | ||
|
||
## Identify "complex" intervals | ||
# segments = calls.groupby(by=['chrom','start','end']).sv_call_name.agg({'is_complex':partial(is_complex, ignore_haplotypes=args.ignore_haplotypes, min_cell_count=args.min_cell_count)}).reset_index().sort_values(['chrom','start','end']) | ||
|
||
## merge complex segments if closer than args.merge_distance | ||
# complex_segments = pd.DataFrame(columns=['chrom','start','end']) | ||
# cur_chrom, cur_start, cur_end = None, None, None | ||
# for chrom, start, end in segments[segments.is_complex][['chrom','start','end']].values: | ||
# if cur_chrom is None: | ||
# cur_chrom, cur_start, cur_end = chrom, start, end | ||
# elif (cur_chrom == chrom) and (start - cur_end < args.merge_distance): | ||
# cur_end = end | ||
# else: | ||
# complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True) | ||
# cur_chrom, cur_start, cur_end = chrom, start, end | ||
# if cur_chrom is not None: | ||
# complex_segments = complex_segments.append({'chrom': cur_chrom, 'start': cur_start,'end': cur_end}, ignore_index=True) | ||
|
||
# print(complex_segments, file=sys.stderr) | ||
# total_complex = sum(complex_segments.end - complex_segments.start) | ||
|
||
# print('Total amount of complex sequence: {}Mbp'.format(total_complex/1000000), file=sys.stderr) | ||
# complex_segments[['chrom','start','end']].to_csv(sys.stdout, index=False, sep='\t') | ||
##print(complex_segments, file=sys.stderr) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
suppressMessages(library(dplyr)) | ||
suppressMessages(library(data.table)) | ||
suppressMessages(library(assertthat)) | ||
|
||
|
||
args <- commandArgs(trailingOnly = T) | ||
if (length(args) != 4) { | ||
print("Usage: Rscript scale.R <count table> <norm factors> <out>") | ||
print("") | ||
print(" Normalize Strand-seq read counts. Divide the counts of all bins") | ||
print(" by a scaling factor (norm$scalar) and further black-list bins") | ||
print(" if requested in the normalizatio file (norm$class).") | ||
options(show.error.messages = F) | ||
stop() | ||
} | ||
|
||
# Read counts | ||
message(" * Reading counts from ", args[1]) | ||
counts <- fread(paste("zcat", args[1])) | ||
assert_that( | ||
is.data.table(counts), | ||
"chrom" %in% colnames(counts), | ||
"start" %in% colnames(counts), | ||
"end" %in% colnames(counts), | ||
"class" %in% colnames(counts), | ||
"sample" %in% colnames(counts), | ||
"cell" %in% colnames(counts), | ||
"w" %in% colnames(counts), | ||
"c" %in% colnames(counts) | ||
) %>% invisible() | ||
setkey(counts, chrom, start, end) | ||
|
||
# Check that all cells have the same bins | ||
bins <- unique(counts[, .(chrom, start, end)]) | ||
counts[, | ||
assert_that(all(.SD == bins), msg = "Not the same bins in all cells"), | ||
by = .(sample, cell), | ||
.SDcols = c("chrom", "start", "end") | ||
] %>% invisible() | ||
|
||
|
||
# remove bad cells | ||
bad_cells <- counts[class == "None", .N, by = .(sample, cell)][N == nrow(bins)] | ||
if (nrow(bad_cells) > 0) { | ||
message(" * Removing ", nrow(bad_cells), " cells because thery were black-listed.") | ||
counts <- counts[!bad_cells, on = c("sample", "cell")] | ||
} | ||
|
||
# Check that the "None" bins are all the same across cells | ||
none_bins <- unique(counts[!bad_cells, on = c("sample", "cell")][class == "None", .(chrom, start, end)]) | ||
if (nrow(none_bins) > 0) { | ||
counts[!bad_cells, on = c("sample", "cell")][class == "None", | ||
assert_that(all(.SD == none_bins, msg = "None bins are not the same in all cells (excl. bad cells)")), | ||
by = .(sample, cell), | ||
.SDcols = c("chrom", "start", "end") | ||
] %>% invisible() | ||
} | ||
|
||
|
||
# Read normalization factors | ||
message(" * Reading norm file from ", args[2]) | ||
norm <- fread(args[2]) | ||
assert_that( | ||
is.data.table(norm), | ||
"chrom" %in% colnames(norm), | ||
"start" %in% colnames(norm), | ||
"end" %in% colnames(norm), | ||
"scalar" %in% colnames(norm) | ||
) %>% invisible() | ||
if ("class" %in% colnames(norm)) { | ||
norm <- norm[, .(chrom, start, end, scalar, norm_class = class)] | ||
} else { | ||
norm <- norm[, .(chrom, start, end, scalar, norm_class = "good")] | ||
} | ||
setkey(norm, chrom, start, end) | ||
|
||
# Check normalization type from snakemake | ||
if (args[4] == "False") { | ||
norm$scalar <- 1 | ||
} | ||
|
||
# Set particular values of the norm_class to "None": | ||
norm[scalar < 0.01, norm_class := "None"] | ||
|
||
|
||
# annotate counts with scaling factor | ||
counts <- merge(counts, | ||
norm, | ||
by = c("chrom", "start", "end"), | ||
all.x = T | ||
) | ||
|
||
if (any(is.na(counts$scalar))) { | ||
message( | ||
" * Assign scalars: Could not match ", | ||
unique(counts[, .(chrom, start, end, scalar)])[is.na(scalar), .N], | ||
" bins (out of ", | ||
unique(counts[, .(chrom, start, end)])[, .N], | ||
") -> set those to 1" | ||
) | ||
} | ||
|
||
# Fill gaps in the norm file | ||
counts[is.na(scalar), `:=`(scalar = 1, norm_class = "good")] | ||
|
||
# Black-listing bins | ||
test <- counts[!bad_cells, on = c("sample", "cell")][cell == unique(cell)[1]] | ||
test <- test[, .( | ||
count_None = sum(class == "None"), | ||
norm_None = sum(norm_class == "None"), | ||
final_None = sum(class == "None" | norm_class == "None") | ||
)] | ||
message(" * ", test$count_None, " bins were already black-listed; ", test$norm_None, " are blacklisted via the normalization, leading to a total of ", test$final_None) | ||
counts[norm_class == "None", class := "None"] | ||
|
||
|
||
|
||
# Apply normalization factor | ||
counts[, `:=`(c = as.numeric(c), w = as.numeric(w))] | ||
|
||
counts[class != "None", `:=`( | ||
c = c * scalar, | ||
w = w * scalar | ||
)] | ||
|
||
# # Check normalization type from snakemake | ||
# if (args[4] == "True") { | ||
# counts[class != "None", `:=`( | ||
# c = c * scalar, | ||
# w = w * scalar | ||
# )] | ||
# } else { | ||
# scalar <- 1 | ||
|
||
# } | ||
|
||
|
||
|
||
counts[class == "None", `:=`(c = 0.0, w = 0.0)] | ||
|
||
message( | ||
" * Applying normalization: min = ", | ||
round(min(counts[class != "None", scalar]), 3), | ||
", max = ", | ||
round(max(counts[class != "None", scalar]), 3), | ||
", median = ", | ||
median(unique(counts[, .(chrom, start, end, class, scalar)][class != "None", scalar])) | ||
) | ||
|
||
|
||
# Remove column | ||
counts[, norm_class := NULL] | ||
counts[, scalar := NULL] | ||
|
||
|
||
# Write down table | ||
message(" * Write data to ", args[3]) | ||
gz1 <- gzfile(args[3], "w") | ||
write.table(counts, gz1, sep = "\t", quote = F, col.names = T, row.names = F) | ||
close(gz1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.