diff --git a/config_hpf.yaml b/config_hpf.yaml index cc2ff4f..f93b38a 100644 --- a/config_hpf.yaml +++ b/config_hpf.yaml @@ -23,10 +23,11 @@ tools: cre: "~/cre" crg: "~/crg" crg2: "~/crg2" - ehdn: "/hpf/largeprojects/ccmbio/arun/Tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0" + ehdn: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0" mity: "/hpf/largeprojects/ccmbio/ajain/mity/mity_package/bin" melt: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/MELT/MELTv2.2.2/MELT.jar" orad: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/orad_2_6_1/orad" + annovar: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/annovar" ref: @@ -123,6 +124,14 @@ annotation: 1000g: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunter/1000G_EH_v1.0.tsv" ehdn: files: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/" + g1k_outlier: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G_outlier" + g1k_manifest: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/manifest.1000G.txt" + g1k_samples: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt" + trf: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv" + omim: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/OMIM_hgnc_join_omim_phenos_2023-06-22.tsv" + gnomad: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo//gnomad.v2.1.1.lof_metrics.by_gene.txt" + annovar_db: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/annovar/humandb/" + cre: database_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/" melt: diff --git a/documentation/str.md b/documentation/str.md new file mode 100644 index 0000000..7b215ac --- /dev/null +++ b/documentation/str.md @@ -0,0 +1,45 @@ +# ExpansionHunterDenovo version: + EHDN v0.7.0 is not available from GitHub or conda anymore, and the 1000G profiles from Brett Trost are based on this, so keeping this version; tool path in 'config.yaml' is updated to + `config['tools']['ehdn']: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0"` + +# Description of scripts used before the changes: +## BASH and R scripts: + +1. scripts are fetched from `crg/str` repo +2. ~/crg/crg.ehdn.sh: runs EHDN with many checks/searches for the relevant bam files based on run type: exome/genome and cre/crg/crg2 pipeline directory structures +3. ~/crg/ehdn_report.sh: main script that uses all the R packages from ~/crg/str, 1000G EHDN profiles, and generates script + - ~/crg/str/DBSCAN.EHdn.parallel.R: library(dbscan), library(ggplot2), library(data.table), library(parallel), library(doSNOW) + - ~/crg/str//mergeExpansions.R: library(data.table), library(GenomicRanges), library(ggplot2), library(cowplot), library(Biostrings) + + +## Python scripts: +1. ~/crg/str/compare_anchored_irrs.py: depends on "core" package present locally +2. ~/crg/str/find_outliers.py: numpy, matplotlib +3. ~/crg/str/generate_EH_genotype_table.generic.py: argparse, glob, BTlib(local), collections, pandas, path +4. ~/crg/str/combine_counts.py: argparse, json, core(local) +5. ~/crg/str/BTlib.py: re, statistics, string, random, itertools, copy, collections, numpy, pandas, copy, docx +6. ~/crg/str/add_gene+threshold_to_EH_column_headings2.py: argparse +7. ~/crg/str/format_for_annovar.py: collections, re +8. ~/crg/str/format_from_annovar.py: pandas, xlsxwriter + + +# Summary of changes related to issue 142: +1. Moved all required scripts from `~/crg` and `~/cre` to `~/crg2/scripts/str` +2. Removed hard-coded paths from str related scripts and added them to `config.yaml` to be passed as arguments via snakemake params. +3. Moved following annotations and tools to /hpf/largeprojects/ccm_dccforge/dccdipg/Common/: + annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv + crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0 + annotation/ExpansionHunterDenovo/1000G_JSON + updated annotation/ExpansionHunterDenovo/manifest.1000G.txt +4. Split the single "EHdn_report" rule into four, with separate env for each rule + - EHDN_mark_outliers: + - EHDN_DBSCAN_outlier + - EHDN_merge_expansions + - EHDN_annovar +4. Created `envs/ehdn-dbscan.yaml` for rules "EHDN_DBSCAN_outlier" and "EHDN_merge_expansions". The version of the R-packages are not the same as in `ccmmarvin`. If I constrain them to versions available in 'ccmmarvin`, then conda fails to create env due to conflicts. The versions installed on ccmmarvin were done manually in 2020, not via conda, +5. Fixed minor bugs, and removed unwanted yaml. +6. Edited above R scripts to remove hard-coded file paths, add command-line arguments, and removed suffixing output with date, as the Snakemake rule requires the output names be known before execution (using dynamic only works if all outputs from a rule are dynamic) +7. Tested with NA12878 + + + diff --git a/envs/ehdn-dbscan.yaml b/envs/ehdn-dbscan.yaml new file mode 100644 index 0000000..5e280d0 --- /dev/null +++ b/envs/ehdn-dbscan.yaml @@ -0,0 +1,12 @@ +channels: + - bioconda + - conda-forge +dependencies: + - r + - r-dbscan=1.1-8 + - r-ggplot2 + - r-data.table + - r-doSNOW=1.0.19 + - r-cowplot + - bioconductor-genomicranges + - bioconductor-biostrings diff --git a/envs/ehdn-report.yaml b/envs/ehdn-report.yaml index 719bfb0..e39c081 100644 --- a/envs/ehdn-report.yaml +++ b/envs/ehdn-report.yaml @@ -2,12 +2,20 @@ channels: - bioconda - conda-forge dependencies: + - python=3.7.8 + - python-docx=0.8.10 + - pandas=1.1.2 + - numpy=1.19.1 + - scipy=1.5.2 + - xlsxwriter=1.3.7 + - r-base=3.4.1 - r-dbscan=1.1.5 - - r-ggplot2 - - r-data.table - - r-doSNOW - - r-cowplot - - bioconductor-genomicranges - - bioconductor-genomeinfodb - - bioconductor-genomeinfodbdata - - bioconductor-biostrings \ No newline at end of file + - r-ggplot2=3.1.0 + - r-data.table=1.11.4 + - r-doSNOW=1.0.20 + - r-cowplot=0.9.3 + - bioconductor-genomicranges=1.32.7 + - bioconductor-genomeinfodb=1.16.0 + - bioconductor-genomeinfodbdata=1.1.0 + - bioconductor-biostrings=2.48.0 + diff --git a/envs/ehdn.yaml b/envs/ehdn.yaml index 3e0bfa6..b1a482e 100644 --- a/envs/ehdn.yaml +++ b/envs/ehdn.yaml @@ -3,4 +3,5 @@ channels: - conda-forge dependencies: - python =3.7.7 - - scipy =1.7.3 \ No newline at end of file + - scipy =1.7.3 + - numpy \ No newline at end of file diff --git a/envs/ehdn_outlier.yaml b/envs/ehdn_outlier.yaml new file mode 100644 index 0000000..f3f321f --- /dev/null +++ b/envs/ehdn_outlier.yaml @@ -0,0 +1,10 @@ +channels: + - bioconda + - conda-forge +dependencies: + - python=3.7.8 + - python-docx=0.8.10 + - pandas=1.1.2 + - numpy + - matplotlib + - xlsxwriter=1.3.7 \ No newline at end of file diff --git a/rules/str.smk b/rules/str.smk index 8adfaca..1ad12a7 100644 --- a/rules/str.smk +++ b/rules/str.smk @@ -8,7 +8,7 @@ rule EH: bam = "str/EH/{family}_{sample}_realigned.bam" params: ref = config["ref"]["genome"], - sex = lambda w: "`sh {}/scripts/str_helper.sh decoy_rm/{}_{}.no_decoy_reads.bam`".format(workflow.basedir, w.family, w.sample), + sex = lambda w: "`sh {}/scripts/str/str_helper.sh decoy_rm/{}_{}.no_decoy_reads.bam`".format(workflow.basedir, w.family, w.sample), catalog = config["annotation"]["eh"]["catalog"] log: "logs/str/{family}_{sample}-EH.log" @@ -31,28 +31,28 @@ rule EH_report: conda: "../envs/eh-report.yaml" shell: - ''' - echo "generating multi-sample genotypes" > {log} - python {params.crg2}/scripts/generate_EH_genotype_table.generic.py str/EH > {output.tsv} - echo "annotating gene name & size threshold" > {log} - python {params.crg2}/scripts/add_gene+threshold_to_EH_column_headings2.py {output.tsv} {params.trf} > {output.annot} - echo "generating final xlsx file" > {log} - python {params.crg2}/scripts/eh_sample_report.py {output.annot} {params.g1000} {output.xlsx} + """ + echo "generating multi-sample genotypes" >> {log} + python {params.crg2}/scripts/str/generate_EH_genotype_table.generic.py str/EH > {output.tsv} + echo "annotating gene name & size threshold" >> {log} + python {params.crg2}/scripts/str/add_gene+threshold_to_EH_column_headings2.py {output.tsv} {params.trf} > {output.annot} + echo "generating final xlsx file" >> {log} + python {params.crg2}/scripts/str/eh_sample_report.py {output.annot} {params.g1000} {output.xlsx} prefix=`echo {output.xlsx} | awk '{{split($1,a,".xlsx"); print a[1]; }}'`; d=`date +%Y-%m-%d` outfile="${{prefix}}.${{d}}.xlsx"; - echo "Copying final report to filaname with timestamp: $outfile" > {log} + echo "Copying final report to filaname with timestamp: $outfile" >> {log} cp {output.xlsx} $outfile - ''' + """ rule EHdn: input: expand("decoy_rm/{family}_{sample}.no_decoy_reads.bam",family=config["run"]["project"], sample=samples.index) output: json = "str/EHDN/{family}_EHDN_str.tsv" params: - crg = config["tools"]["crg"], + crg2 = config["tools"]["crg2"], ehdn = config["tools"]["ehdn"], - ehdn_files = config["annotation"]["ehdn"]["files"], + g1k_manifest = config["annotation"]["ehdn"]["g1k_manifest"], ref = config["ref"]["genome"], # ref = config["ref"]["genome"], # irr_mapq = config["params"]["EHDN"]["irr_mapq"], @@ -63,27 +63,107 @@ rule EHdn: "../envs/ehdn.yaml" shell: ''' - EHDN={params.ehdn} EHDN_files={params.ehdn_files} ref={params.ref} script_dir={params.crg} sh {params.crg}/crg.ehdn.sh {wildcards.family} crg2 + EHDN={params.ehdn} g1k_manifest={params.g1k_manifest} ref={params.ref} script_dir={params.crg2}/scripts/str sh {params.crg2}/scripts/str/crg.ehdn.sh {wildcards.family} crg2 ''' -rule EHdn_report: +#rule EHdn_report: +# input: "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"]) +# output: "report/str/{family}.EHDN.xlsx" +# log: "logs/str/{family}_EHdn_report.log" +# params: +# repdir = "report/str", +# crg2 = config["tools"]["crg2"], +# family = config["run"]["project"], +# outdir = "str/EHDN", +# conda: "../envs/ehdn-report.yaml" +# shell: +# ''' +# export PATH="/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/bin/:$PATH" +# sh {params.crg2}/scripts/str/ehdn_report.sh {params.family} {params.outdir} +# date=`date +%Y-%m-%d`; +# f={params.outdir}/outliers/{params.family}.EHDN.${{date}}.xlsx; +# if [ -f $f ]; then +# if [ ! -d {params.repdir} ]; then mkdir -p {params.repdir}; fi; +# mv $f {params.repdir} +# cp {params.repdir}/{params.family}.EHDN.${{date}}.xlsx {params.repdir}/{params.family}.EHDN.xlsx +# else exit; fi; +# ''' + +rule EHDN_mark_outliers: input: "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"]) - output: "report/str/{family}.EHDN.xlsx" - log: "logs/str/{family}_EHdn_report.log" + output: "str/EHDN/{family}_outliers.txt".format(family=config["run"]["project"]) params: - repdir = "report/str", - crg = config["tools"]["crg"], - family = config["run"]["project"], - outdir = "str/EHDN", + crg2 = config["tools"]["crg2"], + g1k_outlier = config["annotation"]["ehdn"]["g1k_outlier"] + conda: + "../envs/ehdn.yaml" shell: - ''' - PATH="/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/bin/:$PATH" - sh {params.crg}/ehdn_report.sh {params.family} {params.outdir} - date=`date +%Y-%m-%d`; - f={params.outdir}/outliers/{params.family}.EHDN.${{date}}.xlsx; - if [ -f $f ]; then - if [ ! -d {params.repdir} ]; then mkdir -p {params.repdir}; fi; - mv $f {params.repdir} - cp {params.repdir}/{params.family}.EHDN.${{date}}.xlsx {params.repdir}/{params.family}.EHDN.xlsx - else exit; fi; - ''' + """ + python {params.crg2}/scripts/str/find_outliers.py {input} {output} + cat {output} {params.g1k_outlier} > temp && mv temp {output} + """ + +rule EHDN_DBSCAN_outlier: + input: + profile = "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"]), + outliers = "str/EHDN/{family}_outliers.txt".format(family=config["run"]["project"]), + output: + clean="str/EHDN/outliers/clean.samples.txt", + expansions="str/EHDN/outliers/EHdn.expansions.tsv", + tr="str/EHDN/outliers/samples.with.TRcount.txt" + + params: + outdir="str/EHDN/outliers", + crg2=config["tools"]["crg2"], + trf=config["annotation"]["ehdn"]["trf"], + g1k_samples=config["annotation"]["ehdn"]["g1k_samples"] + conda: "../envs/ehdn-dbscan.yaml" + shell: + """ + Rscript {params.crg2}/scripts/str/DBSCAN.EHdn.parallel.R --infile {input.profile} --outpath {params.outdir} --outlierlist {input.outliers} --a1000g {params.g1k_samples} --exp {output.expansions} + echo "Finished running DBSCAN.EHdn.parallel.R"; + echo "`ls {params.outdir}`"; + + """ + +rule EHDN_merge_expansions: + input: "str/EHDN/outliers/EHdn.expansions.tsv", "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"]) + output: "str/EHDN/outliers/merged.rare.expansions.tsv","str/EHDN/outliers/merged.rare.expansions.forannotation.tsv", + "str/EHDN/outliers/map.TRF.EHdn.0.66.tsv", "str/EHDN/outliers/merged.ehdn.tsv", + params: + outdir="str/EHDN/outliers", + crg2=config["tools"]["crg2"], + trf=config["annotation"]["ehdn"]["trf"], + g1k_samples=config["annotation"]["ehdn"]["g1k_samples"] + conda: "../envs/ehdn-dbscan.yaml" + shell: + """ + Rscript {params.crg2}/scripts/str/mergeExpansions.R --rscript {params.crg2}/scripts/str/ExpansionAnalysisFunctions.R --ehdn {input[1]} --trf {params.trf} --outlier {input[0]} --outpath {params.outdir} + """ + + +rule EHDN_annovar: + input: + merged_exp = "str/EHDN/outliers/merged.rare.expansions.tsv", + ehdn_exp = "str/EHDN/outliers/EHdn.expansions.tsv" + output: + merged_rare_exp = "str/EHDN/outliers/merged.rare.EHdn.expansion.tsv", + omim_out = "str/EHDN/outliers/merged.rare.EHdn.expansion-OMIM.hg19_multianno.txt", + gnomad_out = "str/EHDN/outliers/merged.rare.EHdn.expansion-gnoMAD.hg19_multianno.txt", + xlsx="report/str/{family}.EHDN.xlsx".format(family=config["run"]["project"]) + params: + crg2 = config["tools"]["crg2"], + g1k_manifest = config["annotation"]["ehdn"]["g1k_manifest"], + annovar = config["tools"]["annovar"], + annovar_db = config["annotation"]["ehdn"]["annovar_db"], + omim = config["annotation"]["ehdn"]["omim"], + gnomad = config["annotation"]["ehdn"]["gnomad"], + prefix = "str/EHDN/outliers/merged.rare.EHdn.expansion" + conda: "../envs/eh-report.yaml" + shell: + """ + python {params.crg2}/scripts/str/format_for_annovar.py {input.ehdn_exp} {input.merged_exp} {params.g1k_manifest} + {params.annovar}/table_annovar.pl {output.merged_rare_exp} {params.annovar_db} -buildver hg19 -outfile {params.prefix}"-OMIM" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile {params.omim} + {params.annovar}/table_annovar.pl {output.merged_rare_exp} {params.annovar_db} -buildver hg19 -outfile {params.prefix}"-gnoMAD" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile {params.gnomad} + python {params.crg2}/scripts/str/format_from_annovar.py {output.gnomad_out} {output.omim_out} {output.xlsx} + """ \ No newline at end of file diff --git a/scripts/BTlib.py b/scripts/str/BTlib.py similarity index 100% rename from scripts/BTlib.py rename to scripts/str/BTlib.py diff --git a/scripts/str/DBSCAN.EHdn.parallel.R b/scripts/str/DBSCAN.EHdn.parallel.R new file mode 100755 index 0000000..b39eede --- /dev/null +++ b/scripts/str/DBSCAN.EHdn.parallel.R @@ -0,0 +1,170 @@ +rm(list = ls()) +#### to run +#### Rscript DBSCAN.EHdn.parallel.R --infile /hpf/largeprojects/tcagstor/users/btrost/papers/STRs/Qatar_ASD/output_regions.min2.1000G+SSC+MSSNG+QASD.txt +#### --outpath output/ --a1000g /hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt --samplelist samples.txt --outlierlist outliers.txt +#### a1000g <- readLines("/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt") + +###("/hpf/largeprojects/tcagstor/users/worrawat/Expansion/1000G.samples.txt") + +#### read arguments from command line +args = commandArgs(trailingOnly=TRUE) +if(length(args) < 2){ + stop(call. = T, "Require more argument(s)") +} + +paramNames <- grep("--", args) +paramValues <- paramNames + 1 + +if(length(grep("--", args[paramValues])) > 0){ + stop(call. = T, "Failed in reading arguments, '--' found in argument value") +} + +if(length(args) != length(c(paramNames, paramValues))){ + message(sprintf("Ignore argument(s):%s", paste(args[-union(paramNames, paramValues)], collapse=","))) +} + +params <- list() +paramNames <- gsub("--", "", args[paramNames]) +for(i in 1:length(paramNames)){ + params[paramNames[i]] <- args[paramValues[i]] +} + +library(dbscan) +library(ggplot2) +library(data.table) + +set.seed(2000) +a1000g <- readLines(params$a1000g) +outpath <- params$outpath +if(length(grep("\\/$", outpath)) == 0){ + outpath <- paste0(outpath, "/") +} + +if(!dir.exists(outpath)){ + dir.create(outpath, recursive = T) +} + +infile <- params$infile +message(sprintf("reading %s##", infile)) +dt <- fread(infile) +dt <- data.frame(dt) +dt$varid <- paste(dt$V1, dt$V2, dt$V3, sep="_") + +### get count per sample +sample.count <- paste(dt$V7, collapse = ",") +sample.count <- gsub(":", ",", sample.count) +sample.count <- strsplit(sample.count, ",")[[1]] +sample.count <- matrix(sample.count, ncol = 2, byrow = T) +sample.count <- data.frame(table(sample.count[, 1]), stringsAsFactors = F) +write.table(sample.count, + sprintf("%ssamples.with.TRcount.txt", outpath), sep="\t", row.names=F, quote=F, col.names=T) + +if("samplelist" %in% names(params)){ + samples <- readLines(params$samplelist) + sample.count <- sample.count[sample.count$Var1 %in% samples, ] +}else{ + samples <- sample.count$Var1 +} + +if(!"outlierlist" %in% names(params)){ + sd <- sd(sample.count$Freq) + mean <- mean(sample.count$Freq) + outlier.count <- sum(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd) + + if(outlier.count > 0){ + message(sprintf("Remove %s outliers", outlier.count)) + samples <- samples[!samples %in% sample.count$Var[which(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd)]] + + write.table(sample.count[which(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd), ], + sprintf("%soutlier.samples.txt", outpath)) + message(sprintf("Samples excluded from the analysis are in %soutlier.samples.txt", outpath)) + } +}else{ + outliers <- readLines(params$outlierlist) + message(sprintf("Remove %s outliers", length(outliers))) + samples <- samples[!samples %in% outliers] + sample.count <- sample.count[!sample.count$Var1 %in% outliers, ] +} + +message(sprintf("Samples included in the analysis are in %sclean.samples.txt", outpath)) +writeLines(as.character(sample.count$Var1), sprintf("%sclean.samples.txt", outpath)) + +norm.dist <- round(rnorm(length(samples), mean = 1, sd = 0.25), digits = 1) +norm.dist <- sort(norm.dist) + +library(parallel) +library(doSNOW) + +time.start <- Sys.time() +coreNumber <- 12 +cl <- makeCluster(coreNumber-1) +registerDoSNOW(cl) + +dt.out <- foreach (i = 1:nrow(dt), .combine=rbind) %dopar% { + tmp <- gsub(":|,", "#", dt[i, 7]) + tmp <- unlist(strsplit(tmp, "#")[[1]]) + tmp <- data.frame(matrix(tmp, ncol = 2, byrow = T), stringsAsFactors = F) + tmp[, 2] <- as.numeric(tmp[, 2]) + tmp <- tmp[tmp$X1 %in% samples, ] + if(nrow(tmp) == 0){ + dt.here <- data.frame("repeatID" = "", "motif" = "", "outliers" = "", "size" = "", "ref" = "", + "chr" = "", "start" = "", "end" = "")[-1, ] + }else{ + if(nrow(tmp) < length(norm.dist)){ + dt.tmp <- norm.dist[-c(1:nrow(tmp))] + sampleIDs <- c(paste0("Sim", 1:length(dt.tmp)), tmp$X1) + dt.tmp <- c(dt.tmp, tmp$X2) + }else{ + sampleIDs <- tmp$X1 + dt.tmp <- tmp$X2 + } + + ref <- as.numeric(names(which.max(table(dt.tmp)))) + range <- max(2*ref, quantile(dt.tmp, 0.95, na.rm = T) - quantile(dt.tmp, 0.05, na.rm = T)) + + scan <- dbscan::dbscan(matrix(dt.tmp), eps = range, minPts = ceiling(log2(length(samples)))) + + if(length(unique(scan$cluster)) == 1 | sum(scan$cluster == 0) == 0){ + cutoff <- Inf + }else{ + cutoff <- max(dt.tmp[scan$cluster != 0]) + cutoff <- ifelse(cutoff < 2, 2, cutoff) + } + + if(sum(dt.tmp > cutoff) > 0){ + sim.samples <- grep("Sim", sampleIDs) + if(length(sim.samples) > 0){ + dt.tmp <- dt.tmp[-sim.samples] + sampleIDs <- sampleIDs[-sim.samples] + } + + outliers <- paste(sampleIDs[dt.tmp > cutoff], collapse=";") + size <- paste(dt.tmp[dt.tmp > cutoff], collapse=";") + dt.here <- data.frame("repeatID" = dt$varid[i], "motif" = dt$V4[i], outliers, size, ref, + "chr" = dt$V1[i], "start" = dt$V2[i], "end" = dt$V3[i], stringsAsFactors = F) + }else{ + dt.here <- data.frame("repeatID" = "", "motif" = "", "outliers" = "", "size" = "", "ref" = "", + "chr" = "", "start" = "", "end" = "", stringsAsFactors = F)[-1, ] + } + } + + dt.here +} + + +get1000Freq <- function(outlier, fam.1000g){ + outlier <- strsplit(outlier, ";")[[1]] + count <- sum(unique(outlier) %in% fam.1000g) + return(round(count/length(fam.1000g) * 100, digits = 2)) +} + +dt.out$a1000g_freq_perc <- sapply(as.character(dt.out$outliers), get1000Freq, a1000g) + +stopCluster(cl) +#write.table(dt.out, sprintf("%sEHdn.expansions.%s.tsv", outpath, Sys.Date()), sep="\t", row.names=F, col.names=T, quote=F) +write.table(dt.out, params$exp, sep="\t", row.names=F, col.names=T, quote=F) +time.end <- Sys.time() +print(difftime(time.end, time.start)) + +#print("Printing sessioninfo from DBSCAN.EHdn.parallel.R") +#sessionInfo() diff --git a/scripts/str/ExpansionAnalysisFunctions.R b/scripts/str/ExpansionAnalysisFunctions.R new file mode 100755 index 0000000..44a3dfd --- /dev/null +++ b/scripts/str/ExpansionAnalysisFunctions.R @@ -0,0 +1,166 @@ +get1000Freq <- function(outlier, fam.1000g){ + outlier <- strsplit(outlier, ";")[[1]] + count <- sum(unique(outlier) %in% fam.1000g) + return(round(count/length(fam.1000g) * 100, digits = 2)) +} + + +mergeLoci <- function(loci, mergeAny = T){ + + cnv.tmp <- loci + + firstIt <- T + olap <- data.frame() + while(nrow(olap) != 0 | firstIt){ + firstIt <- F + cnv.query <- GRanges(cnv.tmp$chr, IRanges(cnv.tmp$start, cnv.tmp$end), "*") + + olap <- data.frame(findOverlaps(cnv.query, cnv.query)) + olap <- olap[which(olap$queryHits < olap$subjectHits), ] + olap <- olap[!duplicated(olap$queryHits), ] + olap <- olap[!olap$queryHits %in% olap$subjectHits, ] + if(!mergeAny){ + olap <- olap[nchar(cnv.tmp$motif[olap$queryHits]) == nchar(cnv.tmp$motif[olap$subjectHits]), ] + } + + message(sprintf("%s found overlapped, from %s loci", nrow(olap), nrow(cnv.tmp))) + flush.console() + + if(nrow(olap) > 0){ + ids <- union(olap$queryHits, olap$subjectHits) + merge.dt <- data.frame() + for(i in 1:nrow(olap)){ + olap.th <- olap[i, ] + + olap.th[, c("qstart", "qend")] <- cnv.tmp[olap.th$queryHits, c("start", "end")] + olap.th[, c("sstart", "send")] <- cnv.tmp[olap.th$subjectHits, c("start", "end")] + + + repeatID <- paste(union(strsplit(cnv.tmp$repeatID[olap.th$queryHits], ";")[[1]], + strsplit(cnv.tmp$repeatID[olap.th$subjectHits], ";")[[1]]), collapse=";") + motif <- paste(union(strsplit(cnv.tmp$motif[olap.th$queryHits], ";")[[1]], + strsplit(cnv.tmp$motif[olap.th$subjectHits], ";")[[1]]), collapse = ";") + trf.id <- paste(na.omit(union(strsplit(cnv.tmp$trf.id[olap.th$queryHits], ";")[[1]], + strsplit(cnv.tmp$trf.id[olap.th$subjectHits], ";")[[1]])), collapse = ";") + reciprocalIdentity <- max(cnv.tmp$reciprocalIdentity[olap.th$queryHits], cnv.tmp$reciprocalIdentity[olap.th$subjectHits], na.rm =T) + length.diff <- 0 + + merge.dt <- rbind(merge.dt, data.frame(repeatID, "chr" = cnv.tmp$chr[olap.th$queryHits], + "start" = pmin(olap.th$qstart, olap.th$sstart), "end" = pmax(olap.th$qend, olap.th$send), + motif, trf.id, reciprocalIdentity, length.diff)) + } + + + cnv.tmp <- cnv.tmp[-(ids), ] + cnv.tmp <- rbind(cnv.tmp, merge.dt) + } + + gc(); + } + + return(cnv.tmp) +} + +mapOutlierToMergeLoci <- function(outlier.loci, merge.loci){ + tmp <- unique(as.numeric(unlist(sapply(outlier.loci$repeatID, grep, merge.loci$repeatID)))) + merge.loci <- merge.loci[tmp, ] + + for(i in 1:nrow(merge.loci)){ + repeatIDs <- strsplit(merge.loci$repeatID[i], ";")[[1]] + outliers <- paste(unique(unlist(sapply(outlier.loci$outliers[outlier.loci$repeatID %in% repeatIDs], strsplit, ";"))), collapse = ";") + merge.loci$outliers[i] <- outliers + } + + return(merge.loci) +} + +getOutlierMergeData <- function(i, ehdn.result.m, fam.data){ + ehdn.rec <- ehdn.result.m[i, ] + outliers <- strsplit(ehdn.rec$outliers, ";")[[1]] + + + ehdn.data <- fam.data[fam.data$ID %in% outliers, c("ID", "Affection")] + if(nrow(ehdn.data) == 0){ + return(NULL) + }else{ + + ehdn.data[, c("repeatID", "motif", "chr", "start", "end", "trf.id", "gene_symbol", "entrez_id", "typeseq_priority")] <- + ehdn.rec[, c("repeatID", "motif", "chr", "start", "end", "trf.id", "gene_symbol", "entrez_id", "typeseq_priority")] + } + return(ehdn.data) +} + + +getCountTable <- function(dt, label, label.list, fam.data, ehdn.genotype.data, cov){ + if(nrow(dt) == 0){ + return(NA) + }else{ + dt <- dt[dt$ID %in% fam.data$ID, ] + + if(sum(names(fam.data) %in% cov) > 0) + cov.dt <- unique(fam.data[, c("ID", names(fam.data)[names(fam.data) %in% cov])]) + + table.tmp <- aggregate(repeatID ~ ID, dt, length) + names(table.tmp)[2] <- "ExpansionCount" + + table.tmp <- merge(table.tmp, fam.data[, c("ID", label)], by.x = "ID", by.y = "ID", all = T) + table.tmp[is.na(table.tmp)] <- 0 + table.tmp[, label] <- ifelse(table.tmp[, label] == label.list[1], 1, 0) + table.tmp[, label] <- factor(table.tmp[, label]) + if(sum(names(fam.data) %in% cov) > 0) + table.tmp <- merge(table.tmp, cov.dt, by.x = "ID", by.y = "ID", all.x = T) + table.tmp <- merge(table.tmp, ehdn.genotype.data, by = "ID", all.x = T) + table.tmp[is.na(table.tmp)] <- 0 + return(table.tmp) + } +} + +testGLM <- function(dt, label, cov, feature, standardization = T){ + if(is.null(nrow(dt))){ + return(list("OR" = NA, "upper" = NA, "lower" = NA, "pvalue" = NA)) + }else if (length(unique(na.omit(dt[, label]))) < 2){ + return(list("OR" = NA, "upper" = NA, "lower" = NA, "pvalue" = NA)) + }else{ + for(cov.each in cov){ + if(length(unique(dt[, cov.each])) < 2){ + cov <- cov[-which(cov == cov.each)] + } + } + + if(standardization){ + dt[, feature] <- scale(dt[, feature]) + } + ref <- paste0(label, " ~ ", paste(cov, collapse = " + ")) + add <- paste0(ref, " + ", feature) + ref.lm <- glm(ref, dt, family = binomial(link = "logit")) + add.lm <- glm(add, dt, family = binomial(link = "logit")) + ano <- anova(ref.lm, add.lm, test = "Chisq") + + or <- exp(add.lm$coefficients[feature]) + pvalue <- ano$`Pr(>Chi)`[2] + + conf <- confint(add.lm) + low <- exp(conf[feature, 1]) + up <- exp(conf[feature, 2]) + + return(list("OR" = or, "upper" = up, "lower" = low, "pvalue" = pvalue)) + } +} + + + +testByAggregate <- function(repeatIDs, tmp.f, fam.data, ehdn.genotype.data, cov, standardization = F){ + caseCtrl.dt <- tmp.f[which(tmp.f$repeatID %in% repeatIDs & + tmp.f$Affection %in% 1:2), ] + + feature <- "ExpansionCount" + + all.table <- getCountTable(caseCtrl.dt, "Affection", + c(2, 1), + fam.data, ehdn.genotype.data, cov) + + res <- testGLM(all.table, "Affection", cov, feature, standardization) + + return(data.frame(res, "affected.count" = sum(all.table$Affection == 1 & all.table$ExpansionCount > 0), + "unaffected.count" = sum(all.table$Affection == 0 & all.table$ExpansionCount > 0))) +} diff --git a/scripts/str/__init__.py b/scripts/str/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/scripts/add_gene+threshold_to_EH_column_headings2.py b/scripts/str/add_gene+threshold_to_EH_column_headings2.py similarity index 100% rename from scripts/add_gene+threshold_to_EH_column_headings2.py rename to scripts/str/add_gene+threshold_to_EH_column_headings2.py diff --git a/scripts/str/combine_counts.py b/scripts/str/combine_counts.py new file mode 100755 index 0000000..336f593 --- /dev/null +++ b/scripts/str/combine_counts.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 + +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import argparse +import logging +import sys +import json + +from core import regiontools, common + + +def load_parameters(): + '''Capture command-line parameters. + + ''' + parser = argparse.ArgumentParser( + description='combine counts of in-repeat reads across multiple samples') + parser.add_argument( + '--manifest', + help='TSV file with id, case/control status, and path of each sample', + required=True) + parser.add_argument( + '--combinedCounts', + help='file with combined counts of anchored in-repeat reads', + required=True) + parser.add_argument( + '--minUnitSize', + help='size of smallest repeat unit to process (in bp)', + default=2, type=int, required=False) + parser.add_argument( + '--maxUnitSize', + help='size of the largest repeat unit to process (in bp)', + default=20, type=int, required=False) + args = parser.parse_args() + + parameter_encoding = ' '.join(sys.argv[1:]) + logging.info('Starting with these parameters: %s', parameter_encoding) + + return {'manifest_path': args.manifest, 'output_path': args.combinedCounts, + 'min_unit_size': args.minUnitSize, + 'max_unit_size': args.maxUnitSize} + + +def merge_combined_regions(combined_air_regions): + for unit in combined_air_regions: + combined_air_regions[unit].merge() + + +def process_sample(parameters, denovo_json, sample_id, combined_air_regions, + combined_pir_regions): + min_unit_size = parameters['min_unit_size'] + max_unit_size = parameters['max_unit_size'] + + for unit in denovo_json: + if not min_unit_size <= len(unit) <= max_unit_size: + continue + + if unit == 'Depth' or unit == 'ReadLength': + continue + + if 'RegionsWithIrrAnchors' in denovo_json[unit]: + anc_irr_record = denovo_json[unit]['RegionsWithIrrAnchors'] + current_regions = regiontools.create_region_collection_from_denovo_record( + sample_id, anc_irr_record) + + if unit not in combined_air_regions: + combined_air_regions[unit] = regiontools.RegionCollection() + combined_air_regions[unit].extend(current_regions) + + if denovo_json[unit]['IrrPairCount']: + num_irr_pairs = denovo_json[unit]['IrrPairCount'] + if unit not in combined_pir_regions: + combined_pir_regions[unit] = {} + combined_pir_regions[unit][sample_id] = num_irr_pairs + + +def process_samples(parameters, samples, sample_depths): + combined_air_regions = {} + combined_pir_regions = {} + + for sample_num, sample in enumerate(samples): + logging.info('Processing sample %s', sample.id) + with open(sample.path, 'r') as json_file: + denovo_json = json.load(json_file) + sample_depths[sample.id] = denovo_json['Depth'] + process_sample(parameters, denovo_json, sample.id, + combined_air_regions, + combined_pir_regions) + + if sample_num > 0 and sample_num % 50 == 0: + logging.info('Merging regions') + merge_combined_regions(combined_air_regions) + + logging.info('Performing final merge') + merge_combined_regions(combined_air_regions) + + return {'air': combined_air_regions, 'pir': combined_pir_regions} + + +def normalize_count(sample_depth, count): + target_depth = 40 + return target_depth * count / sample_depth + + +def normalize_counts(sample_depths, combined_regions): + for unit, regions in combined_regions['air'].items(): + for region in regions: + for sample_id, count in region.feature_counts._count_dict.items(): + sample_depth = sample_depths[sample_id] + region.feature_counts._count_dict[sample_id] = normalize_count( + sample_depth, count) + for unit, sample_counts in combined_regions['pir'].items(): + for sample_id, count in sample_counts.items(): + sample_depth = sample_depths[sample_id] + combined_regions['pir'][unit][sample_id] = normalize_count( + sample_depth, count) + + +def create_json(combined_regions): + output_json = {} + for unit, regions in combined_regions['air'].items(): + output_json[unit] = {} + output_json[unit]['RegionsWithIrrAnchors'] = regions.as_dict() + + for unit, counts in combined_regions['pir'].items(): + if unit not in output_json: + output_json[unit] = {} + output_json[unit]['IrrPairCounts'] = counts + + return output_json + + +def write_json(parameters, output_json): + with open(parameters['output_path'], 'w') as output_file: + json_encoding = json.dumps(output_json, sort_keys=True, + indent=4, separators=(',', ': ')) + print(json_encoding, file=output_file) + + +def main(): + common.init_logger() + parameters = load_parameters() + samples = common.load_manifest(parameters['manifest_path']) + sample_depths = {} + combined_regions = process_samples(parameters, samples, sample_depths) + normalize_counts(sample_depths, combined_regions) + output_json = create_json(combined_regions) + write_json(parameters, output_json) + + +if __name__ == '__main__': + main() diff --git a/scripts/str/common.py b/scripts/str/common.py new file mode 100755 index 0000000..85ac2e3 --- /dev/null +++ b/scripts/str/common.py @@ -0,0 +1,129 @@ +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import collections +import logging +import json +import scipy.stats as stats + +from . import regiontools + +from .wilcoxontest import wilcoxon_rank_sum_test + + +def init_logger(): + logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO) + + +def load_manifest(manifest_path): + '''Extract sample information from a manifest file. + + ''' + + # pylint: disable=I0011,C0103 + Sample = collections.namedtuple('Sample', 'id status path') + + samples = [] + with open(manifest_path, 'r') as manifest_file: + for line in manifest_file: + sample_id, status, path = line.split() + sample = Sample(id=sample_id, status=status, path=path) + samples.append(sample) + return samples + + +def load_combined_json(json_path): + logging.info('Loading %s', json_path) + with open(json_path, 'r') as json_file: + combined_json = json.load(json_file) + return combined_json + + +def filter_counts_by_magnitude(count_table, count_cutoff): + filtered_count_table = [] + for row in count_table: + max_count = max(count for _, count in row['sample_counts'].items()) + if max_count >= count_cutoff: + filtered_count_table.append(row) + + return filtered_count_table + + +def filter_counts_by_region(count_table, target_regions): + filtered_count_table = [] + for row in count_table: + chrom, start, end = row['region'].replace(':', '-').split('-') + start, end = int(start), int(end) + region = regiontools.Region(chrom, start, end) + overlaps_target_region = any(regiontools.compute_distance( + region, target) == 0 for target in target_regions) + + if overlaps_target_region: + filtered_count_table.append(row) + + return filtered_count_table + + +def filter_counts(count_table, count_cutoff, target_regions=None): + filtered_count_table = filter_counts_by_magnitude( + count_table, count_cutoff) + if target_regions: + filtered_count_table = filter_counts_by_region( + filtered_count_table, target_regions) + return filtered_count_table + + +def extract_case_control_assignments(samples): + sample_status = {} + for sample in samples: + sample_status[sample.id] = sample.status + return sample_status + + +def test_samples(test_params, sample_status, sample_counts): + control_samples = [sample for sample, + status in sample_status.items() if status == 'control'] + case_samples = [sample for sample, + status in sample_status.items() if status == 'case'] + + control_counts = [sample_counts[s] + if s in sample_counts else 0 for s in control_samples] + + case_counts = [sample_counts[s] + if s in sample_counts else 0 for s in case_samples] + + pvalue = wilcoxon_rank_sum_test(test_params, case_counts, control_counts) + + return pvalue + + +def compare_counts(test_params, sample_status, count_table): + for row in count_table: + # Generate counts before testing + pvalue = test_samples(test_params, sample_status, row['sample_counts']) + row['pvalue'] = pvalue + + +def correct_pvalues(count_table): + num_tests = len(count_table) + for row in count_table: + row['bonf_pvalue'] = min(row['pvalue'] * num_tests, 1.0) diff --git a/scripts/str/compare_anchored_irrs.py b/scripts/str/compare_anchored_irrs.py new file mode 100755 index 0000000..9806083 --- /dev/null +++ b/scripts/str/compare_anchored_irrs.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import argparse +import logging +import sys +import json + +from core import regiontools, common + + +def load_parameters(): + '''Capture command-line parameters. + + ''' + parser = argparse.ArgumentParser( + description='compare counts of anchored in-repeat reads') + + # Add required arguments. + manifest_help = 'TSV file with id, case/control status, and path of each sample' + input_help = 'JSON file with combined counts of anchored in-repeat reads' + output_help = ('BED file with regions containing anchored in-repeat reads and associated' + ' p-values') + required_arg_group = parser.add_argument_group('required arguments') + required_arg_group.add_argument( + '--manifest', help=manifest_help, required=True) + required_arg_group.add_argument( + '--inputCounts', help=input_help, required=True) + required_arg_group.add_argument( + '--outputRegions', help=output_help, required=True) + + # Add optional arguments. + def_min_count = 5 + min_count_help = ('minimum number reads in a region for downstream analysis' + ' (default: {})').format(def_min_count) + target_regions_help = 'BED file with regions to which analysis should be restricted' + def_test_method = 'normal' + test_method_help = 'Method of calculating Wilcoxon Rank-Sum Test p-value (default: {})'.format( + def_test_method) + def_num_resamples = 1000000 + num_resamples_help = 'Number of iterations for the resampling test (default: {})'.format( + def_num_resamples) + parser.add_argument( + '--minCount', help=min_count_help, default=def_min_count, type=int) + parser.add_argument('--targetRegions', + help=target_regions_help, default=None) + parser.add_argument('--testMethod', help=test_method_help, + choices=['normal', 'resample'], default=def_test_method) + parser.add_argument( + '--numResamples', help=num_resamples_help, default=def_num_resamples, type=int) + + args = parser.parse_args() + + parameter_encoding = ' '.join(sys.argv[1:]) + logging.info('Starting with these parameters: %s', parameter_encoding) + + test_params = {'method': args.testMethod, + 'num_resamples': args.numResamples} + return {'manifest_path': args.manifest, 'counts_path': args.inputCounts, + 'min_count': args.minCount, 'output_path': args.outputRegions, + 'target_regions': args.targetRegions, 'test_params': test_params} + + +def generate_count_table(combined_counts): + count_table = [] + for unit, rec in combined_counts.items(): + if 'RegionsWithIrrAnchors' not in rec: + continue + + for region, sample_counts in rec['RegionsWithIrrAnchors'].items(): + table_row = {'region': region, 'unit': unit} + table_row['sample_counts'] = sample_counts + count_table.append(table_row) + + logging.info('Loaded %i regions', len(count_table)) + return count_table + + +def load_target_regions(fname): + logging.info('Loading target regions from %s', fname) + regions = [] + with open(fname, 'r') as bed_file: + for line in bed_file: + chrom, start, end, *_ = line.split() + start, end = int(start), int(end) + region = regiontools.Region(chrom, start, end) + regions.append(region) + return regions + + +def output_results(count_table, output_path): + with open(output_path, 'w') as output_file: + for row in count_table: + chrom, start, end = row['region'].replace(':', '-').split('-') + unit = row['unit'] + pvalue, bonf_pvalue = row['pvalue'], row['bonf_pvalue'] + + sample_counts = row['sample_counts'] + encoded_counts = ['{}:{}'.format(s, c) + for s, c in sample_counts.items()] + encoded_counts = ','.join(encoded_counts) + print(chrom, start, end, unit, pvalue, bonf_pvalue, encoded_counts, + sep='\t', file=output_file) + + +def main(): + common.init_logger() + parameters = load_parameters() + combined_counts = common.load_combined_json(parameters['counts_path']) + samples = common.load_manifest(parameters['manifest_path']) + target_regions = None + if parameters['target_regions']: + target_regions = load_target_regions(parameters['target_regions']) + count_table = generate_count_table(combined_counts) + count_table = common.filter_counts( + count_table, parameters['min_count'], target_regions) + logging.info('%i regions left after initial filtering', len(count_table)) + sample_status = common.extract_case_control_assignments(samples) + common.compare_counts( + parameters['test_params'], sample_status, count_table) + common.correct_pvalues(count_table) + output_results(count_table, parameters['output_path']) + logging.info('Done') + + +if __name__ == '__main__': + main() diff --git a/scripts/str/core/__init__.py b/scripts/str/core/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/scripts/str/core/common.py b/scripts/str/core/common.py new file mode 100755 index 0000000..85ac2e3 --- /dev/null +++ b/scripts/str/core/common.py @@ -0,0 +1,129 @@ +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import collections +import logging +import json +import scipy.stats as stats + +from . import regiontools + +from .wilcoxontest import wilcoxon_rank_sum_test + + +def init_logger(): + logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO) + + +def load_manifest(manifest_path): + '''Extract sample information from a manifest file. + + ''' + + # pylint: disable=I0011,C0103 + Sample = collections.namedtuple('Sample', 'id status path') + + samples = [] + with open(manifest_path, 'r') as manifest_file: + for line in manifest_file: + sample_id, status, path = line.split() + sample = Sample(id=sample_id, status=status, path=path) + samples.append(sample) + return samples + + +def load_combined_json(json_path): + logging.info('Loading %s', json_path) + with open(json_path, 'r') as json_file: + combined_json = json.load(json_file) + return combined_json + + +def filter_counts_by_magnitude(count_table, count_cutoff): + filtered_count_table = [] + for row in count_table: + max_count = max(count for _, count in row['sample_counts'].items()) + if max_count >= count_cutoff: + filtered_count_table.append(row) + + return filtered_count_table + + +def filter_counts_by_region(count_table, target_regions): + filtered_count_table = [] + for row in count_table: + chrom, start, end = row['region'].replace(':', '-').split('-') + start, end = int(start), int(end) + region = regiontools.Region(chrom, start, end) + overlaps_target_region = any(regiontools.compute_distance( + region, target) == 0 for target in target_regions) + + if overlaps_target_region: + filtered_count_table.append(row) + + return filtered_count_table + + +def filter_counts(count_table, count_cutoff, target_regions=None): + filtered_count_table = filter_counts_by_magnitude( + count_table, count_cutoff) + if target_regions: + filtered_count_table = filter_counts_by_region( + filtered_count_table, target_regions) + return filtered_count_table + + +def extract_case_control_assignments(samples): + sample_status = {} + for sample in samples: + sample_status[sample.id] = sample.status + return sample_status + + +def test_samples(test_params, sample_status, sample_counts): + control_samples = [sample for sample, + status in sample_status.items() if status == 'control'] + case_samples = [sample for sample, + status in sample_status.items() if status == 'case'] + + control_counts = [sample_counts[s] + if s in sample_counts else 0 for s in control_samples] + + case_counts = [sample_counts[s] + if s in sample_counts else 0 for s in case_samples] + + pvalue = wilcoxon_rank_sum_test(test_params, case_counts, control_counts) + + return pvalue + + +def compare_counts(test_params, sample_status, count_table): + for row in count_table: + # Generate counts before testing + pvalue = test_samples(test_params, sample_status, row['sample_counts']) + row['pvalue'] = pvalue + + +def correct_pvalues(count_table): + num_tests = len(count_table) + for row in count_table: + row['bonf_pvalue'] = min(row['pvalue'] * num_tests, 1.0) diff --git a/scripts/str/core/regiontools.py b/scripts/str/core/regiontools.py new file mode 100755 index 0000000..e935edf --- /dev/null +++ b/scripts/str/core/regiontools.py @@ -0,0 +1,180 @@ +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import sys +import copy + + +class FeatureCounts(object): + def __init__(self, count_dict): + self._count_dict = count_dict + + def __len__(self): + return len(self._count_dict) + + def __getitem__(self, sample_id): + return self._count_dict[sample_id] + + def __repr__(self): + encoding = ('{}={}'.format(s, self._count_dict[s]) + for s in sorted(self._count_dict)) + encoding = ','.join(encoding) + return encoding + + def __iter__(self): + for sample, count in self._count_dict.items(): + yield (sample, count) + + def __eq__(self, other): + # pylint: disable=I0011,W0212 + return self._count_dict == other._count_dict + + def combine(self, other): + # pylint: disable=I0011,W0212 + for sample, count in other._count_dict.items(): + if sample in self._count_dict: + self._count_dict[sample] += count + else: + self._count_dict[sample] = count + + +class Region(object): + def __init__(self, chrom=None, start=0, end=0, feature_counts=None): + self.chrom = chrom + self.start = start + self.end = end + self.feature_counts = feature_counts + + def is_unset(self): + return self.chrom is None + + def __repr__(self): + region_repr = '{}:{}-{}'.format(self.chrom, self.start, self.end) + if self.feature_counts: + region_repr += '({})'.format(self.feature_counts) + return region_repr + + def __eq__(self, other): + return (self.chrom == other.chrom and self.start == other.start and + self.end == other.end and + self.feature_counts == other.feature_counts) + + def __lt__(self, other): + self_tuple = (self.chrom, self.start, self.end) + other_tuple = (other.chrom, other.start, other.end) + return self_tuple < other_tuple + + +def compute_distance(region_a, region_b): + distance = sys.maxsize + if region_a.is_unset() or region_b.is_unset(): + raise Exception('Cannot compute distance between unset regions') + + if region_a.chrom == region_b.chrom: + if region_a.end < region_b.start: # Is region_a left of region_b? + distance = region_b.start - region_a.end + elif region_b.end < region_a.start: # Is region_b left or region_a? + return region_a.start - region_b.end + else: # If neither is true, the regions must overlap. + distance = 0 + + return distance + + +class RegionCollection(object): + def __init__(self, regions=None): + self._regions = regions + if self._regions is None: + self._regions = [] + + def __len__(self): + return len(self._regions) + + def __getitem__(self, index): + return self._regions[index] + + def __repr__(self): + encoding = '\n'.join(repr(r) for r in self._regions) + return encoding + + def __eq__(self, other): + # pylint: disable=I0011,W0212 + for region_a, region_b in zip(self._regions, other._regions): + if region_a != region_b: + return False + return True + + def __iter__(self): + for region in self._regions: + yield region + + def as_dict(self): + region_dict = {} + for region in self._regions: + region_encoding = '{}:{}-{}'.format(region.chrom, + region.start, region.end) + region_dict[region_encoding] = dict(region.feature_counts) + return region_dict + + def merge(self, max_dist=500): + self._regions.sort() + + merged_regions = [] + aggregate_region = Region() + + for region in self._regions: + if aggregate_region.is_unset(): + aggregate_region = region + continue + + if compute_distance(aggregate_region, region) <= max_dist: + aggregate_region.end = max(aggregate_region.end, region.end) + aggregate_region.feature_counts.combine(region.feature_counts) + else: + merged_regions.append(aggregate_region) + aggregate_region = region + + if not aggregate_region.is_unset(): + merged_regions.append(aggregate_region) + + self._regions = merged_regions + + def extend(self, other): + # pylint: disable=I0011,W0212 + self._regions += other._regions + + +def create_region_from_denovo_record(sample_id, region_count): + region_encoding, count = region_count + region_encoding = region_encoding.replace(':', '-').split('-') + chrom, start, end = region_encoding + start, end = int(start), int(end) + feture_counts = FeatureCounts({sample_id: count}) + return Region(chrom, start, end, feture_counts) + + +def create_region_collection_from_denovo_record(sample_id, denovo_record): + regions = [] + for region_count in denovo_record.items(): + region = create_region_from_denovo_record(sample_id, region_count) + regions.append(region) + return RegionCollection(regions) diff --git a/scripts/str/core/wilcoxontest.py b/scripts/str/core/wilcoxontest.py new file mode 100755 index 0000000..9035d9b --- /dev/null +++ b/scripts/str/core/wilcoxontest.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import numpy as np +import scipy.special +import scipy.stats + + +def calculate_approximate_pvalue(cases, controls): + all_counts = cases + controls + ranks = scipy.stats.rankdata(all_counts) + case_rank_sum = np.sum(ranks[:len(cases)]) + + mu_cases = len(cases) * (len(cases) + len(controls) + 1) / 2 + sigma_cases = np.sqrt(len(cases) * len(controls) * + (len(cases) + len(controls) + 1) / 12) + z_cases = (case_rank_sum - mu_cases) / sigma_cases + + return 1 - scipy.stats.norm.cdf(z_cases) + + +def calculate_permutation_pvalue(cases, controls, num_permutations): + all_counts = cases + controls + ranks = scipy.stats.rankdata(all_counts) + num_cases = len(cases) + true_case_rank_sum = np.sum(ranks[:num_cases]) + + permuted_case_ranks = np.random.choice( + ranks, size=(num_permutations, num_cases)) + permuted_case_rank_sums = np.sum(permuted_case_ranks, axis=1) + + num_case_rank_sums_as_extreme_as_true = np.sum( + permuted_case_rank_sums >= true_case_rank_sum) + + return (num_case_rank_sums_as_extreme_as_true + 1) / (num_permutations + 1) + + +def wilcoxon_rank_sum_test(test_params, cases, controls): + test_method = test_params['method'] + if test_method == 'normal': + return calculate_approximate_pvalue(cases, controls) + elif test_method == 'resample': + return calculate_permutation_pvalue(cases, controls, test_params['num_resamples']) + else: + assert False, '{} is an unknown method type'.format(test_method) diff --git a/scripts/str/crg.ehdn.sh b/scripts/str/crg.ehdn.sh new file mode 100644 index 0000000..99b19fc --- /dev/null +++ b/scripts/str/crg.ehdn.sh @@ -0,0 +1,88 @@ +#PBS -N EHDN +#PBS -l vmem=80g,mem=80g,walltime=24:00:00 +#PBS -joe +#PBS -d . + +if [ -z $EHDN ] +then + exit; +fi + +if [ -z ${g1k_manifest} ] +then + exit; +fi + +if [ -z $ref ] +then + exit; +fi + +if [ -z $script_dir ] +then + exit; +fi + +scripts="${script_dir}"; + + + +if [ -z $family ]; then family=$1; fi; +if [ -z $pipeline ]; then pipeline=$2; fi #pipeline is crg or crg2 + +if [ $pipeline == "crg" ]; then + bcbio="${family}/bcbio-align/${family}/final/${family}_*"; + if [ -z "`ls ${bcbio}/${family}_*-ready.bam`" ]; then + if [ -z "`ls ${family}/${family}*-ready.bam`" ]; then + echo "BAM files for $family not found inside bcbio-align/ and $family/. Exiting!"; + exit + else + dir="$family/${family}*-ready.bam"; + fi; + else + dir="$bcbio/${family}*-ready.bam"; + fi; + outdir="${family}/str/expansion_hunter_denovo"; +else + dir="decoy_rm/*.bam" + outdir="str/EHDN"; +fi; + +manifest="${outdir}/${family}_manifest.txt"; +echo "Running for pipeline: ${pipeline} with directories => ${outdir}, ${dir}, $family"; + +for i in `ls $dir`; do + + if [ ! -d "$outdir" ]; then + mkdir -p $outdir; + fi; + if [ $pipeline == "crg" ]; then + prefix=`basename $i -ready.bam`; + else + prefix=`basename $i .bam | cut -d "." -f1`; + fi + reads=`readlink -f $i`; + ehdn_prefix="${outdir}/${prefix}"; + echo -e "STEP1: ExpansionHunterDenovo-v0.7.0 --reference $ref --reads $reads --output-prefix $ehdn_prefix --min-anchor-mapq 50 --max-irr-mapq 40 \n" + $EHDN --reference $ref --reads $reads --output-prefix $ehdn_prefix --min-anchor-mapq 50 --max-irr-mapq 40 & + echo -e "${prefix}\tcase\t${ehdn_prefix}.json" >> $manifest; + +done; + +wait + +cat $manifest ${g1k_manifest} > temp && mv temp $manifest; + +##EHDN report per family +echo "STEP2: generating multi-sample EHDN STR profile and report named ${outdir}/${family}_EHDN_str.tsv" +#module load python/3.7.7 + +countsfile="${outdir}/${family}_counts.txt"; +profile="${outdir}/${family}_EHDN_str.tsv"; +python ${scripts}/combine_counts.py --manifest $manifest --combinedCounts $countsfile +python ${scripts}/compare_anchored_irrs.py --manifest $manifest --inputCounts $countsfile --outputRegions $profile --minCount 2 --testMethod normal +#module unload python/3.7.7 + +##add chr to chromosome column +echo "STEP2.1: add chr prefix to first column ${outdir}/${family}_EHDN_str.tsv" +awk -vFS="\t" -vOFS="\t" '{ if(!($1~/^chr/)) {$1="chr"$1;} print $0; }' ${profile} > temp && mv temp ${profile} diff --git a/scripts/eh_sample_report.py b/scripts/str/eh_sample_report.py similarity index 95% rename from scripts/eh_sample_report.py rename to scripts/str/eh_sample_report.py index da49d36..ccabcb9 100755 --- a/scripts/eh_sample_report.py +++ b/scripts/str/eh_sample_report.py @@ -11,7 +11,7 @@ from xlsxwriter.workbook import Workbook annot_tsv = sys.argv[1] #output from add_gene+threshold_to_EH_column_headings2.py -g1000 = sys.argv[2] #median/mean/std GT sizes from 1000G EH +g1000 = sys.argv[2] xlsx = sys.argv[3] #output xlsx filename def outlier_gt(threshold, gt_dict): @@ -23,6 +23,8 @@ def outlier_gt(threshold, gt_dict): y = x.split("/") if "/" in x else x if not "." in y: y = max(list(map(int,y))) if isinstance(y,list) else int(y) + else: + return " " if y: if "(" in threshold or " " in threshold: continue @@ -54,7 +56,7 @@ def outlier_gt(threshold, gt_dict): with open(g1000) as f: for i in f: if not i.startswith("#"): - annot, mean, std, median = i.strip().split("\t") + annot, mean, median, std = i.strip().split("\t") if not annot in G1K: G1K[annot] = [mean, std, median] @@ -93,7 +95,6 @@ def outlier_gt(threshold, gt_dict): worksheet.write_row(0, 0, header) row = 1 for i in trf: - # for i in eh_gt[sample]: info = trf[i][samples[0]] content = [info.pos, info.motif, info.gene, info.size] content += [ trf[i][s].gt for s in samples ] diff --git a/scripts/str/ehdn_report.sh b/scripts/str/ehdn_report.sh new file mode 100644 index 0000000..24b187d --- /dev/null +++ b/scripts/str/ehdn_report.sh @@ -0,0 +1,63 @@ +#PBS -N EHDN_REPORT +#PBS -l vmem=80g,mem=80g,walltime=48:00:00 +#PBS -joe +#PBS -d . + + +scripts=~/crg/str; ###remove### +EHDN_files=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo; ###remove### +g1k_outlier="${EHDN_files}/1000G_outlier"; +g1k_manifest="${EHDN_files}/manifest.1000G.txt"; +omim=${EHDN_files}/OMIM_hgnc_join_omim_phenos_2023-06-22.tsv; +gnomad=${EHDN_files}/gnomad.v2.1.1.lof_metrics.by_gene.txt; +annovar=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/annovar; ###remove### +annovar_db=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/annovar/humandb/; ###remove### + +family=$1; +outdir=$2; + +##find outlier(noise) samples in current cohort to be removed later +echo "STEP1: write outlier samples to a file" +profile="${outdir}/${family}_EHDN_str.tsv"; +outliers="${outdir}/${family}_outliers.txt"; +echo -e "CMD: python ${scripts}/find_outliers.py ${profile} ${outliers}\n" +python ${scripts}/find_outliers.py ${profile} ${outliers} +cat ${outliers} ${g1k_outlier} > temp && mv temp ${outliers} + +##run outlier detection using DBSCAN clustering with 1000G as control +echo "STEP2: Outlier detection using DBSCAN clustering" +outpath="${outdir}/outliers" +echo -e "CMD: Rscript ${scripts}/DBSCAN.EHdn.parallel.R --infile ${profile} --outpath ${outpath} --outlierlist ${outliers}\n" +Rscript ${scripts}/DBSCAN.EHdn.parallel.R --infile ${profile} --outpath ${outpath} --outlierlist ${outliers} +outlier_tsv=`ls -t ${outpath}/EHdn.expansions*tsv | head -n1`; +echo -e "CMD: Rscript ${scripts}/mergeExpansions.R --ehdn ${profile} --outlier ${outlier_tsv} --outpath ${outpath}\n" +Rscript ${scripts}/mergeExpansions.R --ehdn ${profile} --outlier ${outlier_tsv} --outpath ${outpath} + + +#following Python script requires python3 and xlsxwriter +#installed in my conda env "str" +#source /hpf/largeprojects/ccmbio/aarthi/miniconda3/etc/profile.d/conda.sh +#conda activate str +##format above output for annovar annotation +echo "STEP3.1: format DBSCAN output for ANNOVAR annotation" +merge_tsv=`ls -t ${outpath}/merged.rare.expansions.[0-9]*.tsv | head -n1`; +echo -e "CMD:python ${scripts}/format_for_annovar.py ${outlier_tsv} ${merge_tsv} ${g1k_manifest}\n" +python ${scripts}/format_for_annovar.py ${outlier_tsv} ${merge_tsv} ${g1k_manifest} + +echo "STEP3.2: annotate with OMIM" +annovar_input=`ls -t ${outpath}/merged.rare.EHdn.expansion.[0-9]*tsv | head -n1`; #incase there > 1 file because of multiple runs +outfile=`echo ${merge_tsv} | awk '{ split($1,a,".tsv"); print a[1]; }'`; +echo -e "CMD: ${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile \"${outfile}-OMIM\" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${omim}\n" +${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile "${outfile}-OMIM" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${omim} + +echo "STEP3.3: annotate with gnoMAD" +echo -e "CMD: ${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile \"${outfile}-gnoMAD\" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${gnomad}\n" +${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile "${outfile}-gnoMAD" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${gnomad} + +echo "STEP3.4: format annotated reports to xlsx: EHDN.rare.merged.annotated.xlsx" +gnomad_out=`ls -t ${outfile}-gnoMAD.hg19_multianno.txt | head -n1` +omim_out=`ls -t ${outfile}-OMIM.hg19_multianno.txt | head -n1` +date=`date +%Y-%m-%d` +xlsx="${outpath}/${family}.EHDN.${date}.xlsx" +echo -e "CMD: python ${scripts}/format_from_annovar.py ${gnomad_out} ${omim_out} ${xlsx}\n" +python ${scripts}/format_from_annovar.py ${gnomad_out} ${omim_out} ${xlsx} diff --git a/scripts/str/find_outliers.py b/scripts/str/find_outliers.py new file mode 100755 index 0000000..db4de3f --- /dev/null +++ b/scripts/str/find_outliers.py @@ -0,0 +1,38 @@ +import sys, numpy as np +#from matplotlib import pyplot as plt + + + +infile = sys.argv[1] +outfile = sys.argv[2] +tr = {} +rec = [] +with open(infile) as f: + for i in f: + i = i.strip().split("\t") + if any(s in i for s in ['motif', 'zscore', 'end', 'start']): + header = i + else: + depth = i[-1] + if "," in depth: + j = depth.split(",") + for item in j: + k = item.split(":")[0] + if not k in tr: + tr[k] = 0 + tr[k] += 1 + else: + k = depth.split(":")[0] + if not k in tr: + tr[k] = 0 + tr[k] += 1 + +allcounts = [ counts for sample, counts in tr.items() if not sample.startswith(("HG","NA"))] +stats = lambda l: (np.mean(l), np.std(l), np.median(l)) +allcounts_stats = stats(allcounts) +not_outlier = lambda l, c: (l[0] - 3 * l[1]) < c and (l[0] + 3 * l[1] ) > c +with open(outfile, "w") as f: + for sample, counts in tr.items(): + if not sample.startswith(("HG","NA")): + if not not_outlier(allcounts_stats, counts): + f.writelines("%s"%sample) diff --git a/scripts/str/format_for_annovar.py b/scripts/str/format_for_annovar.py new file mode 100755 index 0000000..dcdb0cc --- /dev/null +++ b/scripts/str/format_for_annovar.py @@ -0,0 +1,137 @@ +import sys, os, re +from collections import namedtuple + +''' +merge output files of EHDN after running through DBSCAN script into one file +get values for outlier, repeat size, a1000g_freq from EHdn.expansion..tsv +and merge with merged.rare.expansion..tsv for ANNOVAR annotation +manifest_1000G.txt: + EHDN manifest files (three-column:sampleid\tcase/control\tpath to EHDN JSON) + you can also provide a single column file with 1000G sample ids; this is uesd + to remove rows with only 1000G samples as outlier +python format_for_annovar.py EHdn.expansions.2021-02-18.tsv merged.rare.expansions.2021-02-18.tsv manifest_1000G.txt +''' + +#not using pandas.merge because the two files have some duplicate lines (chr#start#end) +#merged.rare.expansion has altered intervals (merging nearby motifs) so won't exactly match those +#in EHdn.expansion..tsv file entries + +ehdn_out = sys.argv[1] +dbscan_out = sys.argv[2] +g1k_manifest = sys.argv[3] +#date = ehdn_out.split(".")[-2] +d = os.path.dirname(dbscan_out) +outfile = os.path.join(d,"merged.rare.EHdn.expansion.tsv") +samples = [] +print("input files: ", ehdn_out, dbscan_out, g1k_manifest) +print("out:", outfile) + +#read 1000G sample names from manifest file +g1k = [] +with open(g1k_manifest) as f: + for i in f: + i = i.strip("\n") + if re.split("\t ' '",i)[0]: + g1k.append(re.split("\t|' '",i)[0]) + else: + g1k.append(i) +all_g1k_sample = lambda x: set(x).difference(g1k) + +#read DBSCAN output +dbscan = namedtuple("dbscan", "chr start end motif outliers key") +merged_exp = [] +with open(dbscan_out) as f: + f.readline() + for i in f: + i = i.strip().split("\t") + key = "#".join(i[0:3]) + outlier = re.split(";",i[4]) + if all_g1k_sample(outlier): + merged_exp.append(dbscan(chr=i[0],start=i[1],end=i[2],motif=i[3],outliers=i[4],key=key)) + for o in outlier: + if not o in g1k: + samples.append(o) + +samples = list(set(samples)) +print("samples = ", samples) +#read EHDN output +ehdn = namedtuple("ehdn", "motif outliers size ref chr start end a1000g key") +ehdn_exp = [] +with open(ehdn_out) as f: + f.readline() + for i in f: + i = i.strip().split("\t") + key = "#".join(i[5:8]) + ehdn_exp.append(ehdn(motif=i[1],outliers=i[2],size=i[3],ref=i[4],chr=i[5],start=i[6],end=i[7],a1000g=i[8],key=key)) +ehdn_exp + +#match reacords from DBSCAN and EHDN to retrieve all info +#DBSCAN repeats are merged based on locations, so may not match original EHDN output + +def split_outlier_size(outlier,size,samples): + outlier_l = re.split(";",outlier) + size_l = re.split(";",size) + if len(size_l) == len(outlier_l): + len_samples = len(samples) + outlier_samples = ["0"]*(len_samples+1) + outlier_sizes = ["0"]*len_samples + for i,s in enumerate(samples): + if s in outlier_l: + ind = outlier_l.index(s) + outlier_samples[i] = "1" + outlier_sizes[i] = size_l[ind] + else: + if s in g1k: + outlier_samples[len_samples] += 1 + return outlier_samples, outlier_sizes + +found, allkey, out, seen = [], [], [], [] +ref, alt = "0", "-" +for i in merged_exp: + allkey.append(i) + for j in ehdn_exp: + if i.key == j.key: + if i.outliers == j.outliers: + found.append(i) + outliers, sizes = split_outlier_size(i.outliers, j.size, samples) + print(outliers, sizes, i.key) + # out.append([i.chr, i.start, i.end, ref, alt, i.motif, i.outliers, i.key, j.size, j.a1000g]) + # out.append([i.chr, i.start, i.end, ref, alt, i.motif, outliers, sizes, i.key, j.a1000g]) + o = [i.chr, i.start, i.end, ref, alt, i.motif ] + outliers + sizes + [i.key, j.a1000g] + out.append(o) + print(o) + # seen.append(i) + continue + + +notfound = list(set(allkey).difference(found)) +within = lambda a,b,c,d: ( c <= a and a <= d ) or ( c <= b and b <= d) +for i in notfound: + for j in ehdn_exp: + if i.chr == j.chr and i.motif == j.motif and i.outliers == j.outliers: + r1 = range(int(i.start),int(i.end)) + r2 = range(int(j.start),int(j.end)) +# if i.start == j.start or i.end == j.end or len(set(r1).intersection(r2)) > 5: + if within(i.start,i.end,j.start,j.end) or within(j.start,j.end,i.start,i.end): + # out.append([i.chr, i.start, i.end, ref, alt, i.motif, i.outliers, i.key, j.size, j.a1000g]) + outliers, sizes = split_outlier_size(i.outliers, j.size, samples) + o = [i.chr, i.start, i.end, ref, alt, i.motif ] + outliers + sizes + [i.key, j.a1000g] + # out.append([i.chr, i.start, i.end, ref, alt, i.motif, outliers, sizes, i.key, j.a1000g]) + out.append(o) + print(o) +# seen.append(i) + continue + +columns = ["chr", "start", "end", "ref", "alt", "motif", ] +columns += ["outlier." + s for s in samples] +columns += ["1000G_outlier_count"] +columns += ["size." + s for s in samples] +columns += ["key", "1000G_freq_perc"] +with open(outfile, "w") as f: + for i in columns: + f.writelines("%s\t"%i) + f.writelines("\n") + for i in out: + for j in i: + f.writelines("%s\t"%j) + f.writelines("\n") diff --git a/scripts/str/format_from_annovar.py b/scripts/str/format_from_annovar.py new file mode 100755 index 0000000..8d915cf --- /dev/null +++ b/scripts/str/format_from_annovar.py @@ -0,0 +1,46 @@ +import pandas as pd, sys +import xlsxwriter + +''' +format the outputs from ANNOVAR table_annovar.pl with OMIM and GNOMAD +merged.rare.EHdn.annovar.omim.hg19_multianno.txt +merged.rare.EHdn.annovar.gnomad.hg19_multianno.txt +''' + +gnomad_file = sys.argv[1] +omim_file = sys.argv[2] +outfile = sys.argv[3] + + +#read OMIM annotated file as table and fix header +omim = pd.read_csv(omim_file, sep="\t", header=[0,1], index_col=None) +omim.columns = [j if i.startswith("Otherinfo") else i for i, j in omim.columns ] +omim.rename(columns=lambda x: x.replace(".refGene",""), inplace=True) +omim_col = ['omim_inheritance', 'omim_phenotype' ] + +#read gnoMAD annotated file as table and fix header +gnomad = pd.read_csv(gnomad_file, sep="\t", header=[0,1], index_col=None) +gnomad.columns = [j if i.startswith("Otherinfo") else i for i, j in gnomad.columns ] +gnomad.rename(columns=lambda x: x.replace(".refGene",""), inplace=True) + +#sample-wise outlier and size column name for reporting +sample_outliers, size_outliers = [], [] +for i in list(gnomad.columns): + if i.startswith("outlier.") or i.startswith("1000G_outlier_count"): + sample_outliers.append(i) + if i.startswith("size."): + size_outliers.append(i) +print(sample_outliers, size_outliers) +print(gnomad.columns) + +gnomad_col = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func', 'Gene', 'motif' ] + sample_outliers + size_outliers + ['key', '1000G_freq_perc', 'GeneDetail','ExonicFunc', 'AAChange', 'transcript', 'oe_lof', 'oe_lof_upper', 'oe_mis', 'oe_mis_upper', 'pLI', 'pRec'] +print(gnomad_col) +annot_rare = pd.merge(gnomad[gnomad_col],omim[omim_col],right_index=True, left_index=True) +print(annot_rare.head(5)) + +output_columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Function', 'Gene', 'motif', ] +output_columns += sample_outliers + size_outliers +output_columns += ['chr#start#end', 'a1000g_freq_perc', 'GeneDetail','ExonicFunc', 'AAChange', 'transcript', 'oe_lof', 'oe_lof_upper', 'oe_mis', 'oe_mis_upper', 'pLI', 'pRec', 'omim_inheritance', 'omim_phenotype' ] +if not ".xlsx" in outfile: + outfile = outfile + ".xlsx" +annot_rare.to_excel(outfile, index=False) diff --git a/scripts/generate_EH_genotype_table.generic.py b/scripts/str/generate_EH_genotype_table.generic.py similarity index 100% rename from scripts/generate_EH_genotype_table.generic.py rename to scripts/str/generate_EH_genotype_table.generic.py diff --git a/scripts/str/mergeExpansions.R b/scripts/str/mergeExpansions.R new file mode 100755 index 0000000..ff35349 --- /dev/null +++ b/scripts/str/mergeExpansions.R @@ -0,0 +1,189 @@ +rm(list=ls()) + +#### parameters +#### to run +#### Rscript mergeExpansions.R --rscript ~/crg2/scripts/str/ExpansionAnalysis.R --ehdn /hpf/largeprojects/tcagstor/users/btrost/papers/STRs/Qatar_ASD/output_regions.min2.1000G+SSC+MSSNG+QASD.txt +#### --outlier dbscan.tsv --outpath output/ --trf /hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv +###trf <- "/hpf/largeprojects/ccmbio/aarthi/old_projects/proj_CHEO/CRG/str/ExpansionAnalysisPackage/UCSC_simple_repeats_hg19_coord_motif.tsv" +###source(path.expand("~/crg2/str/ExpansionAnalysisFunctions.R")) + +#### read arguments from command line +args = commandArgs(trailingOnly=TRUE) +if(length(args) < 2){ + stop(call. = T, "Require more argument(s)") +} + +paramNames <- grep("--", args) +paramValues <- paramNames + 1 + +if(length(grep("--", args[paramValues])) > 0){ + stop(call. = T, "Failed in reading arguments, '--' found in argument value") +} + +if(length(args) != length(c(paramNames, paramValues))){ + message(sprintf("Ignore argument(s):%s", paste(args[-union(paramNames, paramValues)], collapse=","))) +} + +params <- list() +paramNames <- gsub("--", "", args[paramNames]) +for(i in 1:length(paramNames)){ + params[paramNames[i]] <- args[paramValues[i]] +} + +source(path.expand(params$rscript)) +outpath <- params$outpath +if(length(grep("\\/$", outpath)) == 0){ + outpath <- paste0(outpath, "/") +} + +if(!dir.exists(outpath)){ + dir.create(outpath, recursive = T) +} + +############### + +library(data.table) +library(GenomicRanges) +library(ggplot2) +library(cowplot) +library(Biostrings) + +queryString <- function(str1, str2){ + ori.str2 <- str2 + str2 <- paste0(str2, str2) + + length.str1 <- nchar(str1) + length.str2 <- nchar(str2) + revcom.str1 <- as.character(reverseComplement(DNAString(str1))) + + length.diff <- ifelse(length.str2 > length.str1, length.str2 - length.str1, 1) + + identity <- c() + for(i in 1:length.diff){ + tmp <- substr(str2, i, length.str1+i-1) + match <- sum(strsplit(tmp, "")[[1]] == strsplit(str1, "")[[1]]) + match.recom <- sum(strsplit(tmp, "")[[1]] == strsplit(revcom.str1, "")[[1]]) + + match <- min(max(match, match.recom), nchar(ori.str2)) + tmp.identity <- match/length.str1 + + identity <- c(identity, tmp.identity) + } + return(max(identity)) +} + +#### prepare map TRF +message(sprintf("reading %s##", params$ehdn)) +ehdn <- fread(params$ehdn) +trf <- fread(params$trf) +ehdn <- as.data.frame(ehdn)[, c(1:6)] +ehdn$repeatID <- paste(ehdn$V1, ehdn$V2, ehdn$V3, ehdn$V4, sep="#") +names(ehdn) <- c("chr", "start", "end", "motif", "var1", "var2", "repeatID") +if(length(grep("chr", ehdn$chr[1:5])) == 0) + ehdn$chr <- paste0("chr", ehdn$chr) +trf <- as.data.frame(trf) +trf$repeatID <- paste(trf$V1, trf$V2, trf$V3, trf$V4, sep="#") + +ehdn.g <- GRanges(ehdn$chr, IRanges(ehdn$start, ehdn$end), "*") +trf.g <- GRanges(trf$V1, IRanges(trf$V2, trf$V3), "*") + +olap <- data.frame(findOverlaps(trf.g, ehdn.g)) +olap$trf.motif <- trf$V4[olap$queryHits] +olap$ehdn.motif <- ehdn$motif[olap$subjectHits] +olap$trf.id <- trf$repeatID[olap$queryHits] +olap$ehdn.id <- ehdn$repeatID[olap$subjectHits] + +for(i in 1:nrow(olap)){ + trf.identity <- queryString(olap$trf.motif[i], olap$ehdn.motif[i]) + ehdn.identity <- queryString(olap$ehdn.motif[i], olap$trf.motif[i]) + + olap$reciprocalIdentity[i] <- min(trf.identity, ehdn.identity) +} + +olap$length.diff <- nchar(olap$trf.motif) - nchar(olap$ehdn.motif) + +olap.f <- olap[order(olap$reciprocalIdentity, decreasing = T), ] +olap.f <- olap.f[!duplicated(olap.f$subjectHits), ] +olap.f <- olap.f[olap.f$reciprocalIdentity > 0.66, ] +olap.f <- aggregate(trf.id ~ ehdn.id + reciprocalIdentity + length.diff, olap.f, paste, collapse = ";") +olap.f <- unique(olap.f) +olap.f <- olap.f[order(olap.f$reciprocalIdentity, decreasing = T), ] + +olap.f <- olap.f[!duplicated(olap.f$ehdn.id), ] + +write.table(olap.f[, c(4, 1, 2, 3)], sprintf("%smap.TRF.EHdn.0.66.tsv", outpath), sep="\t", row.names=F, quote=F, col.names=T) + +######################################### +### merge +outfile <- sprintf("%smerged.ehdn.tsv", outpath) + +dt <- fread(params$ehdn) +dt <- data.frame(dt) +ehdn <- dt +dt$varid <- paste(dt$V1, dt$V2, dt$V3, sep="_") + +clean.sample <- readLines(sprintf("%sclean.samples.txt", outpath)) + +ehdn$repeatID <- paste(ehdn$V1, ehdn$V2, ehdn$V3, ehdn$V4, sep="#") +ehdn$samples <- 0 +for(i in 1:nrow(ehdn)){ + tmp <- strsplit(gsub(",", ":", ehdn$V7[i]), ":")[[1]] + tmp <- tmp[seq(1, length(tmp), 2)] + tmp <- tmp[tmp %in% clean.sample] + + ehdn$samples[i] <- length(tmp) +} + +ehdn <- ehdn[ehdn$samples > 0, ] +map <- read.delim(sprintf("%smap.TRF.EHdn.0.66.tsv", outpath), stringsAsFactors = F) + +ehdn.all <- merge(ehdn[, -c(5:7, 9)], map, by.x = "repeatID", by.y = "ehdn.id", all.x = T) +names(ehdn.all) <- c("repeatID", "chr", "start", "end", "motif", "trf.id", "reciprocalIdentity", "length.diff") +ehdn.all <- ehdn.all[ehdn.all$chr %in% paste0("chr", c(1:22, "X", "Y")), ] + +ehdn.m <- mergeLoci(ehdn.all) +ehdn.m$uniqueMotif <- sapply(sapply(ehdn.m$motif, strsplit, ";"), length) + +ehdn.m$allRepeat <- ehdn.m$repeatID +ehdn.m$repeatID <- paste(ehdn.m$chr, ehdn.m$start, ehdn.m$end, sep="#") + +write.table(ehdn.m, outfile, sep="\t", row.names=F, quote=F, col.names=T) + +################## +### extract rare merged expansion +expansion <- read.delim(params$outlier, stringsAsFactors = F) +expansion <- expansion[expansion$a1000g_freq_perc < 0.1, ] + +ehdn.m.g <- GRanges(ehdn.m$chr, IRanges(ehdn.m$start, ehdn.m$end), "*") +expansion.g <- GRanges(expansion$chr, IRanges(expansion$start, expansion$end), "*") + +olap <- data.frame(findOverlaps(expansion.g, ehdn.m.g)) +olap <- aggregate(queryHits ~ subjectHits, olap, paste, collapse = ";") + +expansion.m <- data.frame() +for(i in 1:nrow(olap)){ + idx <- as.numeric(strsplit(olap$queryHits[i], ";")[[1]]) + tmp.exp <- expansion[idx, ] + motif <- paste(tmp.exp$motif, sep=";") + outliers <- paste(tmp.exp$outliers, sep=";") + repeats <- paste(tmp.exp$repeatID, sep=";") + chr <- tmp.exp$chr[1] + start <- min(tmp.exp$start) + end <- max(tmp.exp$end) + + expansion.m <- rbind(expansion.m, data.frame(chr, start, end, motif, outliers, stringsAsFactors = F)) +} + +write.table(expansion.m, sprintf("%smerged.rare.expansions.tsv", outpath), sep="\t", row.names=F, col.names=T, quote=F) + +expansion.m$ref <- "0" +expansion.m$alt <- "-" +expansion.m$varid <- paste(expansion.m$chr, expansion.m$start, expansion.m$end, sep="#") +names(expansion.m)[1] <- "#chr" +exp.annovar <- expansion.m[, c("#chr", "start", "end", "ref", "alt", "varid")] + +write.table(exp.annovar, sprintf("%smerged.rare.expansions.forannotation.tsv",outpath), sep="\t", row.names=F, quote=F, col.names=T) +##### +##### +###print("Printing sessioninfo from mergeExpansions.R") +###sessionInfo() diff --git a/scripts/str/regiontools.py b/scripts/str/regiontools.py new file mode 100755 index 0000000..e935edf --- /dev/null +++ b/scripts/str/regiontools.py @@ -0,0 +1,180 @@ +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import sys +import copy + + +class FeatureCounts(object): + def __init__(self, count_dict): + self._count_dict = count_dict + + def __len__(self): + return len(self._count_dict) + + def __getitem__(self, sample_id): + return self._count_dict[sample_id] + + def __repr__(self): + encoding = ('{}={}'.format(s, self._count_dict[s]) + for s in sorted(self._count_dict)) + encoding = ','.join(encoding) + return encoding + + def __iter__(self): + for sample, count in self._count_dict.items(): + yield (sample, count) + + def __eq__(self, other): + # pylint: disable=I0011,W0212 + return self._count_dict == other._count_dict + + def combine(self, other): + # pylint: disable=I0011,W0212 + for sample, count in other._count_dict.items(): + if sample in self._count_dict: + self._count_dict[sample] += count + else: + self._count_dict[sample] = count + + +class Region(object): + def __init__(self, chrom=None, start=0, end=0, feature_counts=None): + self.chrom = chrom + self.start = start + self.end = end + self.feature_counts = feature_counts + + def is_unset(self): + return self.chrom is None + + def __repr__(self): + region_repr = '{}:{}-{}'.format(self.chrom, self.start, self.end) + if self.feature_counts: + region_repr += '({})'.format(self.feature_counts) + return region_repr + + def __eq__(self, other): + return (self.chrom == other.chrom and self.start == other.start and + self.end == other.end and + self.feature_counts == other.feature_counts) + + def __lt__(self, other): + self_tuple = (self.chrom, self.start, self.end) + other_tuple = (other.chrom, other.start, other.end) + return self_tuple < other_tuple + + +def compute_distance(region_a, region_b): + distance = sys.maxsize + if region_a.is_unset() or region_b.is_unset(): + raise Exception('Cannot compute distance between unset regions') + + if region_a.chrom == region_b.chrom: + if region_a.end < region_b.start: # Is region_a left of region_b? + distance = region_b.start - region_a.end + elif region_b.end < region_a.start: # Is region_b left or region_a? + return region_a.start - region_b.end + else: # If neither is true, the regions must overlap. + distance = 0 + + return distance + + +class RegionCollection(object): + def __init__(self, regions=None): + self._regions = regions + if self._regions is None: + self._regions = [] + + def __len__(self): + return len(self._regions) + + def __getitem__(self, index): + return self._regions[index] + + def __repr__(self): + encoding = '\n'.join(repr(r) for r in self._regions) + return encoding + + def __eq__(self, other): + # pylint: disable=I0011,W0212 + for region_a, region_b in zip(self._regions, other._regions): + if region_a != region_b: + return False + return True + + def __iter__(self): + for region in self._regions: + yield region + + def as_dict(self): + region_dict = {} + for region in self._regions: + region_encoding = '{}:{}-{}'.format(region.chrom, + region.start, region.end) + region_dict[region_encoding] = dict(region.feature_counts) + return region_dict + + def merge(self, max_dist=500): + self._regions.sort() + + merged_regions = [] + aggregate_region = Region() + + for region in self._regions: + if aggregate_region.is_unset(): + aggregate_region = region + continue + + if compute_distance(aggregate_region, region) <= max_dist: + aggregate_region.end = max(aggregate_region.end, region.end) + aggregate_region.feature_counts.combine(region.feature_counts) + else: + merged_regions.append(aggregate_region) + aggregate_region = region + + if not aggregate_region.is_unset(): + merged_regions.append(aggregate_region) + + self._regions = merged_regions + + def extend(self, other): + # pylint: disable=I0011,W0212 + self._regions += other._regions + + +def create_region_from_denovo_record(sample_id, region_count): + region_encoding, count = region_count + region_encoding = region_encoding.replace(':', '-').split('-') + chrom, start, end = region_encoding + start, end = int(start), int(end) + feture_counts = FeatureCounts({sample_id: count}) + return Region(chrom, start, end, feture_counts) + + +def create_region_collection_from_denovo_record(sample_id, denovo_record): + regions = [] + for region_count in denovo_record.items(): + region = create_region_from_denovo_record(sample_id, region_count) + regions.append(region) + return RegionCollection(regions) diff --git a/scripts/str_helper.sh b/scripts/str/str_helper.sh similarity index 100% rename from scripts/str_helper.sh rename to scripts/str/str_helper.sh diff --git a/scripts/str/wilcoxontest.py b/scripts/str/wilcoxontest.py new file mode 100755 index 0000000..9035d9b --- /dev/null +++ b/scripts/str/wilcoxontest.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +# +# Expansion Hunter Denovo +# Copyright (c) 2017 Illumina, Inc. +# +# Author: Egor Dolzhenko , +# Sai Chen +# Concept: Michael Eberle +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import numpy as np +import scipy.special +import scipy.stats + + +def calculate_approximate_pvalue(cases, controls): + all_counts = cases + controls + ranks = scipy.stats.rankdata(all_counts) + case_rank_sum = np.sum(ranks[:len(cases)]) + + mu_cases = len(cases) * (len(cases) + len(controls) + 1) / 2 + sigma_cases = np.sqrt(len(cases) * len(controls) * + (len(cases) + len(controls) + 1) / 12) + z_cases = (case_rank_sum - mu_cases) / sigma_cases + + return 1 - scipy.stats.norm.cdf(z_cases) + + +def calculate_permutation_pvalue(cases, controls, num_permutations): + all_counts = cases + controls + ranks = scipy.stats.rankdata(all_counts) + num_cases = len(cases) + true_case_rank_sum = np.sum(ranks[:num_cases]) + + permuted_case_ranks = np.random.choice( + ranks, size=(num_permutations, num_cases)) + permuted_case_rank_sums = np.sum(permuted_case_ranks, axis=1) + + num_case_rank_sums_as_extreme_as_true = np.sum( + permuted_case_rank_sums >= true_case_rank_sum) + + return (num_case_rank_sums_as_extreme_as_true + 1) / (num_permutations + 1) + + +def wilcoxon_rank_sum_test(test_params, cases, controls): + test_method = test_params['method'] + if test_method == 'normal': + return calculate_approximate_pvalue(cases, controls) + elif test_method == 'resample': + return calculate_permutation_pvalue(cases, controls, test_params['num_resamples']) + else: + assert False, '{} is an unknown method type'.format(test_method)