diff --git a/config_hpf.yaml b/config_hpf.yaml
index cc2ff4f..f93b38a 100644
--- a/config_hpf.yaml
+++ b/config_hpf.yaml
@@ -23,10 +23,11 @@ tools:
   cre: "~/cre"
   crg: "~/crg"
   crg2: "~/crg2"
-  ehdn: "/hpf/largeprojects/ccmbio/arun/Tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0"
+  ehdn: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0"
   mity: "/hpf/largeprojects/ccmbio/ajain/mity/mity_package/bin"
   melt: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/MELT/MELTv2.2.2/MELT.jar"
   orad: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/orad_2_6_1/orad" 
+  annovar: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/annovar"
 
 
 ref:
@@ -123,6 +124,14 @@ annotation:
     1000g: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunter/1000G_EH_v1.0.tsv"
   ehdn:
     files: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/"
+    g1k_outlier: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G_outlier"
+    g1k_manifest: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/manifest.1000G.txt"
+    g1k_samples: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt"
+    trf: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv"
+    omim: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/OMIM_hgnc_join_omim_phenos_2023-06-22.tsv"
+    gnomad: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo//gnomad.v2.1.1.lof_metrics.by_gene.txt"
+    annovar_db: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/annovar/humandb/"
+
   cre:
      database_path: "/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/genomes/Hsapiens/GRCh37/variation/"
   melt:
diff --git a/documentation/str.md b/documentation/str.md
new file mode 100644
index 0000000..7b215ac
--- /dev/null
+++ b/documentation/str.md
@@ -0,0 +1,45 @@
+# ExpansionHunterDenovo version: 
+    EHDN v0.7.0 is not available from GitHub or conda anymore, and the 1000G profiles from Brett Trost are based on this, so keeping this version; tool path in 'config.yaml' is updated to
+    `config['tools']['ehdn']: "/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0"`
+
+# Description of scripts used before the changes:
+## BASH and R scripts:
+
+1. scripts are fetched from `crg/str` repo
+2. ~/crg/crg.ehdn.sh: runs  EHDN with many checks/searches for the relevant bam files based on run type: exome/genome and cre/crg/crg2 pipeline directory structures
+3. ~/crg/ehdn_report.sh: main script that uses all the R packages from ~/crg/str, 1000G EHDN profiles, and generates script
+    - ~/crg/str/DBSCAN.EHdn.parallel.R: library(dbscan), library(ggplot2), library(data.table), library(parallel), library(doSNOW)
+    - ~/crg/str//mergeExpansions.R: library(data.table), library(GenomicRanges), library(ggplot2), library(cowplot), library(Biostrings)
+    
+
+## Python scripts:
+1. ~/crg/str/compare_anchored_irrs.py: depends on "core" package  present locally
+2. ~/crg/str/find_outliers.py: numpy, matplotlib
+3. ~/crg/str/generate_EH_genotype_table.generic.py: argparse, glob, BTlib(local), collections, pandas, path
+4. ~/crg/str/combine_counts.py: argparse, json, core(local)
+5. ~/crg/str/BTlib.py: re, statistics, string, random, itertools, copy, collections, numpy, pandas, copy, docx
+6. ~/crg/str/add_gene+threshold_to_EH_column_headings2.py: argparse
+7. ~/crg/str/format_for_annovar.py: collections, re
+8. ~/crg/str/format_from_annovar.py: pandas, xlsxwriter
+
+
+# Summary of changes related to issue 142:
+1. Moved all required scripts from `~/crg` and `~/cre` to `~/crg2/scripts/str`
+2. Removed hard-coded paths from str related scripts and added them to `config.yaml` to be passed as arguments via snakemake params.
+3. Moved following annotations and tools to /hpf/largeprojects/ccm_dccforge/dccdipg/Common/:
+    annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv
+    crg2-non-conda-tools/EHDN.TCAG/ExpansionHunterDenovo-v0.7.0
+    annotation/ExpansionHunterDenovo/1000G_JSON
+    updated annotation/ExpansionHunterDenovo/manifest.1000G.txt
+4. Split the single "EHdn_report" rule into four, with separate env for each rule 
+    - EHDN_mark_outliers: 
+    - EHDN_DBSCAN_outlier
+    - EHDN_merge_expansions
+    - EHDN_annovar
+4. Created `envs/ehdn-dbscan.yaml` for rules "EHDN_DBSCAN_outlier" and "EHDN_merge_expansions". The version of the R-packages are not the same as in `ccmmarvin`. If I constrain them to versions available in 'ccmmarvin`, then conda fails to create env due to conflicts. The versions installed on ccmmarvin were done manually in 2020, not via conda, 
+5. Fixed minor bugs, and removed unwanted yaml. 
+6. Edited above R scripts to remove hard-coded file paths, add command-line arguments, and removed  suffixing output with date, as the Snakemake rule requires the output names be known before execution (using dynamic only works if all outputs from a rule are dynamic)
+7. Tested with NA12878
+
+
+
diff --git a/envs/ehdn-dbscan.yaml b/envs/ehdn-dbscan.yaml
new file mode 100644
index 0000000..5e280d0
--- /dev/null
+++ b/envs/ehdn-dbscan.yaml
@@ -0,0 +1,12 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - r
+  - r-dbscan=1.1-8
+  - r-ggplot2
+  - r-data.table
+  - r-doSNOW=1.0.19
+  - r-cowplot
+  - bioconductor-genomicranges
+  - bioconductor-biostrings
diff --git a/envs/ehdn-report.yaml b/envs/ehdn-report.yaml
index 719bfb0..e39c081 100644
--- a/envs/ehdn-report.yaml
+++ b/envs/ehdn-report.yaml
@@ -2,12 +2,20 @@ channels:
   - bioconda
   - conda-forge
 dependencies:
+  - python=3.7.8
+  - python-docx=0.8.10
+  - pandas=1.1.2
+  - numpy=1.19.1
+  - scipy=1.5.2
+  - xlsxwriter=1.3.7
+  - r-base=3.4.1
   - r-dbscan=1.1.5
-  - r-ggplot2
-  - r-data.table
-  - r-doSNOW
-  - r-cowplot
-  - bioconductor-genomicranges
-  - bioconductor-genomeinfodb
-  - bioconductor-genomeinfodbdata
-  - bioconductor-biostrings
\ No newline at end of file
+  - r-ggplot2=3.1.0
+  - r-data.table=1.11.4
+  - r-doSNOW=1.0.20
+  - r-cowplot=0.9.3
+  - bioconductor-genomicranges=1.32.7
+  - bioconductor-genomeinfodb=1.16.0
+  - bioconductor-genomeinfodbdata=1.1.0
+  - bioconductor-biostrings=2.48.0
+
diff --git a/envs/ehdn.yaml b/envs/ehdn.yaml
index 3e0bfa6..b1a482e 100644
--- a/envs/ehdn.yaml
+++ b/envs/ehdn.yaml
@@ -3,4 +3,5 @@ channels:
   - conda-forge
 dependencies:
   - python =3.7.7
-  - scipy =1.7.3
\ No newline at end of file
+  - scipy =1.7.3
+  - numpy
\ No newline at end of file
diff --git a/envs/ehdn_outlier.yaml b/envs/ehdn_outlier.yaml
new file mode 100644
index 0000000..f3f321f
--- /dev/null
+++ b/envs/ehdn_outlier.yaml
@@ -0,0 +1,10 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - python=3.7.8
+  - python-docx=0.8.10
+  - pandas=1.1.2
+  - numpy
+  - matplotlib
+  - xlsxwriter=1.3.7
\ No newline at end of file
diff --git a/rules/str.smk b/rules/str.smk
index 8adfaca..1ad12a7 100644
--- a/rules/str.smk
+++ b/rules/str.smk
@@ -8,7 +8,7 @@ rule EH:
         bam = "str/EH/{family}_{sample}_realigned.bam"
     params:
         ref = config["ref"]["genome"],
-        sex = lambda w: "`sh {}/scripts/str_helper.sh decoy_rm/{}_{}.no_decoy_reads.bam`".format(workflow.basedir, w.family, w.sample),
+        sex = lambda w: "`sh {}/scripts/str/str_helper.sh decoy_rm/{}_{}.no_decoy_reads.bam`".format(workflow.basedir, w.family, w.sample),
         catalog = config["annotation"]["eh"]["catalog"]
     log:
         "logs/str/{family}_{sample}-EH.log"
@@ -31,28 +31,28 @@ rule EH_report:
     conda:
         "../envs/eh-report.yaml"
     shell:
-        '''
-        echo "generating multi-sample genotypes" > {log}
-        python {params.crg2}/scripts/generate_EH_genotype_table.generic.py str/EH > {output.tsv}
-        echo "annotating gene name & size threshold" > {log}
-        python {params.crg2}/scripts/add_gene+threshold_to_EH_column_headings2.py {output.tsv} {params.trf} > {output.annot}
-        echo "generating final xlsx file" > {log}
-        python {params.crg2}/scripts/eh_sample_report.py {output.annot} {params.g1000} {output.xlsx} 
+        """
+        echo "generating multi-sample genotypes" >> {log}
+        python {params.crg2}/scripts/str/generate_EH_genotype_table.generic.py str/EH > {output.tsv}
+        echo "annotating gene name & size threshold" >> {log}
+        python {params.crg2}/scripts/str/add_gene+threshold_to_EH_column_headings2.py {output.tsv} {params.trf} > {output.annot}
+        echo "generating final xlsx file" >> {log}
+        python {params.crg2}/scripts/str/eh_sample_report.py {output.annot} {params.g1000} {output.xlsx} 
         prefix=`echo {output.xlsx} | awk '{{split($1,a,".xlsx"); print a[1]; }}'`;
         d=`date +%Y-%m-%d`
         outfile="${{prefix}}.${{d}}.xlsx";
-        echo "Copying final report to filaname with timestamp: $outfile" > {log}
+        echo "Copying final report to filaname with timestamp: $outfile" >> {log}
         cp {output.xlsx} $outfile
-        '''
+        """
 
 rule EHdn:
     input: expand("decoy_rm/{family}_{sample}.no_decoy_reads.bam",family=config["run"]["project"], sample=samples.index)
     output:
         json = "str/EHDN/{family}_EHDN_str.tsv"
     params:
-        crg = config["tools"]["crg"],
+        crg2 = config["tools"]["crg2"],
         ehdn = config["tools"]["ehdn"],
-        ehdn_files = config["annotation"]["ehdn"]["files"],
+        g1k_manifest = config["annotation"]["ehdn"]["g1k_manifest"],
         ref = config["ref"]["genome"],
         # ref = config["ref"]["genome"],
         # irr_mapq = config["params"]["EHDN"]["irr_mapq"],
@@ -63,27 +63,107 @@ rule EHdn:
         "../envs/ehdn.yaml"
     shell:
         '''
-        EHDN={params.ehdn} EHDN_files={params.ehdn_files} ref={params.ref} script_dir={params.crg} sh {params.crg}/crg.ehdn.sh {wildcards.family} crg2
+        EHDN={params.ehdn} g1k_manifest={params.g1k_manifest} ref={params.ref} script_dir={params.crg2}/scripts/str sh {params.crg2}/scripts/str/crg.ehdn.sh {wildcards.family} crg2
         '''
 
-rule EHdn_report:
+#rule EHdn_report:
+#    input: "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"])
+#    output: "report/str/{family}.EHDN.xlsx"
+#    log: "logs/str/{family}_EHdn_report.log"
+#    params:
+#        repdir = "report/str",
+#        crg2 = config["tools"]["crg2"],
+#        family = config["run"]["project"],
+#        outdir = "str/EHDN",
+#    conda: "../envs/ehdn-report.yaml"
+#    shell:
+#        '''
+#        export PATH="/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/bin/:$PATH"
+#	    sh {params.crg2}/scripts/str/ehdn_report.sh {params.family} {params.outdir}
+#        date=`date +%Y-%m-%d`;
+#        f={params.outdir}/outliers/{params.family}.EHDN.${{date}}.xlsx;
+#        if [ -f $f ]; then 
+#	    if [ ! -d {params.repdir} ]; then mkdir -p {params.repdir}; fi;
+#    	mv $f {params.repdir}
+#	    cp {params.repdir}/{params.family}.EHDN.${{date}}.xlsx {params.repdir}/{params.family}.EHDN.xlsx
+#        else exit; fi;
+#        '''
+
+rule EHDN_mark_outliers:
     input: "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"])
-    output: "report/str/{family}.EHDN.xlsx"
-    log: "logs/str/{family}_EHdn_report.log"
+    output: "str/EHDN/{family}_outliers.txt".format(family=config["run"]["project"])
     params:
-        repdir = "report/str",
-        crg = config["tools"]["crg"],
-        family = config["run"]["project"],
-        outdir = "str/EHDN",
+        crg2 = config["tools"]["crg2"],
+        g1k_outlier = config["annotation"]["ehdn"]["g1k_outlier"]
+    conda:
+        "../envs/ehdn.yaml"
     shell:
-        '''
-        PATH="/hpf/largeprojects/ccmbio/naumenko/tools/bcbio/bin/:$PATH"
-	sh {params.crg}/ehdn_report.sh {params.family} {params.outdir}
-        date=`date +%Y-%m-%d`;
-        f={params.outdir}/outliers/{params.family}.EHDN.${{date}}.xlsx;
-        if [ -f $f ]; then 
-	    if [ ! -d {params.repdir} ]; then mkdir -p {params.repdir}; fi;
-    	mv $f {params.repdir}
-	    cp {params.repdir}/{params.family}.EHDN.${{date}}.xlsx {params.repdir}/{params.family}.EHDN.xlsx
-        else exit; fi;
-        '''
+        """
+        python {params.crg2}/scripts/str/find_outliers.py {input} {output}
+        cat {output} {params.g1k_outlier} > temp && mv temp {output}
+        """
+
+rule EHDN_DBSCAN_outlier:
+    input: 
+        profile = "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"]),
+        outliers = "str/EHDN/{family}_outliers.txt".format(family=config["run"]["project"]),
+    output: 
+        clean="str/EHDN/outliers/clean.samples.txt",
+        expansions="str/EHDN/outliers/EHdn.expansions.tsv",
+        tr="str/EHDN/outliers/samples.with.TRcount.txt"
+
+    params: 
+        outdir="str/EHDN/outliers",
+        crg2=config["tools"]["crg2"],
+        trf=config["annotation"]["ehdn"]["trf"],
+        g1k_samples=config["annotation"]["ehdn"]["g1k_samples"]
+    conda: "../envs/ehdn-dbscan.yaml"
+    shell:
+        """
+        Rscript {params.crg2}/scripts/str/DBSCAN.EHdn.parallel.R --infile {input.profile} --outpath {params.outdir} --outlierlist {input.outliers} --a1000g {params.g1k_samples} --exp {output.expansions}
+        echo "Finished running DBSCAN.EHdn.parallel.R";
+        echo "`ls {params.outdir}`";
+        
+        """
+
+rule EHDN_merge_expansions:
+    input: "str/EHDN/outliers/EHdn.expansions.tsv", "str/EHDN/{family}_EHDN_str.tsv".format(family=config["run"]["project"])
+    output:  "str/EHDN/outliers/merged.rare.expansions.tsv","str/EHDN/outliers/merged.rare.expansions.forannotation.tsv",
+            "str/EHDN/outliers/map.TRF.EHdn.0.66.tsv", "str/EHDN/outliers/merged.ehdn.tsv", 
+    params: 
+        outdir="str/EHDN/outliers",
+        crg2=config["tools"]["crg2"],
+        trf=config["annotation"]["ehdn"]["trf"],
+        g1k_samples=config["annotation"]["ehdn"]["g1k_samples"]
+    conda: "../envs/ehdn-dbscan.yaml"
+    shell:
+        """
+        Rscript {params.crg2}/scripts/str/mergeExpansions.R --rscript {params.crg2}/scripts/str/ExpansionAnalysisFunctions.R --ehdn {input[1]}  --trf {params.trf} --outlier {input[0]} --outpath {params.outdir}
+        """
+
+
+rule EHDN_annovar:
+    input: 
+        merged_exp = "str/EHDN/outliers/merged.rare.expansions.tsv",
+        ehdn_exp = "str/EHDN/outliers/EHdn.expansions.tsv"
+    output: 
+        merged_rare_exp = "str/EHDN/outliers/merged.rare.EHdn.expansion.tsv",
+        omim_out = "str/EHDN/outliers/merged.rare.EHdn.expansion-OMIM.hg19_multianno.txt",
+        gnomad_out = "str/EHDN/outliers/merged.rare.EHdn.expansion-gnoMAD.hg19_multianno.txt",
+        xlsx="report/str/{family}.EHDN.xlsx".format(family=config["run"]["project"])
+    params:
+        crg2 = config["tools"]["crg2"],
+        g1k_manifest = config["annotation"]["ehdn"]["g1k_manifest"],
+        annovar = config["tools"]["annovar"],
+        annovar_db = config["annotation"]["ehdn"]["annovar_db"],
+        omim = config["annotation"]["ehdn"]["omim"],
+        gnomad = config["annotation"]["ehdn"]["gnomad"],
+        prefix = "str/EHDN/outliers/merged.rare.EHdn.expansion"
+    conda: "../envs/eh-report.yaml"
+    shell:
+        """
+            python  {params.crg2}/scripts/str/format_for_annovar.py {input.ehdn_exp} {input.merged_exp} {params.g1k_manifest}
+            {params.annovar}/table_annovar.pl {output.merged_rare_exp} {params.annovar_db} -buildver hg19 -outfile {params.prefix}"-OMIM" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile {params.omim}
+            {params.annovar}/table_annovar.pl {output.merged_rare_exp} {params.annovar_db} -buildver hg19 -outfile {params.prefix}"-gnoMAD" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile {params.gnomad}
+            python {params.crg2}/scripts/str/format_from_annovar.py {output.gnomad_out} {output.omim_out} {output.xlsx}
+        """
\ No newline at end of file
diff --git a/scripts/BTlib.py b/scripts/str/BTlib.py
similarity index 100%
rename from scripts/BTlib.py
rename to scripts/str/BTlib.py
diff --git a/scripts/str/DBSCAN.EHdn.parallel.R b/scripts/str/DBSCAN.EHdn.parallel.R
new file mode 100755
index 0000000..b39eede
--- /dev/null
+++ b/scripts/str/DBSCAN.EHdn.parallel.R
@@ -0,0 +1,170 @@
+rm(list = ls())
+#### to run
+#### Rscript DBSCAN.EHdn.parallel.R --infile /hpf/largeprojects/tcagstor/users/btrost/papers/STRs/Qatar_ASD/output_regions.min2.1000G+SSC+MSSNG+QASD.txt 
+#### --outpath output/ --a1000g /hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt --samplelist samples.txt --outlierlist outliers.txt
+#### a1000g <- readLines("/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/1000G.samples.txt")
+
+###("/hpf/largeprojects/tcagstor/users/worrawat/Expansion/1000G.samples.txt")
+
+#### read arguments from command line
+args = commandArgs(trailingOnly=TRUE)
+if(length(args) < 2){
+  stop(call. = T, "Require more argument(s)")
+}
+
+paramNames <- grep("--", args)
+paramValues <- paramNames + 1
+
+if(length(grep("--", args[paramValues])) > 0){
+  stop(call. = T, "Failed in reading arguments, '--' found in argument value")
+}
+
+if(length(args) != length(c(paramNames, paramValues))){
+  message(sprintf("Ignore argument(s):%s", paste(args[-union(paramNames, paramValues)], collapse=",")))
+}
+
+params <- list()
+paramNames <- gsub("--", "", args[paramNames])
+for(i in 1:length(paramNames)){
+  params[paramNames[i]] <- args[paramValues[i]]
+}
+
+library(dbscan)
+library(ggplot2)
+library(data.table)
+
+set.seed(2000)
+a1000g <- readLines(params$a1000g)
+outpath <- params$outpath
+if(length(grep("\\/$", outpath)) == 0){
+  outpath <- paste0(outpath, "/")
+}
+
+if(!dir.exists(outpath)){
+  dir.create(outpath, recursive = T)
+}
+
+infile <- params$infile
+message(sprintf("reading %s##", infile))
+dt <- fread(infile)
+dt <- data.frame(dt)
+dt$varid <- paste(dt$V1, dt$V2, dt$V3, sep="_")
+
+### get count per sample
+sample.count <- paste(dt$V7, collapse = ",")
+sample.count <- gsub(":", ",", sample.count)
+sample.count <- strsplit(sample.count, ",")[[1]] 
+sample.count <- matrix(sample.count, ncol = 2, byrow = T)
+sample.count <- data.frame(table(sample.count[, 1]), stringsAsFactors = F)
+write.table(sample.count,
+            sprintf("%ssamples.with.TRcount.txt", outpath), sep="\t", row.names=F, quote=F, col.names=T)
+
+if("samplelist" %in% names(params)){
+  samples <- readLines(params$samplelist)
+  sample.count <- sample.count[sample.count$Var1 %in% samples, ]
+}else{
+  samples <- sample.count$Var1
+}
+
+if(!"outlierlist" %in% names(params)){
+  sd <- sd(sample.count$Freq)
+  mean <- mean(sample.count$Freq)
+  outlier.count <- sum(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd)
+  
+  if(outlier.count > 0){
+    message(sprintf("Remove %s outliers", outlier.count))
+    samples <- samples[!samples %in% sample.count$Var[which(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd)]]
+    
+    write.table(sample.count[which(sample.count$Freq > mean+3*sd | sample.count$Freq < mean-3*sd), ],
+                sprintf("%soutlier.samples.txt", outpath))
+    message(sprintf("Samples excluded from the analysis are in %soutlier.samples.txt", outpath))
+  }
+}else{
+  outliers <- readLines(params$outlierlist)
+  message(sprintf("Remove %s outliers", length(outliers)))
+  samples <- samples[!samples %in% outliers]
+  sample.count <- sample.count[!sample.count$Var1 %in% outliers, ]
+}
+
+message(sprintf("Samples included in the analysis are in %sclean.samples.txt", outpath))
+writeLines(as.character(sample.count$Var1), sprintf("%sclean.samples.txt", outpath))
+
+norm.dist <- round(rnorm(length(samples), mean = 1, sd = 0.25), digits = 1)
+norm.dist <- sort(norm.dist)
+
+library(parallel)
+library(doSNOW)
+
+time.start <- Sys.time()
+coreNumber <- 12
+cl <- makeCluster(coreNumber-1)
+registerDoSNOW(cl)
+
+dt.out <- foreach (i = 1:nrow(dt), .combine=rbind) %dopar% {
+  tmp <- gsub(":|,", "#", dt[i, 7])
+  tmp <- unlist(strsplit(tmp, "#")[[1]])
+  tmp <- data.frame(matrix(tmp, ncol = 2, byrow = T), stringsAsFactors = F)
+  tmp[, 2] <- as.numeric(tmp[, 2])
+  tmp <- tmp[tmp$X1 %in% samples, ]
+  if(nrow(tmp) == 0){
+    dt.here <- data.frame("repeatID" = "", "motif" = "", "outliers" = "", "size" = "", "ref" = "",
+                          "chr" = "", "start" = "", "end" = "")[-1, ]
+  }else{
+    if(nrow(tmp) < length(norm.dist)){
+      dt.tmp <- norm.dist[-c(1:nrow(tmp))]
+      sampleIDs <- c(paste0("Sim", 1:length(dt.tmp)), tmp$X1)
+      dt.tmp <- c(dt.tmp, tmp$X2)
+    }else{
+      sampleIDs <- tmp$X1
+      dt.tmp <- tmp$X2
+    }
+    
+    ref <- as.numeric(names(which.max(table(dt.tmp))))
+    range <-  max(2*ref, quantile(dt.tmp, 0.95, na.rm = T) - quantile(dt.tmp, 0.05, na.rm = T))
+  
+    scan <- dbscan::dbscan(matrix(dt.tmp), eps = range, minPts = ceiling(log2(length(samples))))
+  
+    if(length(unique(scan$cluster)) == 1 | sum(scan$cluster == 0) == 0){
+      cutoff <- Inf
+    }else{
+      cutoff <- max(dt.tmp[scan$cluster != 0])
+      cutoff <- ifelse(cutoff < 2, 2, cutoff)
+    }
+    
+    if(sum(dt.tmp > cutoff) > 0){
+      sim.samples <- grep("Sim", sampleIDs)
+      if(length(sim.samples) > 0){
+        dt.tmp <- dt.tmp[-sim.samples]
+        sampleIDs <- sampleIDs[-sim.samples]
+      }
+      
+      outliers <- paste(sampleIDs[dt.tmp > cutoff], collapse=";")
+      size <- paste(dt.tmp[dt.tmp > cutoff], collapse=";")
+      dt.here <- data.frame("repeatID" = dt$varid[i], "motif" = dt$V4[i], outliers, size, ref,
+                            "chr" = dt$V1[i], "start" = dt$V2[i], "end" = dt$V3[i], stringsAsFactors = F)
+    }else{
+      dt.here <- data.frame("repeatID" = "", "motif" = "", "outliers" = "", "size" = "", "ref" = "",
+                            "chr" = "", "start" = "", "end" = "", stringsAsFactors = F)[-1, ]
+    }
+  }
+
+  dt.here
+}
+
+
+get1000Freq <- function(outlier, fam.1000g){
+  outlier <- strsplit(outlier, ";")[[1]]
+  count <- sum(unique(outlier) %in% fam.1000g)
+  return(round(count/length(fam.1000g) * 100, digits = 2))
+}
+
+dt.out$a1000g_freq_perc <- sapply(as.character(dt.out$outliers), get1000Freq, a1000g)
+
+stopCluster(cl) 
+#write.table(dt.out, sprintf("%sEHdn.expansions.%s.tsv", outpath, Sys.Date()), sep="\t", row.names=F, col.names=T, quote=F)
+write.table(dt.out, params$exp, sep="\t", row.names=F, col.names=T, quote=F)
+time.end <- Sys.time()
+print(difftime(time.end, time.start))
+
+#print("Printing	sessioninfo from DBSCAN.EHdn.parallel.R")
+#sessionInfo()
diff --git a/scripts/str/ExpansionAnalysisFunctions.R b/scripts/str/ExpansionAnalysisFunctions.R
new file mode 100755
index 0000000..44a3dfd
--- /dev/null
+++ b/scripts/str/ExpansionAnalysisFunctions.R
@@ -0,0 +1,166 @@
+get1000Freq <- function(outlier, fam.1000g){
+  outlier <- strsplit(outlier, ";")[[1]]
+  count <- sum(unique(outlier) %in% fam.1000g)
+  return(round(count/length(fam.1000g) * 100, digits = 2))
+}
+
+
+mergeLoci <- function(loci, mergeAny = T){
+  
+  cnv.tmp <- loci
+  
+  firstIt <- T
+  olap <- data.frame()
+  while(nrow(olap) != 0 | firstIt){
+    firstIt <- F
+    cnv.query <- GRanges(cnv.tmp$chr, IRanges(cnv.tmp$start, cnv.tmp$end), "*")
+    
+    olap <- data.frame(findOverlaps(cnv.query, cnv.query))
+    olap <- olap[which(olap$queryHits < olap$subjectHits), ]
+    olap <- olap[!duplicated(olap$queryHits), ]
+    olap <- olap[!olap$queryHits %in% olap$subjectHits, ]
+    if(!mergeAny){
+      olap <- olap[nchar(cnv.tmp$motif[olap$queryHits]) == nchar(cnv.tmp$motif[olap$subjectHits]), ]
+    }
+    
+    message(sprintf("%s found overlapped, from %s loci", nrow(olap), nrow(cnv.tmp)))
+    flush.console()
+    
+    if(nrow(olap) > 0){
+      ids <- union(olap$queryHits, olap$subjectHits)
+      merge.dt <- data.frame()
+      for(i in 1:nrow(olap)){
+        olap.th <- olap[i, ]
+        
+        olap.th[, c("qstart", "qend")] <- cnv.tmp[olap.th$queryHits, c("start", "end")]
+        olap.th[, c("sstart", "send")] <- cnv.tmp[olap.th$subjectHits, c("start", "end")]
+        
+        
+        repeatID <- paste(union(strsplit(cnv.tmp$repeatID[olap.th$queryHits], ";")[[1]], 
+                                strsplit(cnv.tmp$repeatID[olap.th$subjectHits], ";")[[1]]), collapse=";")
+        motif <- paste(union(strsplit(cnv.tmp$motif[olap.th$queryHits], ";")[[1]], 
+                             strsplit(cnv.tmp$motif[olap.th$subjectHits], ";")[[1]]), collapse = ";")
+        trf.id <- paste(na.omit(union(strsplit(cnv.tmp$trf.id[olap.th$queryHits], ";")[[1]], 
+                                      strsplit(cnv.tmp$trf.id[olap.th$subjectHits], ";")[[1]])), collapse = ";")
+        reciprocalIdentity <- max(cnv.tmp$reciprocalIdentity[olap.th$queryHits], cnv.tmp$reciprocalIdentity[olap.th$subjectHits], na.rm =T)
+        length.diff <- 0
+        
+        merge.dt <- rbind(merge.dt, data.frame(repeatID, "chr" = cnv.tmp$chr[olap.th$queryHits],
+                                               "start" = pmin(olap.th$qstart, olap.th$sstart), "end" = pmax(olap.th$qend, olap.th$send),
+                                               motif, trf.id, reciprocalIdentity, length.diff))
+      }
+      
+      
+      cnv.tmp <- cnv.tmp[-(ids), ]
+      cnv.tmp <- rbind(cnv.tmp, merge.dt)
+    }
+    
+    gc();
+  }
+  
+  return(cnv.tmp)
+}
+
+mapOutlierToMergeLoci <- function(outlier.loci, merge.loci){
+  tmp <- unique(as.numeric(unlist(sapply(outlier.loci$repeatID, grep, merge.loci$repeatID))))
+  merge.loci <- merge.loci[tmp, ]
+  
+  for(i in 1:nrow(merge.loci)){
+    repeatIDs <- strsplit(merge.loci$repeatID[i], ";")[[1]]
+    outliers <- paste(unique(unlist(sapply(outlier.loci$outliers[outlier.loci$repeatID %in% repeatIDs], strsplit, ";"))), collapse = ";")
+    merge.loci$outliers[i] <- outliers
+  }
+  
+  return(merge.loci)
+}
+
+getOutlierMergeData <- function(i, ehdn.result.m, fam.data){
+  ehdn.rec <- ehdn.result.m[i, ]
+  outliers <- strsplit(ehdn.rec$outliers, ";")[[1]]
+  
+  
+  ehdn.data <-  fam.data[fam.data$ID %in% outliers, c("ID", "Affection")]
+  if(nrow(ehdn.data) == 0){
+    return(NULL)
+  }else{
+    
+    ehdn.data[, c("repeatID", "motif", "chr", "start", "end", "trf.id", "gene_symbol", "entrez_id", "typeseq_priority")] <- 
+      ehdn.rec[, c("repeatID", "motif", "chr", "start", "end", "trf.id", "gene_symbol", "entrez_id", "typeseq_priority")]
+  }
+  return(ehdn.data)
+}
+
+
+getCountTable <- function(dt, label, label.list, fam.data, ehdn.genotype.data, cov){
+  if(nrow(dt) == 0){
+    return(NA)
+  }else{
+    dt <- dt[dt$ID %in% fam.data$ID, ]
+    
+    if(sum(names(fam.data) %in% cov) > 0)
+      cov.dt <- unique(fam.data[, c("ID", names(fam.data)[names(fam.data) %in% cov])])
+    
+    table.tmp <- aggregate(repeatID ~ ID, dt, length)
+    names(table.tmp)[2] <- "ExpansionCount" 
+    
+    table.tmp <- merge(table.tmp, fam.data[, c("ID", label)], by.x = "ID", by.y = "ID", all = T)
+    table.tmp[is.na(table.tmp)] <- 0
+    table.tmp[, label] <- ifelse(table.tmp[, label] == label.list[1], 1, 0)
+    table.tmp[, label] <- factor(table.tmp[, label])
+    if(sum(names(fam.data) %in% cov) > 0)
+      table.tmp <- merge(table.tmp, cov.dt, by.x = "ID", by.y = "ID", all.x = T)
+    table.tmp <- merge(table.tmp, ehdn.genotype.data, by = "ID", all.x = T)
+    table.tmp[is.na(table.tmp)] <- 0
+    return(table.tmp)
+  }
+}
+
+testGLM <- function(dt, label, cov, feature, standardization = T){
+  if(is.null(nrow(dt))){
+    return(list("OR" = NA, "upper" = NA, "lower" = NA,  "pvalue" = NA))
+  }else if (length(unique(na.omit(dt[, label]))) < 2){
+    return(list("OR" = NA, "upper" = NA, "lower" = NA,  "pvalue" = NA))
+  }else{
+    for(cov.each in cov){
+      if(length(unique(dt[, cov.each])) < 2){
+        cov <- cov[-which(cov == cov.each)]
+      }
+    }
+    
+    if(standardization){
+      dt[, feature] <- scale(dt[, feature])
+    }
+    ref <- paste0(label, " ~ ", paste(cov, collapse = " + "))
+    add <- paste0(ref, " + ", feature)
+    ref.lm <- glm(ref, dt, family = binomial(link = "logit"))
+    add.lm <- glm(add, dt, family = binomial(link = "logit"))
+    ano <- anova(ref.lm, add.lm, test = "Chisq")
+    
+    or <- exp(add.lm$coefficients[feature])
+    pvalue <- ano$`Pr(>Chi)`[2]
+    
+    conf <- confint(add.lm)
+    low <- exp(conf[feature, 1])
+    up <- exp(conf[feature, 2])
+    
+    return(list("OR" = or, "upper" = up, "lower" = low, "pvalue" = pvalue))
+  }
+}
+
+
+
+testByAggregate <- function(repeatIDs, tmp.f, fam.data, ehdn.genotype.data, cov, standardization = F){
+  caseCtrl.dt <- tmp.f[which(tmp.f$repeatID %in% repeatIDs & 
+                               tmp.f$Affection %in% 1:2), ]
+  
+  feature <- "ExpansionCount"
+  
+  all.table <- getCountTable(caseCtrl.dt, "Affection", 
+                             c(2, 1), 
+                             fam.data, ehdn.genotype.data, cov)
+  
+  res <- testGLM(all.table, "Affection", cov, feature, standardization)
+  
+  return(data.frame(res, "affected.count" = sum(all.table$Affection == 1 & all.table$ExpansionCount > 0),
+                    "unaffected.count" = sum(all.table$Affection == 0 & all.table$ExpansionCount > 0)))
+}
diff --git a/scripts/str/__init__.py b/scripts/str/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/scripts/add_gene+threshold_to_EH_column_headings2.py b/scripts/str/add_gene+threshold_to_EH_column_headings2.py
similarity index 100%
rename from scripts/add_gene+threshold_to_EH_column_headings2.py
rename to scripts/str/add_gene+threshold_to_EH_column_headings2.py
diff --git a/scripts/str/combine_counts.py b/scripts/str/combine_counts.py
new file mode 100755
index 0000000..336f593
--- /dev/null
+++ b/scripts/str/combine_counts.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import logging
+import sys
+import json
+
+from core import regiontools, common
+
+
+def load_parameters():
+    '''Capture command-line parameters.
+
+    '''
+    parser = argparse.ArgumentParser(
+        description='combine counts of in-repeat reads across multiple samples')
+    parser.add_argument(
+        '--manifest',
+        help='TSV file with id, case/control status, and path of each sample',
+        required=True)
+    parser.add_argument(
+        '--combinedCounts',
+        help='file with combined counts of anchored in-repeat reads',
+        required=True)
+    parser.add_argument(
+        '--minUnitSize',
+        help='size of smallest repeat unit to process (in bp)',
+        default=2, type=int, required=False)
+    parser.add_argument(
+        '--maxUnitSize',
+        help='size of the largest repeat unit to process (in bp)',
+        default=20, type=int, required=False)
+    args = parser.parse_args()
+
+    parameter_encoding = ' '.join(sys.argv[1:])
+    logging.info('Starting with these parameters: %s', parameter_encoding)
+
+    return {'manifest_path': args.manifest, 'output_path': args.combinedCounts,
+            'min_unit_size': args.minUnitSize,
+            'max_unit_size': args.maxUnitSize}
+
+
+def merge_combined_regions(combined_air_regions):
+    for unit in combined_air_regions:
+        combined_air_regions[unit].merge()
+
+
+def process_sample(parameters, denovo_json, sample_id, combined_air_regions,
+                   combined_pir_regions):
+    min_unit_size = parameters['min_unit_size']
+    max_unit_size = parameters['max_unit_size']
+
+    for unit in denovo_json:
+        if not min_unit_size <= len(unit) <= max_unit_size:
+            continue
+
+        if unit == 'Depth' or unit == 'ReadLength':
+            continue
+
+        if 'RegionsWithIrrAnchors' in denovo_json[unit]:
+            anc_irr_record = denovo_json[unit]['RegionsWithIrrAnchors']
+            current_regions = regiontools.create_region_collection_from_denovo_record(
+                sample_id, anc_irr_record)
+
+            if unit not in combined_air_regions:
+                combined_air_regions[unit] = regiontools.RegionCollection()
+            combined_air_regions[unit].extend(current_regions)
+
+        if denovo_json[unit]['IrrPairCount']:
+            num_irr_pairs = denovo_json[unit]['IrrPairCount']
+            if unit not in combined_pir_regions:
+                combined_pir_regions[unit] = {}
+            combined_pir_regions[unit][sample_id] = num_irr_pairs
+
+
+def process_samples(parameters, samples, sample_depths):
+    combined_air_regions = {}
+    combined_pir_regions = {}
+
+    for sample_num, sample in enumerate(samples):
+        logging.info('Processing sample %s', sample.id)
+        with open(sample.path, 'r') as json_file:
+            denovo_json = json.load(json_file)
+            sample_depths[sample.id] = denovo_json['Depth']
+            process_sample(parameters, denovo_json, sample.id,
+                           combined_air_regions,
+                           combined_pir_regions)
+
+        if sample_num > 0 and sample_num % 50 == 0:
+            logging.info('Merging regions')
+            merge_combined_regions(combined_air_regions)
+
+    logging.info('Performing final merge')
+    merge_combined_regions(combined_air_regions)
+
+    return {'air': combined_air_regions, 'pir': combined_pir_regions}
+
+
+def normalize_count(sample_depth, count):
+    target_depth = 40
+    return target_depth * count / sample_depth
+
+
+def normalize_counts(sample_depths, combined_regions):
+    for unit, regions in combined_regions['air'].items():
+        for region in regions:
+            for sample_id, count in region.feature_counts._count_dict.items():
+                sample_depth = sample_depths[sample_id]
+                region.feature_counts._count_dict[sample_id] = normalize_count(
+                    sample_depth, count)
+    for unit, sample_counts in combined_regions['pir'].items():
+        for sample_id, count in sample_counts.items():
+            sample_depth = sample_depths[sample_id]
+            combined_regions['pir'][unit][sample_id] = normalize_count(
+                sample_depth, count)
+
+
+def create_json(combined_regions):
+    output_json = {}
+    for unit, regions in combined_regions['air'].items():
+        output_json[unit] = {}
+        output_json[unit]['RegionsWithIrrAnchors'] = regions.as_dict()
+
+    for unit, counts in combined_regions['pir'].items():
+        if unit not in output_json:
+            output_json[unit] = {}
+        output_json[unit]['IrrPairCounts'] = counts
+
+    return output_json
+
+
+def write_json(parameters, output_json):
+    with open(parameters['output_path'], 'w') as output_file:
+        json_encoding = json.dumps(output_json, sort_keys=True,
+                                   indent=4, separators=(',', ': '))
+        print(json_encoding, file=output_file)
+
+
+def main():
+    common.init_logger()
+    parameters = load_parameters()
+    samples = common.load_manifest(parameters['manifest_path'])
+    sample_depths = {}
+    combined_regions = process_samples(parameters, samples, sample_depths)
+    normalize_counts(sample_depths, combined_regions)
+    output_json = create_json(combined_regions)
+    write_json(parameters, output_json)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/str/common.py b/scripts/str/common.py
new file mode 100755
index 0000000..85ac2e3
--- /dev/null
+++ b/scripts/str/common.py
@@ -0,0 +1,129 @@
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import collections
+import logging
+import json
+import scipy.stats as stats
+
+from . import regiontools
+
+from .wilcoxontest import wilcoxon_rank_sum_test
+
+
+def init_logger():
+    logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO)
+
+
+def load_manifest(manifest_path):
+    '''Extract sample information from a manifest file.
+
+    '''
+
+    # pylint: disable=I0011,C0103
+    Sample = collections.namedtuple('Sample', 'id status path')
+
+    samples = []
+    with open(manifest_path, 'r') as manifest_file:
+        for line in manifest_file:
+            sample_id, status, path = line.split()
+            sample = Sample(id=sample_id, status=status, path=path)
+            samples.append(sample)
+    return samples
+
+
+def load_combined_json(json_path):
+    logging.info('Loading %s', json_path)
+    with open(json_path, 'r') as json_file:
+        combined_json = json.load(json_file)
+    return combined_json
+
+
+def filter_counts_by_magnitude(count_table, count_cutoff):
+    filtered_count_table = []
+    for row in count_table:
+        max_count = max(count for _, count in row['sample_counts'].items())
+        if max_count >= count_cutoff:
+            filtered_count_table.append(row)
+
+    return filtered_count_table
+
+
+def filter_counts_by_region(count_table, target_regions):
+    filtered_count_table = []
+    for row in count_table:
+        chrom, start, end = row['region'].replace(':', '-').split('-')
+        start, end = int(start), int(end)
+        region = regiontools.Region(chrom, start, end)
+        overlaps_target_region = any(regiontools.compute_distance(
+            region, target) == 0 for target in target_regions)
+
+        if overlaps_target_region:
+            filtered_count_table.append(row)
+
+    return filtered_count_table
+
+
+def filter_counts(count_table, count_cutoff, target_regions=None):
+    filtered_count_table = filter_counts_by_magnitude(
+        count_table, count_cutoff)
+    if target_regions:
+        filtered_count_table = filter_counts_by_region(
+            filtered_count_table, target_regions)
+    return filtered_count_table
+
+
+def extract_case_control_assignments(samples):
+    sample_status = {}
+    for sample in samples:
+        sample_status[sample.id] = sample.status
+    return sample_status
+
+
+def test_samples(test_params, sample_status, sample_counts):
+    control_samples = [sample for sample,
+                       status in sample_status.items() if status == 'control']
+    case_samples = [sample for sample,
+                    status in sample_status.items() if status == 'case']
+
+    control_counts = [sample_counts[s]
+                      if s in sample_counts else 0 for s in control_samples]
+
+    case_counts = [sample_counts[s]
+                   if s in sample_counts else 0 for s in case_samples]
+
+    pvalue = wilcoxon_rank_sum_test(test_params, case_counts, control_counts)
+
+    return pvalue
+
+
+def compare_counts(test_params, sample_status, count_table):
+    for row in count_table:
+        # Generate counts before testing
+        pvalue = test_samples(test_params, sample_status, row['sample_counts'])
+        row['pvalue'] = pvalue
+
+
+def correct_pvalues(count_table):
+    num_tests = len(count_table)
+    for row in count_table:
+        row['bonf_pvalue'] = min(row['pvalue'] * num_tests, 1.0)
diff --git a/scripts/str/compare_anchored_irrs.py b/scripts/str/compare_anchored_irrs.py
new file mode 100755
index 0000000..9806083
--- /dev/null
+++ b/scripts/str/compare_anchored_irrs.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import argparse
+import logging
+import sys
+import json
+
+from core import regiontools, common
+
+
+def load_parameters():
+    '''Capture command-line parameters.
+
+    '''
+    parser = argparse.ArgumentParser(
+        description='compare counts of anchored in-repeat reads')
+
+    # Add required arguments.
+    manifest_help = 'TSV file with id, case/control status, and path of each sample'
+    input_help = 'JSON file with combined counts of anchored in-repeat reads'
+    output_help = ('BED file with regions containing anchored in-repeat reads and associated'
+                   ' p-values')
+    required_arg_group = parser.add_argument_group('required arguments')
+    required_arg_group.add_argument(
+        '--manifest', help=manifest_help, required=True)
+    required_arg_group.add_argument(
+        '--inputCounts', help=input_help, required=True)
+    required_arg_group.add_argument(
+        '--outputRegions', help=output_help, required=True)
+
+    # Add optional arguments.
+    def_min_count = 5
+    min_count_help = ('minimum number reads in a region for downstream analysis'
+                      ' (default: {})').format(def_min_count)
+    target_regions_help = 'BED file with regions to which analysis should be restricted'
+    def_test_method = 'normal'
+    test_method_help = 'Method of calculating Wilcoxon Rank-Sum Test p-value (default: {})'.format(
+        def_test_method)
+    def_num_resamples = 1000000
+    num_resamples_help = 'Number of iterations for the resampling test (default: {})'.format(
+        def_num_resamples)
+    parser.add_argument(
+        '--minCount', help=min_count_help, default=def_min_count, type=int)
+    parser.add_argument('--targetRegions',
+                        help=target_regions_help, default=None)
+    parser.add_argument('--testMethod', help=test_method_help,
+                        choices=['normal', 'resample'], default=def_test_method)
+    parser.add_argument(
+        '--numResamples', help=num_resamples_help, default=def_num_resamples, type=int)
+
+    args = parser.parse_args()
+
+    parameter_encoding = ' '.join(sys.argv[1:])
+    logging.info('Starting with these parameters: %s', parameter_encoding)
+
+    test_params = {'method': args.testMethod,
+                   'num_resamples': args.numResamples}
+    return {'manifest_path': args.manifest, 'counts_path': args.inputCounts,
+            'min_count': args.minCount, 'output_path': args.outputRegions,
+            'target_regions': args.targetRegions, 'test_params': test_params}
+
+
+def generate_count_table(combined_counts):
+    count_table = []
+    for unit, rec in combined_counts.items():
+        if 'RegionsWithIrrAnchors' not in rec:
+            continue
+
+        for region, sample_counts in rec['RegionsWithIrrAnchors'].items():
+            table_row = {'region': region, 'unit': unit}
+            table_row['sample_counts'] = sample_counts
+            count_table.append(table_row)
+
+    logging.info('Loaded %i regions', len(count_table))
+    return count_table
+
+
+def load_target_regions(fname):
+    logging.info('Loading target regions from %s', fname)
+    regions = []
+    with open(fname, 'r') as bed_file:
+        for line in bed_file:
+            chrom, start, end, *_ = line.split()
+            start, end = int(start), int(end)
+            region = regiontools.Region(chrom, start, end)
+            regions.append(region)
+    return regions
+
+
+def output_results(count_table, output_path):
+    with open(output_path, 'w') as output_file:
+        for row in count_table:
+            chrom, start, end = row['region'].replace(':', '-').split('-')
+            unit = row['unit']
+            pvalue, bonf_pvalue = row['pvalue'], row['bonf_pvalue']
+
+            sample_counts = row['sample_counts']
+            encoded_counts = ['{}:{}'.format(s, c)
+                              for s, c in sample_counts.items()]
+            encoded_counts = ','.join(encoded_counts)
+            print(chrom, start, end, unit, pvalue, bonf_pvalue, encoded_counts,
+                  sep='\t', file=output_file)
+
+
+def main():
+    common.init_logger()
+    parameters = load_parameters()
+    combined_counts = common.load_combined_json(parameters['counts_path'])
+    samples = common.load_manifest(parameters['manifest_path'])
+    target_regions = None
+    if parameters['target_regions']:
+        target_regions = load_target_regions(parameters['target_regions'])
+    count_table = generate_count_table(combined_counts)
+    count_table = common.filter_counts(
+        count_table, parameters['min_count'], target_regions)
+    logging.info('%i regions left after initial filtering', len(count_table))
+    sample_status = common.extract_case_control_assignments(samples)
+    common.compare_counts(
+        parameters['test_params'], sample_status, count_table)
+    common.correct_pvalues(count_table)
+    output_results(count_table, parameters['output_path'])
+    logging.info('Done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/str/core/__init__.py b/scripts/str/core/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/scripts/str/core/common.py b/scripts/str/core/common.py
new file mode 100755
index 0000000..85ac2e3
--- /dev/null
+++ b/scripts/str/core/common.py
@@ -0,0 +1,129 @@
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import collections
+import logging
+import json
+import scipy.stats as stats
+
+from . import regiontools
+
+from .wilcoxontest import wilcoxon_rank_sum_test
+
+
+def init_logger():
+    logging.basicConfig(format='%(asctime)s: %(message)s', level=logging.INFO)
+
+
+def load_manifest(manifest_path):
+    '''Extract sample information from a manifest file.
+
+    '''
+
+    # pylint: disable=I0011,C0103
+    Sample = collections.namedtuple('Sample', 'id status path')
+
+    samples = []
+    with open(manifest_path, 'r') as manifest_file:
+        for line in manifest_file:
+            sample_id, status, path = line.split()
+            sample = Sample(id=sample_id, status=status, path=path)
+            samples.append(sample)
+    return samples
+
+
+def load_combined_json(json_path):
+    logging.info('Loading %s', json_path)
+    with open(json_path, 'r') as json_file:
+        combined_json = json.load(json_file)
+    return combined_json
+
+
+def filter_counts_by_magnitude(count_table, count_cutoff):
+    filtered_count_table = []
+    for row in count_table:
+        max_count = max(count for _, count in row['sample_counts'].items())
+        if max_count >= count_cutoff:
+            filtered_count_table.append(row)
+
+    return filtered_count_table
+
+
+def filter_counts_by_region(count_table, target_regions):
+    filtered_count_table = []
+    for row in count_table:
+        chrom, start, end = row['region'].replace(':', '-').split('-')
+        start, end = int(start), int(end)
+        region = regiontools.Region(chrom, start, end)
+        overlaps_target_region = any(regiontools.compute_distance(
+            region, target) == 0 for target in target_regions)
+
+        if overlaps_target_region:
+            filtered_count_table.append(row)
+
+    return filtered_count_table
+
+
+def filter_counts(count_table, count_cutoff, target_regions=None):
+    filtered_count_table = filter_counts_by_magnitude(
+        count_table, count_cutoff)
+    if target_regions:
+        filtered_count_table = filter_counts_by_region(
+            filtered_count_table, target_regions)
+    return filtered_count_table
+
+
+def extract_case_control_assignments(samples):
+    sample_status = {}
+    for sample in samples:
+        sample_status[sample.id] = sample.status
+    return sample_status
+
+
+def test_samples(test_params, sample_status, sample_counts):
+    control_samples = [sample for sample,
+                       status in sample_status.items() if status == 'control']
+    case_samples = [sample for sample,
+                    status in sample_status.items() if status == 'case']
+
+    control_counts = [sample_counts[s]
+                      if s in sample_counts else 0 for s in control_samples]
+
+    case_counts = [sample_counts[s]
+                   if s in sample_counts else 0 for s in case_samples]
+
+    pvalue = wilcoxon_rank_sum_test(test_params, case_counts, control_counts)
+
+    return pvalue
+
+
+def compare_counts(test_params, sample_status, count_table):
+    for row in count_table:
+        # Generate counts before testing
+        pvalue = test_samples(test_params, sample_status, row['sample_counts'])
+        row['pvalue'] = pvalue
+
+
+def correct_pvalues(count_table):
+    num_tests = len(count_table)
+    for row in count_table:
+        row['bonf_pvalue'] = min(row['pvalue'] * num_tests, 1.0)
diff --git a/scripts/str/core/regiontools.py b/scripts/str/core/regiontools.py
new file mode 100755
index 0000000..e935edf
--- /dev/null
+++ b/scripts/str/core/regiontools.py
@@ -0,0 +1,180 @@
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import copy
+
+
+class FeatureCounts(object):
+    def __init__(self, count_dict):
+        self._count_dict = count_dict
+
+    def __len__(self):
+        return len(self._count_dict)
+
+    def __getitem__(self, sample_id):
+        return self._count_dict[sample_id]
+
+    def __repr__(self):
+        encoding = ('{}={}'.format(s, self._count_dict[s])
+                    for s in sorted(self._count_dict))
+        encoding = ','.join(encoding)
+        return encoding
+
+    def __iter__(self):
+        for sample, count in self._count_dict.items():
+            yield (sample, count)
+
+    def __eq__(self, other):
+        # pylint: disable=I0011,W0212
+        return self._count_dict == other._count_dict
+
+    def combine(self, other):
+        # pylint: disable=I0011,W0212
+        for sample, count in other._count_dict.items():
+            if sample in self._count_dict:
+                self._count_dict[sample] += count
+            else:
+                self._count_dict[sample] = count
+
+
+class Region(object):
+    def __init__(self, chrom=None, start=0, end=0, feature_counts=None):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.feature_counts = feature_counts
+
+    def is_unset(self):
+        return self.chrom is None
+
+    def __repr__(self):
+        region_repr = '{}:{}-{}'.format(self.chrom, self.start, self.end)
+        if self.feature_counts:
+            region_repr += '({})'.format(self.feature_counts)
+        return region_repr
+
+    def __eq__(self, other):
+        return (self.chrom == other.chrom and self.start == other.start and
+                self.end == other.end and
+                self.feature_counts == other.feature_counts)
+
+    def __lt__(self, other):
+        self_tuple = (self.chrom, self.start, self.end)
+        other_tuple = (other.chrom, other.start, other.end)
+        return self_tuple < other_tuple
+
+
+def compute_distance(region_a, region_b):
+    distance = sys.maxsize
+    if region_a.is_unset() or region_b.is_unset():
+        raise Exception('Cannot compute distance between unset regions')
+
+    if region_a.chrom == region_b.chrom:
+        if region_a.end < region_b.start:  # Is region_a left of region_b?
+            distance = region_b.start - region_a.end
+        elif region_b.end < region_a.start:  # Is region_b left or region_a?
+            return region_a.start - region_b.end
+        else:  # If neither is true, the regions must overlap.
+            distance = 0
+
+    return distance
+
+
+class RegionCollection(object):
+    def __init__(self, regions=None):
+        self._regions = regions
+        if self._regions is None:
+            self._regions = []
+
+    def __len__(self):
+        return len(self._regions)
+
+    def __getitem__(self, index):
+        return self._regions[index]
+
+    def __repr__(self):
+        encoding = '\n'.join(repr(r) for r in self._regions)
+        return encoding
+
+    def __eq__(self, other):
+        # pylint: disable=I0011,W0212
+        for region_a, region_b in zip(self._regions, other._regions):
+            if region_a != region_b:
+                return False
+        return True
+
+    def __iter__(self):
+        for region in self._regions:
+            yield region
+
+    def as_dict(self):
+        region_dict = {}
+        for region in self._regions:
+            region_encoding = '{}:{}-{}'.format(region.chrom,
+                                                region.start, region.end)
+            region_dict[region_encoding] = dict(region.feature_counts)
+        return region_dict
+
+    def merge(self, max_dist=500):
+        self._regions.sort()
+
+        merged_regions = []
+        aggregate_region = Region()
+
+        for region in self._regions:
+            if aggregate_region.is_unset():
+                aggregate_region = region
+                continue
+
+            if compute_distance(aggregate_region, region) <= max_dist:
+                aggregate_region.end = max(aggregate_region.end, region.end)
+                aggregate_region.feature_counts.combine(region.feature_counts)
+            else:
+                merged_regions.append(aggregate_region)
+                aggregate_region = region
+
+        if not aggregate_region.is_unset():
+            merged_regions.append(aggregate_region)
+
+        self._regions = merged_regions
+
+    def extend(self, other):
+        # pylint: disable=I0011,W0212
+        self._regions += other._regions
+
+
+def create_region_from_denovo_record(sample_id, region_count):
+    region_encoding, count = region_count
+    region_encoding = region_encoding.replace(':', '-').split('-')
+    chrom, start, end = region_encoding
+    start, end = int(start), int(end)
+    feture_counts = FeatureCounts({sample_id: count})
+    return Region(chrom, start, end, feture_counts)
+
+
+def create_region_collection_from_denovo_record(sample_id, denovo_record):
+    regions = []
+    for region_count in denovo_record.items():
+        region = create_region_from_denovo_record(sample_id, region_count)
+        regions.append(region)
+    return RegionCollection(regions)
diff --git a/scripts/str/core/wilcoxontest.py b/scripts/str/core/wilcoxontest.py
new file mode 100755
index 0000000..9035d9b
--- /dev/null
+++ b/scripts/str/core/wilcoxontest.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import numpy as np
+import scipy.special
+import scipy.stats
+
+
+def calculate_approximate_pvalue(cases, controls):
+    all_counts = cases + controls
+    ranks = scipy.stats.rankdata(all_counts)
+    case_rank_sum = np.sum(ranks[:len(cases)])
+
+    mu_cases = len(cases) * (len(cases) + len(controls) + 1) / 2
+    sigma_cases = np.sqrt(len(cases) * len(controls) *
+                          (len(cases) + len(controls) + 1) / 12)
+    z_cases = (case_rank_sum - mu_cases) / sigma_cases
+
+    return 1 - scipy.stats.norm.cdf(z_cases)
+
+
+def calculate_permutation_pvalue(cases, controls, num_permutations):
+    all_counts = cases + controls
+    ranks = scipy.stats.rankdata(all_counts)
+    num_cases = len(cases)
+    true_case_rank_sum = np.sum(ranks[:num_cases])
+
+    permuted_case_ranks = np.random.choice(
+        ranks, size=(num_permutations, num_cases))
+    permuted_case_rank_sums = np.sum(permuted_case_ranks, axis=1)
+
+    num_case_rank_sums_as_extreme_as_true = np.sum(
+        permuted_case_rank_sums >= true_case_rank_sum)
+
+    return (num_case_rank_sums_as_extreme_as_true + 1) / (num_permutations + 1)
+
+
+def wilcoxon_rank_sum_test(test_params, cases, controls):
+    test_method = test_params['method']
+    if test_method == 'normal':
+        return calculate_approximate_pvalue(cases, controls)
+    elif test_method == 'resample':
+        return calculate_permutation_pvalue(cases, controls, test_params['num_resamples'])
+    else:
+        assert False, '{} is an unknown method type'.format(test_method)
diff --git a/scripts/str/crg.ehdn.sh b/scripts/str/crg.ehdn.sh
new file mode 100644
index 0000000..99b19fc
--- /dev/null
+++ b/scripts/str/crg.ehdn.sh
@@ -0,0 +1,88 @@
+#PBS -N EHDN
+#PBS -l vmem=80g,mem=80g,walltime=24:00:00
+#PBS -joe
+#PBS -d .
+
+if [ -z $EHDN ]
+then
+    exit;
+fi
+
+if [ -z ${g1k_manifest} ]
+then
+    exit;
+fi
+
+if [ -z $ref ]
+then
+    exit;
+fi
+
+if [ -z $script_dir ]
+then
+    exit;
+fi
+
+scripts="${script_dir}";
+
+
+
+if [ -z $family ]; then family=$1; fi;
+if [ -z $pipeline ]; then pipeline=$2; fi #pipeline is crg or crg2
+
+if [ $pipeline == "crg" ]; then 
+    bcbio="${family}/bcbio-align/${family}/final/${family}_*";
+    if [ -z "`ls ${bcbio}/${family}_*-ready.bam`" ]; then 
+        if [ -z "`ls ${family}/${family}*-ready.bam`" ]; then
+            echo "BAM files for $family not found inside bcbio-align/ and $family/. Exiting!";
+            exit
+        else
+            dir="$family/${family}*-ready.bam";
+        fi;
+    else
+        dir="$bcbio/${family}*-ready.bam";
+    fi;
+    outdir="${family}/str/expansion_hunter_denovo";
+else
+    dir="decoy_rm/*.bam"
+    outdir="str/EHDN";
+fi;
+
+manifest="${outdir}/${family}_manifest.txt";
+echo "Running for pipeline: ${pipeline} with directories => ${outdir}, ${dir}, $family";
+
+for i in `ls $dir`; do
+
+    if [ ! -d "$outdir" ]; then
+        mkdir -p $outdir;
+    fi;
+    if [ $pipeline == "crg" ]; then 
+        prefix=`basename $i -ready.bam`;
+    else
+        prefix=`basename $i .bam | cut -d "." -f1`; 
+    fi
+    reads=`readlink -f $i`;
+    ehdn_prefix="${outdir}/${prefix}";
+    echo -e "STEP1: ExpansionHunterDenovo-v0.7.0 --reference $ref --reads $reads --output-prefix $ehdn_prefix --min-anchor-mapq 50 --max-irr-mapq 40 \n"
+    $EHDN --reference $ref --reads $reads --output-prefix $ehdn_prefix --min-anchor-mapq 50 --max-irr-mapq 40 &
+    echo -e "${prefix}\tcase\t${ehdn_prefix}.json" >> $manifest;
+
+done;
+
+wait
+
+cat $manifest ${g1k_manifest} > temp && mv temp $manifest;
+
+##EHDN report per family
+echo "STEP2: generating multi-sample EHDN STR profile and report named ${outdir}/${family}_EHDN_str.tsv"
+#module load python/3.7.7
+
+countsfile="${outdir}/${family}_counts.txt";
+profile="${outdir}/${family}_EHDN_str.tsv";
+python ${scripts}/combine_counts.py --manifest $manifest --combinedCounts $countsfile
+python ${scripts}/compare_anchored_irrs.py --manifest $manifest --inputCounts $countsfile --outputRegions $profile --minCount 2 --testMethod normal
+#module unload python/3.7.7
+
+##add chr to chromosome column
+echo "STEP2.1: add chr prefix to first column ${outdir}/${family}_EHDN_str.tsv"
+awk -vFS="\t" -vOFS="\t"  '{ if(!($1~/^chr/)) {$1="chr"$1;} print $0; }' ${profile} > temp && mv temp ${profile}
diff --git a/scripts/eh_sample_report.py b/scripts/str/eh_sample_report.py
similarity index 95%
rename from scripts/eh_sample_report.py
rename to scripts/str/eh_sample_report.py
index da49d36..ccabcb9 100755
--- a/scripts/eh_sample_report.py
+++ b/scripts/str/eh_sample_report.py
@@ -11,7 +11,7 @@
 from xlsxwriter.workbook import Workbook
 
 annot_tsv = sys.argv[1] #output from add_gene+threshold_to_EH_column_headings2.py
-g1000 = sys.argv[2] #median/mean/std GT sizes from 1000G EH 
+g1000 = sys.argv[2]
 xlsx = sys.argv[3] #output xlsx filename
 
 def outlier_gt(threshold, gt_dict):
@@ -23,6 +23,8 @@ def outlier_gt(threshold, gt_dict):
         y = x.split("/") if "/" in x else x
         if not "." in y:
             y = max(list(map(int,y))) if isinstance(y,list) else int(y)
+        else:
+            return " "
         if y:
             if "(" in threshold or " " in threshold:
                 continue
@@ -54,7 +56,7 @@ def outlier_gt(threshold, gt_dict):
 with open(g1000) as f:
     for i in f:
         if not i.startswith("#"):
-            annot, mean, std, median = i.strip().split("\t")
+            annot, mean, median, std = i.strip().split("\t")
             if not annot in G1K:
                 G1K[annot] = [mean, std, median]
 
@@ -93,7 +95,6 @@ def outlier_gt(threshold, gt_dict):
 worksheet.write_row(0, 0, header)
 row = 1
 for i in trf:
-        # for i in eh_gt[sample]:
         info = trf[i][samples[0]]
         content = [info.pos, info.motif, info.gene, info.size]
         content += [ trf[i][s].gt for s in samples ]          
diff --git a/scripts/str/ehdn_report.sh b/scripts/str/ehdn_report.sh
new file mode 100644
index 0000000..24b187d
--- /dev/null
+++ b/scripts/str/ehdn_report.sh
@@ -0,0 +1,63 @@
+#PBS -N EHDN_REPORT
+#PBS -l vmem=80g,mem=80g,walltime=48:00:00
+#PBS -joe
+#PBS -d .
+
+
+scripts=~/crg/str; ###remove###
+EHDN_files=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo; ###remove### 
+g1k_outlier="${EHDN_files}/1000G_outlier";
+g1k_manifest="${EHDN_files}/manifest.1000G.txt";
+omim=${EHDN_files}/OMIM_hgnc_join_omim_phenos_2023-06-22.tsv;
+gnomad=${EHDN_files}/gnomad.v2.1.1.lof_metrics.by_gene.txt;
+annovar=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/crg2-non-conda-tools/annovar; ###remove### 
+annovar_db=/hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/annovar/humandb/; ###remove###
+
+family=$1;
+outdir=$2;
+
+##find outlier(noise) samples in current cohort to be removed later
+echo "STEP1: write outlier samples to a file"
+profile="${outdir}/${family}_EHDN_str.tsv";
+outliers="${outdir}/${family}_outliers.txt";
+echo -e "CMD: python ${scripts}/find_outliers.py ${profile} ${outliers}\n"
+python ${scripts}/find_outliers.py ${profile} ${outliers}
+cat ${outliers} ${g1k_outlier} > temp && mv temp ${outliers}
+
+##run outlier detection using DBSCAN clustering with 1000G as control
+echo "STEP2: Outlier detection using DBSCAN clustering"
+outpath="${outdir}/outliers"
+echo -e "CMD: Rscript ${scripts}/DBSCAN.EHdn.parallel.R --infile ${profile} --outpath ${outpath} --outlierlist ${outliers}\n"
+Rscript ${scripts}/DBSCAN.EHdn.parallel.R --infile ${profile} --outpath ${outpath} --outlierlist ${outliers}
+outlier_tsv=`ls -t ${outpath}/EHdn.expansions*tsv | head -n1`;
+echo -e "CMD: Rscript ${scripts}/mergeExpansions.R --ehdn ${profile}  --outlier ${outlier_tsv} --outpath ${outpath}\n"
+Rscript ${scripts}/mergeExpansions.R --ehdn ${profile}  --outlier ${outlier_tsv} --outpath ${outpath}
+
+
+#following Python script requires python3 and xlsxwriter
+#installed in my conda env "str"
+#source /hpf/largeprojects/ccmbio/aarthi/miniconda3/etc/profile.d/conda.sh
+#conda activate str
+##format above output for annovar annotation
+echo "STEP3.1: format DBSCAN output for ANNOVAR annotation"
+merge_tsv=`ls -t ${outpath}/merged.rare.expansions.[0-9]*.tsv | head -n1`;
+echo -e "CMD:python ${scripts}/format_for_annovar.py ${outlier_tsv} ${merge_tsv} ${g1k_manifest}\n"
+python ${scripts}/format_for_annovar.py ${outlier_tsv} ${merge_tsv} ${g1k_manifest}
+
+echo "STEP3.2: annotate with OMIM"
+annovar_input=`ls -t ${outpath}/merged.rare.EHdn.expansion.[0-9]*tsv | head -n1`; #incase there > 1 file because of multiple runs
+outfile=`echo ${merge_tsv} | awk '{ split($1,a,".tsv"); print a[1]; }'`;
+echo -e "CMD: ${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile \"${outfile}-OMIM\" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${omim}\n"
+${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile "${outfile}-OMIM" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${omim}
+
+echo "STEP3.3: annotate with gnoMAD"
+echo -e "CMD: ${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile \"${outfile}-gnoMAD\" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${gnomad}\n"
+${annovar}/table_annovar.pl ${annovar_input} ${annovar_db} -buildver hg19 -outfile "${outfile}-gnoMAD" -remove --onetranscript --otherinfo -protocol refGene -operation gx -nastring . -xreffile ${gnomad}
+
+echo "STEP3.4: format annotated reports to xlsx: EHDN.rare.merged.annotated.xlsx"
+gnomad_out=`ls -t ${outfile}-gnoMAD.hg19_multianno.txt | head -n1`
+omim_out=`ls -t ${outfile}-OMIM.hg19_multianno.txt | head -n1`
+date=`date +%Y-%m-%d`
+xlsx="${outpath}/${family}.EHDN.${date}.xlsx"
+echo -e "CMD: python ${scripts}/format_from_annovar.py ${gnomad_out} ${omim_out} ${xlsx}\n"
+python ${scripts}/format_from_annovar.py ${gnomad_out} ${omim_out} ${xlsx}
diff --git a/scripts/str/find_outliers.py b/scripts/str/find_outliers.py
new file mode 100755
index 0000000..db4de3f
--- /dev/null
+++ b/scripts/str/find_outliers.py
@@ -0,0 +1,38 @@
+import sys, numpy as np
+#from matplotlib import pyplot as plt
+
+
+
+infile = sys.argv[1]
+outfile = sys.argv[2]
+tr = {}
+rec = []
+with open(infile) as f:
+    for  i in f:
+        i = i.strip().split("\t")
+        if any(s in i for s in ['motif', 'zscore', 'end', 'start']):
+            header = i
+        else:
+            depth = i[-1]
+            if "," in depth:
+                j = depth.split(",")
+                for item in j:
+                    k = item.split(":")[0]
+                    if not k in tr:
+                        tr[k] = 0
+                    tr[k] += 1
+            else:
+                k = depth.split(":")[0]
+                if not k in tr:
+                        tr[k] = 0
+                tr[k] += 1
+
+allcounts = [ counts for sample, counts in tr.items() if not sample.startswith(("HG","NA"))]
+stats = lambda l: (np.mean(l), np.std(l), np.median(l))
+allcounts_stats = stats(allcounts)
+not_outlier = lambda l, c: (l[0] - 3 * l[1]) < c and (l[0] + 3 * l[1] ) > c
+with open(outfile, "w") as f:
+    for sample, counts in tr.items():
+        if not sample.startswith(("HG","NA")):
+            if not not_outlier(allcounts_stats, counts):
+                f.writelines("%s"%sample)
diff --git a/scripts/str/format_for_annovar.py b/scripts/str/format_for_annovar.py
new file mode 100755
index 0000000..dcdb0cc
--- /dev/null
+++ b/scripts/str/format_for_annovar.py
@@ -0,0 +1,137 @@
+import sys, os, re
+from collections import namedtuple
+
+'''
+merge output files of EHDN after running through DBSCAN script into one file
+get values for outlier, repeat size, a1000g_freq from EHdn.expansion.<date>.tsv 
+and merge with merged.rare.expansion.<date>.tsv for ANNOVAR annotation
+manifest_1000G.txt: 
+    EHDN manifest files (three-column:sampleid\tcase/control\tpath to EHDN JSON)
+    you can also provide a single column file with 1000G sample ids; this is uesd
+    to remove rows with only 1000G samples as outlier
+python format_for_annovar.py EHdn.expansions.2021-02-18.tsv merged.rare.expansions.2021-02-18.tsv manifest_1000G.txt
+'''
+
+#not using pandas.merge because the two files have some duplicate lines (chr#start#end)
+#merged.rare.expansion has altered intervals (merging nearby motifs) so won't exactly match those
+#in EHdn.expansion.<dat>.tsv file entries
+
+ehdn_out = sys.argv[1]
+dbscan_out = sys.argv[2]
+g1k_manifest = sys.argv[3]
+#date = ehdn_out.split(".")[-2]
+d = os.path.dirname(dbscan_out)
+outfile = os.path.join(d,"merged.rare.EHdn.expansion.tsv")
+samples = []
+print("input files: ", ehdn_out, dbscan_out, g1k_manifest)
+print("out:", outfile)
+
+#read 1000G sample names from manifest file
+g1k = []
+with open(g1k_manifest) as f:
+    for i in f:
+        i = i.strip("\n")
+        if re.split("\t ' '",i)[0]:
+            g1k.append(re.split("\t|' '",i)[0])
+        else:
+            g1k.append(i)
+all_g1k_sample = lambda x: set(x).difference(g1k) 
+
+#read DBSCAN output
+dbscan = namedtuple("dbscan", "chr start end motif outliers key")
+merged_exp = []
+with open(dbscan_out) as f:
+    f.readline()
+    for i in f:
+        i = i.strip().split("\t")
+        key = "#".join(i[0:3])
+        outlier = re.split(";",i[4])
+        if all_g1k_sample(outlier):
+            merged_exp.append(dbscan(chr=i[0],start=i[1],end=i[2],motif=i[3],outliers=i[4],key=key))
+            for o in outlier:
+                if not o in g1k:
+                    samples.append(o)
+
+samples = list(set(samples))
+print("samples = ", samples)
+#read EHDN output
+ehdn = namedtuple("ehdn", "motif outliers size ref chr start end a1000g key")
+ehdn_exp = []
+with open(ehdn_out) as f:
+    f.readline()
+    for i in f:
+        i = i.strip().split("\t")
+        key = "#".join(i[5:8])
+        ehdn_exp.append(ehdn(motif=i[1],outliers=i[2],size=i[3],ref=i[4],chr=i[5],start=i[6],end=i[7],a1000g=i[8],key=key))
+ehdn_exp        
+
+#match reacords from DBSCAN and EHDN to retrieve all info
+#DBSCAN repeats are merged based on locations, so may not match original EHDN output
+
+def split_outlier_size(outlier,size,samples):
+    outlier_l = re.split(";",outlier) 
+    size_l = re.split(";",size)
+    if len(size_l) == len(outlier_l):
+        len_samples = len(samples)
+        outlier_samples = ["0"]*(len_samples+1)
+        outlier_sizes = ["0"]*len_samples
+        for i,s in enumerate(samples):
+            if s in outlier_l:
+                ind = outlier_l.index(s)
+                outlier_samples[i] = "1"
+                outlier_sizes[i] = size_l[ind]
+            else:
+                if s in g1k:
+                    outlier_samples[len_samples] += 1
+    return outlier_samples, outlier_sizes
+
+found, allkey, out, seen = [], [], [], []
+ref, alt = "0", "-"
+for i in merged_exp:
+    allkey.append(i)
+    for j in ehdn_exp:
+        if i.key == j.key:
+            if i.outliers == j.outliers:
+                found.append(i)
+                outliers, sizes = split_outlier_size(i.outliers, j.size, samples)
+                print(outliers, sizes, i.key)
+                # out.append([i.chr, i.start, i.end, ref, alt, i.motif, i.outliers, i.key, j.size, j.a1000g])
+                # out.append([i.chr, i.start, i.end, ref, alt, i.motif, outliers, sizes, i.key, j.a1000g])
+                o = [i.chr, i.start, i.end, ref, alt, i.motif ] +  outliers + sizes + [i.key, j.a1000g]
+                out.append(o)
+                print(o)
+            # seen.append(i)
+                continue
+
+
+notfound = list(set(allkey).difference(found))
+within = lambda a,b,c,d: ( c <= a and a <= d ) or ( c <= b and b <= d)
+for i in notfound:
+    for j in ehdn_exp:
+        if i.chr == j.chr and i.motif == j.motif and i.outliers == j.outliers:
+            r1 = range(int(i.start),int(i.end))
+            r2 = range(int(j.start),int(j.end))
+#            if i.start == j.start or i.end == j.end or len(set(r1).intersection(r2)) > 5:
+            if within(i.start,i.end,j.start,j.end) or within(j.start,j.end,i.start,i.end):
+                # out.append([i.chr, i.start, i.end, ref, alt, i.motif, i.outliers, i.key, j.size, j.a1000g])
+                outliers, sizes = split_outlier_size(i.outliers, j.size, samples)
+                o = [i.chr, i.start, i.end, ref, alt, i.motif ] +  outliers + sizes + [i.key, j.a1000g]
+                # out.append([i.chr, i.start, i.end, ref, alt, i.motif, outliers, sizes, i.key, j.a1000g])
+                out.append(o)
+                print(o)
+#               seen.append(i)
+                continue
+
+columns = ["chr", "start", "end", "ref", "alt", "motif", ]
+columns += ["outlier." + s for s in samples]
+columns += ["1000G_outlier_count"]
+columns += ["size." + s for s in samples]
+columns += ["key", "1000G_freq_perc"]
+with open(outfile, "w")  as f:
+    for i in columns:
+        f.writelines("%s\t"%i)
+    f.writelines("\n")
+    for i in out:
+        for j in i:
+            f.writelines("%s\t"%j)
+        f.writelines("\n")
diff --git a/scripts/str/format_from_annovar.py b/scripts/str/format_from_annovar.py
new file mode 100755
index 0000000..8d915cf
--- /dev/null
+++ b/scripts/str/format_from_annovar.py
@@ -0,0 +1,46 @@
+import pandas as pd, sys
+import xlsxwriter
+
+'''
+format the outputs from ANNOVAR table_annovar.pl with OMIM and GNOMAD
+merged.rare.EHdn.annovar.omim.hg19_multianno.txt
+merged.rare.EHdn.annovar.gnomad.hg19_multianno.txt
+'''
+
+gnomad_file = sys.argv[1]
+omim_file = sys.argv[2]
+outfile = sys.argv[3]
+
+
+#read OMIM annotated file as table and fix header
+omim = pd.read_csv(omim_file, sep="\t", header=[0,1], index_col=None)
+omim.columns = [j if i.startswith("Otherinfo") else i for i, j in omim.columns ]
+omim.rename(columns=lambda x: x.replace(".refGene",""), inplace=True)
+omim_col = ['omim_inheritance', 'omim_phenotype' ]
+
+#read gnoMAD annotated file as table and fix header
+gnomad = pd.read_csv(gnomad_file, sep="\t", header=[0,1], index_col=None)
+gnomad.columns = [j if i.startswith("Otherinfo") else i for i, j in gnomad.columns ]
+gnomad.rename(columns=lambda x: x.replace(".refGene",""), inplace=True)
+
+#sample-wise outlier and size column name for reporting
+sample_outliers, size_outliers = [], []
+for i in list(gnomad.columns):
+    if i.startswith("outlier.") or i.startswith("1000G_outlier_count"):
+        sample_outliers.append(i)
+    if i.startswith("size."):
+        size_outliers.append(i)
+print(sample_outliers, size_outliers)
+print(gnomad.columns)
+
+gnomad_col = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func', 'Gene', 'motif' ] + sample_outliers + size_outliers + ['key', '1000G_freq_perc', 'GeneDetail','ExonicFunc', 'AAChange', 'transcript', 'oe_lof', 'oe_lof_upper', 'oe_mis', 'oe_mis_upper', 'pLI', 'pRec']
+print(gnomad_col)
+annot_rare = pd.merge(gnomad[gnomad_col],omim[omim_col],right_index=True, left_index=True)
+print(annot_rare.head(5))
+
+output_columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Function', 'Gene', 'motif', ]
+output_columns += sample_outliers + size_outliers
+output_columns += ['chr#start#end', 'a1000g_freq_perc', 'GeneDetail','ExonicFunc', 'AAChange', 'transcript', 'oe_lof', 'oe_lof_upper', 'oe_mis', 'oe_mis_upper', 'pLI', 'pRec', 'omim_inheritance', 'omim_phenotype' ]
+if not ".xlsx" in outfile:
+    outfile = outfile + ".xlsx"
+annot_rare.to_excel(outfile, index=False)
diff --git a/scripts/generate_EH_genotype_table.generic.py b/scripts/str/generate_EH_genotype_table.generic.py
similarity index 100%
rename from scripts/generate_EH_genotype_table.generic.py
rename to scripts/str/generate_EH_genotype_table.generic.py
diff --git a/scripts/str/mergeExpansions.R b/scripts/str/mergeExpansions.R
new file mode 100755
index 0000000..ff35349
--- /dev/null
+++ b/scripts/str/mergeExpansions.R
@@ -0,0 +1,189 @@
+rm(list=ls())
+
+#### parameters
+#### to run
+#### Rscript mergeExpansions.R --rscript ~/crg2/scripts/str/ExpansionAnalysis.R --ehdn /hpf/largeprojects/tcagstor/users/btrost/papers/STRs/Qatar_ASD/output_regions.min2.1000G+SSC+MSSNG+QASD.txt 
+#### --outlier dbscan.tsv --outpath output/ --trf /hpf/largeprojects/ccm_dccforge/dccdipg/Common/annotation/ExpansionHunterDenovo/UCSC_simple_repeats_hg19_coord_motif.tsv
+###trf <- "/hpf/largeprojects/ccmbio/aarthi/old_projects/proj_CHEO/CRG/str/ExpansionAnalysisPackage/UCSC_simple_repeats_hg19_coord_motif.tsv"
+###source(path.expand("~/crg2/str/ExpansionAnalysisFunctions.R"))
+
+#### read arguments from command line
+args = commandArgs(trailingOnly=TRUE)
+if(length(args) < 2){
+  stop(call. = T, "Require more argument(s)")
+}
+
+paramNames <- grep("--", args)
+paramValues <- paramNames + 1
+
+if(length(grep("--", args[paramValues])) > 0){
+  stop(call. = T, "Failed in reading arguments, '--' found in argument value")
+}
+
+if(length(args) != length(c(paramNames, paramValues))){
+  message(sprintf("Ignore argument(s):%s", paste(args[-union(paramNames, paramValues)], collapse=",")))
+}
+
+params <- list()
+paramNames <- gsub("--", "", args[paramNames])
+for(i in 1:length(paramNames)){
+  params[paramNames[i]] <- args[paramValues[i]]
+}
+
+source(path.expand(params$rscript))
+outpath <- params$outpath
+if(length(grep("\\/$", outpath)) == 0){
+  outpath <- paste0(outpath, "/")
+}
+
+if(!dir.exists(outpath)){
+  dir.create(outpath, recursive = T)
+}
+
+###############
+
+library(data.table)
+library(GenomicRanges)
+library(ggplot2)
+library(cowplot)
+library(Biostrings)
+
+queryString <- function(str1, str2){
+  ori.str2 <- str2
+  str2 <- paste0(str2, str2)
+  
+  length.str1 <- nchar(str1)
+  length.str2 <- nchar(str2)
+  revcom.str1 <- as.character(reverseComplement(DNAString(str1)))
+  
+  length.diff <- ifelse(length.str2 > length.str1, length.str2 - length.str1, 1)
+  
+  identity <- c()
+  for(i in 1:length.diff){
+    tmp <- substr(str2, i, length.str1+i-1)
+    match <- sum(strsplit(tmp, "")[[1]] == strsplit(str1, "")[[1]])
+    match.recom <- sum(strsplit(tmp, "")[[1]] == strsplit(revcom.str1, "")[[1]])
+    
+    match <- min(max(match, match.recom), nchar(ori.str2))
+    tmp.identity <- match/length.str1
+    
+    identity <- c(identity, tmp.identity)
+  }
+  return(max(identity))
+}
+
+#### prepare map TRF
+message(sprintf("reading %s##", params$ehdn))
+ehdn <- fread(params$ehdn)
+trf <- fread(params$trf)
+ehdn <- as.data.frame(ehdn)[, c(1:6)]
+ehdn$repeatID <- paste(ehdn$V1, ehdn$V2, ehdn$V3, ehdn$V4, sep="#")
+names(ehdn) <- c("chr", "start", "end", "motif", "var1", "var2", "repeatID")
+if(length(grep("chr", ehdn$chr[1:5])) == 0)
+  ehdn$chr <- paste0("chr", ehdn$chr)
+trf <- as.data.frame(trf)
+trf$repeatID <- paste(trf$V1, trf$V2, trf$V3, trf$V4, sep="#")
+
+ehdn.g <- GRanges(ehdn$chr, IRanges(ehdn$start, ehdn$end), "*")
+trf.g <- GRanges(trf$V1, IRanges(trf$V2, trf$V3), "*")
+
+olap <- data.frame(findOverlaps(trf.g, ehdn.g))
+olap$trf.motif <- trf$V4[olap$queryHits]
+olap$ehdn.motif <- ehdn$motif[olap$subjectHits]
+olap$trf.id <- trf$repeatID[olap$queryHits]
+olap$ehdn.id <- ehdn$repeatID[olap$subjectHits]
+
+for(i in 1:nrow(olap)){
+  trf.identity <- queryString(olap$trf.motif[i], olap$ehdn.motif[i])
+  ehdn.identity <- queryString(olap$ehdn.motif[i], olap$trf.motif[i])
+  
+  olap$reciprocalIdentity[i] <- min(trf.identity, ehdn.identity)
+}
+
+olap$length.diff <- nchar(olap$trf.motif) - nchar(olap$ehdn.motif)
+
+olap.f <- olap[order(olap$reciprocalIdentity, decreasing = T), ]
+olap.f <- olap.f[!duplicated(olap.f$subjectHits), ]
+olap.f <- olap.f[olap.f$reciprocalIdentity > 0.66, ]
+olap.f <- aggregate(trf.id ~ ehdn.id + reciprocalIdentity + length.diff, olap.f, paste, collapse = ";")
+olap.f <- unique(olap.f)
+olap.f <- olap.f[order(olap.f$reciprocalIdentity, decreasing = T), ]
+
+olap.f <- olap.f[!duplicated(olap.f$ehdn.id), ]
+
+write.table(olap.f[, c(4, 1, 2, 3)], sprintf("%smap.TRF.EHdn.0.66.tsv", outpath), sep="\t", row.names=F, quote=F, col.names=T)
+
+#########################################
+### merge
+outfile <- sprintf("%smerged.ehdn.tsv", outpath)
+
+dt <- fread(params$ehdn)
+dt <- data.frame(dt)
+ehdn <- dt
+dt$varid <- paste(dt$V1, dt$V2, dt$V3, sep="_")
+
+clean.sample <- readLines(sprintf("%sclean.samples.txt", outpath))
+
+ehdn$repeatID <- paste(ehdn$V1, ehdn$V2, ehdn$V3, ehdn$V4, sep="#")
+ehdn$samples <- 0
+for(i in 1:nrow(ehdn)){
+  tmp <- strsplit(gsub(",", ":", ehdn$V7[i]), ":")[[1]]
+  tmp <- tmp[seq(1, length(tmp), 2)]
+  tmp <- tmp[tmp %in% clean.sample]
+  
+  ehdn$samples[i] <- length(tmp)
+}
+
+ehdn <- ehdn[ehdn$samples > 0, ]
+map <- read.delim(sprintf("%smap.TRF.EHdn.0.66.tsv", outpath), stringsAsFactors = F)
+
+ehdn.all <- merge(ehdn[, -c(5:7, 9)], map, by.x = "repeatID", by.y = "ehdn.id", all.x = T)
+names(ehdn.all) <- c("repeatID", "chr", "start", "end", "motif", "trf.id", "reciprocalIdentity", "length.diff")
+ehdn.all <- ehdn.all[ehdn.all$chr %in% paste0("chr", c(1:22, "X", "Y")), ]
+
+ehdn.m <- mergeLoci(ehdn.all)
+ehdn.m$uniqueMotif <- sapply(sapply(ehdn.m$motif, strsplit, ";"), length)
+
+ehdn.m$allRepeat <- ehdn.m$repeatID
+ehdn.m$repeatID <- paste(ehdn.m$chr, ehdn.m$start, ehdn.m$end, sep="#")
+
+write.table(ehdn.m, outfile, sep="\t", row.names=F, quote=F, col.names=T)
+
+##################
+### extract rare merged expansion
+expansion <- read.delim(params$outlier, stringsAsFactors = F)
+expansion <- expansion[expansion$a1000g_freq_perc < 0.1, ]
+
+ehdn.m.g <- GRanges(ehdn.m$chr, IRanges(ehdn.m$start, ehdn.m$end), "*")
+expansion.g <- GRanges(expansion$chr, IRanges(expansion$start, expansion$end), "*")
+
+olap <- data.frame(findOverlaps(expansion.g, ehdn.m.g))
+olap <- aggregate(queryHits ~ subjectHits, olap, paste, collapse = ";")
+
+expansion.m <- data.frame()
+for(i in 1:nrow(olap)){
+  idx <- as.numeric(strsplit(olap$queryHits[i], ";")[[1]])
+  tmp.exp <- expansion[idx, ]
+  motif <- paste(tmp.exp$motif, sep=";")
+  outliers <- paste(tmp.exp$outliers, sep=";")
+  repeats <- paste(tmp.exp$repeatID, sep=";")
+  chr <- tmp.exp$chr[1]
+  start <- min(tmp.exp$start)
+  end <- max(tmp.exp$end)
+  
+  expansion.m <- rbind(expansion.m, data.frame(chr, start, end, motif, outliers, stringsAsFactors = F))
+}
+
+write.table(expansion.m, sprintf("%smerged.rare.expansions.tsv", outpath), sep="\t", row.names=F, col.names=T, quote=F)
+            
+expansion.m$ref <- "0"
+expansion.m$alt <- "-"
+expansion.m$varid <- paste(expansion.m$chr, expansion.m$start, expansion.m$end, sep="#")
+names(expansion.m)[1] <- "#chr"
+exp.annovar <- expansion.m[, c("#chr", "start", "end", "ref", "alt", "varid")]
+
+write.table(exp.annovar, sprintf("%smerged.rare.expansions.forannotation.tsv",outpath), sep="\t", row.names=F, quote=F, col.names=T)
+#####
+#####
+###print("Printing sessioninfo from mergeExpansions.R")
+###sessionInfo()
diff --git a/scripts/str/regiontools.py b/scripts/str/regiontools.py
new file mode 100755
index 0000000..e935edf
--- /dev/null
+++ b/scripts/str/regiontools.py
@@ -0,0 +1,180 @@
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import sys
+import copy
+
+
+class FeatureCounts(object):
+    def __init__(self, count_dict):
+        self._count_dict = count_dict
+
+    def __len__(self):
+        return len(self._count_dict)
+
+    def __getitem__(self, sample_id):
+        return self._count_dict[sample_id]
+
+    def __repr__(self):
+        encoding = ('{}={}'.format(s, self._count_dict[s])
+                    for s in sorted(self._count_dict))
+        encoding = ','.join(encoding)
+        return encoding
+
+    def __iter__(self):
+        for sample, count in self._count_dict.items():
+            yield (sample, count)
+
+    def __eq__(self, other):
+        # pylint: disable=I0011,W0212
+        return self._count_dict == other._count_dict
+
+    def combine(self, other):
+        # pylint: disable=I0011,W0212
+        for sample, count in other._count_dict.items():
+            if sample in self._count_dict:
+                self._count_dict[sample] += count
+            else:
+                self._count_dict[sample] = count
+
+
+class Region(object):
+    def __init__(self, chrom=None, start=0, end=0, feature_counts=None):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+        self.feature_counts = feature_counts
+
+    def is_unset(self):
+        return self.chrom is None
+
+    def __repr__(self):
+        region_repr = '{}:{}-{}'.format(self.chrom, self.start, self.end)
+        if self.feature_counts:
+            region_repr += '({})'.format(self.feature_counts)
+        return region_repr
+
+    def __eq__(self, other):
+        return (self.chrom == other.chrom and self.start == other.start and
+                self.end == other.end and
+                self.feature_counts == other.feature_counts)
+
+    def __lt__(self, other):
+        self_tuple = (self.chrom, self.start, self.end)
+        other_tuple = (other.chrom, other.start, other.end)
+        return self_tuple < other_tuple
+
+
+def compute_distance(region_a, region_b):
+    distance = sys.maxsize
+    if region_a.is_unset() or region_b.is_unset():
+        raise Exception('Cannot compute distance between unset regions')
+
+    if region_a.chrom == region_b.chrom:
+        if region_a.end < region_b.start:  # Is region_a left of region_b?
+            distance = region_b.start - region_a.end
+        elif region_b.end < region_a.start:  # Is region_b left or region_a?
+            return region_a.start - region_b.end
+        else:  # If neither is true, the regions must overlap.
+            distance = 0
+
+    return distance
+
+
+class RegionCollection(object):
+    def __init__(self, regions=None):
+        self._regions = regions
+        if self._regions is None:
+            self._regions = []
+
+    def __len__(self):
+        return len(self._regions)
+
+    def __getitem__(self, index):
+        return self._regions[index]
+
+    def __repr__(self):
+        encoding = '\n'.join(repr(r) for r in self._regions)
+        return encoding
+
+    def __eq__(self, other):
+        # pylint: disable=I0011,W0212
+        for region_a, region_b in zip(self._regions, other._regions):
+            if region_a != region_b:
+                return False
+        return True
+
+    def __iter__(self):
+        for region in self._regions:
+            yield region
+
+    def as_dict(self):
+        region_dict = {}
+        for region in self._regions:
+            region_encoding = '{}:{}-{}'.format(region.chrom,
+                                                region.start, region.end)
+            region_dict[region_encoding] = dict(region.feature_counts)
+        return region_dict
+
+    def merge(self, max_dist=500):
+        self._regions.sort()
+
+        merged_regions = []
+        aggregate_region = Region()
+
+        for region in self._regions:
+            if aggregate_region.is_unset():
+                aggregate_region = region
+                continue
+
+            if compute_distance(aggregate_region, region) <= max_dist:
+                aggregate_region.end = max(aggregate_region.end, region.end)
+                aggregate_region.feature_counts.combine(region.feature_counts)
+            else:
+                merged_regions.append(aggregate_region)
+                aggregate_region = region
+
+        if not aggregate_region.is_unset():
+            merged_regions.append(aggregate_region)
+
+        self._regions = merged_regions
+
+    def extend(self, other):
+        # pylint: disable=I0011,W0212
+        self._regions += other._regions
+
+
+def create_region_from_denovo_record(sample_id, region_count):
+    region_encoding, count = region_count
+    region_encoding = region_encoding.replace(':', '-').split('-')
+    chrom, start, end = region_encoding
+    start, end = int(start), int(end)
+    feture_counts = FeatureCounts({sample_id: count})
+    return Region(chrom, start, end, feture_counts)
+
+
+def create_region_collection_from_denovo_record(sample_id, denovo_record):
+    regions = []
+    for region_count in denovo_record.items():
+        region = create_region_from_denovo_record(sample_id, region_count)
+        regions.append(region)
+    return RegionCollection(regions)
diff --git a/scripts/str_helper.sh b/scripts/str/str_helper.sh
similarity index 100%
rename from scripts/str_helper.sh
rename to scripts/str/str_helper.sh
diff --git a/scripts/str/wilcoxontest.py b/scripts/str/wilcoxontest.py
new file mode 100755
index 0000000..9035d9b
--- /dev/null
+++ b/scripts/str/wilcoxontest.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+#
+# Expansion Hunter Denovo
+# Copyright (c) 2017 Illumina, Inc.
+#
+# Author: Egor Dolzhenko <edolzhenko@illumina.com>,
+#         Sai Chen <schen6@illumina.com>
+# Concept: Michael Eberle <meberle@illumina.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import numpy as np
+import scipy.special
+import scipy.stats
+
+
+def calculate_approximate_pvalue(cases, controls):
+    all_counts = cases + controls
+    ranks = scipy.stats.rankdata(all_counts)
+    case_rank_sum = np.sum(ranks[:len(cases)])
+
+    mu_cases = len(cases) * (len(cases) + len(controls) + 1) / 2
+    sigma_cases = np.sqrt(len(cases) * len(controls) *
+                          (len(cases) + len(controls) + 1) / 12)
+    z_cases = (case_rank_sum - mu_cases) / sigma_cases
+
+    return 1 - scipy.stats.norm.cdf(z_cases)
+
+
+def calculate_permutation_pvalue(cases, controls, num_permutations):
+    all_counts = cases + controls
+    ranks = scipy.stats.rankdata(all_counts)
+    num_cases = len(cases)
+    true_case_rank_sum = np.sum(ranks[:num_cases])
+
+    permuted_case_ranks = np.random.choice(
+        ranks, size=(num_permutations, num_cases))
+    permuted_case_rank_sums = np.sum(permuted_case_ranks, axis=1)
+
+    num_case_rank_sums_as_extreme_as_true = np.sum(
+        permuted_case_rank_sums >= true_case_rank_sum)
+
+    return (num_case_rank_sums_as_extreme_as_true + 1) / (num_permutations + 1)
+
+
+def wilcoxon_rank_sum_test(test_params, cases, controls):
+    test_method = test_params['method']
+    if test_method == 'normal':
+        return calculate_approximate_pvalue(cases, controls)
+    elif test_method == 'resample':
+        return calculate_permutation_pvalue(cases, controls, test_params['num_resamples'])
+    else:
+        assert False, '{} is an unknown method type'.format(test_method)