From 9e14ec5ec2e2ad8fb9bc2668fdfa050c151f446e Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@ls013gpu.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 6 Mar 2020 12:54:35 -0800
Subject: [PATCH 01/89] added new scripts for extending standalone features

---
 .../python/extend_stanalone_features.py       | 248 +++++++
 neusomatic/python/genomic_file_handlers.py    | 623 ++++++++++++++++++
 neusomatic/python/read_info_extractor.py      | 290 ++++++++
 neusomatic/python/sequencing_features.py      | 269 ++++++++
 4 files changed, 1430 insertions(+)
 create mode 100755 neusomatic/python/extend_stanalone_features.py
 create mode 100644 neusomatic/python/genomic_file_handlers.py
 create mode 100644 neusomatic/python/read_info_extractor.py
 create mode 100644 neusomatic/python/sequencing_features.py

diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_stanalone_features.py
new file mode 100755
index 0000000..c75748e
--- /dev/null
+++ b/neusomatic/python/extend_stanalone_features.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+#-------------------------------------------------------------------------
+# extend_standalone_features.py
+# add extra features for standalone mode
+#-------------------------------------------------------------------------
+import argparse
+import traceback
+import logging
+import multiprocessing
+import os
+import gzip
+
+import pysam
+import numpy as np
+
+import sequencing_features
+import genomic_file_handlers as genome
+
+
+def extract_features(candidate_record):
+    work, reference, tumor_bam, normal_bam, chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic = candidate_record
+    thread_logger = logging.getLogger(
+        "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name))
+    try:
+        thread_logger.info(
+            "---------------------Filter Candidates---------------------")
+        tbam = pysam.AlignmentFile(tumor_bam)
+        nbam = pysam.AlignmentFile(normal_bam)
+        ref_fa = pysam.FastaFile(reference)
+
+        my_coordinate = [chrom, int(pos)]
+        nBamFeatures = sequencing_features.from_bam(
+            nbam, my_coordinate, ref, alt, min_mapq, min_bq)
+        tBamFeatures = sequencing_features.from_bam(
+            tbam, my_coordinate, ref, alt, min_mapq, min_bq)
+
+        n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev']
+        n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
+        t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
+        t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
+        sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt)
+
+        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
+            ref_fa, my_coordinate, ref, alt)
+
+        indel_length = len(alt) - len(ref)
+
+        CHROM = my_coordinate[0]
+        POS = my_coordinate[1]
+        REF = ref_base
+        ALT = first_alt
+        if_dbsnp = if_dbsnp
+        COMMON = if_common
+        if_COSMIC = if_cosmic
+        COSMIC_CNT = num_cases
+        Consistent_Mates = tBamFeatures['consistent_mates']
+        Inconsistent_Mates = tBamFeatures['inconsistent_mates']
+        N_DP = nBamFeatures['dp']
+        nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq']
+        nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq']
+        nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq']
+        nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq']
+        nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq']
+        nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq']
+        nBAM_REF_NM = '%g' % nBamFeatures['ref_NM']
+        nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM']
+        nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff']
+        nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads']
+        nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads']
+        nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads']
+        nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads']
+        nBAM_Concordance_FET = rescale(
+            nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
+        N_REF_FOR = nBamFeatures['ref_for']
+        N_REF_REV = nBamFeatures['ref_rev']
+        N_ALT_FOR = nBamFeatures['alt_for']
+        N_ALT_REV = nBamFeatures['alt_rev']
+        nBAM_StrandBias_FET = rescale(
+            nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
+        nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos']
+        nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads']
+        nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads']
+        nBAM_Clipping_FET = rescale(
+            nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
+        nBAM_MQ0 = nBamFeatures['MQ0']
+        nBAM_Other_Reads = nBamFeatures['noise_read_count']
+        nBAM_Poor_Reads = nBamFeatures['poor_read_count']
+        nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp']
+        nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp']
+        nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp']
+        nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp']
+        nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp']
+        nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp']
+        SOR = sor
+        MaxHomopolymer_Length = homopolymer_length
+        SiteHomopolymer_Length = site_homopolymer_length
+        T_DP = tBamFeatures['dp']
+        tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq']
+        tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq']
+        tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq']
+        tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq']
+        tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq']
+        tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq']
+        tBAM_REF_NM = '%g' % tBamFeatures['ref_NM']
+        tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM']
+        tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff']
+        tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads']
+        tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads']
+        tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads']
+        tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads']
+        tBAM_Concordance_FET = rescale(
+            tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
+        T_REF_FOR = tBamFeatures['ref_for']
+        T_REF_REV = tBamFeatures['ref_rev']
+        T_ALT_FOR = tBamFeatures['alt_for']
+        T_ALT_REV = tBamFeatures['alt_rev']
+        tBAM_StrandBias_FET = rescale(
+            tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
+        tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos']
+        tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads']
+        tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads']
+        tBAM_Clipping_FET = rescale(
+            tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
+        tBAM_MQ0 = tBamFeatures['MQ0']
+        tBAM_Other_Reads = tBamFeatures['noise_read_count']
+        tBAM_Poor_Reads = tBamFeatures['poor_read_count']
+        tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp']
+        tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp']
+        tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp']
+        tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp']
+        tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp']
+        tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp']
+        InDel_Length = indel_length
+
+        # thread_logger.info(tBamFeatures)
+        # aaa
+
+        return 0
+
+    except Exception as ex:
+        thread_logger.error(traceback.format_exc())
+        thread_logger.error(ex)
+        return None
+
+
+def extend_standalone_features(candidates_vcf,
+                               reference, tumor_bam, normal_bam,
+                               min_mapq, min_bq,
+                               dbsnp, cosmic,
+                               num_threads,
+                               work):
+
+    logger = logging.getLogger(extend_standalone_features.__name__)
+
+    logger.info("----------------------Preprocessing------------------------")
+    if not os.path.exists(work):
+        os.mkdir(work)
+
+    if not os.path.exists(tumor_bam):
+        logger.error("Aborting!")
+        raise Exception("No tumor BAM file {}".format(tumor_bam))
+    if not os.path.exists(normal_bam):
+        logger.error("Aborting!")
+        raise Exception("No normal BAM file {}".format(normal_bam))
+    if not os.path.exists(tumor_bam + ".bai"):
+        logger.error("Aborting!")
+        raise Exception(
+            "No tumor .bai index file {}".format(tumor_bam + ".bai"))
+    if not os.path.exists(normal_bam + ".bai"):
+        logger.error("Aborting!")
+        raise Exception(
+            "No normal .bai index file {}".format(normal_bam + ".bai"))
+
+    if dbsnp:
+        with gzip.open(dbsnp,'rt') as i_f:
+            for line in i_f:
+                if not line.strip():
+                    continue
+                if line[0] == "#":
+                    continue
+                print(line)
+                aaa
+    pool = multiprocessing.Pool(num_threads)
+    map_args = []
+    with open(candidates_vcf) as i_f:
+        for line in i_f:
+            if not line.strip():
+                continue
+            if line[0] == "#":
+                continue
+            chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
+            map_args.append((work, reference, tumor_bam, normal_bam,
+                             chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic))
+    try:
+        ext_features = pool.map_async(extract_features, map_args).get()
+        pool.close()
+    except Exception as inst:
+        logger.error(inst)
+        pool.close()
+        traceback.print_exc()
+        raise Exception
+
+
+if __name__ == '__main__':
+    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=FORMAT)
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser(
+        description='extract extra features for standalone mode')
+    parser.add_argument('--candidates_vcf', type=str, help='candidates vcf',
+                        required=True)
+    parser.add_argument('--reference', type=str, help='reference fasta filename',
+                        required=True)
+    parser.add_argument('--tumor_bam', type=str,
+                        help='tumor bam', required=True)
+    parser.add_argument('--normal_bam', type=str,
+                        help='normal bam', required=True)
+    parser.add_argument('--min_mapq', type=int,
+                        help='minimum mapping quality', default=1)
+    parser.add_argument('--min_bq', type=float,
+                        help='minimum base quality', default=5)
+    parser.add_argument('--dbsnp', type=str,
+                        help='dbSNP vcf (to annotate candidate variants)', default=None)
+    parser.add_argument('--cosmic', type=str,
+                        help='COSMIC vcf (to annotate candidate variants)', default=None)
+    parser.add_argument('--num_threads', type=int,
+                        help='number of threads', default=1)
+    parser.add_argument('--work', type=str,
+                        help='work directory', required=True)
+    args = parser.parse_args()
+    logger.info(args)
+
+    try:
+        output = extend_standalone_features(args.candidates_vcf,
+                                            args.reference, args.tumor_bam, args.normal_bam,
+                                            args.min_mapq, args.min_bq,
+                                            args.dbsnp, args.cosmic,
+                                            args.num_threads,
+                                            args.work)
+        if output is None:
+            raise Exception("extend_standalone_features failed!")
+    except Exception as e:
+        logger.error(traceback.format_exc())
+        logger.error("Aborting!")
+        logger.error(
+            "extend_standalone_features.py failure on arguments: {}".format(args))
+        raise e
diff --git a/neusomatic/python/genomic_file_handlers.py b/neusomatic/python/genomic_file_handlers.py
new file mode 100644
index 0000000..cd19a26
--- /dev/null
+++ b/neusomatic/python/genomic_file_handlers.py
@@ -0,0 +1,623 @@
+#!/usr/bin/env python3
+
+from pysam import AlignmentFile
+import sys, os, gzip, re, math
+
+# The regular expression pattern for "chrXX 1234567" in both VarScan2 Output and VCF files:
+pattern_major_chr_position = re.compile(r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b')
+
+# More lenient pattern:
+pattern_chr_position = re.compile(r'[^\t]+\t[0-9]+\b')
+pattern_chrom = re.compile(r'(?:chr)?([1-9]|1[0-9]|2[0-2]|[XY]|MT?)\W')
+
+
+# Valid Phred+33 quality strings:
+valid_q = set()
+[valid_q.add( chr(33+i) ) for i in range(42)];
+
+nan = float('nan')
+inf = float('inf')
+
+AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"}
+AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile", "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"}
+
+
+### ### ### ### ### MAJOR CLASSES ### ### ### ### ###
+class Vcf_line:
+    '''Each instance of this object is a line from the vcf file (no header).'''
+
+    def __init__(self, vcf_line):
+
+        '''Argument is a line in pileup file.'''
+        self.vcf_line = vcf_line.rstrip('\n')
+
+        try:
+            self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, *self.has_samples = vcf_line.rstrip('\n').split('\t')
+            self.position = int(self.position)
+
+            try:
+                self.field, *self.samples = self.has_samples
+            except ValueError:
+                self.field = self.samples = ''
+
+        except ValueError:
+            self.chromosome = self.identifier = self.refbase = self.altbase = self.qual = self.filters = self.info = self.field = self.samples = ''
+            self.position = None
+
+
+    def get_info_items(self):
+        return self.info.split(';')
+
+
+    def get_info_value(self, variable):
+
+        key_item = re.search(r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line)
+
+        # The key has a value attached to it, e.g., VAR=1,2,3
+        if key_item:
+            return key_item.groups()[0]
+
+        # Perhaps it's simply a flag without "="
+        else:
+            key_item = self.info.split(';')
+            return True if variable in key_item else False
+
+
+    def get_sample_variable(self):
+        return self.field.split(':')
+
+
+    def get_sample_item(self, idx=0, out_type='d'):
+        '''d to output a dictionary. l to output a tuple of lists'''
+
+        if out_type.lower() == 'd':
+            return dict( zip(self.get_sample_variable(), self.samples[idx].split(':') ) )
+        elif out_type.lower() == 'l':
+            return ( self.get_sample_variable(), self.samples[idx].split(':') )
+
+
+    def get_sample_value(self, variable, idx=0):
+
+        var2value = dict( zip( self.field.split(':'), self.samples[idx].split(':') ))
+
+        try:
+            return var2value[variable]
+        except KeyError:
+            return None
+
+
+
+
+class pysam_header:
+    '''
+    Extract BAM header using pysam.
+    Only sample name (SM) so far.
+    '''
+
+    def __init__(self, bam_file):
+
+        bam = AlignmentFile(bam_file)
+        self.bam_header = bam.header
+
+
+    def SM(self):
+        '''Sample Name'''
+
+        sample_name = set()
+
+        for header_i in self.bam_header['RG']:
+            sample_name.add( header_i['SM'] )
+        sample_name = tuple(sample_name)
+
+        return sample_name
+
+
+### ### ### ### ### MAJOR CLASSES OVER ### ### ### ### ###
+
+
+
+
+
+
+
+
+### ### ### ### ### FUNCTIONS OF CONVENIENCE ### ### ### ### ###
+
+def skip_vcf_header(opened_file):
+    
+    line_i = opened_file.readline().rstrip()
+    while line_i.startswith('#'):
+        line_i = opened_file.readline().rstrip()
+    
+    return line_i
+
+
+def faiordict2contigorder(file_name, file_format):
+    '''Takes either a .fai or .dict file, and return a contig order dictionary, i.e., chrom_seq['chr1'] == 0'''
+
+    assert file_format in ('fai', 'dict')
+
+    contig_sequence = []
+    with open(file_name) as gfile:
+        
+        for line_i in gfile:
+            
+            if file_format == 'fai':
+                contig_match = re.match(r'([^\t]+)\t', line_i)
+
+            elif file_format == 'dict':
+                if line_i.startswith('@SQ'):
+                    contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i)
+
+            if contig_match:
+                contig_i = contig_match.groups()[0].split(' ')[0]  # some .fai files have space after the contig for descriptions.
+                contig_sequence.append( contig_i )
+
+    chrom_seq = {}
+    for n,contig_i in enumerate(contig_sequence):
+        chrom_seq[contig_i] = n
+
+    return chrom_seq
+
+
+
+def open_textfile(file_name):
+
+    # See if the input file is a .gz file:
+    if file_name.lower().endswith('.gz'):
+        return gzip.open(file_name, 'rt')
+
+    else:
+        return open(file_name)
+
+
+
+def open_bam_file(file_name):
+
+    try:
+        return AlignmentFile(file_name, 'rb')
+    except ValueError:
+        return open(file_name)
+
+
+
+
+def ascii2phred33(x):
+    '''Put in an ASCII string, return a Phred+33 score.'''
+    return ord(x)-33
+
+
+def phred33toascii(x):
+    '''Put in a Phred33 score, return the character.'''
+    return chr(x+33)
+
+
+def p2phred(p, max_phred=inf):
+    '''Convert p-value to Phred-scale quality score.'''
+
+    if p == 0:
+        Q = max_phred
+
+    elif p == 1:
+        Q = 0
+
+    elif p<0 or p>1:
+        Q = nan
+
+    elif p > 0:
+        Q = -10 * math.log10(p)
+        if Q > max_phred:
+            Q = max_phred
+
+    elif math.isnan(p):
+        Q = nan
+
+    return Q
+
+
+
+def phred2p(phred):
+    '''Convert Phred-scale quality score to p-value.'''
+    return 10**(-phred/10)
+
+
+def findall_index(mylist, tolookfor):
+    '''Find all instances in a list that matches exactly thestring.'''
+    all_indices = [i for i,item in enumerate(mylist) if item == tolookfor]
+    return all_indices
+
+
+def findall_index_regex(mylist, pattern):
+    '''Find all instances in a list that matches a regex pattern.'''
+    all_indices = [i for i,item in enumerate(mylist) if re.search(pattern, item)]
+    return all_indices
+
+
+def count_repeating_bases(sequence):
+
+    '''For a string, count the number of characters that appears in a row.
+    E.g., for string "ABBCCCDDDDAAAAAAA", the function returns 1, 2, 3, 4, 7, because there is 1 A, 2 B's, 3 C's, 4 D's, and then 7 A's.
+    '''
+    counters = []
+    previous_base = None
+
+    for current_base in sequence:
+
+        if current_base == previous_base:
+            counters[-1] += 1
+        else:
+            counters.append(1)
+
+        previous_base = current_base
+
+    counters
+
+    return counters
+
+
+
+def numeric_id(chr_i, pos_i, contig_seq):
+
+    chr_i = contig_seq[chr_i]
+    numeric_chr_i = float(chr_i) * 1000000000000
+    numeric_pos_i = float(pos_i)
+
+    numeric_i = numeric_chr_i + numeric_pos_i
+
+    return numeric_i
+
+
+
+
+
+# Define which chromosome coordinate is ahead for the following function:
+chrom_sequence = [str(i) for i in range(1,23)]
+chrom_sequence.append('X')
+chrom_sequence.append('Y')
+chrom_sequence.append('M')
+
+chrom_seq = {}
+for n,contig_i in enumerate(chrom_sequence):
+    chrom_seq[contig_i] = n
+
+def whoisbehind(coord_0, coord_1, chrom_sequence):
+    '''
+    coord_0 and coord_1 are two strings or two lists, specifying the chromosome, a (typically) tab, and then the location.
+    Return the index where the coordinate is behind. Return 10 if they are the same position.
+    '''
+
+    end_of_0 = end_of_1 = False
+
+    if coord_0 == '' or coord_0==['',''] or coord_0==('','') or not coord_0:
+        end_of_0 = True
+
+    if coord_1 == '' or coord_1==['',''] or coord_1==('','') or not coord_1:
+        end_of_1 = True
+
+    if end_of_0 and end_of_1:
+        return 10
+
+    elif end_of_1:
+        return 0
+
+    elif end_of_0:
+        return 1
+
+    else:
+
+        if isinstance(coord_0, str):
+            chrom0, position0 = coord_0.split()
+        elif isinstance(coord_0, list) or isinstance(coord_0, tuple):
+            chrom0, position0 = coord_0[0], coord_0[1]
+
+        if isinstance(coord_1, str):
+            chrom1, position1 = coord_1.split()
+        elif isinstance(coord_1, list) or isinstance(coord_1, tuple):
+            chrom1, position1 = coord_1[0], coord_1[1]
+
+        if isinstance(chrom_sequence, dict):
+            chrom0_position = chrom_sequence[chrom0]
+            chrom1_position = chrom_sequence[chrom1]
+        elif isinstance(chrom_sequence, list) or isinstance(chrom_sequence, tuple):
+            chrom0_position = chrom_sequence.index(chrom0)
+            chrom1_position = chrom_sequence.index(chrom1)
+
+        if chrom0_position < chrom1_position:
+            return 0   # 1st coordinate is ahead
+
+        elif chrom0_position > chrom1_position:
+            return 1   # 1st coordinate is ahead
+
+        # Must be in the same chromosome
+        else:
+
+            position0 = int(position0)
+            position1 = int(position1)
+
+            if position0 < position1:
+                return 0
+
+            elif position0 > position1:
+                return 1
+
+            # Same chromosome, same position, then same coordinate:
+            elif position0 == position1:
+                return 10
+
+
+
+
+def vcf_header_modifier(infile_handle, addons=[], getlost=' '):
+
+    '''addons = A list of INFO, FORMAT, ID, or Filter lines you want to add.
+    getlost = a regex expression for the ID of INFO/FORMAT/FILTER that you want to get rid of.'''
+
+    line_i = infile_handle.readline().rstrip()
+
+    # First, write into the INFO and FORMAT what I want to add:
+    vcfheader_info_format_filter = []
+    vcfheader_misc = []
+
+    for additions in addons:
+        vcfheader_info_format_filter.append(additions)
+
+    while line_i.startswith('##'):
+
+        if re.match(r'##fileformat=', line_i):
+            vcffileformat = line_i
+
+        elif re.match(r'##(INFO|FORMAT|FILTER)', line_i):
+
+            if not re.match(r'##(INFO|FORMAT|FILTER)=<ID={},'.format(getlost), line_i):
+                vcfheader_info_format_filter.append( line_i )
+
+        elif re.match(r'##', line_i):
+            vcfheader_misc.append( line_i )
+
+        # Continue:
+        line_i = infile_handle.readline().rstrip()
+
+
+    # Print headers:
+    vcfheader_info_format_filter.sort()
+    vcfheader_misc.sort()
+
+    return vcffileformat, vcfheader_info_format_filter, vcfheader_misc, line_i
+
+
+
+
+
+
+
+def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):
+
+    '''
+    Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, at which time the function stops reading and you can do stuff.
+    Returns (True, Vcf_line_j)  if the j_th vcf file contains an entry that matches the i_th coordinate.
+    Returns (False, Vcf_line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate.
+    '''
+
+    coordinate_j = re.match( pattern_chr_position, line_j )
+
+    if coordinate_j:
+        coordinate_j = coordinate_j.group()
+    else:
+        coordinate_j = ''
+
+    # Which coordinate is behind?
+    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+    # The file_j is already ahead, return the same line_j, but tag it "False"
+    if is_behind == 0:
+        reporter = (False, line_j)
+
+    # The two coordinates are the same, return the same line_j, but tag it "True"
+    elif is_behind == 10:
+        reporter = (True, line_j)
+
+    # If file_j is behind, then needs to catch up:
+    elif is_behind == 1:
+
+        # Keep at it until line_j is no longer behind:
+        while is_behind == 1:
+
+            # Catch up
+            line_j = filehandle_j.readline().rstrip()
+            next_coord = re.match( pattern_chr_position, line_j )
+
+            if next_coord:
+                coordinate_j = next_coord.group()
+            else:
+                coordinate_j = ''
+
+            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+        # If file_j has caught up exactly to the position of coordinate_i:
+        if is_behind == 10:
+            reporter = (True, line_j)
+
+        # If file_j has run past coordinate_i:
+        elif is_behind == 0:
+            reporter = (False, line_j)
+
+    return reporter
+
+
+
+
+
+
+def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
+
+    '''
+    Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, then
+        1) Create a list to store information for this coordinate in the j_th vcf file
+        2) Keep reading the j_th vcf file and store all lines with the same coordinate, until the coordinate goes to the next coordiate at which time the function stops reading and you can do stuff with the list created above.
+        3) Basically, it won't stop when vcf_j reaches the coordinate, but only stop when vcf_j has gone beyond the coordinate.
+
+    Returns (True, [Vcf_lines], line_j) if the j_th vcf file contains an entry that matches the i_th coordinate.
+    Returns (False, []        , line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate.
+    '''
+
+    coordinate_j = re.match( pattern_chr_position, line_j )
+
+    if coordinate_j:
+        coordinate_j = coordinate_j.group()
+    else:
+        coordinate_j = ''
+
+    # Which coordinate is behind?
+    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+    # The file_j is already ahead, return the same line_j, but tag it "False"
+    if is_behind == 0:
+        reporter = (False, [], line_j)
+
+    # The two coordinates are the same, return the same line_j, but tag it "True"
+    elif is_behind == 10:
+
+        # Create a list, initiated with the current line:
+        lines_of_coordinate_i = [ line_j ]
+
+        while is_behind == 10:
+            line_j = filehandle_j.readline().rstrip()
+            next_coord = re.match( pattern_chr_position, line_j )
+
+            if next_coord:
+                coordinate_j = next_coord.group()
+            else:
+                coordinate_j = ''
+
+            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+            # If the next line (still) has the same coordinate:
+            if is_behind == 10:
+                lines_of_coordinate_i.append( line_j )
+
+        reporter = (True, lines_of_coordinate_i, line_j)
+
+
+    # If file_j is behind, then needs to catch up:
+    # This is an opportunity to check if the vcf_j file is properly sorted, by asserting current line cannot be "behind" a subsequent line
+    elif is_behind == 1:
+
+        # Keep at it until line_j is no longer behind:
+        while is_behind == 1:
+
+            # Catch up
+            line_j = filehandle_j.readline().rstrip()
+            next_coord = re.match( pattern_chr_position, line_j )
+
+            if next_coord:
+                if whoisbehind(coordinate_j, next_coord.group(), chrom_sequence) == 1:
+                    raise Exception('{} does not seem to be properly sorted'.format(filehandle_j.name) )
+
+                coordinate_j = next_coord.group()
+            else:
+                coordinate_j = ''
+
+            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+
+        # If file_j has caught up exactly to the position of coordinate_i:
+        if is_behind == 10:
+
+            # Create a list, initiated with the current line:
+            lines_of_coordinate_i = [ line_j ]
+
+            while is_behind == 10:
+                line_j = filehandle_j.readline().rstrip()
+                next_coord = re.match( pattern_chr_position, line_j )
+
+                if next_coord:
+                    coordinate_j = next_coord.group()
+                else:
+                    coordinate_j = ''
+
+                is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+                # If the next line (still) has the same coordinate:
+                if is_behind == 10:
+                    lines_of_coordinate_i.append( line_j )
+
+            reporter = (True, lines_of_coordinate_i, line_j)
+
+
+        elif is_behind == 0:
+
+            reporter = (False, [], line_j)
+
+    return reporter
+
+
+
+
+
+
+def find_vcf_at_coordinate(my_coordinate, latest_vcf_line, vcf_file_handle, chrom_seq):
+    '''Best used in conjunction with catchup_multilines.
+    Given the current coordinate, the latest vcf_line from a vcf file, and the vcf file handle, it will return all the VCF variants (as VCF objects) at the given coordinate as a dictionary, where the key is the ( (contig, position), ref_base_i, alt_base_i ).
+    If there are two ALT bases in a given VCF line, the output dictionary will include two copies of this VCF object, with two different keys, each representing a different ALT base.
+    '''
+    latest_vcf_run  = catchup_multilines(my_coordinate, latest_vcf_line, vcf_file_handle, chrom_seq)
+    latest_vcf_here = latest_vcf_run[1]
+
+    vcf_variants = {}
+    if latest_vcf_run[0]:
+
+        for vcf_line_i in latest_vcf_here:
+
+            vcf_i = Vcf_line( vcf_line_i )
+
+            # Some VCF files wrongly uses "/" to separate different ALT's
+            altbases = re.split(r'[,/]', vcf_i.altbase)
+            for alt_i in altbases:
+                vcf_variants[ ((vcf_i.chromosome, vcf_i.position), vcf_i.refbase, alt_i) ] = vcf_i
+
+            assert my_coordinate[1] == vcf_i.position
+
+    latest_vcf_line = latest_vcf_run[-1]
+
+    return latest_vcf_run[0], vcf_variants, latest_vcf_line
+
+
+
+
+# Read the 2nd file (i.e., filehandle_j) one line down if it's behind the i_th coordinate:
+def catchup_one_line_at_a_time(coordinate_i, line_j, filehandle_j, chrom_sequence):
+
+    '''
+    A sister program of catch_up, the difference is that the j_th file will be read only once if the coordinate is behind i, so that it allows the programmer a chance to do something for coordinates that only occurs in j, whereas the catch_up function will keep reading until it gets to to gets past i, so the programmer has no chance to do anything for coordinates that occur only in j.
+    Return (0, Vcf_line_j)  if the coordinate_j matches coordinate_i.
+    Return (1, Vcf_line_j)  if the coordinate_j is ahead coordinate_i.
+    Return (-1, Vcf_line_j) if the coordinate_j is behind of coordinate_i.
+    '''
+
+    coordinate_j = re.match( pattern_chr_position, line_j )
+
+    if coordinate_j:
+        coordinate_j = coordinate_j.group()
+    else:
+        coordinate_j = ''
+
+    # Which coordinate is behind?
+    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+
+    # The file_j is already ahead, return the same line_j, but tag it "False"
+    if is_behind == 0:
+        reporter = (1, line_j)
+
+    # The two coordinates are the same, return the same line_j, but tag it "True"
+    elif is_behind == 10:
+        reporter = (0, line_j)
+
+    # If file_j is behind, then needs to catch up:
+    elif is_behind == 1:
+
+        # Read one line into file_j:
+        line_j_next = filehandle_j.readline().rstrip()
+        next_coord = re.match( pattern_chr_position, line_j_next )
+        reporter = (-1, line_j_next)
+
+    return reporter
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
new file mode 100644
index 0000000..4dcec80
--- /dev/null
+++ b/neusomatic/python/read_info_extractor.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+
+import re
+
+cigar_aln_match    = 0
+cigar_insertion    = 1
+cigar_deletion     = 2
+cigar_skip         = 3
+cigar_soft_clip    = 4
+cigar_hard_clip    = 5
+cigar_padding      = 6
+cigar_seq_match    = 7
+cigar_seq_mismatch = 8
+
+nan = float('nan')
+inf = float('inf')
+
+## Define functions:
+
+### PYSAM ###
+def position_of_aligned_read(read_i, target_position):
+    '''
+    Return the base call of the target position, or if it's a start of insertion/deletion.
+    This target position follows pysam convension, i.e., 0-based.
+    In VCF files, deletions/insertions occur AFTER the position.
+
+    Return (Code, seq_i, base_at_target, indel_length, nearest insertion/deletion)
+
+    The first number in result is a code:
+    1) Match to reference, which is either a reference read or a SNV/SNP
+    2) Deletion after the target position
+    3) Insertion after the target position
+    0) The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
+    '''
+
+    flanking_deletion, flanking_insertion = nan, nan
+
+    for i, align_i in enumerate(read_i.get_aligned_pairs()):
+
+        # If find a match:
+        if align_i[1] == target_position:
+            seq_i = align_i[0]
+            break
+
+
+    # If the target position is aligned:
+    try:
+        if seq_i is not None:
+            base_at_target = read_i.seq[seq_i]
+
+            # Whether if it's a Deletion/Insertion depends on what happens after this position:
+            # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel
+            # if "i" is NOT the final alignment:
+            if i != len(read_i.get_aligned_pairs()) - 1:
+
+                indel_length = 0
+                # If the next alignment is the next sequenced base, then the target is either a reference read of a SNP/SNV:
+                if read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == target_position + 1:
+
+                    code = 1 # Reference read for mismatch
+
+                # If the next reference position has no read position to it, it is DELETED in this read:
+                elif read_i.get_aligned_pairs()[i+1][0] == None and read_i.get_aligned_pairs()[i+1][1] == target_position + 1:
+
+                    code = 2 # Deletion
+
+                    for align_j in read_i.get_aligned_pairs()[ i+1:: ]:
+                        if align_j[0] == None:
+                            indel_length -= 1
+                        else:
+                            break
+
+                # Opposite of deletion, if the read position cannot be aligned to the reference, it can be an INSERTION.
+                # Insertions sometimes show up wit soft-clipping at the end, if the inserted sequence is "too long" to align on a single read. In this case, the inserted length derived here is but a lower limit of the real inserted length.
+                elif read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == None:
+
+                    code = 3 # Insertion or soft-clipping
+
+                    for align_j in read_i.get_aligned_pairs()[ i+1:: ]:
+                        if align_j[1] == None:
+                            indel_length += 1
+                        else:
+                            break
+
+            # If "i" is the final alignment, cannt exam for indel:
+            else:
+                code = 1           # Assuming no indel
+                indel_length = nan # Would be zero if certain no indel, but uncertain here
+
+        # If the target position is deleted from the sequencing read (i.e., the deletion in this read occurs before the target position):
+        else:
+            code = 0
+            base_at_target, indel_length, flanking_indel = None, None, None
+            
+
+        # See if there is insertion/deletion within 5 bp of "i":
+        if isinstance(indel_length, int):
+            flanking_indel = inf
+            left_side_start = seq_i
+            right_side_start = seq_i + abs(indel_length) + 1
+            switch = 1
+            for j in (3,2,1):
+                for indel_seeker_i in left_side_start, right_side_start:
+
+                    switch = switch * -1
+                    displacement = j * switch
+                    seq_j = indel_seeker_i + displacement
+
+                    if 0 <= seq_j < len(read_i.get_aligned_pairs()):
+
+                        # If the reference position has no base aligned to it, it's a deletion.
+                        # On the other hand, if the base has no reference base aligned to it, it's an insertion.
+                        if read_i.get_aligned_pairs()[ seq_j ][1] == None or read_i.get_aligned_pairs()[ seq_j ][0] == None:
+                            flanking_indel = j
+                            break
+        else:
+            flanking_indel = None
+
+        return code, seq_i, base_at_target, indel_length, flanking_indel
+
+    # The target position does not exist in the read
+    except UnboundLocalError:
+        return None, None, None, None, None
+
+
+
+## Dedup test for BAM file
+def dedup_test(read_i, remove_dup_or_not=True):
+    '''
+    Return False (i.e., remove the read) if the read is a duplicate and if the user specify that duplicates should be removed.
+    Else return True (i.e, keep the read)
+    '''
+    if read_i.is_duplicate and remove_dup_or_not:
+        return False
+    else:
+        return True
+
+
+
+### END OF PYSAM ###
+
+
+# Useful to make BED region into an iterator of coordinates
+def genomic_coordinates(contig_i, start, end):
+    for pos_i in range(start, end+1):
+        yield contig_i, pos_i
+
+
+
+
+def mean(stuff):    
+    return sum(stuff)/len(stuff) if stuff else nan
+
+
+
+##### Extract Indel DP4 info from pileup files:
+def pileup_indel_DP4(pileup_object, indel_pattern):
+    if pileup_object.reads:
+        ref_for = pileup_object.reads.count('.')
+        ref_rev = pileup_object.reads.count(',')
+        alt_for = pileup_object.reads.count( indel_pattern.upper() )
+        alt_rev = pileup_object.reads.count( indel_pattern.lower() )
+
+        dp4     = ref_for, ref_rev, alt_for, alt_rev
+
+    else:
+        dp4 = nan,nan,nan,nan
+
+    return dp4
+
+
+def pileup_DP4(pileup_object, ref_base, variant_call):
+
+    base_calls = pileup_object.base_reads()
+
+    if base_calls:
+
+        # SNV
+        if len(variant_call) == len(ref_base):
+
+            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[2].count(variant_call.upper()), base_calls[3].count(variant_call.lower())
+
+        # Insertion:
+        elif len(variant_call) > len(ref_base):
+
+            inserted_sequence = variant_call[ len(ref_base):: ]
+
+            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower())
+
+        # Deletion:
+        elif len(variant_call) < len(ref_base):
+
+            deleted_sequence = ref_base[ len(variant_call):: ]
+
+            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower())
+
+    else:
+        ref_for = ref_rev = alt_for = alt_rev = 0
+
+    return ref_for, ref_rev, alt_for, alt_rev
+
+
+
+
+def rescale(x, original='fraction', rescale_to=None, max_phred=1001):
+    
+    if ( rescale_to == None ) or ( original.lower() == rescale_to.lower() ):
+        y = x if isinstance(x, int) else '%.2f' % x
+    
+    elif original.lower() == 'fraction' and rescale_to == 'phred':
+        y = genome.p2phred(x, max_phred=max_phred)
+        y = '%.2f' % y
+    
+    elif original.lower() == 'phred' and rescale_to == 'fraction':
+        y = genome.phred2p(x)
+        y = '%.2f' % y
+    
+    return y
+
+
+
+
+
+##### Stuff from VarDict:
+def find_MSI(vcf_object):
+
+    msi = vcf_object.get_info_value('MSI')
+    if msi:
+        msi = float(msi)
+    else:
+        msi = nan
+    return msi
+
+
+def find_MSILEN(vcf_object):
+
+    msilen = vcf_object.get_info_value('MSILEN')
+    if msilen:
+        msilen = float(msilen)
+    else:
+        msilen = nan
+    return msilen
+
+
+def find_SHIFT3(vcf_object):
+
+    shift3 = vcf_object.get_info_value('SHIFT3')
+    if shift3:
+        shift3 = float(shift3)
+    else:
+        shift3 = nan
+    return shift3
+
+
+
+# MuTect2's Stuff:
+def mutect2_nlod(vcf_object):
+    nlod = vcf_object.get_info_value('NLOD')
+    if nlod:
+        return float(nlod)
+    else:
+        return nan
+
+
+def mutect2_tlod(vcf_object):
+    tlod = vcf_object.get_info_value('TLOD')
+    if tlod:
+        return float(tlod)
+    else:
+        return nan
+
+
+def mutect2_STR(vcf_object):
+    if vcf_object.get_info_value('STR'):
+        return 1
+    else:
+        return 0
+
+
+def mutect2_ECNT(vcf_object):
+    ecnt = vcf_object.get_info_value('ECNT')
+    if ecnt:
+        try:
+            ecnt = int( ecnt )
+        except ValueError:
+            ecnt = nan
+    else:
+        ecnt = nan
+
+    return ecnt
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
new file mode 100644
index 0000000..1135e1e
--- /dev/null
+++ b/neusomatic/python/sequencing_features.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python3
+
+import sys, os, re, pysam
+import scipy.stats as stats
+import genomic_file_handlers as genome
+from read_info_extractor import * 
+
+nan = float('nan')
+
+
+def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+
+    '''
+    bam is the opened file handle of bam file
+    my_coordiate is a list or tuple of 0-based (contig, position)
+    '''
+    
+    indel_length = len(first_alt) - len(ref_base)
+    reads = bam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] )
+    
+    ref_read_mq = []
+    alt_read_mq = []
+    ref_read_bq = []
+    alt_read_bq = []
+    ref_edit_distance = []
+    alt_edit_distance = []
+    
+    ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0
+    ref_for = ref_rev = alt_for = alt_rev = dp = 0
+    ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0
+    MQ0 = 0
+    
+    ref_pos_from_end = []
+    alt_pos_from_end = []
+    ref_flanking_indel = []
+    alt_flanking_indel = []
+    
+    noise_read_count = poor_read_count  = 0
+    
+    qname_collector = {}
+    
+    for read_i in reads:
+        if not read_i.is_unmapped and dedup_test(read_i):
+            
+            dp += 1
+            
+            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 )
+            
+            if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
+                poor_read_count += 1
+            
+            if read_i.mapping_quality == 0:
+                MQ0 += 1
+            
+            # Reference calls:
+            if code_i == 1 and base_call_i == ref_base[0]:
+
+                try:
+                    qname_collector[read_i.qname].append(0)
+                except KeyError:
+                    qname_collector[read_i.qname] = [0]
+            
+                ref_read_mq.append( read_i.mapping_quality )
+                ref_read_bq.append( read_i.query_qualities[ith_base] )
+                
+                try:
+                    ref_edit_distance.append( read_i.get_tag('NM') )
+                except KeyError:
+                    pass
+                
+                # Concordance
+                if        read_i.is_proper_pair  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    ref_concordant_reads += 1
+                elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    ref_discordant_reads += 1
+                
+                # Orientation
+                if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    ref_for += 1
+                elif    read_i.is_reverse  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    ref_rev += 1
+                
+                # Soft-clipped reads?
+                if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
+                    ref_SC_reads += 1
+                else:
+                    ref_notSC_reads += 1
+
+                # Distance from the end of the read:
+                if ith_base != None:
+                    ref_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
+                    
+                # Flanking indels:
+                ref_flanking_indel.append( flanking_indel_i )
+
+            
+            # Alternate calls:
+            # SNV, or Deletion, or Insertion where I do not check for matching indel length
+            elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \
+                 (indel_length < 0  and code_i == 2 and indel_length == indel_length_i) or \
+                 (indel_length > 0  and code_i == 3):
+
+                try:
+                    qname_collector[read_i.qname].append(1)
+                except KeyError:
+                    qname_collector[read_i.qname] = [1]
+
+                alt_read_mq.append( read_i.mapping_quality )
+                alt_read_bq.append( read_i.query_qualities[ith_base] )
+                
+                try:
+                    alt_edit_distance.append( read_i.get_tag('NM') )
+                except KeyError:
+                    pass
+                
+                # Concordance
+                if        read_i.is_proper_pair  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    alt_concordant_reads += 1
+                elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    alt_discordant_reads += 1
+                
+                # Orientation
+                if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    alt_for += 1
+                elif    read_i.is_reverse  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                    alt_rev += 1
+                
+                # Soft-clipped reads?
+                if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
+                    alt_SC_reads += 1
+                else:
+                    alt_notSC_reads += 1
+
+                # Distance from the end of the read:
+                if ith_base != None:
+                    alt_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
+                                        
+                # Flanking indels:
+                alt_flanking_indel.append( flanking_indel_i )
+            
+            
+            # Inconsistent read or 2nd alternate calls:
+            else:
+                
+                try:
+                    qname_collector[read_i.qname].append(2)
+                except KeyError:
+                    qname_collector[read_i.qname] = [2]
+                
+                noise_read_count += 1
+    
+    # Done extracting info from tumor BAM. Now tally them:
+    ref_mq        = mean(ref_read_mq)
+    alt_mq        = mean(alt_read_mq)
+    z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0]
+    
+    ref_bq        = mean(ref_read_bq)
+    alt_bq        = mean(alt_read_bq)
+    z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0]
+    
+    ref_NM        = mean(ref_edit_distance)
+    alt_NM        = mean(alt_edit_distance)
+    z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
+    NM_Diff       = alt_NM - ref_NM - abs(indel_length)
+    
+    concordance_fet = stats.fisher_exact(( (ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads) ))[1]
+    strandbias_fet  = stats.fisher_exact(( (ref_for, alt_for), (ref_rev, alt_rev) ))[1]
+    clipping_fet    = stats.fisher_exact(( (ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads) ))[1]
+    
+    z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
+    
+    ref_indel_1bp = ref_flanking_indel.count(1)
+    ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp
+    ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp
+    alt_indel_1bp = alt_flanking_indel.count(1)
+    alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp
+    alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp
+    
+    consistent_mates = inconsistent_mates = 0
+    for pairs_i in qname_collector:
+        
+        # Both are alternative calls:
+        if qname_collector[pairs_i] == [1,1]:
+            consistent_mates += 1
+        
+        # One is alternate call but the other one is not:
+        elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]:
+            inconsistent_mates += 1
+
+    return vars()
+
+
+
+
+
+def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt):
+
+    '''
+    ref_fa is the opened reference fasta file handle
+    my_coordiate is a list or tuple of 0-based (contig, position)
+    '''
+
+    # Homopolymer eval (Make sure to modify for INDEL):
+    # The min and max is to prevent the +/- 20 bases from exceeding the ends of the reference sequence
+    lseq  = ref_fa.fetch(my_coordinate[0], max(0, my_coordinate[1]-20), my_coordinate[1])
+    rseq  = ref_fa.fetch(my_coordinate[0], my_coordinate[1]+1, min(ref_fa.get_reference_length(my_coordinate[0])+1, my_coordinate[1]+21) )
+    
+    # This is to get around buy in old version of pysam that reads the reference sequence in bytes instead of strings
+    lseq = lseq.decode() if isinstance(lseq, bytes) else lseq
+    rseq = rseq.decode() if isinstance(rseq, bytes) else rseq
+    
+    seq41_ref = lseq + ref_base  + rseq
+    seq41_alt = lseq + first_alt + rseq
+    
+    ref_counts = genome.count_repeating_bases(seq41_ref)
+    alt_counts = genome.count_repeating_bases(seq41_alt)
+    
+    homopolymer_length = max( max(ref_counts), max(alt_counts) )
+    
+    # Homopolymer spanning the variant site:
+    ref_c = 0
+    alt_c = 0
+    for i in rseq:
+        if i == ref_base:
+            ref_c += 1
+        else:
+            break
+            
+    for i in lseq[::-1]:
+        if i == ref_base:
+            ref_c += 1
+        else:
+            break
+    
+    for i in rseq:
+        if i == first_alt:
+            alt_c += 1
+        else:
+            break
+            
+    for i in lseq[::-1]:
+        if i == first_alt:
+            alt_c += 1
+        else:
+            break
+
+    site_homopolymer_length = max( alt_c+1, ref_c+1 )
+
+    return homopolymer_length, site_homopolymer_length
+
+
+
+
+
+def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
+
+    # Odds Ratio just like VarDict's output
+    sor_numerator   = n_alt * t_ref
+    sor_denominator = n_ref * t_alt
+    if sor_numerator == 0 and sor_denominator == 0:
+        sor = nan
+    elif sor_denominator == 0:
+        sor = max_value
+    else:
+        sor = sor_numerator / sor_denominator
+        if sor >= max_value:
+            sor = max_value
+
+    return sor

From 8720927de67998a57bed44dd709726e1898593a6 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 8 Mar 2020 20:14:09 -0700
Subject: [PATCH 02/89] fix for dbsnp

---
 .../python/extend_stanalone_features.py       | 341 ++++++++++++------
 1 file changed, 224 insertions(+), 117 deletions(-)

diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_stanalone_features.py
index c75748e..5f6fef7 100755
--- a/neusomatic/python/extend_stanalone_features.py
+++ b/neusomatic/python/extend_stanalone_features.py
@@ -15,127 +15,160 @@
 
 import sequencing_features
 import genomic_file_handlers as genome
+from read_info_extractor import rescale
 
 
 def extract_features(candidate_record):
-    work, reference, tumor_bam, normal_bam, chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic = candidate_record
+    work, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record
     thread_logger = logging.getLogger(
         "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name))
     try:
-        thread_logger.info(
-            "---------------------Filter Candidates---------------------")
         tbam = pysam.AlignmentFile(tumor_bam)
         nbam = pysam.AlignmentFile(normal_bam)
         ref_fa = pysam.FastaFile(reference)
+        if dbsnp:
+            dbsnp_tb = pysam.TabixFile(dbsnp)
 
-        my_coordinate = [chrom, int(pos)]
-        nBamFeatures = sequencing_features.from_bam(
-            nbam, my_coordinate, ref, alt, min_mapq, min_bq)
-        tBamFeatures = sequencing_features.from_bam(
-            tbam, my_coordinate, ref, alt, min_mapq, min_bq)
-
-        n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev']
-        n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
-        t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
-        t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
-        sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt)
-
-        homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
-            ref_fa, my_coordinate, ref, alt)
-
-        indel_length = len(alt) - len(ref)
-
-        CHROM = my_coordinate[0]
-        POS = my_coordinate[1]
-        REF = ref_base
-        ALT = first_alt
-        if_dbsnp = if_dbsnp
-        COMMON = if_common
-        if_COSMIC = if_cosmic
-        COSMIC_CNT = num_cases
-        Consistent_Mates = tBamFeatures['consistent_mates']
-        Inconsistent_Mates = tBamFeatures['inconsistent_mates']
-        N_DP = nBamFeatures['dp']
-        nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq']
-        nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq']
-        nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq']
-        nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq']
-        nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq']
-        nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq']
-        nBAM_REF_NM = '%g' % nBamFeatures['ref_NM']
-        nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM']
-        nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff']
-        nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads']
-        nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads']
-        nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads']
-        nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads']
-        nBAM_Concordance_FET = rescale(
-            nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
-        N_REF_FOR = nBamFeatures['ref_for']
-        N_REF_REV = nBamFeatures['ref_rev']
-        N_ALT_FOR = nBamFeatures['alt_for']
-        N_ALT_REV = nBamFeatures['alt_rev']
-        nBAM_StrandBias_FET = rescale(
-            nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
-        nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos']
-        nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads']
-        nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads']
-        nBAM_Clipping_FET = rescale(
-            nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
-        nBAM_MQ0 = nBamFeatures['MQ0']
-        nBAM_Other_Reads = nBamFeatures['noise_read_count']
-        nBAM_Poor_Reads = nBamFeatures['poor_read_count']
-        nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp']
-        nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp']
-        nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp']
-        nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp']
-        nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp']
-        nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp']
-        SOR = sor
-        MaxHomopolymer_Length = homopolymer_length
-        SiteHomopolymer_Length = site_homopolymer_length
-        T_DP = tBamFeatures['dp']
-        tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq']
-        tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq']
-        tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq']
-        tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq']
-        tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq']
-        tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq']
-        tBAM_REF_NM = '%g' % tBamFeatures['ref_NM']
-        tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM']
-        tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff']
-        tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads']
-        tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads']
-        tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads']
-        tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads']
-        tBAM_Concordance_FET = rescale(
-            tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
-        T_REF_FOR = tBamFeatures['ref_for']
-        T_REF_REV = tBamFeatures['ref_rev']
-        T_ALT_FOR = tBamFeatures['alt_for']
-        T_ALT_REV = tBamFeatures['alt_rev']
-        tBAM_StrandBias_FET = rescale(
-            tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
-        tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos']
-        tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads']
-        tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads']
-        tBAM_Clipping_FET = rescale(
-            tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
-        tBAM_MQ0 = tBamFeatures['MQ0']
-        tBAM_Other_Reads = tBamFeatures['noise_read_count']
-        tBAM_Poor_Reads = tBamFeatures['poor_read_count']
-        tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp']
-        tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp']
-        tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp']
-        tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp']
-        tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp']
-        tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp']
-        InDel_Length = indel_length
-
-        # thread_logger.info(tBamFeatures)
-        # aaa
-
-        return 0
+        ext_features = []
+        for chrom, pos, ref, alt, if_cosmic, num_cosmic_cases in batch:
+            var_id = "-".join([chrom, pos, ref, alt])
+            pos = int(pos)
+            my_coordinate = [chrom, pos]
+            nBamFeatures = sequencing_features.from_bam(
+                nbam, my_coordinate, ref, alt, min_mapq, min_bq)
+            tBamFeatures = sequencing_features.from_bam(
+                tbam, my_coordinate, ref, alt, min_mapq, min_bq)
+
+            n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev']
+            n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
+            t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
+            t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
+            sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt)
+
+            homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
+                ref_fa, my_coordinate, ref, alt)
+
+            indel_length = len(alt) - len(ref)
+
+            if_dbsnp = 0
+            if_common = 0
+            if dbsnp:
+                region = "{}:{}-{}".format(chrom, pos, pos + 1)
+                dbsnp_vars = {}
+                for x in dbsnp_tb.fetch(region=region):
+                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[
+                        0:8]
+                    for alt_ in alts_.split(","):
+                        dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
+                        dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
+                if var_id in dbsnp_vars:
+                    if_dbsnp = 1
+                    if_common = dbsnp_vars[var_id]
+
+            p_scale = None
+            CHROM = my_coordinate[0]
+            POS = my_coordinate[1]
+            REF = ref
+            ALT = alt
+            if_dbsnp = if_dbsnp
+            COMMON = if_common
+            if_COSMIC = if_cosmic
+            COSMIC_CNT = num_cosmic_cases
+            Consistent_Mates = tBamFeatures['consistent_mates']
+            Inconsistent_Mates = tBamFeatures['inconsistent_mates']
+            N_DP = nBamFeatures['dp']
+            nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq']
+            nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq']
+            nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq']
+            nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq']
+            nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq']
+            nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq']
+            nBAM_REF_NM = '%g' % nBamFeatures['ref_NM']
+            nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM']
+            nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff']
+            nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads']
+            nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads']
+            nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads']
+            nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads']
+            nBAM_Concordance_FET = rescale(
+                nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
+            N_REF_FOR = nBamFeatures['ref_for']
+            N_REF_REV = nBamFeatures['ref_rev']
+            N_ALT_FOR = nBamFeatures['alt_for']
+            N_ALT_REV = nBamFeatures['alt_rev']
+            nBAM_StrandBias_FET = rescale(
+                nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
+            nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos']
+            nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads']
+            nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads']
+            nBAM_Clipping_FET = rescale(
+                nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
+            nBAM_MQ0 = nBamFeatures['MQ0']
+            nBAM_Other_Reads = nBamFeatures['noise_read_count']
+            nBAM_Poor_Reads = nBamFeatures['poor_read_count']
+            nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp']
+            nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp']
+            nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp']
+            nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp']
+            nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp']
+            nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp']
+            SOR = sor
+            MaxHomopolymer_Length = homopolymer_length
+            SiteHomopolymer_Length = site_homopolymer_length
+            T_DP = tBamFeatures['dp']
+            tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq']
+            tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq']
+            tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq']
+            tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq']
+            tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq']
+            tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq']
+            tBAM_REF_NM = '%g' % tBamFeatures['ref_NM']
+            tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM']
+            tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff']
+            tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads']
+            tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads']
+            tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads']
+            tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads']
+            tBAM_Concordance_FET = rescale(
+                tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
+            T_REF_FOR = tBamFeatures['ref_for']
+            T_REF_REV = tBamFeatures['ref_rev']
+            T_ALT_FOR = tBamFeatures['alt_for']
+            T_ALT_REV = tBamFeatures['alt_rev']
+            tBAM_StrandBias_FET = rescale(
+                tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
+            tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos']
+            tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads']
+            tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads']
+            tBAM_Clipping_FET = rescale(
+                tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
+            tBAM_MQ0 = tBamFeatures['MQ0']
+            tBAM_Other_Reads = tBamFeatures['noise_read_count']
+            tBAM_Poor_Reads = tBamFeatures['poor_read_count']
+            tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp']
+            tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp']
+            tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp']
+            tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp']
+            tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp']
+            tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp']
+            InDel_Length = indel_length
+
+            ext_features.append([CHROM, POS, REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
+                            Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
+                            nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
+                            nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
+                            nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
+                            nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
+                            nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
+                            nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
+                            MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
+                            tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
+                            tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
+                            tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
+                            tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
+                            tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
+                            tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
+        return ext_features
 
     except Exception as ex:
         thread_logger.error(traceback.format_exc())
@@ -152,7 +185,8 @@ def extend_standalone_features(candidates_vcf,
 
     logger = logging.getLogger(extend_standalone_features.__name__)
 
-    logger.info("----------------------Preprocessing------------------------")
+    logger.info(
+        "----------------------Extend Standalone Features------------------------")
     if not os.path.exists(work):
         os.mkdir(work)
 
@@ -172,34 +206,107 @@ def extend_standalone_features(candidates_vcf,
             "No normal .bai index file {}".format(normal_bam + ".bai"))
 
     if dbsnp:
-        with gzip.open(dbsnp,'rt') as i_f:
+        if not os.path.exists(dbsnp):
+            logger.error("Aborting!")
+            raise Exception(
+                "No dbSNP file {}".format(dbsnp))
+
+        if dbsnp[-6:] != "vcf.gz":
+            logger.error("Aborting!")
+            raise Exception(
+                "The dbSNP file should be a tabix indexed file with .vcf.gz format")
+        if not os.path.exists(dbsnp + ".tbi"):
+            logger.error("Aborting!")
+            raise Exception(
+                "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
+
+    if cosmic:
+        cosmic_vars = {}
+        with open(cosmic) as i_f:
             for line in i_f:
                 if not line.strip():
                     continue
                 if line[0] == "#":
                     continue
-                print(line)
-                aaa
+                x = line.strip().split("\t")
+                chrom, pos, _, ref, alts, _, _, info = x[0:8]
+                num_cases = info.split("CNT=")[1].split(
+                    ";")[0] if "CNT=" in info else float('nan')
+                for alt in alts.split(","):
+                    var_id = "-".join([chrom, pos, ref, alt])
+                    cosmic_vars[var_id] = num_cases
+
+    n_variants = 0
+    with open(candidates_vcf) as i_f:
+        for line in i_f:
+            if not line.strip():
+                continue
+            if line[0] == "#":
+                continue
+            n_variants += 1
+    logger.info("Number of variants: {}".format(n_variants))
+    split_len = n_variants // num_threads
     pool = multiprocessing.Pool(num_threads)
     map_args = []
     with open(candidates_vcf) as i_f:
+        i = 0
+        batch = []
         for line in i_f:
             if not line.strip():
                 continue
             if line[0] == "#":
                 continue
+
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
-            map_args.append((work, reference, tumor_bam, normal_bam,
-                             chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic))
+            var_id = "-".join([chrom, pos, ref, alt])
+            num_cosmic_cases = float('nan')
+            if_cosmic = 0
+            if cosmic and var_id in cosmic_vars:
+                if_cosmic = 1
+                num_cosmic_cases = cosmic_vars[var_id]
+            batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
+            i += 1
+            if len(batch) >= split_len or i == n_variants:
+                map_args.append((work, reference, tumor_bam, normal_bam,
+                                 min_mapq, min_bq, dbsnp, batch))
+                batch = []
+
+    header = ["CHROM", "POS", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
+              "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ",
+              "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff",
+              "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
+              "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET",
+              "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET",
+              "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
+              "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR",
+              "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ",
+              "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff",
+              "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
+              "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET",
+              "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET",
+              "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
+              "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]
+
     try:
         ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
+        output_tsv = os.path.join(work, "features.tsv")
+        with open(output_tsv, "w") as o_f:
+            o_f.write(
+                "\t".join(header) + "\n")
+            for features in ext_features:
+                for w in features:
+                    o_f.write(
+                        "\t".join(map(lambda x: str(x).replace("nan", "0"), w)) + "\n")
     except Exception as inst:
         logger.error(inst)
         pool.close()
         traceback.print_exc()
         raise Exception
 
+    logger.info("Done Extending Standalone Features.")
+    return ext_features
+
 
 if __name__ == '__main__':
     FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'

From 9cb9960fdfbc3e041b58ebb24a93e7dd5d8e739a Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 11 Mar 2020 17:00:13 -0700
Subject: [PATCH 03/89] fix_extract_ensemble

---
 ...analone_features.py => extend_features.py} | 107 +++++++++++-------
 neusomatic/python/generate_dataset.py         |  43 ++++---
 neusomatic/python/preprocess.py               |  42 ++++++-
 3 files changed, 130 insertions(+), 62 deletions(-)
 rename neusomatic/python/{extend_stanalone_features.py => extend_features.py} (77%)

diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_features.py
similarity index 77%
rename from neusomatic/python/extend_stanalone_features.py
rename to neusomatic/python/extend_features.py
index 5f6fef7..8b275de 100755
--- a/neusomatic/python/extend_stanalone_features.py
+++ b/neusomatic/python/extend_features.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #-------------------------------------------------------------------------
-# extend_standalone_features.py
+# extend_features.py
 # add extra features for standalone mode
 #-------------------------------------------------------------------------
 import argparse
@@ -19,9 +19,9 @@
 
 
 def extract_features(candidate_record):
-    work, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record
+    reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record
     thread_logger = logging.getLogger(
-        "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name))
+        "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name))
     try:
         tbam = pysam.AlignmentFile(tumor_bam)
         nbam = pysam.AlignmentFile(normal_bam)
@@ -43,7 +43,8 @@ def extract_features(candidate_record):
             n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
             t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
             t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
-            sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt)
+            sor = sequencing_features.somaticOddRatio(
+                n_ref, n_alt, t_ref, t_alt)
 
             homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                 ref_fa, my_coordinate, ref, alt)
@@ -60,7 +61,8 @@ def extract_features(candidate_record):
                         0:8]
                     for alt_ in alts_.split(","):
                         dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
-                        dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
+                        dbsnp_vars[
+                            dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
                 if var_id in dbsnp_vars:
                     if_dbsnp = 1
                     if_common = dbsnp_vars[var_id]
@@ -153,21 +155,21 @@ def extract_features(candidate_record):
             tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp']
             InDel_Length = indel_length
 
-            ext_features.append([CHROM, POS, REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
-                            Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
-                            nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
-                            nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
-                            nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
-                            nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
-                            nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
-                            nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
-                            MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
-                            tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
-                            tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
-                            tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
-                            tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
-                            tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
-                            tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
+            ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
+                                 Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
+                                 nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
+                                 nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
+                                 nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
+                                 nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
+                                 nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
+                                 nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
+                                 MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
+                                 tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
+                                 tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
+                                 tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
+                                 tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
+                                 tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
+                                 tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
         return ext_features
 
     except Exception as ex:
@@ -176,19 +178,18 @@ def extract_features(candidate_record):
         return None
 
 
-def extend_standalone_features(candidates_vcf,
-                               reference, tumor_bam, normal_bam,
-                               min_mapq, min_bq,
-                               dbsnp, cosmic,
-                               num_threads,
-                               work):
+def extend_features(candidates_vcf,
+                    exclude_variants,
+                    output_tsv,
+                    reference, tumor_bam, normal_bam,
+                    min_mapq, min_bq,
+                    dbsnp, cosmic,
+                    num_threads):
 
-    logger = logging.getLogger(extend_standalone_features.__name__)
+    logger = logging.getLogger(extend_features.__name__)
 
     logger.info(
         "----------------------Extend Standalone Features------------------------")
-    if not os.path.exists(work):
-        os.mkdir(work)
 
     if not os.path.exists(tumor_bam):
         logger.error("Aborting!")
@@ -236,6 +237,21 @@ def extend_standalone_features(candidates_vcf,
                     var_id = "-".join([chrom, pos, ref, alt])
                     cosmic_vars[var_id] = num_cases
 
+    if exclude_variants:
+        exclude_vars = []
+        with open(exclude_variants) as i_f:
+            for line in i_f:
+                if not line.strip():
+                    continue
+                if line[0] == "#":
+                    continue
+                if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM":
+                    continue
+                x = line.strip().split("\t")
+                chrom, pos, _, ref, alt = x[0:5]
+                var_id = "-".join([chrom, pos, ref, alt])
+                exclude_vars.append(var_id)
+
     n_variants = 0
     with open(candidates_vcf) as i_f:
         for line in i_f:
@@ -259,6 +275,9 @@ def extend_standalone_features(candidates_vcf,
 
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
             var_id = "-".join([chrom, pos, ref, alt])
+            if exclude_variants:
+                if var_id in exclude_vars:
+                    continue
             num_cosmic_cases = float('nan')
             if_cosmic = 0
             if cosmic and var_id in cosmic_vars:
@@ -267,11 +286,12 @@ def extend_standalone_features(candidates_vcf,
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
             i += 1
             if len(batch) >= split_len or i == n_variants:
-                map_args.append((work, reference, tumor_bam, normal_bam,
+                map_args.append((reference, tumor_bam, normal_bam,
                                  min_mapq, min_bq, dbsnp, batch))
                 batch = []
 
-    header = ["CHROM", "POS", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
+    logger.info("Number of batches: {}".format(len(map_args)))
+    header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
               "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ",
               "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff",
               "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
@@ -290,7 +310,6 @@ def extend_standalone_features(candidates_vcf,
     try:
         ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
-        output_tsv = os.path.join(work, "features.tsv")
         with open(output_tsv, "w") as o_f:
             o_f.write(
                 "\t".join(header) + "\n")
@@ -317,6 +336,10 @@ def extend_standalone_features(candidates_vcf,
         description='extract extra features for standalone mode')
     parser.add_argument('--candidates_vcf', type=str, help='candidates vcf',
                         required=True)
+    parser.add_argument('--exclude_variants', type=str, help='variants to exclude',
+                        default=None)
+    parser.add_argument('--output_tsv', type=str, help='output features tsv',
+                        required=True)
     parser.add_argument('--reference', type=str, help='reference fasta filename',
                         required=True)
     parser.add_argument('--tumor_bam', type=str,
@@ -333,23 +356,23 @@ def extend_standalone_features(candidates_vcf,
                         help='COSMIC vcf (to annotate candidate variants)', default=None)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
-    parser.add_argument('--work', type=str,
-                        help='work directory', required=True)
     args = parser.parse_args()
     logger.info(args)
 
     try:
-        output = extend_standalone_features(args.candidates_vcf,
-                                            args.reference, args.tumor_bam, args.normal_bam,
-                                            args.min_mapq, args.min_bq,
-                                            args.dbsnp, args.cosmic,
-                                            args.num_threads,
-                                            args.work)
+        output = extend_features(args.candidates_vcf,
+                                 args.exclude_variants,
+                                 args.output_tsv,
+                                 args.reference, args.tumor_bam, args.normal_bam,
+                                 args.min_mapq, args.min_bq,
+                                 args.dbsnp, args.cosmic,
+                                 args.num_threads,
+                                 )
         if output is None:
-            raise Exception("extend_standalone_features failed!")
+            raise Exception("extend_features failed!")
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
         logger.error(
-            "extend_standalone_features.py failure on arguments: {}".format(args))
+            "extend_features.py failure on arguments: {}".format(args))
         raise e
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 01bd2d7..908b781 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1369,7 +1369,7 @@ def find_records(input_record):
         return None
 
 
-def extract_ensemble(work, ensemble_tsv):
+def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
     ensemble_pos = []
@@ -1399,15 +1399,23 @@ def extract_ensemble(work, ensemble_tsv):
                          "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
                          "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp",
                          "InDel_Length"]
+    callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
+                        "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
+                        "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
+                        "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
+
+    n_vars = 0
     with open(ensemble_tsv) as s_f:
         for line in s_f:
             if not line.strip():
                 continue
             if line[0:5] == "CHROM":
                 header_pos = line.strip().split()[0:5]
-                header = line.strip().split()[5:105]
+                header_ = line.strip().split()[5:]
+                if is_extend:
+                    header_ += callers_features
                 header_en = list(filter(
-                    lambda x: x[1] in expected_features, enumerate(line.strip().split()[5:])))
+                    lambda x: x[1] in expected_features, enumerate(header_)))
                 header = list(map(lambda x: x[1], header_en))
                 if set(expected_features) - set(header):
                     logger.error("The following features are missing from ensemble file: {}".format(
@@ -1420,9 +1428,15 @@ def extract_ensemble(work, ensemble_tsv):
             fields = line.strip().split()
             fields[2] = str(int(fields[1]) + len(fields[3]))
             ensemble_pos.append(fields[0:5])
+            features = fields[5:]
+            if is_extend:
+                features += ["0"] * len(callers_features)
             ensemble_data.append(list(map(lambda x: float(
-                x.replace("False", "0").replace("True", "1")), fields[5:])))
-    ensemble_data = np.array(ensemble_data)[:, order_header]
+                x.replace("False", "0").replace("True", "1")), features)))
+            n_vars += 1
+    if n_vars > 0:
+        ensemble_data = np.array(ensemble_data)[:, order_header]
+    header = np.array(header)[order_header].tolist()
 
     cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
         "Consistent_Mates", "Inconsistent_Mates", "N_DP",
@@ -1502,14 +1516,14 @@ def extract_ensemble(work, ensemble_tsv):
                         ]
     selected_features = sorted([i for f in min_max_features for i in f[0]])
     selected_features_tags = list(map(lambda x: header[x], selected_features))
-    for i_s, mn, mx in min_max_features:
-        s = ensemble_data[:, np.array(i_s)]
-        s = np.maximum(np.minimum(s, mx), mn)
-        s = (s - mn) / (mx - mn)
-        ensemble_data[:, np.array(i_s)] = s
-    ensemble_data = ensemble_data[:, selected_features]
-    ensemble_data = ensemble_data.tolist()
-    ensemble_bed = os.path.join(work, "ensemble.bed")
+    if n_vars > 0:
+        for i_s, mn, mx in min_max_features:
+            s = ensemble_data[:, np.array(i_s)]
+            s = np.maximum(np.minimum(s, mx), mn)
+            s = (s - mn) / (mx - mn)
+            ensemble_data[:, np.array(i_s)] = s
+        ensemble_data = ensemble_data[:, selected_features]
+        ensemble_data = ensemble_data.tolist()
     with open(ensemble_bed, "w")as f_:
         f_.write(
             "#" + "\t".join(map(str, header_pos + selected_features_tags)) + "\n")
@@ -1546,7 +1560,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
 
     split_batch_size = 10000
     if ensemble_tsv and not ensemble_bed:
-        ensemble_bed = extract_ensemble(work, ensemble_tsv)
+        ensemble_bed = os.path.join(work, "ensemble.bed")
+        extract_ensemble(ensemble_tsv, ensemble_bed, False)
 
     cmd = "bedtools intersect -a {} -b {} -u".format(
         tumor_pred_vcf_file, region_bed_file)
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index b227def..a82ddf7 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -19,7 +19,8 @@
 from filter_candidates import filter_candidates
 from generate_dataset import generate_dataset, extract_ensemble
 from scan_alignments import scan_alignments
-from utils import concatenate_vcfs, run_bedtools_cmd
+from extend_features import extend_features
+from utils import concatenate_files, concatenate_vcfs, run_bedtools_cmd
 
 
 def split_dbsnp(record):
@@ -196,10 +197,10 @@ def extract_candidate_split_regions(
             for line in f_:
                 if not line.strip():
                     continue
-                if line[0]!="#":
+                if line[0] != "#":
                     is_empty = False
                     break
-        logger.info([filtered_vcf,is_empty])
+        logger.info([filtered_vcf, is_empty])
         if not is_empty:
             cmd = '''grep -v "#" {}'''.format(filtered_vcf)
             candidates_bed = run_bedtools_cmd(cmd, run_logger=logger)
@@ -219,7 +220,6 @@ def extract_candidate_split_regions(
                 prefix="tmpbed_", suffix=".bed", delete=False)
             candidates_bed = candidates_bed.name
 
-
         if ensemble_beds:
             cmd = "cat {} {}".format(
                 candidates_bed,
@@ -255,6 +255,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                matrix_width, matrix_base_pad, min_ev_frac_per_col,
                ensemble_tsv, long_read, restart, first_do_without_qual,
                filter_duplicate,
+               add_extra_features,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -289,7 +290,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         ensemble_bed = os.path.join(work, "ensemble.bed")
         logger.info("Extract ensemble info.")
         if restart or not os.path.exists(ensemble_bed):
-            ensemble_bed = extract_ensemble(work, ensemble_tsv)
+            extract_ensemble(ensemble_tsv, ensemble_bed, False)
 
     merge_d_for_short_read = 100
     candidates_split_regions = []
@@ -380,10 +381,35 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             if os.path.exists(work_dataset_split):
                 shutil.rmtree(work_dataset_split)
             os.mkdir(work_dataset_split)
+            ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None
+            if add_extra_features:
+                extra_features_tsv = os.path.join(
+                    work_dataset_split, "ex_features.tsv")
+                extra_features = extend_features(filtered_vcf,
+                                                 ensemble_beds[
+                                                     i] if ensemble_tsv else None,
+                                                 extra_features_tsv,
+                                                 reference, tumor_bam, normal_bam,
+                                                 min_mapq, snp_min_bq,
+                                                 dbsnp, None,
+                                                 num_threads)
+                extra_features_bed = os.path.join(
+                    work_dataset_split, "ex_features.bed")
+                extract_ensemble(extra_features_tsv, extra_features_bed, True)
+                if ensemble_tsv:
+                    merged_features_bed = os.path.join(
+                        work_dataset_split, "merged_features.bed")
+                    concatenate_files([extra_features_bed, ensemble_beds[
+                                      i]], merged_features_bed, check_file_existence=True)
+                    ensemble_bed_i = merged_features_bed
+                else:
+                    ensemble_bed_i = extra_features_bed
+
+
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_beds[i] if ensemble_tsv else None, tsv_batch_size)
+                                    ensemble_bed_i, tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir
@@ -465,6 +491,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--filter_duplicate',
                         help='filter duplicate reads when preparing pileup information',
                         action="store_true")
+    parser.add_argument('--add_extra_features',
+                        help='add extra input features',
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -482,6 +511,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col,
                    args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual,
                    args.filter_duplicate,
+                   args.add_extra_features,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:

From 58ccf62f48f5969b03d7e0988eb7143d3d065f4e Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 12 Mar 2020 14:23:35 -0700
Subject: [PATCH 04/89] fix dirnames

---
 neusomatic/python/preprocess.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index a82ddf7..3bfe814 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -383,8 +383,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             os.mkdir(work_dataset_split)
             ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None
             if add_extra_features:
+                work_tumor_i = os.dirname(filtered_vcf)
                 extra_features_tsv = os.path.join(
-                    work_dataset_split, "ex_features.tsv")
+                    work_tumor_i, "extra_features.tsv")
                 extra_features = extend_features(filtered_vcf,
                                                  ensemble_beds[
                                                      i] if ensemble_tsv else None,
@@ -394,7 +395,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                  dbsnp, None,
                                                  num_threads)
                 extra_features_bed = os.path.join(
-                    work_dataset_split, "ex_features.bed")
+                    work_dataset_split, "extra_features.bed")
                 extract_ensemble(extra_features_tsv, extra_features_bed, True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(

From cab5ff7772d8e8d1816e8a22ed405f7f37b4c37b Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 13 Mar 2020 19:26:46 -0700
Subject: [PATCH 05/89] small fix

---
 neusomatic/python/preprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index ef79035..e5b4bfc 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -345,7 +345,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             os.mkdir(work_dataset_split)
             ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None
             if add_extra_features:
-                work_tumor_i = os.dirname(filtered_vcf)
+                work_tumor_i = os.path.dirname(filtered_vcf)
                 extra_features_tsv = os.path.join(
                     work_tumor_i, "extra_features.tsv")
                 extra_features = extend_features(filtered_vcf,

From efa419098cfcc57915ae42cf5e9a7cf2ae5c0777 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 19 Mar 2020 16:39:20 -0700
Subject: [PATCH 06/89] fix features

---
 neusomatic/python/preprocess.py          | 69 +++++++++++++++--------
 neusomatic/python/read_info_extractor.py | 71 ------------------------
 2 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index e5b4bfc..a23a515 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -249,7 +249,6 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             raise Exception(
                 "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
 
-
     ensemble_bed = None
     if ensemble_tsv:
         ensemble_bed = os.path.join(work, "ensemble.bed")
@@ -322,15 +321,15 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         os.mkdir(work_normal)
     logger.info("Scan normal bam (and extracting quality scores).")
     normal_counts, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam,
-                                                  None, scan_window_size, 0.2, min_mapq,
-                                                  None, min_dp, max_dp,
-                                                  filter_duplicate,
-                                                  good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
-                                                  ins_min_af, del_min_af, del_merge_min_af,
-                                                  ins_merge_min_af, merge_r,
-                                                  scan_alignments_binary, restart, num_threads,
-                                                  calc_qual=True,
-                                                  regions=candidates_split_regions)
+                                               None, scan_window_size, 0.2, min_mapq,
+                                               None, min_dp, max_dp,
+                                               filter_duplicate,
+                                               good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
+                                               ins_min_af, del_min_af, del_merge_min_af,
+                                               ins_merge_min_af, merge_r,
+                                               scan_alignments_binary, restart, num_threads,
+                                               calc_qual=True,
+                                               regions=candidates_split_regions)
 
     work_dataset = os.path.join(work, "dataset")
     if restart or not os.path.exists(work_dataset):
@@ -348,27 +347,53 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 work_tumor_i = os.path.dirname(filtered_vcf)
                 extra_features_tsv = os.path.join(
                     work_tumor_i, "extra_features.tsv")
-                extra_features = extend_features(filtered_vcf,
-                                                 ensemble_beds[
-                                                     i] if ensemble_tsv else None,
-                                                 extra_features_tsv,
-                                                 reference, tumor_bam, normal_bam,
-                                                 min_mapq, snp_min_bq,
-                                                 dbsnp, None,
-                                                 num_threads)
+                if not os.path.exists(extra_features_tsv) or restart:
+                    extend_features(filtered_vcf,
+                                    ensemble_beds[
+                                        i] if ensemble_tsv else None,
+                                    extra_features_tsv,
+                                    reference, tumor_bam, normal_bam,
+                                    min_mapq, snp_min_bq,
+                                    dbsnp, None,
+                                    num_threads)
                 extra_features_bed = os.path.join(
                     work_dataset_split, "extra_features.bed")
-                extract_ensemble(extra_features_tsv, extra_features_bed, True)
+                if not os.path.exists(extra_features_bed) or restart:
+                    extract_ensemble(extra_features_tsv, extra_features_bed, True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
-                    concatenate_files([extra_features_bed, ensemble_beds[
-                                      i]], merged_features_bed, check_file_existence=True)
+                    if not os.path.exists(merged_features_bed) or restart:
+                        exclude_ens_variants = []
+                        with open(merged_features_bed, "w") as o_f:
+                            with open(ensemble_beds[i]) as i_f:
+                                for line in i_f:
+                                    if not line.strip():
+                                        continue
+                                    if line[0] == "#":
+                                        o_f.write(line)                                        
+                                        continue
+                                    chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    exclude_ens_variants.append(var_id)
+                                    o_f.write(line)
+                            with open(extra_features_bed) as i_f:
+                                for line in i_f:
+                                    if not line.strip():
+                                        continue
+                                    if line[0] == "#":
+                                        continue
+                                    chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    if var_id in exclude_ens_variants:
+                                        continue
+                                    o_f.write(line)
+                    # concatenate_files([extra_features_bed, ensemble_beds[
+                    #                   i]], merged_features_bed, check_file_existence=True)
                     ensemble_bed_i = merged_features_bed
                 else:
                     ensemble_bed_i = extra_features_bed
 
-
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index 4dcec80..b5bf75d 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -217,74 +217,3 @@ def rescale(x, original='fraction', rescale_to=None, max_phred=1001):
     
     return y
 
-
-
-
-
-##### Stuff from VarDict:
-def find_MSI(vcf_object):
-
-    msi = vcf_object.get_info_value('MSI')
-    if msi:
-        msi = float(msi)
-    else:
-        msi = nan
-    return msi
-
-
-def find_MSILEN(vcf_object):
-
-    msilen = vcf_object.get_info_value('MSILEN')
-    if msilen:
-        msilen = float(msilen)
-    else:
-        msilen = nan
-    return msilen
-
-
-def find_SHIFT3(vcf_object):
-
-    shift3 = vcf_object.get_info_value('SHIFT3')
-    if shift3:
-        shift3 = float(shift3)
-    else:
-        shift3 = nan
-    return shift3
-
-
-
-# MuTect2's Stuff:
-def mutect2_nlod(vcf_object):
-    nlod = vcf_object.get_info_value('NLOD')
-    if nlod:
-        return float(nlod)
-    else:
-        return nan
-
-
-def mutect2_tlod(vcf_object):
-    tlod = vcf_object.get_info_value('TLOD')
-    if tlod:
-        return float(tlod)
-    else:
-        return nan
-
-
-def mutect2_STR(vcf_object):
-    if vcf_object.get_info_value('STR'):
-        return 1
-    else:
-        return 0
-
-
-def mutect2_ECNT(vcf_object):
-    ecnt = vcf_object.get_info_value('ECNT')
-    if ecnt:
-        try:
-            ecnt = int( ecnt )
-        except ValueError:
-            ecnt = nan
-    else:
-        ecnt = nan
-
-    return ecnt

From e4be780ed6e00f45b2445afe8ed3073c90b953db Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 10 Apr 2020 00:15:55 -0700
Subject: [PATCH 07/89] fix ensemble

---
 neusomatic/python/call.py  | 15 ++++++++++-----
 neusomatic/python/train.py | 16 +++++++++++-----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 073d674..6f3c5fb 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -395,7 +395,6 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr
 
 def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     batch_size, max_load_candidates, pass_threshold, lowqual_threshold,
-                    ensemble,
                     use_cuda):
     logger = logging.getLogger(call_neusomatic.__name__)
 
@@ -412,7 +411,17 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     vartype_classes = ['DEL', 'INS', 'NONE', 'SNP']
     data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+
+    ensemble = False
+    with open(candidates_tsv[0]) as i_f:
+        for line in i_f:
+            x = line.strip().split()
+            if len(x) == 97:
+                ensemble = True
+            break
+
     num_channels = 119 if ensemble else 26
+    logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
         logger.info("GPU calling!")
@@ -583,9 +592,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                         help='output directory', required=True)
     parser.add_argument('--checkpoint', type=str,
                         help='network model checkpoint path', required=True)
-    parser.add_argument('--ensemble',
-                        help='Enable calling for ensemble mode',
-                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--batch_size', type=int,
@@ -607,7 +613,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                      args.checkpoint,
                                      args.num_threads, args.batch_size, args.max_load_candidates,
                                      args.pass_threshold, args.lowqual_threshold,
-                                     args.ensemble,
                                      use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 3cc5fac..35b1f1e 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -201,7 +201,7 @@ def __len__(self):
 def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpoint,
                      num_threads, batch_size, max_epochs, learning_rate, lr_drop_epochs,
                      lr_drop_ratio, momentum, boost_none, none_count_scale,
-                     max_load_candidates, coverage_thr, save_freq, ensemble,
+                     max_load_candidates, coverage_thr, save_freq,
                      merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs,
                      train_split_len,
                      normalize_channels,
@@ -219,7 +219,17 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         torch.set_num_threads(num_threads)
 
     data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+
+    ensemble = False
+    with open(candidates_tsv[0]) as i_f:
+        for line in i_f:
+            x=line.strip().split()
+            if len(x)==97:
+                ensemble=True
+            break
+    
     num_channels = 119 if ensemble else 26
+    logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
         logger.info("GPU training!")
@@ -507,9 +517,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         help='pretrained network model checkpoint path', default=None)
     parser.add_argument('--validation_candidates_tsv', nargs="*",
                         help=' validation candidate tsv files', default=[])
-    parser.add_argument('--ensemble',
-                        help='Enable training for ensemble mode',
-                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--batch_size', type=int,
@@ -568,7 +575,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.lr, args.lr_drop_epochs, args.lr_drop_ratio, args.momentum,
                                       args.boost_none, args.none_count_scale,
                                       args.max_load_candidates, args.coverage_thr, args.save_freq,
-                                      args.ensemble,
                                       args.merged_candidates_per_tsv, args.merged_max_num_tsvs,
                                       args.overwrite_merged_tsvs, args.train_split_len,
                                       args.normalize_channels,

From 8fc85830d8a184b7d112fde190b51591c1ea6565 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 21 Apr 2020 17:12:04 -0700
Subject: [PATCH 08/89] backward compatiblity for call.py

---
 neusomatic/python/call.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 6f3c5fb..25b76f9 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -592,6 +592,9 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                         help='output directory', required=True)
     parser.add_argument('--checkpoint', type=str,
                         help='network model checkpoint path', required=True)
+    parser.add_argument('--ensemble',
+                        help='Enable calling for ensemble mode',
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--batch_size', type=int,

From 8e8ec699ab38d26d42f160e222cbd74ab233f92b Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 28 Apr 2020 18:59:37 -0700
Subject: [PATCH 09/89] few fixes

---
 neusomatic/python/call.py            | 12 ++++----
 neusomatic/python/defaults.py        |  5 ++--
 neusomatic/python/extend_features.py | 28 ++++---------------
 neusomatic/python/preprocess.py      | 42 +++++++++++-----------------
 neusomatic/python/train.py           | 13 ++++-----
 5 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 0527245..6afb6f1 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -27,7 +27,7 @@
 from dataloader import NeuSomaticDataset, matrix_transform
 from utils import get_chromosomes_order, prob2phred
 from merge_tsvs import merge_tsvs
-from defaults import VARTYPE_CLASSES
+from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES
 
 import torch._utils
 try:
@@ -414,13 +414,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
-        for line in i_f:
-            x = line.strip().split()
-            if len(x) == 97:
-                ensemble = True
-            break
+        x=i_f.readline().strip().split()
+        if len(x) == NUM_ENS_FEATURES+4:
+            ensemble = True
 
-    num_channels = 119 if ensemble else 26
+    num_channels = NUM_ENS_FEATURES+NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py
index d249a61..97a90ee 100644
--- a/neusomatic/python/defaults.py
+++ b/neusomatic/python/defaults.py
@@ -1,4 +1,5 @@
-NUM_ENS_FEATURES=93
-VCF_HEADER="##fileformat=VCFv4.2"
+NUM_ENS_FEATURES = 93
+NUM_ST_FEATURES = 26
+VCF_HEADER = "##fileformat=VCFv4.2"
 TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3}
 VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP']
\ No newline at end of file
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 8b275de..f5b2d3b 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -16,6 +16,7 @@
 import sequencing_features
 import genomic_file_handlers as genome
 from read_info_extractor import rescale
+from utils import skip_empty
 
 
 def extract_features(candidate_record):
@@ -224,11 +225,7 @@ def extend_features(candidates_vcf,
     if cosmic:
         cosmic_vars = {}
         with open(cosmic) as i_f:
-            for line in i_f:
-                if not line.strip():
-                    continue
-                if line[0] == "#":
-                    continue
+            for line in skip_empty(i_f):
                 x = line.strip().split("\t")
                 chrom, pos, _, ref, alts, _, _, info = x[0:8]
                 num_cases = info.split("CNT=")[1].split(
@@ -240,11 +237,7 @@ def extend_features(candidates_vcf,
     if exclude_variants:
         exclude_vars = []
         with open(exclude_variants) as i_f:
-            for line in i_f:
-                if not line.strip():
-                    continue
-                if line[0] == "#":
-                    continue
+            for line in skip_empty(i_f):
                 if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM":
                     continue
                 x = line.strip().split("\t")
@@ -254,25 +247,16 @@ def extend_features(candidates_vcf,
 
     n_variants = 0
     with open(candidates_vcf) as i_f:
-        for line in i_f:
-            if not line.strip():
-                continue
-            if line[0] == "#":
-                continue
+        for line in skip_empty(i_f):
             n_variants += 1
     logger.info("Number of variants: {}".format(n_variants))
-    split_len = n_variants // num_threads
+    split_len = (n_variants + num_threads - 1) // num_threads
     pool = multiprocessing.Pool(num_threads)
     map_args = []
     with open(candidates_vcf) as i_f:
         i = 0
         batch = []
-        for line in i_f:
-            if not line.strip():
-                continue
-            if line[0] == "#":
-                continue
-
+        for line in skip_empty(i_f):
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
             var_id = "-".join([chrom, pos, ref, alt])
             if exclude_variants:
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 0c57e98..fb4a6e2 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -348,33 +348,23 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         work_dataset_split, "merged_features.bed")
                     if not os.path.exists(merged_features_bed) or restart:
                         exclude_ens_variants = []
-                        with open(merged_features_bed, "w") as o_f:
-                            with open(ensemble_beds[i]) as i_f:
-                                for line in i_f:
-                                    if not line.strip():
-                                        continue
-                                    if line[0] == "#":
-                                        o_f.write(line)
-                                        continue
-                                    chrom, pos, _, ref, alt = line.strip().split("\t")[
-                                        0:5]
-                                    var_id = "-".join([chrom, pos, ref, alt])
-                                    exclude_ens_variants.append(var_id)
+                        with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
+                            for line in skip_empty(i_f_1, skip_header=False):
+                                if line.startswith("#"):
                                     o_f.write(line)
-                            with open(extra_features_bed) as i_f:
-                                for line in i_f:
-                                    if not line.strip():
-                                        continue
-                                    if line[0] == "#":
-                                        continue
-                                    chrom, pos, _, ref, alt = line.strip().split("\t")[
-                                        0:5]
-                                    var_id = "-".join([chrom, pos, ref, alt])
-                                    if var_id in exclude_ens_variants:
-                                        continue
-                                    o_f.write(line)
-                    # concatenate_files([extra_features_bed, ensemble_beds[
-                    # i]], merged_features_bed, check_file_existence=True)
+                                    continue
+                                chrom, pos, _, ref, alt = line.strip().split("\t")[
+                                    0:5]
+                                var_id = "-".join([chrom, pos, ref, alt])
+                                exclude_ens_variants.append(var_id)
+                                o_f.write(line)
+                            for line in skip_empty(i_f_2):
+                                chrom, pos, _, ref, alt = line.strip().split("\t")[
+                                    0:5]
+                                var_id = "-".join([chrom, pos, ref, alt])
+                                if var_id in exclude_ens_variants:
+                                    continue
+                                o_f.write(line)
                     ensemble_bed_i = merged_features_bed
                 else:
                     ensemble_bed_i = extra_features_bed
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 6bf6346..acc44c1 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -24,7 +24,7 @@
 from network import NeuSomaticNet
 from dataloader import NeuSomaticDataset, matrix_transform
 from merge_tsvs import merge_tsvs
-from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES
+from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES
 
 import torch._utils
 try:
@@ -220,13 +220,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
-        for line in i_f:
-            x=line.strip().split()
-            if len(x)==97:
-                ensemble=True
-            break
+        x=i_f.readline().strip().split()
+        if len(x) == NUM_ENS_FEATURES+4:
+            ensemble = True
+
+    num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
     
-    num_channels = 119 if ensemble else 26
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:

From 2a845091d50a1335dc2d194f3c9115541eae8bc8 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 28 Apr 2020 19:12:29 -0700
Subject: [PATCH 10/89] fix format

---
 neusomatic/python/call.py                     |   6 +-
 neusomatic/python/defaults.py                 |   2 +-
 neusomatic/python/extend_features.py          |   2 +-
 .../python/extract_postprocess_targets.py     |   1 +
 neusomatic/python/filter_candidates.py        |   5 +-
 neusomatic/python/generate_dataset.py         |   8 +-
 neusomatic/python/genomic_file_handlers.py    | 191 ++++++++----------
 neusomatic/python/long_read_indelrealign.py   |  42 ++--
 neusomatic/python/merge_post_vcfs.py          |   1 +
 neusomatic/python/postprocess.py              |   9 +-
 neusomatic/python/preprocess.py               |   3 +-
 neusomatic/python/read_info_extractor.py      | 110 +++++-----
 neusomatic/python/resolve_scores.py           |   2 +-
 neusomatic/python/scan_alignments.py          |   1 +
 neusomatic/python/sequencing_features.py      | 190 ++++++++---------
 neusomatic/python/train.py                    |   8 +-
 neusomatic/python/utils.py                    |   6 +-
 17 files changed, 290 insertions(+), 297 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 6afb6f1..b6eb055 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -414,11 +414,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
-        x=i_f.readline().strip().split()
-        if len(x) == NUM_ENS_FEATURES+4:
+        x = i_f.readline().strip().split()
+        if len(x) == NUM_ENS_FEATURES + 4:
             ensemble = True
 
-    num_channels = NUM_ENS_FEATURES+NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
+    num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py
index 97a90ee..4cf0d21 100644
--- a/neusomatic/python/defaults.py
+++ b/neusomatic/python/defaults.py
@@ -2,4 +2,4 @@
 NUM_ST_FEATURES = 26
 VCF_HEADER = "##fileformat=VCFv4.2"
 TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3}
-VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP']
\ No newline at end of file
+VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP']
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index f5b2d3b..71afffd 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -238,7 +238,7 @@ def extend_features(candidates_vcf,
         exclude_vars = []
         with open(exclude_variants) as i_f:
             for line in skip_empty(i_f):
-                if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM":
+                if exclude_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM":
                     continue
                 x = line.strip().split("\t")
                 chrom, pos, _, ref, alt = x[0:5]
diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py
index aa96857..c3dee50 100755
--- a/neusomatic/python/extract_postprocess_targets.py
+++ b/neusomatic/python/extract_postprocess_targets.py
@@ -11,6 +11,7 @@
 from utils import skip_empty
 from defaults import VCF_HEADER
 
+
 def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
     logger = logging.getLogger(extract_postprocess_targets.__name__)
 
diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py
index 39a1fdf..58fc628 100755
--- a/neusomatic/python/filter_candidates.py
+++ b/neusomatic/python/filter_candidates.py
@@ -276,14 +276,15 @@ def filter_candidates(candidate_record):
             for record in final_records:
                 if dbsnp:
                     chrom, pos, ref, alt = record[0:4]
-                    var_id = "-".join(map(str,[chrom, pos, ref, alt]))
+                    var_id = "-".join(map(str, [chrom, pos, ref, alt]))
                     region = "{}:{}-{}".format(chrom, pos, pos + 1)
                     dbsnp_vars = []
                     for x in dbsnp_tb.fetch(region=region):
                         chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[
                             0:5]
                         for alt_ in alts_.split(","):
-                            dbsnp_var_id = "-".join(map(str,[chrom_, pos_, ref_, alt_]))
+                            dbsnp_var_id = "-".join(map(str,
+                                                        [chrom_, pos_, ref_, alt_]))
                             dbsnp_vars.append(dbsnp_var_id)
                     if var_id in dbsnp_vars:
                         continue
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index fe4570d..5daca75 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -25,6 +25,7 @@
 
 NUC_to_NUM_tabix = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0}
 
+
 def get_type(ref, alt):
     logger = logging.getLogger(get_type.__name__)
     len_diff = len(ref) - len(alt.split(",")[0])
@@ -862,11 +863,12 @@ def find_records(input_record):
             concatenate_vcfs(
                 [split_pred_vcf_file, split_missed_ensemble_bed_file], split_pred_with_missed_file)
 
-            tmp_=get_tmp_file()
-            with open(split_pred_with_missed_file) as i_f, open(tmp_,"w") as o_f:
+            tmp_ = get_tmp_file()
+            with open(split_pred_with_missed_file) as i_f, open(tmp_, "w") as o_f:
                 for line in skip_empty(i_f):
                     x = line.strip().split("\t")
-                    o_f.write("\t".join(list(map(str,[x[0],x[1],".",x[3],x[4],".",".",".",".","."])))+"\n")
+                    o_f.write("\t".join(
+                        list(map(str, [x[0], x[1], ".", x[3], x[4], ".", ".", ".", ".", "."]))) + "\n")
             bedtools_sort(tmp_, output_fn=split_pred_with_missed_file,
                           run_logger=thread_logger)
             not_in_ensemble_bed = bedtools_window(
diff --git a/neusomatic/python/genomic_file_handlers.py b/neusomatic/python/genomic_file_handlers.py
index cd19a26..6e45a3a 100644
--- a/neusomatic/python/genomic_file_handlers.py
+++ b/neusomatic/python/genomic_file_handlers.py
@@ -1,10 +1,16 @@
 #!/usr/bin/env python3
 
 from pysam import AlignmentFile
-import sys, os, gzip, re, math
+import sys
+import os
+import gzip
+import re
+import math
 
-# The regular expression pattern for "chrXX 1234567" in both VarScan2 Output and VCF files:
-pattern_major_chr_position = re.compile(r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b')
+# The regular expression pattern for "chrXX 1234567" in both VarScan2
+# Output and VCF files:
+pattern_major_chr_position = re.compile(
+    r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b')
 
 # More lenient pattern:
 pattern_chr_position = re.compile(r'[^\t]+\t[0-9]+\b')
@@ -13,13 +19,15 @@
 
 # Valid Phred+33 quality strings:
 valid_q = set()
-[valid_q.add( chr(33+i) ) for i in range(42)];
+[valid_q.add(chr(33 + i)) for i in range(42)]
 
 nan = float('nan')
 inf = float('inf')
 
-AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"}
-AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile", "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"}
+AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I",
+           "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"}
+AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile",
+           "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"}
 
 
 ### ### ### ### ### MAJOR CLASSES ### ### ### ### ###
@@ -27,12 +35,12 @@ class Vcf_line:
     '''Each instance of this object is a line from the vcf file (no header).'''
 
     def __init__(self, vcf_line):
-
         '''Argument is a line in pileup file.'''
         self.vcf_line = vcf_line.rstrip('\n')
 
         try:
-            self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, *self.has_samples = vcf_line.rstrip('\n').split('\t')
+            self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, * \
+                self.has_samples = vcf_line.rstrip('\n').split('\t')
             self.position = int(self.position)
 
             try:
@@ -44,14 +52,13 @@ def __init__(self, vcf_line):
             self.chromosome = self.identifier = self.refbase = self.altbase = self.qual = self.filters = self.info = self.field = self.samples = ''
             self.position = None
 
-
     def get_info_items(self):
         return self.info.split(';')
 
-
     def get_info_value(self, variable):
 
-        key_item = re.search(r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line)
+        key_item = re.search(
+            r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line)
 
         # The key has a value attached to it, e.g., VAR=1,2,3
         if key_item:
@@ -62,23 +69,21 @@ def get_info_value(self, variable):
             key_item = self.info.split(';')
             return True if variable in key_item else False
 
-
     def get_sample_variable(self):
         return self.field.split(':')
 
-
     def get_sample_item(self, idx=0, out_type='d'):
         '''d to output a dictionary. l to output a tuple of lists'''
 
         if out_type.lower() == 'd':
-            return dict( zip(self.get_sample_variable(), self.samples[idx].split(':') ) )
+            return dict(zip(self.get_sample_variable(), self.samples[idx].split(':')))
         elif out_type.lower() == 'l':
-            return ( self.get_sample_variable(), self.samples[idx].split(':') )
-
+            return (self.get_sample_variable(), self.samples[idx].split(':'))
 
     def get_sample_value(self, variable, idx=0):
 
-        var2value = dict( zip( self.field.split(':'), self.samples[idx].split(':') ))
+        var2value = dict(zip(self.field.split(
+            ':'), self.samples[idx].split(':')))
 
         try:
             return var2value[variable]
@@ -86,8 +91,6 @@ def get_sample_value(self, variable, idx=0):
             return None
 
 
-
-
 class pysam_header:
     '''
     Extract BAM header using pysam.
@@ -99,14 +102,13 @@ def __init__(self, bam_file):
         bam = AlignmentFile(bam_file)
         self.bam_header = bam.header
 
-
     def SM(self):
         '''Sample Name'''
 
         sample_name = set()
 
         for header_i in self.bam_header['RG']:
-            sample_name.add( header_i['SM'] )
+            sample_name.add(header_i['SM'])
         sample_name = tuple(sample_name)
 
         return sample_name
@@ -115,20 +117,14 @@ def SM(self):
 ### ### ### ### ### MAJOR CLASSES OVER ### ### ### ### ###
 
 
-
-
-
-
-
-
 ### ### ### ### ### FUNCTIONS OF CONVENIENCE ### ### ### ### ###
 
 def skip_vcf_header(opened_file):
-    
+
     line_i = opened_file.readline().rstrip()
     while line_i.startswith('#'):
         line_i = opened_file.readline().rstrip()
-    
+
     return line_i
 
 
@@ -139,9 +135,9 @@ def faiordict2contigorder(file_name, file_format):
 
     contig_sequence = []
     with open(file_name) as gfile:
-        
+
         for line_i in gfile:
-            
+
             if file_format == 'fai':
                 contig_match = re.match(r'([^\t]+)\t', line_i)
 
@@ -150,17 +146,17 @@ def faiordict2contigorder(file_name, file_format):
                     contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i)
 
             if contig_match:
-                contig_i = contig_match.groups()[0].split(' ')[0]  # some .fai files have space after the contig for descriptions.
-                contig_sequence.append( contig_i )
+                # some .fai files have space after the contig for descriptions.
+                contig_i = contig_match.groups()[0].split(' ')[0]
+                contig_sequence.append(contig_i)
 
     chrom_seq = {}
-    for n,contig_i in enumerate(contig_sequence):
+    for n, contig_i in enumerate(contig_sequence):
         chrom_seq[contig_i] = n
 
     return chrom_seq
 
 
-
 def open_textfile(file_name):
 
     # See if the input file is a .gz file:
@@ -171,7 +167,6 @@ def open_textfile(file_name):
         return open(file_name)
 
 
-
 def open_bam_file(file_name):
 
     try:
@@ -180,16 +175,14 @@ def open_bam_file(file_name):
         return open(file_name)
 
 
-
-
 def ascii2phred33(x):
     '''Put in an ASCII string, return a Phred+33 score.'''
-    return ord(x)-33
+    return ord(x) - 33
 
 
 def phred33toascii(x):
     '''Put in a Phred33 score, return the character.'''
-    return chr(x+33)
+    return chr(x + 33)
 
 
 def p2phred(p, max_phred=inf):
@@ -201,7 +194,7 @@ def p2phred(p, max_phred=inf):
     elif p == 1:
         Q = 0
 
-    elif p<0 or p>1:
+    elif p < 0 or p > 1:
         Q = nan
 
     elif p > 0:
@@ -215,26 +208,25 @@ def p2phred(p, max_phred=inf):
     return Q
 
 
-
 def phred2p(phred):
     '''Convert Phred-scale quality score to p-value.'''
-    return 10**(-phred/10)
+    return 10**(-phred / 10)
 
 
 def findall_index(mylist, tolookfor):
     '''Find all instances in a list that matches exactly thestring.'''
-    all_indices = [i for i,item in enumerate(mylist) if item == tolookfor]
+    all_indices = [i for i, item in enumerate(mylist) if item == tolookfor]
     return all_indices
 
 
 def findall_index_regex(mylist, pattern):
     '''Find all instances in a list that matches a regex pattern.'''
-    all_indices = [i for i,item in enumerate(mylist) if re.search(pattern, item)]
+    all_indices = [i for i, item in enumerate(
+        mylist) if re.search(pattern, item)]
     return all_indices
 
 
 def count_repeating_bases(sequence):
-
     '''For a string, count the number of characters that appears in a row.
     E.g., for string "ABBCCCDDDDAAAAAAA", the function returns 1, 2, 3, 4, 7, because there is 1 A, 2 B's, 3 C's, 4 D's, and then 7 A's.
     '''
@@ -255,7 +247,6 @@ def count_repeating_bases(sequence):
     return counters
 
 
-
 def numeric_id(chr_i, pos_i, contig_seq):
 
     chr_i = contig_seq[chr_i]
@@ -267,19 +258,17 @@ def numeric_id(chr_i, pos_i, contig_seq):
     return numeric_i
 
 
-
-
-
 # Define which chromosome coordinate is ahead for the following function:
-chrom_sequence = [str(i) for i in range(1,23)]
+chrom_sequence = [str(i) for i in range(1, 23)]
 chrom_sequence.append('X')
 chrom_sequence.append('Y')
 chrom_sequence.append('M')
 
 chrom_seq = {}
-for n,contig_i in enumerate(chrom_sequence):
+for n, contig_i in enumerate(chrom_sequence):
     chrom_seq[contig_i] = n
 
+
 def whoisbehind(coord_0, coord_1, chrom_sequence):
     '''
     coord_0 and coord_1 are two strings or two lists, specifying the chromosome, a (typically) tab, and then the location.
@@ -288,10 +277,10 @@ def whoisbehind(coord_0, coord_1, chrom_sequence):
 
     end_of_0 = end_of_1 = False
 
-    if coord_0 == '' or coord_0==['',''] or coord_0==('','') or not coord_0:
+    if coord_0 == '' or coord_0 == ['', ''] or coord_0 == ('', '') or not coord_0:
         end_of_0 = True
 
-    if coord_1 == '' or coord_1==['',''] or coord_1==('','') or not coord_1:
+    if coord_1 == '' or coord_1 == ['', ''] or coord_1 == ('', '') or not coord_1:
         end_of_1 = True
 
     if end_of_0 and end_of_1:
@@ -345,10 +334,7 @@ def whoisbehind(coord_0, coord_1, chrom_sequence):
                 return 10
 
 
-
-
 def vcf_header_modifier(infile_handle, addons=[], getlost=' '):
-
     '''addons = A list of INFO, FORMAT, ID, or Filter lines you want to add.
     getlost = a regex expression for the ID of INFO/FORMAT/FILTER that you want to get rid of.'''
 
@@ -369,15 +355,14 @@ def vcf_header_modifier(infile_handle, addons=[], getlost=' '):
         elif re.match(r'##(INFO|FORMAT|FILTER)', line_i):
 
             if not re.match(r'##(INFO|FORMAT|FILTER)=<ID={},'.format(getlost), line_i):
-                vcfheader_info_format_filter.append( line_i )
+                vcfheader_info_format_filter.append(line_i)
 
         elif re.match(r'##', line_i):
-            vcfheader_misc.append( line_i )
+            vcfheader_misc.append(line_i)
 
         # Continue:
         line_i = infile_handle.readline().rstrip()
 
-
     # Print headers:
     vcfheader_info_format_filter.sort()
     vcfheader_misc.sort()
@@ -385,20 +370,14 @@ def vcf_header_modifier(infile_handle, addons=[], getlost=' '):
     return vcffileformat, vcfheader_info_format_filter, vcfheader_misc, line_i
 
 
-
-
-
-
-
 def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):
-
     '''
     Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, at which time the function stops reading and you can do stuff.
     Returns (True, Vcf_line_j)  if the j_th vcf file contains an entry that matches the i_th coordinate.
     Returns (False, Vcf_line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate.
     '''
 
-    coordinate_j = re.match( pattern_chr_position, line_j )
+    coordinate_j = re.match(pattern_chr_position, line_j)
 
     if coordinate_j:
         coordinate_j = coordinate_j.group()
@@ -406,13 +385,14 @@ def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):
         coordinate_j = ''
 
     # Which coordinate is behind?
-    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+    is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
     # The file_j is already ahead, return the same line_j, but tag it "False"
     if is_behind == 0:
         reporter = (False, line_j)
 
-    # The two coordinates are the same, return the same line_j, but tag it "True"
+    # The two coordinates are the same, return the same line_j, but tag it
+    # "True"
     elif is_behind == 10:
         reporter = (True, line_j)
 
@@ -424,14 +404,14 @@ def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):
 
             # Catch up
             line_j = filehandle_j.readline().rstrip()
-            next_coord = re.match( pattern_chr_position, line_j )
+            next_coord = re.match(pattern_chr_position, line_j)
 
             if next_coord:
                 coordinate_j = next_coord.group()
             else:
                 coordinate_j = ''
 
-            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+            is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
         # If file_j has caught up exactly to the position of coordinate_i:
         if is_behind == 10:
@@ -444,12 +424,7 @@ def catchup(coordinate_i, line_j, filehandle_j, chrom_sequence):
     return reporter
 
 
-
-
-
-
 def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
-
     '''
     Keep reading the j_th vcf file until it hits (or goes past) the i_th coordinate, then
         1) Create a list to store information for this coordinate in the j_th vcf file
@@ -460,7 +435,7 @@ def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
     Returns (False, []        , line_j) if the j_th vcf file does not contain such an entry, and therefore the function has run past the i_th coordinate, by which time the programmer can decide to move into the next i_th coordiate.
     '''
 
-    coordinate_j = re.match( pattern_chr_position, line_j )
+    coordinate_j = re.match(pattern_chr_position, line_j)
 
     if coordinate_j:
         coordinate_j = coordinate_j.group()
@@ -468,38 +443,39 @@ def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
         coordinate_j = ''
 
     # Which coordinate is behind?
-    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+    is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
     # The file_j is already ahead, return the same line_j, but tag it "False"
     if is_behind == 0:
         reporter = (False, [], line_j)
 
-    # The two coordinates are the same, return the same line_j, but tag it "True"
+    # The two coordinates are the same, return the same line_j, but tag it
+    # "True"
     elif is_behind == 10:
 
         # Create a list, initiated with the current line:
-        lines_of_coordinate_i = [ line_j ]
+        lines_of_coordinate_i = [line_j]
 
         while is_behind == 10:
             line_j = filehandle_j.readline().rstrip()
-            next_coord = re.match( pattern_chr_position, line_j )
+            next_coord = re.match(pattern_chr_position, line_j)
 
             if next_coord:
                 coordinate_j = next_coord.group()
             else:
                 coordinate_j = ''
 
-            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+            is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
             # If the next line (still) has the same coordinate:
             if is_behind == 10:
-                lines_of_coordinate_i.append( line_j )
+                lines_of_coordinate_i.append(line_j)
 
         reporter = (True, lines_of_coordinate_i, line_j)
 
-
     # If file_j is behind, then needs to catch up:
-    # This is an opportunity to check if the vcf_j file is properly sorted, by asserting current line cannot be "behind" a subsequent line
+    # This is an opportunity to check if the vcf_j file is properly sorted, by
+    # asserting current line cannot be "behind" a subsequent line
     elif is_behind == 1:
 
         # Keep at it until line_j is no longer behind:
@@ -507,43 +483,43 @@ def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
 
             # Catch up
             line_j = filehandle_j.readline().rstrip()
-            next_coord = re.match( pattern_chr_position, line_j )
+            next_coord = re.match(pattern_chr_position, line_j)
 
             if next_coord:
                 if whoisbehind(coordinate_j, next_coord.group(), chrom_sequence) == 1:
-                    raise Exception('{} does not seem to be properly sorted'.format(filehandle_j.name) )
+                    raise Exception(
+                        '{} does not seem to be properly sorted'.format(filehandle_j.name))
 
                 coordinate_j = next_coord.group()
             else:
                 coordinate_j = ''
 
-            is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
-
+            is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
         # If file_j has caught up exactly to the position of coordinate_i:
         if is_behind == 10:
 
             # Create a list, initiated with the current line:
-            lines_of_coordinate_i = [ line_j ]
+            lines_of_coordinate_i = [line_j]
 
             while is_behind == 10:
                 line_j = filehandle_j.readline().rstrip()
-                next_coord = re.match( pattern_chr_position, line_j )
+                next_coord = re.match(pattern_chr_position, line_j)
 
                 if next_coord:
                     coordinate_j = next_coord.group()
                 else:
                     coordinate_j = ''
 
-                is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+                is_behind = whoisbehind(
+                    coordinate_i, coordinate_j, chrom_sequence)
 
                 # If the next line (still) has the same coordinate:
                 if is_behind == 10:
-                    lines_of_coordinate_i.append( line_j )
+                    lines_of_coordinate_i.append(line_j)
 
             reporter = (True, lines_of_coordinate_i, line_j)
 
-
         elif is_behind == 0:
 
             reporter = (False, [], line_j)
@@ -551,16 +527,13 @@ def catchup_multilines(coordinate_i, line_j, filehandle_j, chrom_sequence):
     return reporter
 
 
-
-
-
-
 def find_vcf_at_coordinate(my_coordinate, latest_vcf_line, vcf_file_handle, chrom_seq):
     '''Best used in conjunction with catchup_multilines.
     Given the current coordinate, the latest vcf_line from a vcf file, and the vcf file handle, it will return all the VCF variants (as VCF objects) at the given coordinate as a dictionary, where the key is the ( (contig, position), ref_base_i, alt_base_i ).
     If there are two ALT bases in a given VCF line, the output dictionary will include two copies of this VCF object, with two different keys, each representing a different ALT base.
     '''
-    latest_vcf_run  = catchup_multilines(my_coordinate, latest_vcf_line, vcf_file_handle, chrom_seq)
+    latest_vcf_run = catchup_multilines(
+        my_coordinate, latest_vcf_line, vcf_file_handle, chrom_seq)
     latest_vcf_here = latest_vcf_run[1]
 
     vcf_variants = {}
@@ -568,12 +541,13 @@ def find_vcf_at_coordinate(my_coordinate, latest_vcf_line, vcf_file_handle, chro
 
         for vcf_line_i in latest_vcf_here:
 
-            vcf_i = Vcf_line( vcf_line_i )
+            vcf_i = Vcf_line(vcf_line_i)
 
             # Some VCF files wrongly uses "/" to separate different ALT's
             altbases = re.split(r'[,/]', vcf_i.altbase)
             for alt_i in altbases:
-                vcf_variants[ ((vcf_i.chromosome, vcf_i.position), vcf_i.refbase, alt_i) ] = vcf_i
+                vcf_variants[((vcf_i.chromosome, vcf_i.position),
+                              vcf_i.refbase, alt_i)] = vcf_i
 
             assert my_coordinate[1] == vcf_i.position
 
@@ -582,11 +556,9 @@ def find_vcf_at_coordinate(my_coordinate, latest_vcf_line, vcf_file_handle, chro
     return latest_vcf_run[0], vcf_variants, latest_vcf_line
 
 
-
-
-# Read the 2nd file (i.e., filehandle_j) one line down if it's behind the i_th coordinate:
+# Read the 2nd file (i.e., filehandle_j) one line down if it's behind the
+# i_th coordinate:
 def catchup_one_line_at_a_time(coordinate_i, line_j, filehandle_j, chrom_sequence):
-
     '''
     A sister program of catch_up, the difference is that the j_th file will be read only once if the coordinate is behind i, so that it allows the programmer a chance to do something for coordinates that only occurs in j, whereas the catch_up function will keep reading until it gets to to gets past i, so the programmer has no chance to do anything for coordinates that occur only in j.
     Return (0, Vcf_line_j)  if the coordinate_j matches coordinate_i.
@@ -594,7 +566,7 @@ def catchup_one_line_at_a_time(coordinate_i, line_j, filehandle_j, chrom_sequenc
     Return (-1, Vcf_line_j) if the coordinate_j is behind of coordinate_i.
     '''
 
-    coordinate_j = re.match( pattern_chr_position, line_j )
+    coordinate_j = re.match(pattern_chr_position, line_j)
 
     if coordinate_j:
         coordinate_j = coordinate_j.group()
@@ -602,13 +574,14 @@ def catchup_one_line_at_a_time(coordinate_i, line_j, filehandle_j, chrom_sequenc
         coordinate_j = ''
 
     # Which coordinate is behind?
-    is_behind = whoisbehind( coordinate_i, coordinate_j, chrom_sequence )
+    is_behind = whoisbehind(coordinate_i, coordinate_j, chrom_sequence)
 
     # The file_j is already ahead, return the same line_j, but tag it "False"
     if is_behind == 0:
         reporter = (1, line_j)
 
-    # The two coordinates are the same, return the same line_j, but tag it "True"
+    # The two coordinates are the same, return the same line_j, but tag it
+    # "True"
     elif is_behind == 10:
         reporter = (0, line_j)
 
@@ -617,7 +590,7 @@ def catchup_one_line_at_a_time(coordinate_i, line_j, filehandle_j, chrom_sequenc
 
         # Read one line into file_j:
         line_j_next = filehandle_j.readline().rstrip()
-        next_coord = re.match( pattern_chr_position, line_j_next )
+        next_coord = re.match(pattern_chr_position, line_j_next)
         reporter = (-1, line_j_next)
 
     return reporter
diff --git a/neusomatic/python/long_read_indelrealign.py b/neusomatic/python/long_read_indelrealign.py
index bc62cca..0d51aba 100755
--- a/neusomatic/python/long_read_indelrealign.py
+++ b/neusomatic/python/long_read_indelrealign.py
@@ -646,8 +646,8 @@ def find_realign_dict(realign_bed_file, chrom):
     realign_bed = get_tmp_file()
     with open(realign_bed_file) as i_f, open(realign_bed, "w") as o_f:
         for line in skip_empty(i_f):
-            x=line.strip().split()
-            if x[0]==chrom:
+            x = line.strip().split()
+            if x[0] == chrom:
                 o_f.write(line)
 
     realign_dict = {}
@@ -923,30 +923,31 @@ def find_var(out_fasta_file, snp_min_af, del_min_af, ins_min_af, scale_maf, simp
         is_ins = False
         is_del = False
         done = False
-        for i, (r, a) in enumerate(zip(list(ref_seq)+[0], list(alt_seq)+[0])):
+        for i, (r, a) in enumerate(zip(list(ref_seq) + [0], list(alt_seq) + [0])):
             if i in i_afs:
                 af = afs[i_afs.index(i)]
             else:
                 af = 0
             if r == a:
-                done=True
+                done = True
             else:
-                if r==0 and a!=0:
+                if r == 0 and a != 0:
                     if not is_ins:
-                        done=True
-                elif r!=0 and a==0:
+                        done = True
+                elif r != 0 and a == 0:
                     if not is_del:
-                        done=True
+                        done = True
                 else:
-                    done=True
+                    done = True
             if done:
                 if current_alt:
                     rr = "".join(map(lambda x: NUM_to_NUC[
-                             x], filter(lambda x: x > 0, current_ref)))
+                        x], filter(lambda x: x > 0, current_ref)))
                     aa = "".join(map(lambda x: NUM_to_NUC[
-                             x], filter(lambda x: x > 0, current_alt)))
-                    variants.append([current_bias, rr, aa, np.array(current_af)])
-                    done=False
+                        x], filter(lambda x: x > 0, current_alt)))
+                    variants.append(
+                        [current_bias, rr, aa, np.array(current_af)])
+                    done = False
                     current_ref = []
                     current_alt = []
                     current_af = []
@@ -958,13 +959,14 @@ def find_var(out_fasta_file, snp_min_af, del_min_af, ins_min_af, scale_maf, simp
                 current_ref.append(r)
                 current_alt.append(a)
                 current_af.append(af)
-                is_ins = r==0 and a!=0
-                is_del = r!=0 and a==0
+                is_ins = r == 0 and a != 0
+                is_del = r != 0 and a == 0
             if r != 0:
                 bias += 1
 
     return variants
 
+
 def TrimREFALT(ref, alt, pos):
     logger = logging.getLogger(TrimREFALT.__name__)
     alte = len(alt)
@@ -1061,12 +1063,14 @@ def run_realignment(input_record):
                 for var in vars_:
                     pos_, ref_seq, alt_seq, afs = var
                     if ref_seq != alt_seq:
-                        ref, alt, pos = ref_seq, alt_seq, int(region.start) + 1 + pos_
+                        ref, alt, pos = ref_seq, alt_seq, int(
+                            region.start) + 1 + pos_
                         if pos > 1:
-                            num_add_before = min(40, pos-1)
-                            before = ref_fasta.fetch(region.chrom, pos - num_add_before, pos-1).upper()
+                            num_add_before = min(40, pos - 1)
+                            before = ref_fasta.fetch(
+                                region.chrom, pos - num_add_before, pos - 1).upper()
                             print(before)
-                            pos -= num_add_before-1
+                            pos -= num_add_before - 1
                             ref = before + ref
                             alt = before + alt
                         ref, alt, pos = TrimREFALT(
diff --git a/neusomatic/python/merge_post_vcfs.py b/neusomatic/python/merge_post_vcfs.py
index 014e56f..111df49 100755
--- a/neusomatic/python/merge_post_vcfs.py
+++ b/neusomatic/python/merge_post_vcfs.py
@@ -13,6 +13,7 @@
 from utils import get_chromosomes_order, skip_empty
 from defaults import VCF_HEADER
 
+
 def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, out_vcf,
                     pass_threshold, lowqual_threshold):
 
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index 6df0a17..1c456c8 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -215,8 +215,9 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
         os.mkdir(work_lr_indel_realign)
         ra_resolved_vcf = os.path.join(
             work, "candidates_preds.ra_resolved.vcf")
-        not_resolved_bed = os.path.join(work, "candidates_preds.not_ra_resolved.bed")
-        long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, 
+        not_resolved_bed = os.path.join(
+            work, "candidates_preds.not_ra_resolved.bed")
+        long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf,
                                not_resolved_bed, target_bed,
                                reference, num_threads, lr_pad,
                                lr_chunk_size, lr_chunk_scale, lr_snp_min_af,
@@ -227,12 +228,12 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
                                msa_binary)
         resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf)
 
-        not_resolved_vcf = os.path.join(work, "candidates_preds.not_ra_resolved.vcf")
+        not_resolved_vcf = os.path.join(
+            work, "candidates_preds.not_ra_resolved.vcf")
         cmd = "bedtools intersect -a {} -b {} -u".format(
             target_vcf, not_resolved_bed)
         run_bedtools_cmd(cmd, output_fn=not_resolved_vcf, run_logger=logger)
 
-        
         all_no_resolve = concatenate_files(
             [no_resolve, ensembled_preds, not_resolved_vcf], os.path.join(work, "no_resolve.vcf"))
 
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index fb4a6e2..8cd40bb 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -152,7 +152,8 @@ def extract_candidate_split_regions(
         logger.info([filtered_vcf, is_empty])
         if not is_empty:
             candidates_bed = get_tmp_file()
-            vcf_2_bed(filtered_vcf,candidates_bed, len_ref=True, keep_ref_alt=False)
+            vcf_2_bed(filtered_vcf, candidates_bed,
+                      len_ref=True, keep_ref_alt=False)
 
             candidates_bed = bedtools_sort(candidates_bed, run_logger=logger)
             candidates_bed = bedtools_slop(
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index b5bf75d..e4b4ead 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -2,22 +2,24 @@
 
 import re
 
-cigar_aln_match    = 0
-cigar_insertion    = 1
-cigar_deletion     = 2
-cigar_skip         = 3
-cigar_soft_clip    = 4
-cigar_hard_clip    = 5
-cigar_padding      = 6
-cigar_seq_match    = 7
+cigar_aln_match = 0
+cigar_insertion = 1
+cigar_deletion = 2
+cigar_skip = 3
+cigar_soft_clip = 4
+cigar_hard_clip = 5
+cigar_padding = 6
+cigar_seq_match = 7
 cigar_seq_mismatch = 8
 
 nan = float('nan')
 inf = float('inf')
 
-## Define functions:
+# Define functions:
 
 ### PYSAM ###
+
+
 def position_of_aligned_read(read_i, target_position):
     '''
     Return the base call of the target position, or if it's a start of insertion/deletion.
@@ -42,7 +44,6 @@ def position_of_aligned_read(read_i, target_position):
             seq_i = align_i[0]
             break
 
-
     # If the target position is aligned:
     try:
         if seq_i is not None:
@@ -54,29 +55,34 @@ def position_of_aligned_read(read_i, target_position):
             if i != len(read_i.get_aligned_pairs()) - 1:
 
                 indel_length = 0
-                # If the next alignment is the next sequenced base, then the target is either a reference read of a SNP/SNV:
-                if read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == target_position + 1:
+                # If the next alignment is the next sequenced base, then the
+                # target is either a reference read of a SNP/SNV:
+                if read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1:
 
-                    code = 1 # Reference read for mismatch
+                    code = 1  # Reference read for mismatch
 
-                # If the next reference position has no read position to it, it is DELETED in this read:
-                elif read_i.get_aligned_pairs()[i+1][0] == None and read_i.get_aligned_pairs()[i+1][1] == target_position + 1:
+                # If the next reference position has no read position to it, it
+                # is DELETED in this read:
+                elif read_i.get_aligned_pairs()[i + 1][0] == None and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1:
 
-                    code = 2 # Deletion
+                    code = 2  # Deletion
 
-                    for align_j in read_i.get_aligned_pairs()[ i+1:: ]:
+                    for align_j in read_i.get_aligned_pairs()[i + 1::]:
                         if align_j[0] == None:
                             indel_length -= 1
                         else:
                             break
 
                 # Opposite of deletion, if the read position cannot be aligned to the reference, it can be an INSERTION.
-                # Insertions sometimes show up wit soft-clipping at the end, if the inserted sequence is "too long" to align on a single read. In this case, the inserted length derived here is but a lower limit of the real inserted length.
-                elif read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == None:
+                # Insertions sometimes show up wit soft-clipping at the end, if
+                # the inserted sequence is "too long" to align on a single
+                # read. In this case, the inserted length derived here is but a
+                # lower limit of the real inserted length.
+                elif read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == None:
 
-                    code = 3 # Insertion or soft-clipping
+                    code = 3  # Insertion or soft-clipping
 
-                    for align_j in read_i.get_aligned_pairs()[ i+1:: ]:
+                    for align_j in read_i.get_aligned_pairs()[i + 1::]:
                         if align_j[1] == None:
                             indel_length += 1
                         else:
@@ -85,13 +91,13 @@ def position_of_aligned_read(read_i, target_position):
             # If "i" is the final alignment, cannt exam for indel:
             else:
                 code = 1           # Assuming no indel
-                indel_length = nan # Would be zero if certain no indel, but uncertain here
+                indel_length = nan  # Would be zero if certain no indel, but uncertain here
 
-        # If the target position is deleted from the sequencing read (i.e., the deletion in this read occurs before the target position):
+        # If the target position is deleted from the sequencing read (i.e., the
+        # deletion in this read occurs before the target position):
         else:
             code = 0
             base_at_target, indel_length, flanking_indel = None, None, None
-            
 
         # See if there is insertion/deletion within 5 bp of "i":
         if isinstance(indel_length, int):
@@ -99,7 +105,7 @@ def position_of_aligned_read(read_i, target_position):
             left_side_start = seq_i
             right_side_start = seq_i + abs(indel_length) + 1
             switch = 1
-            for j in (3,2,1):
+            for j in (3, 2, 1):
                 for indel_seeker_i in left_side_start, right_side_start:
 
                     switch = switch * -1
@@ -109,8 +115,9 @@ def position_of_aligned_read(read_i, target_position):
                     if 0 <= seq_j < len(read_i.get_aligned_pairs()):
 
                         # If the reference position has no base aligned to it, it's a deletion.
-                        # On the other hand, if the base has no reference base aligned to it, it's an insertion.
-                        if read_i.get_aligned_pairs()[ seq_j ][1] == None or read_i.get_aligned_pairs()[ seq_j ][0] == None:
+                        # On the other hand, if the base has no reference base
+                        # aligned to it, it's an insertion.
+                        if read_i.get_aligned_pairs()[seq_j][1] == None or read_i.get_aligned_pairs()[seq_j][0] == None:
                             flanking_indel = j
                             break
         else:
@@ -123,8 +130,7 @@ def position_of_aligned_read(read_i, target_position):
         return None, None, None, None, None
 
 
-
-## Dedup test for BAM file
+# Dedup test for BAM file
 def dedup_test(read_i, remove_dup_or_not=True):
     '''
     Return False (i.e., remove the read) if the read is a duplicate and if the user specify that duplicates should be removed.
@@ -136,35 +142,31 @@ def dedup_test(read_i, remove_dup_or_not=True):
         return True
 
 
-
 ### END OF PYSAM ###
 
 
 # Useful to make BED region into an iterator of coordinates
 def genomic_coordinates(contig_i, start, end):
-    for pos_i in range(start, end+1):
+    for pos_i in range(start, end + 1):
         yield contig_i, pos_i
 
 
+def mean(stuff):
+    return sum(stuff) / len(stuff) if stuff else nan
 
 
-def mean(stuff):    
-    return sum(stuff)/len(stuff) if stuff else nan
-
-
-
-##### Extract Indel DP4 info from pileup files:
+# Extract Indel DP4 info from pileup files:
 def pileup_indel_DP4(pileup_object, indel_pattern):
     if pileup_object.reads:
         ref_for = pileup_object.reads.count('.')
         ref_rev = pileup_object.reads.count(',')
-        alt_for = pileup_object.reads.count( indel_pattern.upper() )
-        alt_rev = pileup_object.reads.count( indel_pattern.lower() )
+        alt_for = pileup_object.reads.count(indel_pattern.upper())
+        alt_rev = pileup_object.reads.count(indel_pattern.lower())
 
-        dp4     = ref_for, ref_rev, alt_for, alt_rev
+        dp4 = ref_for, ref_rev, alt_for, alt_rev
 
     else:
-        dp4 = nan,nan,nan,nan
+        dp4 = nan, nan, nan, nan
 
     return dp4
 
@@ -178,21 +180,24 @@ def pileup_DP4(pileup_object, ref_base, variant_call):
         # SNV
         if len(variant_call) == len(ref_base):
 
-            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[2].count(variant_call.upper()), base_calls[3].count(variant_call.lower())
+            ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[
+                2].count(variant_call.upper()), base_calls[3].count(variant_call.lower())
 
         # Insertion:
         elif len(variant_call) > len(ref_base):
 
-            inserted_sequence = variant_call[ len(ref_base):: ]
+            inserted_sequence = variant_call[len(ref_base)::]
 
-            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower())
+            ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[
+                6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower())
 
         # Deletion:
         elif len(variant_call) < len(ref_base):
 
-            deleted_sequence = ref_base[ len(variant_call):: ]
+            deleted_sequence = ref_base[len(variant_call)::]
 
-            ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower())
+            ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[
+                4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower())
 
     else:
         ref_for = ref_rev = alt_for = alt_rev = 0
@@ -200,20 +205,17 @@ def pileup_DP4(pileup_object, ref_base, variant_call):
     return ref_for, ref_rev, alt_for, alt_rev
 
 
-
-
 def rescale(x, original='fraction', rescale_to=None, max_phred=1001):
-    
-    if ( rescale_to == None ) or ( original.lower() == rescale_to.lower() ):
+
+    if (rescale_to == None) or (original.lower() == rescale_to.lower()):
         y = x if isinstance(x, int) else '%.2f' % x
-    
+
     elif original.lower() == 'fraction' and rescale_to == 'phred':
         y = genome.p2phred(x, max_phred=max_phred)
         y = '%.2f' % y
-    
+
     elif original.lower() == 'phred' and rescale_to == 'fraction':
         y = genome.phred2p(x)
         y = '%.2f' % y
-    
-    return y
 
+    return y
diff --git a/neusomatic/python/resolve_scores.py b/neusomatic/python/resolve_scores.py
index 65ff505..f54c57c 100755
--- a/neusomatic/python/resolve_scores.py
+++ b/neusomatic/python/resolve_scores.py
@@ -24,7 +24,7 @@ def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf):
 
     final_intervals = read_tsv_file(tmp_)
     for x in final_intervals:
-        x[5] = str(np.round(-10*np.log10(0.25),4))
+        x[5] = str(np.round(-10 * np.log10(0.25), 4))
 
     tmp_ = bedtools_window(
         ra_vcf, target_vcf, args=" -w 5", run_logger=logger)
diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index fe7e343..37b47f2 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -22,6 +22,7 @@
 from utils import concatenate_files, run_shell_command, bedtools_sort, bedtools_merge, get_tmp_file, skip_empty
 from split_bed import split_region
 
+
 def run_scan_alignments(record):
     work, reference, scan_alignments_binary, split_region_file, \
         input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 1135e1e..3d7644a 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -1,57 +1,60 @@
 #!/usr/bin/env python3
 
-import sys, os, re, pysam
+import sys
+import os
+import re
+import pysam
 import scipy.stats as stats
 import genomic_file_handlers as genome
-from read_info_extractor import * 
+from read_info_extractor import *
 
 nan = float('nan')
 
 
 def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
-
     '''
     bam is the opened file handle of bam file
     my_coordiate is a list or tuple of 0-based (contig, position)
     '''
-    
+
     indel_length = len(first_alt) - len(ref_base)
-    reads = bam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] )
-    
+    reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1])
+
     ref_read_mq = []
     alt_read_mq = []
     ref_read_bq = []
     alt_read_bq = []
     ref_edit_distance = []
     alt_edit_distance = []
-    
+
     ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0
     ref_for = ref_rev = alt_for = alt_rev = dp = 0
     ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0
     MQ0 = 0
-    
+
     ref_pos_from_end = []
     alt_pos_from_end = []
     ref_flanking_indel = []
     alt_flanking_indel = []
-    
-    noise_read_count = poor_read_count  = 0
-    
+
+    noise_read_count = poor_read_count = 0
+
     qname_collector = {}
-    
+
     for read_i in reads:
         if not read_i.is_unmapped and dedup_test(read_i):
-            
+
             dp += 1
-            
-            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 )
-            
+
+            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
+                read_i, my_coordinate[1] - 1)
+
             if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
                 poor_read_count += 1
-            
+
             if read_i.mapping_quality == 0:
                 MQ0 += 1
-            
+
             # Reference calls:
             if code_i == 1 and base_call_i == ref_base[0]:
 
@@ -59,27 +62,27 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
                     qname_collector[read_i.qname].append(0)
                 except KeyError:
                     qname_collector[read_i.qname] = [0]
-            
-                ref_read_mq.append( read_i.mapping_quality )
-                ref_read_bq.append( read_i.query_qualities[ith_base] )
-                
+
+                ref_read_mq.append(read_i.mapping_quality)
+                ref_read_bq.append(read_i.query_qualities[ith_base])
+
                 try:
-                    ref_edit_distance.append( read_i.get_tag('NM') )
+                    ref_edit_distance.append(read_i.get_tag('NM'))
                 except KeyError:
                     pass
-                
+
                 # Concordance
-                if        read_i.is_proper_pair  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     ref_concordant_reads += 1
                 elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     ref_discordant_reads += 1
-                
+
                 # Orientation
                 if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     ref_for += 1
-                elif    read_i.is_reverse  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     ref_rev += 1
-                
+
                 # Soft-clipped reads?
                 if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
                     ref_SC_reads += 1
@@ -88,43 +91,44 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
                 # Distance from the end of the read:
                 if ith_base != None:
-                    ref_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
-                    
+                    ref_pos_from_end.append(
+                        min(ith_base, read_i.query_length - ith_base))
+
                 # Flanking indels:
-                ref_flanking_indel.append( flanking_indel_i )
+                ref_flanking_indel.append(flanking_indel_i)
 
-            
             # Alternate calls:
-            # SNV, or Deletion, or Insertion where I do not check for matching indel length
+            # SNV, or Deletion, or Insertion where I do not check for matching
+            # indel length
             elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \
                  (indel_length < 0  and code_i == 2 and indel_length == indel_length_i) or \
-                 (indel_length > 0  and code_i == 3):
+                 (indel_length > 0 and code_i == 3):
 
                 try:
                     qname_collector[read_i.qname].append(1)
                 except KeyError:
                     qname_collector[read_i.qname] = [1]
 
-                alt_read_mq.append( read_i.mapping_quality )
-                alt_read_bq.append( read_i.query_qualities[ith_base] )
-                
+                alt_read_mq.append(read_i.mapping_quality)
+                alt_read_bq.append(read_i.query_qualities[ith_base])
+
                 try:
-                    alt_edit_distance.append( read_i.get_tag('NM') )
+                    alt_edit_distance.append(read_i.get_tag('NM'))
                 except KeyError:
                     pass
-                
+
                 # Concordance
-                if        read_i.is_proper_pair  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     alt_concordant_reads += 1
                 elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     alt_discordant_reads += 1
-                
+
                 # Orientation
                 if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     alt_for += 1
-                elif    read_i.is_reverse  and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
                     alt_rev += 1
-                
+
                 # Soft-clipped reads?
                 if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
                     alt_SC_reads += 1
@@ -133,56 +137,59 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
                 # Distance from the end of the read:
                 if ith_base != None:
-                    alt_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) )
-                                        
+                    alt_pos_from_end.append(
+                        min(ith_base, read_i.query_length - ith_base))
+
                 # Flanking indels:
-                alt_flanking_indel.append( flanking_indel_i )
-            
-            
+                alt_flanking_indel.append(flanking_indel_i)
+
             # Inconsistent read or 2nd alternate calls:
             else:
-                
+
                 try:
                     qname_collector[read_i.qname].append(2)
                 except KeyError:
                     qname_collector[read_i.qname] = [2]
-                
+
                 noise_read_count += 1
-    
+
     # Done extracting info from tumor BAM. Now tally them:
-    ref_mq        = mean(ref_read_mq)
-    alt_mq        = mean(alt_read_mq)
+    ref_mq = mean(ref_read_mq)
+    alt_mq = mean(alt_read_mq)
     z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0]
-    
-    ref_bq        = mean(ref_read_bq)
-    alt_bq        = mean(alt_read_bq)
+
+    ref_bq = mean(ref_read_bq)
+    alt_bq = mean(alt_read_bq)
     z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0]
-    
-    ref_NM        = mean(ref_edit_distance)
-    alt_NM        = mean(alt_edit_distance)
+
+    ref_NM = mean(ref_edit_distance)
+    alt_NM = mean(alt_edit_distance)
     z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
-    NM_Diff       = alt_NM - ref_NM - abs(indel_length)
-    
-    concordance_fet = stats.fisher_exact(( (ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads) ))[1]
-    strandbias_fet  = stats.fisher_exact(( (ref_for, alt_for), (ref_rev, alt_rev) ))[1]
-    clipping_fet    = stats.fisher_exact(( (ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads) ))[1]
-    
+    NM_Diff = alt_NM - ref_NM - abs(indel_length)
+
+    concordance_fet = stats.fisher_exact(
+        ((ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads)))[1]
+    strandbias_fet = stats.fisher_exact(
+        ((ref_for, alt_for), (ref_rev, alt_rev)))[1]
+    clipping_fet = stats.fisher_exact(
+        ((ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads)))[1]
+
     z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
-    
+
     ref_indel_1bp = ref_flanking_indel.count(1)
     ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp
     ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp
     alt_indel_1bp = alt_flanking_indel.count(1)
     alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp
     alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp
-    
+
     consistent_mates = inconsistent_mates = 0
     for pairs_i in qname_collector:
-        
+
         # Both are alternative calls:
-        if qname_collector[pairs_i] == [1,1]:
+        if qname_collector[pairs_i] == [1, 1]:
             consistent_mates += 1
-        
+
         # One is alternate call but the other one is not:
         elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]:
             inconsistent_mates += 1
@@ -190,33 +197,33 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
     return vars()
 
 
-
-
-
 def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt):
-
     '''
     ref_fa is the opened reference fasta file handle
     my_coordiate is a list or tuple of 0-based (contig, position)
     '''
 
     # Homopolymer eval (Make sure to modify for INDEL):
-    # The min and max is to prevent the +/- 20 bases from exceeding the ends of the reference sequence
-    lseq  = ref_fa.fetch(my_coordinate[0], max(0, my_coordinate[1]-20), my_coordinate[1])
-    rseq  = ref_fa.fetch(my_coordinate[0], my_coordinate[1]+1, min(ref_fa.get_reference_length(my_coordinate[0])+1, my_coordinate[1]+21) )
-    
-    # This is to get around buy in old version of pysam that reads the reference sequence in bytes instead of strings
+    # The min and max is to prevent the +/- 20 bases from exceeding the ends
+    # of the reference sequence
+    lseq = ref_fa.fetch(my_coordinate[0], max(
+        0, my_coordinate[1] - 20), my_coordinate[1])
+    rseq = ref_fa.fetch(my_coordinate[0], my_coordinate[
+                        1] + 1, min(ref_fa.get_reference_length(my_coordinate[0]) + 1, my_coordinate[1] + 21))
+
+    # This is to get around buy in old version of pysam that reads the
+    # reference sequence in bytes instead of strings
     lseq = lseq.decode() if isinstance(lseq, bytes) else lseq
     rseq = rseq.decode() if isinstance(rseq, bytes) else rseq
-    
-    seq41_ref = lseq + ref_base  + rseq
+
+    seq41_ref = lseq + ref_base + rseq
     seq41_alt = lseq + first_alt + rseq
-    
+
     ref_counts = genome.count_repeating_bases(seq41_ref)
     alt_counts = genome.count_repeating_bases(seq41_alt)
-    
-    homopolymer_length = max( max(ref_counts), max(alt_counts) )
-    
+
+    homopolymer_length = max(max(ref_counts), max(alt_counts))
+
     # Homopolymer spanning the variant site:
     ref_c = 0
     alt_c = 0
@@ -225,37 +232,34 @@ def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt):
             ref_c += 1
         else:
             break
-            
+
     for i in lseq[::-1]:
         if i == ref_base:
             ref_c += 1
         else:
             break
-    
+
     for i in rseq:
         if i == first_alt:
             alt_c += 1
         else:
             break
-            
+
     for i in lseq[::-1]:
         if i == first_alt:
             alt_c += 1
         else:
             break
 
-    site_homopolymer_length = max( alt_c+1, ref_c+1 )
+    site_homopolymer_length = max(alt_c + 1, ref_c + 1)
 
     return homopolymer_length, site_homopolymer_length
 
 
-
-
-
 def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
 
     # Odds Ratio just like VarDict's output
-    sor_numerator   = n_alt * t_ref
+    sor_numerator = n_alt * t_ref
     sor_denominator = n_ref * t_alt
     if sor_numerator == 0 and sor_denominator == 0:
         sor = nan
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index acc44c1..ddd31b5 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -89,7 +89,7 @@ def test(net, epoch, validation_loader, use_cuda):
         (matrices, labels, _, var_len_s, _), (paths) = data
 
         paths_ = copy.deepcopy(paths)
-        del paths 
+        del paths
         paths = paths_
 
         matrices = Variable(matrices)
@@ -220,12 +220,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
-        x=i_f.readline().strip().split()
-        if len(x) == NUM_ENS_FEATURES+4:
+        x = i_f.readline().strip().split()
+        if len(x) == NUM_ENS_FEATURES + 4:
             ensemble = True
 
     num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
-    
+
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
diff --git a/neusomatic/python/utils.py b/neusomatic/python/utils.py
index d4e9d45..67ba79c 100755
--- a/neusomatic/python/utils.py
+++ b/neusomatic/python/utils.py
@@ -125,6 +125,7 @@ def write_tsv_file(tsv_file, records, sep='\t', add_fields=[]):
         for x in records:
             f_o.write(sep.join(map(str, x + add_fields)) + "\n")
 
+
 def skip_empty(fh, skip_header=True):
     for line in fh:
         if skip_header and line.startswith("#"):
@@ -133,6 +134,7 @@ def skip_empty(fh, skip_header=True):
             continue
         yield line
 
+
 def read_tsv_file(tsv_file, sep='\t', fields=None):
     records = []
     with open(tsv_file) as i_f:
@@ -143,11 +145,12 @@ def read_tsv_file(tsv_file, sep='\t', fields=None):
             records.append(x)
     return records
 
+
 def vcf_2_bed(vcf_file, bed_file, add_fields=[], len_ref=False, keep_ref_alt=True):
     with open(bed_file, "w") as f_o, open(vcf_file, "r") as f_i:
         for line in skip_empty(f_i):
             x = line.strip().split("\t")
-            len_=1 if not len_ref else len(x[3])
+            len_ = 1 if not len_ref else len(x[3])
             if keep_ref_alt:
                 f_o.write(
                     "\t".join(map(str, [x[0], int(x[1]), int(x[1]) + len_, x[3], x[4]] + add_fields)) + "\n")
@@ -156,7 +159,6 @@ def vcf_2_bed(vcf_file, bed_file, add_fields=[], len_ref=False, keep_ref_alt=Tru
                     "\t".join(map(str, [x[0], int(x[1]), int(x[1]) + len_] + add_fields)) + "\n")
 
 
-
 def bedtools_sort(bed_file, args="", output_fn=None, run_logger=None):
     cmd = "bedtools sort -i {} {}".format(bed_file, args)
     if output_fn is None:

From 5581ba46dd77a60835ed92b59bd923287f245144 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 28 Apr 2020 23:47:45 -0700
Subject: [PATCH 11/89] small fix

---
 neusomatic/python/postprocess.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index 1c456c8..d0d934e 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -21,7 +21,7 @@
 from extract_postprocess_targets import extract_postprocess_targets
 from merge_post_vcfs import merge_post_vcfs
 from resolve_variants import resolve_variants
-from utils import concatenate_files, get_chromosomes_order, bedtools_window, skip_empty
+from utils import concatenate_files, get_chromosomes_order, bedtools_window, run_bedtools_cmd, skip_empty
 from long_read_indelrealign import long_read_indelrealign
 from resolve_scores import resolve_scores
 from _version import __version__

From 115d81401a2dca202e475090b6b8afa4462a6cde Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb009.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 28 Apr 2020 23:52:10 -0700
Subject: [PATCH 12/89] fix for training loss

---
 neusomatic/python/train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index ddd31b5..7b27fc9 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -451,9 +451,11 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
                 loss.backward()
                 optimizer.step()
-                loss_s.append(loss.data)
+                loss_data = copy.deepcopy(loss.cpu().data)
+                del loss
+                loss_s.append(loss_data)
 
-                running_loss += loss.data
+                running_loss += loss_data
                 if i_ % print_freq == print_freq - 1:
                     logger.info('epoch: {}, iter: {:>7}, lr: {}, loss: {:.5f}'.format(
                                 n_epoch + prev_epochs, len(loss_s),

From a3c4f3ea6e15fd60b67fb4fb0db4c08add714970 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb009.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 29 Apr 2020 02:07:43 -0700
Subject: [PATCH 13/89] fix for backward compatibility

---
 neusomatic/python/train.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 7b27fc9..1a0e125 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -518,6 +518,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         help=' validation candidate tsv files', default=[])
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
+    parser.add_argument('--ensemble',
+                        help='Enable training for ensemble mode',
+                        action="store_true")
     parser.add_argument('--batch_size', type=int,
                         help='batch size', default=1000)
     parser.add_argument('--max_epochs', type=int,

From f9ee72564658414b703d06a7505afdea8c64897b Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 30 Apr 2020 08:47:17 -0700
Subject: [PATCH 14/89] fix train loss

---
 neusomatic/python/train.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 1a0e125..a841a26 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -451,11 +451,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
                 loss.backward()
                 optimizer.step()
-                loss_data = copy.deepcopy(loss.cpu().data)
-                del loss
-                loss_s.append(loss_data)
+                loss_s.append(loss.data)
 
-                running_loss += loss_data
+                running_loss += loss.data
                 if i_ % print_freq == print_freq - 1:
                     logger.info('epoch: {}, iter: {:>7}, lr: {}, loss: {:.5f}'.format(
                                 n_epoch + prev_epochs, len(loss_s),

From 26c4ca4156e48e7caec45f71fa51a41c12e5208f Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 2 May 2020 22:53:53 -0700
Subject: [PATCH 15/89] fix extend_features

---
 neusomatic/python/extend_features.py     |  3 +++
 neusomatic/python/read_info_extractor.py | 20 ++++++++++----------
 neusomatic/python/sequencing_features.py |  4 ++--
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 71afffd..91c8359 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -292,6 +292,9 @@ def extend_features(candidates_vcf,
               "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]
 
     try:
+        # ext_features=[]
+        # for w in map_args:
+        #     ext_features.append(extract_features(w))
         ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index e4b4ead..b6db804 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -36,8 +36,8 @@ def position_of_aligned_read(read_i, target_position):
     '''
 
     flanking_deletion, flanking_insertion = nan, nan
-
-    for i, align_i in enumerate(read_i.get_aligned_pairs()):
+    aligned_pairs=read_i.get_aligned_pairs()
+    for i, align_i in enumerate(aligned_pairs):
 
         # If find a match:
         if align_i[1] == target_position:
@@ -52,22 +52,22 @@ def position_of_aligned_read(read_i, target_position):
             # Whether if it's a Deletion/Insertion depends on what happens after this position:
             # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel
             # if "i" is NOT the final alignment:
-            if i != len(read_i.get_aligned_pairs()) - 1:
+            if i != len(aligned_pairs) - 1:
 
                 indel_length = 0
                 # If the next alignment is the next sequenced base, then the
                 # target is either a reference read of a SNP/SNV:
-                if read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1:
+                if aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == target_position + 1:
 
                     code = 1  # Reference read for mismatch
 
                 # If the next reference position has no read position to it, it
                 # is DELETED in this read:
-                elif read_i.get_aligned_pairs()[i + 1][0] == None and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1:
+                elif aligned_pairs[i + 1][0] == None and aligned_pairs[i + 1][1] == target_position + 1:
 
                     code = 2  # Deletion
 
-                    for align_j in read_i.get_aligned_pairs()[i + 1::]:
+                    for align_j in aligned_pairs[i + 1::]:
                         if align_j[0] == None:
                             indel_length -= 1
                         else:
@@ -78,11 +78,11 @@ def position_of_aligned_read(read_i, target_position):
                 # the inserted sequence is "too long" to align on a single
                 # read. In this case, the inserted length derived here is but a
                 # lower limit of the real inserted length.
-                elif read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == None:
+                elif aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == None:
 
                     code = 3  # Insertion or soft-clipping
 
-                    for align_j in read_i.get_aligned_pairs()[i + 1::]:
+                    for align_j in aligned_pairs[i + 1::]:
                         if align_j[1] == None:
                             indel_length += 1
                         else:
@@ -112,12 +112,12 @@ def position_of_aligned_read(read_i, target_position):
                     displacement = j * switch
                     seq_j = indel_seeker_i + displacement
 
-                    if 0 <= seq_j < len(read_i.get_aligned_pairs()):
+                    if 0 <= seq_j < len(aligned_pairs):
 
                         # If the reference position has no base aligned to it, it's a deletion.
                         # On the other hand, if the base has no reference base
                         # aligned to it, it's an insertion.
-                        if read_i.get_aligned_pairs()[seq_j][1] == None or read_i.get_aligned_pairs()[seq_j][0] == None:
+                        if aligned_pairs[seq_j][1] == None or aligned_pairs[seq_j][0] == None:
                             flanking_indel = j
                             break
         else:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 3d7644a..b9adfcc 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -178,10 +178,10 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
     ref_indel_1bp = ref_flanking_indel.count(1)
     ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp
-    ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp
+    ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp
     alt_indel_1bp = alt_flanking_indel.count(1)
     alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp
-    alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp
+    alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp
 
     consistent_mates = inconsistent_mates = 0
     for pairs_i in qname_collector:

From 757dfab3339d09ba29afbf112aa24aa38b7f6459 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 2 May 2020 23:07:54 -0700
Subject: [PATCH 16/89] improve efficiency in extend_features

---
 neusomatic/python/extend_features.py     | 160 ++++++-------
 neusomatic/python/sequencing_features.py | 284 +++++++++--------------
 2 files changed, 192 insertions(+), 252 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 91c8359..5c813dc 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -35,17 +35,11 @@ def extract_features(candidate_record):
             var_id = "-".join([chrom, pos, ref, alt])
             pos = int(pos)
             my_coordinate = [chrom, pos]
-            nBamFeatures = sequencing_features.from_bam(
-                nbam, my_coordinate, ref, alt, min_mapq, min_bq)
-            tBamFeatures = sequencing_features.from_bam(
-                tbam, my_coordinate, ref, alt, min_mapq, min_bq)
+            nBamFeatures = sequencing_features.AlignmentFeatures(nbam, my_coordinate, ref, alt, min_mapq, min_bq)
+            tBamFeatures = sequencing_features.AlignmentFeatures(tbam, my_coordinate, ref, alt, min_mapq, min_bq)
 
-            n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev']
-            n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev']
-            t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev']
-            t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev']
-            sor = sequencing_features.somaticOddRatio(
-                n_ref, n_alt, t_ref, t_alt)
+            sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
+                                                      tBamFeatures.nalt)
 
             homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                 ref_fa, my_coordinate, ref, alt)
@@ -58,12 +52,10 @@ def extract_features(candidate_record):
                 region = "{}:{}-{}".format(chrom, pos, pos + 1)
                 dbsnp_vars = {}
                 for x in dbsnp_tb.fetch(region=region):
-                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[
-                        0:8]
+                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[0:8]
                     for alt_ in alts_.split(","):
                         dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
-                        dbsnp_vars[
-                            dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
+                        dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
                 if var_id in dbsnp_vars:
                     if_dbsnp = 1
                     if_common = dbsnp_vars[var_id]
@@ -77,83 +69,83 @@ def extract_features(candidate_record):
             COMMON = if_common
             if_COSMIC = if_cosmic
             COSMIC_CNT = num_cosmic_cases
-            Consistent_Mates = tBamFeatures['consistent_mates']
-            Inconsistent_Mates = tBamFeatures['inconsistent_mates']
-            N_DP = nBamFeatures['dp']
-            nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq']
-            nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq']
-            nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq']
-            nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq']
-            nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq']
-            nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq']
-            nBAM_REF_NM = '%g' % nBamFeatures['ref_NM']
-            nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM']
-            nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff']
-            nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads']
-            nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads']
-            nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads']
-            nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads']
+            Consistent_Mates = tBamFeatures.consistent_mates
+            Inconsistent_Mates = tBamFeatures.inconsistent_mates
+            N_DP = nBamFeatures.dp
+            nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq
+            nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq
+            nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq
+            nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq
+            nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq
+            nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq
+            nBAM_REF_NM = '%g' % nBamFeatures.ref_NM
+            nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM
+            nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff
+            nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads
+            nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads
+            nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads
+            nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads
             nBAM_Concordance_FET = rescale(
-                nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
-            N_REF_FOR = nBamFeatures['ref_for']
-            N_REF_REV = nBamFeatures['ref_rev']
-            N_ALT_FOR = nBamFeatures['alt_for']
-            N_ALT_REV = nBamFeatures['alt_rev']
+                nBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
+            N_REF_FOR = nBamFeatures.ref_for
+            N_REF_REV = nBamFeatures.ref_rev
+            N_ALT_FOR = nBamFeatures.alt_for
+            N_ALT_REV = nBamFeatures.alt_rev
             nBAM_StrandBias_FET = rescale(
-                nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
-            nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos']
-            nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads']
-            nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads']
+                nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
+            nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos
+            nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads
+            nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads
             nBAM_Clipping_FET = rescale(
-                nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
-            nBAM_MQ0 = nBamFeatures['MQ0']
-            nBAM_Other_Reads = nBamFeatures['noise_read_count']
-            nBAM_Poor_Reads = nBamFeatures['poor_read_count']
-            nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp']
-            nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp']
-            nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp']
-            nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp']
-            nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp']
-            nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp']
+                nBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
+            nBAM_MQ0 = nBamFeatures.MQ0
+            nBAM_Other_Reads = nBamFeatures.noise_read_count
+            nBAM_Poor_Reads = nBamFeatures.poor_read_count
+            nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp
+            nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp
+            nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp
+            nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp
+            nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp
+            nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp
             SOR = sor
             MaxHomopolymer_Length = homopolymer_length
             SiteHomopolymer_Length = site_homopolymer_length
-            T_DP = tBamFeatures['dp']
-            tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq']
-            tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq']
-            tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq']
-            tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq']
-            tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq']
-            tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq']
-            tBAM_REF_NM = '%g' % tBamFeatures['ref_NM']
-            tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM']
-            tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff']
-            tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads']
-            tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads']
-            tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads']
-            tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads']
+            T_DP = tBamFeatures.dp
+            tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq
+            tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq
+            tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq
+            tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq
+            tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq
+            tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq
+            tBAM_REF_NM = '%g' % tBamFeatures.ref_NM
+            tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM
+            tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff
+            tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads
+            tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads
+            tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads
+            tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads
             tBAM_Concordance_FET = rescale(
-                tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001)
-            T_REF_FOR = tBamFeatures['ref_for']
-            T_REF_REV = tBamFeatures['ref_rev']
-            T_ALT_FOR = tBamFeatures['alt_for']
-            T_ALT_REV = tBamFeatures['alt_rev']
+                tBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
+            T_REF_FOR = tBamFeatures.ref_for
+            T_REF_REV = tBamFeatures.ref_rev
+            T_ALT_FOR = tBamFeatures.alt_for
+            T_ALT_REV = tBamFeatures.alt_rev
             tBAM_StrandBias_FET = rescale(
-                tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001)
-            tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos']
-            tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads']
-            tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads']
+                tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
+            tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos
+            tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads
+            tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads
             tBAM_Clipping_FET = rescale(
-                tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001)
-            tBAM_MQ0 = tBamFeatures['MQ0']
-            tBAM_Other_Reads = tBamFeatures['noise_read_count']
-            tBAM_Poor_Reads = tBamFeatures['poor_read_count']
-            tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp']
-            tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp']
-            tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp']
-            tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp']
-            tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp']
-            tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp']
+                tBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
+            tBAM_MQ0 = tBamFeatures.MQ0
+            tBAM_Other_Reads = tBamFeatures.noise_read_count
+            tBAM_Poor_Reads = tBamFeatures.poor_read_count
+            tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp
+            tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp
+            tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp
+            tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp
+            tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp
+            tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp
             InDel_Length = indel_length
 
             ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
@@ -295,11 +287,11 @@ def extend_features(candidates_vcf,
         # ext_features=[]
         # for w in map_args:
         #     ext_features.append(extract_features(w))
+
         ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
-            o_f.write(
-                "\t".join(header) + "\n")
+            o_f.write("\t".join(header) + "\n")
             for features in ext_features:
                 for w in features:
                     o_f.write(
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index b9adfcc..61723c0 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -7,42 +7,40 @@
 import scipy.stats as stats
 import genomic_file_handlers as genome
 from read_info_extractor import *
+from collections import defaultdict
 
 nan = float('nan')
 
 
-def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
-    '''
-    bam is the opened file handle of bam file
-    my_coordiate is a list or tuple of 0-based (contig, position)
-    '''
-
-    indel_length = len(first_alt) - len(ref_base)
-    reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1])
+class AlignmentFeatures:
+    def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+        '''
+        bam is the opened file handle of bam file
+        my_coordiate is a list or tuple of 0-based (contig, position)
+        '''
 
-    ref_read_mq = []
-    alt_read_mq = []
-    ref_read_bq = []
-    alt_read_bq = []
-    ref_edit_distance = []
-    alt_edit_distance = []
+        indel_length = len(first_alt) - len(ref_base)
+        reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1])
 
-    ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0
-    ref_for = ref_rev = alt_for = alt_rev = dp = 0
-    ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0
-    MQ0 = 0
+        # index 0 for ref, 1 for alt
+        read_mq = [[], []]
+        read_bq = [[], []]
+        edit_distance = [[], []]
+        flanking_indel = [[], []]
+        pos_from_end = [[], []]
+        concordance_counts = [[0, 0], [0, 0]]
+        orientation_counts = [[0, 0], [0, 0]]
+        soft_clip_counts = [[0, 0], [0, 0]]
+        dp = 0
+        MQ0 = 0
 
-    ref_pos_from_end = []
-    alt_pos_from_end = []
-    ref_flanking_indel = []
-    alt_flanking_indel = []
+        noise_read_count = poor_read_count = 0
 
-    noise_read_count = poor_read_count = 0
+        qname_collector = defaultdict(list)
 
-    qname_collector = {}
-
-    for read_i in reads:
-        if not read_i.is_unmapped and dedup_test(read_i):
+        for read_i in reads:
+            if read_i.is_unmapped or not dedup_test(read_i):
+                continue
 
             dp += 1
 
@@ -55,146 +53,98 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
             if read_i.mapping_quality == 0:
                 MQ0 += 1
 
-            # Reference calls:
-            if code_i == 1 and base_call_i == ref_base[0]:
-
-                try:
-                    qname_collector[read_i.qname].append(0)
-                except KeyError:
-                    qname_collector[read_i.qname] = [0]
-
-                ref_read_mq.append(read_i.mapping_quality)
-                ref_read_bq.append(read_i.query_qualities[ith_base])
-
-                try:
-                    ref_edit_distance.append(read_i.get_tag('NM'))
-                except KeyError:
-                    pass
-
-                # Concordance
-                if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    ref_concordant_reads += 1
-                elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    ref_discordant_reads += 1
-
-                # Orientation
-                if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    ref_for += 1
-                elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    ref_rev += 1
-
-                # Soft-clipped reads?
-                if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
-                    ref_SC_reads += 1
-                else:
-                    ref_notSC_reads += 1
-
-                # Distance from the end of the read:
-                if ith_base != None:
-                    ref_pos_from_end.append(
-                        min(ith_base, read_i.query_length - ith_base))
-
-                # Flanking indels:
-                ref_flanking_indel.append(flanking_indel_i)
-
-            # Alternate calls:
-            # SNV, or Deletion, or Insertion where I do not check for matching
-            # indel length
-            elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \
-                 (indel_length < 0  and code_i == 2 and indel_length == indel_length_i) or \
-                 (indel_length > 0 and code_i == 3):
-
-                try:
-                    qname_collector[read_i.qname].append(1)
-                except KeyError:
-                    qname_collector[read_i.qname] = [1]
-
-                alt_read_mq.append(read_i.mapping_quality)
-                alt_read_bq.append(read_i.query_qualities[ith_base])
-
-                try:
-                    alt_edit_distance.append(read_i.get_tag('NM'))
-                except KeyError:
-                    pass
-
-                # Concordance
-                if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    alt_concordant_reads += 1
-                elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    alt_discordant_reads += 1
-
-                # Orientation
-                if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    alt_for += 1
-                elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                    alt_rev += 1
-
-                # Soft-clipped reads?
-                if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip:
-                    alt_SC_reads += 1
-                else:
-                    alt_notSC_reads += 1
-
-                # Distance from the end of the read:
-                if ith_base != None:
-                    alt_pos_from_end.append(
-                        min(ith_base, read_i.query_length - ith_base))
-
-                # Flanking indels:
-                alt_flanking_indel.append(flanking_indel_i)
-
-            # Inconsistent read or 2nd alternate calls:
-            else:
-
-                try:
-                    qname_collector[read_i.qname].append(2)
-                except KeyError:
-                    qname_collector[read_i.qname] = [2]
+            is_ref_call = code_i == 1 and base_call_i == ref_base[0]
+            is_alt_call = (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or (
+                        indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or (
+                                      indel_length > 0 and code_i == 3)
 
+            # inconsistent read or second alternate calls
+            if not (is_ref_call or is_alt_call):
+                qname_collector[read_i.qname].append(2)
                 noise_read_count += 1
-
-    # Done extracting info from tumor BAM. Now tally them:
-    ref_mq = mean(ref_read_mq)
-    alt_mq = mean(alt_read_mq)
-    z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0]
-
-    ref_bq = mean(ref_read_bq)
-    alt_bq = mean(alt_read_bq)
-    z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0]
-
-    ref_NM = mean(ref_edit_distance)
-    alt_NM = mean(alt_edit_distance)
-    z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
-    NM_Diff = alt_NM - ref_NM - abs(indel_length)
-
-    concordance_fet = stats.fisher_exact(
-        ((ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads)))[1]
-    strandbias_fet = stats.fisher_exact(
-        ((ref_for, alt_for), (ref_rev, alt_rev)))[1]
-    clipping_fet = stats.fisher_exact(
-        ((ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads)))[1]
-
-    z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
-
-    ref_indel_1bp = ref_flanking_indel.count(1)
-    ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp
-    ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp
-    alt_indel_1bp = alt_flanking_indel.count(1)
-    alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp
-    alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp
-
-    consistent_mates = inconsistent_mates = 0
-    for pairs_i in qname_collector:
-
-        # Both are alternative calls:
-        if qname_collector[pairs_i] == [1, 1]:
-            consistent_mates += 1
-
-        # One is alternate call but the other one is not:
-        elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]:
-            inconsistent_mates += 1
-
-    return vars()
+                continue
+
+            index = 1 if is_alt_call else 0
+
+            qname_collector[read_i.qname].append(index)
+
+            read_mq[index].append(read_i.mapping_quality)
+            read_bq[index].append(read_i.query_qualities[ith_base])
+
+            try:
+                edit_distance[index].append(read_i.get_tag('NM'))
+            except KeyError:
+                pass
+
+            if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+                concordance_counts[0 if read_i.is_proper_pair else 1][index] += 1
+                orientation_counts[1 if read_i.is_reverse else 0][index] += 1
+
+            is_soft_clipped = read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip
+            soft_clip_counts[1 if is_soft_clipped else 0][index] += 1
+
+            # Distance from the end of the read:
+            if ith_base is not None:
+                pos_from_end[index].append(min(ith_base, read_i.query_length - ith_base))
+
+            flanking_indel[index].append(flanking_indel_i)
+
+        # unpack to get the ref and alt values
+        ref_pos_from_end, alt_pos_from_end = pos_from_end
+        self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[0]
+        self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[1]
+        self.ref_for, self.alt_for = orientation_counts[0]
+        self.ref_rev, self.alt_rev = orientation_counts[1]
+        self.ref_notSC_reads, self.alt_notSC_reads = soft_clip_counts[0]
+        self.ref_SC_reads, self.alt_SC_reads = soft_clip_counts[1]
+
+        # Done extracting info from BAM. Now tally them:
+        ref_read_mq, alt_read_mq = read_mq
+        self.ref_mq = mean(ref_read_mq)
+        self.alt_mq = mean(alt_read_mq)
+        self.z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0]
+
+        ref_read_bq, alt_read_bq = read_bq
+        self.ref_bq = mean(ref_read_bq)
+        self.alt_bq = mean(alt_read_bq)
+        self.z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0]
+
+        ref_edit_distance, alt_edit_distance = edit_distance
+        self.ref_NM = mean(ref_edit_distance)
+        self.alt_NM = mean(alt_edit_distance)
+        self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
+        self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length)
+
+        self.concordance_fet = stats.fisher_exact(concordance_counts)[1]
+        self.strandbias_fet = stats.fisher_exact(orientation_counts)[1]
+        self.clipping_fet = stats.fisher_exact(soft_clip_counts)[1]
+
+        self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
+
+        ref_flanking_indel, alt_flanking_indel = flanking_indel
+        self.ref_indel_1bp = ref_flanking_indel.count(1)
+        self.ref_indel_2bp = ref_flanking_indel.count(2) + self.ref_indel_1bp
+        self.ref_indel_3bp = ref_flanking_indel.count(3) + self.ref_indel_2bp
+        self.alt_indel_1bp = alt_flanking_indel.count(1)
+        self.alt_indel_2bp = alt_flanking_indel.count(2) + self.alt_indel_1bp
+        self.alt_indel_3bp = alt_flanking_indel.count(3) + self.alt_indel_2bp
+
+        self.consistent_mates = self.inconsistent_mates = 0
+        for one_count in map(lambda x: x.count(1), filter(lambda y: len(y) == 2, qname_collector.values())):
+            # Both are alternative calls:
+            if one_count == 2:
+                self.consistent_mates += 1
+
+            # One is alternate call but the other one is not:
+            elif one_count == 1:
+                self.inconsistent_mates += 1
+
+        self.nref = self.ref_for + self.ref_rev
+        self.nalt = self.alt_for + self.alt_rev
+        self.dp = dp
+        self.MQ0 = MQ0
+        self.noise_read_count = noise_read_count
+        self.poor_read_count = poor_read_count
 
 
 def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt):
@@ -266,8 +216,6 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
     elif sor_denominator == 0:
         sor = max_value
     else:
-        sor = sor_numerator / sor_denominator
-        if sor >= max_value:
-            sor = max_value
+        sor = min(sor_numerator / sor_denominator, max_value)
 
     return sor

From 776ddcee0d9593a5f4ce78644a32f33626aa57a0 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 3 May 2020 19:28:29 -0700
Subject: [PATCH 17/89] fix for extend_features

---
 neusomatic/python/extend_features.py  |  6 +++++-
 neusomatic/python/generate_dataset.py |  2 +-
 neusomatic/python/preprocess.py       | 13 +++++++++++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 5c813dc..ee40526 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -249,6 +249,7 @@ def extend_features(candidates_vcf,
         i = 0
         batch = []
         for line in skip_empty(i_f):
+            i += 1
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
             var_id = "-".join([chrom, pos, ref, alt])
             if exclude_variants:
@@ -260,11 +261,14 @@ def extend_features(candidates_vcf,
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
-            i += 1
             if len(batch) >= split_len or i == n_variants:
                 map_args.append((reference, tumor_bam, normal_bam,
                                  min_mapq, min_bq, dbsnp, batch))
                 batch = []
+        if batch:
+            map_args.append((reference, tumor_bam, normal_bam,
+                             min_mapq, min_bq, dbsnp, tumor_only, batch))
+
 
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 68cfde6..9ef58d0 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1388,7 +1388,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
             n_vars += 1
     if n_vars > 0:
         ensemble_data = np.array(ensemble_data)[:, order_header]
-    header = np.array(header)[order_header].tolist()
+    header = np.array(header_)[order_header].tolist()
 
     cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
         "Consistent_Mates", "Inconsistent_Mates", "N_DP",
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 8cd40bb..32ec235 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -94,7 +94,7 @@ def get_ensemble_region(record):
             region, reference + ".fai", args=" -b {}".format(matrix_base_pad + 3),
             run_logger=thread_logger)
         bedtools_intersect(
-            ensemble_bed, ensemble_bed_region_file_tmp, args=" -u",
+            ensemble_bed, ensemble_bed_region_file_tmp, args=" -u -header",
             output_fn=ensemble_bed_region_file, run_logger=thread_logger)
         return ensemble_bed_region_file
 
@@ -349,17 +349,26 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         work_dataset_split, "merged_features.bed")
                     if not os.path.exists(merged_features_bed) or restart:
                         exclude_ens_variants = []
+                        header_line = ""
                         with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
                             for line in skip_empty(i_f_1, skip_header=False):
                                 if line.startswith("#"):
                                     o_f.write(line)
+                                    if not header_line:
+                                        header_line = line
+                                    else:
+                                        assert(header_line == line)
                                     continue
                                 chrom, pos, _, ref, alt = line.strip().split("\t")[
                                     0:5]
                                 var_id = "-".join([chrom, pos, ref, alt])
                                 exclude_ens_variants.append(var_id)
                                 o_f.write(line)
-                            for line in skip_empty(i_f_2):
+                            for line in skip_empty(i_f_2, skip_header=False):
+                                if line.startswith("#"):
+                                    if header_line:
+                                        assert(header_line == line)
+                                    continue
                                 chrom, pos, _, ref, alt = line.strip().split("\t")[
                                     0:5]
                                 var_id = "-".join([chrom, pos, ref, alt])

From 15ff4a8061d466b89a60a3945a526da5a178cfe9 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 5 May 2020 12:10:39 -0700
Subject: [PATCH 18/89] added seq_complexity

---
 neusomatic/python/call.py                |  59 ++++++++-----
 neusomatic/python/extend_features.py     | 107 +++++++++++++++--------
 neusomatic/python/generate_dataset.py    |  82 ++++++++++-------
 neusomatic/python/preprocess.py          |  15 +++-
 neusomatic/python/sequencing_features.py |  58 ++++++++++++
 neusomatic/python/train.py               |  77 ++++++++++------
 6 files changed, 275 insertions(+), 123 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index b6eb055..3a8229a 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -412,13 +412,38 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 
+    logger.info("Load pretrained model from checkpoint {}".format(checkpoint))
+    pretrained_dict = torch.load(
+        checkpoint, map_location=lambda storage, loc: storage)
+    pretrained_state_dict = pretrained_dict["state_dict"]
+    model_tag = pretrained_dict["tag"]
+    logger.info("tag: {}".format(model_tag))
+    coverage_thr = pretrained_dict["coverage_thr"]
+    if "normalize_channels" in pretrained_dict:
+        normalize_channels = pretrained_dict["normalize_channels"]
+    else:
+        normalize_channels = False
+    if "seq_complexity" in pretrained_dict:
+        seq_complexity = pretrained_dict["seq_complexity"]
+    else:
+        seq_complexity = False
+
+    logger.info("coverage_thr: {}".format(coverage_thr))
+    logger.info("normalize_channels: {}".format(normalize_channels))
+    logger.info("seq_complexity: {}".format(seq_complexity))
+
+    
+    num_expected_ensemble = NUM_ENS_FEATURES
+    if seq_complexity:
+        num_expected_ensemble += 2
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
         x = i_f.readline().strip().split()
-        if len(x) == NUM_ENS_FEATURES + 4:
+        if len(x) == num_expected_ensemble + 4:
             ensemble = True
+    num_channels = num_expected_ensemble + \
+        NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
-    num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
@@ -431,26 +456,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         logger.info("We use {} GPUs!".format(torch.cuda.device_count()))
         net = nn.DataParallel(net)
 
-    if not os.path.exists(out_dir):
-        os.mkdir(out_dir)
-    logger.info("Load pretrained model from checkpoint {}".format(checkpoint))
-    pretrained_dict = torch.load(
-        checkpoint, map_location=lambda storage, loc: storage)
-    pretrained_state_dict = pretrained_dict["state_dict"]
-    model_tag = pretrained_dict["tag"]
-    logger.info("tag: {}".format(model_tag))
-
-    matrices_dir = "{}/matrices_{}".format(out_dir, model_tag)
-    if os.path.exists(matrices_dir):
-        logger.warning("Remove matrices directory: {}".format(matrices_dir))
-        shutil.rmtree(matrices_dir)
-    os.mkdir(matrices_dir)
-    coverage_thr = pretrained_dict["coverage_thr"]
-    if "normalize_channels" in pretrained_dict:
-        normalize_channels = pretrained_dict["normalize_channels"]
-    else:
-        normalize_channels = False
-
     model_dict = net.state_dict()
 
     # 1. filter out unnecessary keys
@@ -472,6 +477,16 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     # 3. load the new state dict
     net.load_state_dict(pretrained_state_dict)
 
+
+    if not os.path.exists(out_dir):
+        os.mkdir(out_dir)
+    matrices_dir = "{}/matrices_{}".format(out_dir, model_tag)
+    if os.path.exists(matrices_dir):
+        logger.warning("Remove matrices directory: {}".format(matrices_dir))
+        shutil.rmtree(matrices_dir)
+    os.mkdir(matrices_dir)
+
+
     new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs")
     if os.path.exists(new_split_tsvs_dir):
         logger.warning(
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index ee40526..f440bda 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -20,7 +20,7 @@
 
 
 def extract_features(candidate_record):
-    reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record
+    reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch = candidate_record
     thread_logger = logging.getLogger(
         "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name))
     try:
@@ -35,8 +35,10 @@ def extract_features(candidate_record):
             var_id = "-".join([chrom, pos, ref, alt])
             pos = int(pos)
             my_coordinate = [chrom, pos]
-            nBamFeatures = sequencing_features.AlignmentFeatures(nbam, my_coordinate, ref, alt, min_mapq, min_bq)
-            tBamFeatures = sequencing_features.AlignmentFeatures(tbam, my_coordinate, ref, alt, min_mapq, min_bq)
+            nBamFeatures = sequencing_features.AlignmentFeatures(
+                nbam, my_coordinate, ref, alt, min_mapq, min_bq)
+            tBamFeatures = sequencing_features.AlignmentFeatures(
+                tbam, my_coordinate, ref, alt, min_mapq, min_bq)
 
             sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
                                                       tBamFeatures.nalt)
@@ -46,16 +48,31 @@ def extract_features(candidate_record):
 
             indel_length = len(alt) - len(ref)
 
+            if seq_complexity:
+                seq_span_80bp = ref_fa.fetch(my_coordinate[0], max(
+                    0, my_coordinate[1] - 41), my_coordinate[1] + 40)
+                seq_left_80bp = ref_fa.fetch(my_coordinate[0], max(
+                    0, my_coordinate[1] - 81), my_coordinate[1])
+                seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[
+                                              1], my_coordinate[1] + 81)
+                LC_spanning = sequencing_features.LC(seq_span_80bp)
+                LC_adjacent = min(sequencing_features.LC(
+                    seq_left_80bp), sequencing_features.LC(seq_right_80bp))
+                LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
+                LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)
+
             if_dbsnp = 0
             if_common = 0
             if dbsnp:
                 region = "{}:{}-{}".format(chrom, pos, pos + 1)
                 dbsnp_vars = {}
                 for x in dbsnp_tb.fetch(region=region):
-                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[0:8]
+                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[
+                        0:8]
                     for alt_ in alts_.split(","):
                         dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
-                        dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
+                        dbsnp_vars[
+                            dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
                 if var_id in dbsnp_vars:
                     if_dbsnp = 1
                     if_common = dbsnp_vars[var_id]
@@ -71,6 +88,10 @@ def extract_features(candidate_record):
             COSMIC_CNT = num_cosmic_cases
             Consistent_Mates = tBamFeatures.consistent_mates
             Inconsistent_Mates = tBamFeatures.inconsistent_mates
+            if seq_complexity:
+                Seq_Complexity_Span = LC_spanning_phred
+                Seq_Complexity_Adj = LC_adjacent_phred
+
             N_DP = nBamFeatures.dp
             nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq
             nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq
@@ -148,21 +169,26 @@ def extract_features(candidate_record):
             tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp
             InDel_Length = indel_length
 
-            ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
-                                 Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
-                                 nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
-                                 nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
-                                 nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
-                                 nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
-                                 nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
-                                 nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
-                                 MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
-                                 tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
-                                 tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
-                                 tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
-                                 tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
-                                 tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
-                                 tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
+            features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
+                        Consistent_Mates, Inconsistent_Mates]
+            if seq_complexity:
+                features.extend([Seq_Complexity_Span, Seq_Complexity_Adj])
+            features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
+                             nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
+                             nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
+                             nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
+                             nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
+                             nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
+                             nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
+                             MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
+                             tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
+                             tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
+                             tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
+                             tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
+                             tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
+                             tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
+
+            ext_features.append(features)
         return ext_features
 
     except Exception as ex:
@@ -177,6 +203,7 @@ def extend_features(candidates_vcf,
                     reference, tumor_bam, normal_bam,
                     min_mapq, min_bq,
                     dbsnp, cosmic,
+                    seq_complexity,
                     num_threads):
 
     logger = logging.getLogger(extend_features.__name__)
@@ -263,29 +290,31 @@ def extend_features(candidates_vcf,
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
             if len(batch) >= split_len or i == n_variants:
                 map_args.append((reference, tumor_bam, normal_bam,
-                                 min_mapq, min_bq, dbsnp, batch))
+                                 min_mapq, min_bq, dbsnp, seq_complexity, batch))
                 batch = []
         if batch:
             map_args.append((reference, tumor_bam, normal_bam,
-                             min_mapq, min_bq, dbsnp, tumor_only, batch))
-
+                             min_mapq, min_bq, dbsnp, seq_complexity, batch))
 
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
-              "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ",
-              "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff",
-              "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
-              "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET",
-              "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET",
-              "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
-              "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR",
-              "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ",
-              "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff",
-              "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
-              "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET",
-              "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET",
-              "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
-              "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]
+              "Consistent_Mates", "Inconsistent_Mates"]
+    if seq_complexity:
+        header.extend(["Seq_Complexity_Span", "Seq_Complexity_Adj"])
+    header.extend(["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ",
+                   "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff",
+                   "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
+                   "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET",
+                   "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET",
+                   "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
+                   "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR",
+                   "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ",
+                   "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff",
+                   "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
+                   "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET",
+                   "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET",
+                   "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
+                   "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"])
 
     try:
         # ext_features=[]
@@ -337,6 +366,9 @@ def extend_features(candidates_vcf,
                         help='dbSNP vcf (to annotate candidate variants)', default=None)
     parser.add_argument('--cosmic', type=str,
                         help='COSMIC vcf (to annotate candidate variants)', default=None)
+    parser.add_argument('--seq_complexity',
+                        help='Compute linguistic sequence complexity features',
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     args = parser.parse_args()
@@ -349,6 +381,7 @@ def extend_features(candidates_vcf,
                                  args.reference, args.tumor_bam, args.normal_bam,
                                  args.min_mapq, args.min_bq,
                                  args.dbsnp, args.cosmic,
+                                 args.seq_complexity,
                                  args.num_threads,
                                  )
         if output is None:
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 9ef58d0..19eb077 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -826,7 +826,7 @@ def find_len(ref, alt):
 
 
 def find_records(input_record):
-    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, work_index = input_record
+    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, seq_complexity, work_index = input_record
     thread_logger = logging.getLogger(
         "{} ({})".format(find_records.__name__, multiprocessing.current_process().name))
     try:
@@ -848,6 +848,9 @@ def find_records(input_record):
         split_in_ensemble_bed = os.path.join(
             work, "in_ensemble_{}.bed".format(work_index))
 
+        num_ens_features = NUM_ENS_FEATURES
+        if seq_complexity:
+            num_ens_features += 2
         bedtools_intersect(
             truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger)
         bedtools_intersect(
@@ -897,7 +900,7 @@ def find_records(input_record):
                         r_ = [[chrom, pos, ref, alt]]
                     for rr in r_:
                         records.append(rr + [str(i)])
-                        anns[i] = [0] * NUM_ENS_FEATURES
+                        anns[i] = [0] * num_ens_features
                         i += 1
 
             curren_pos_records = []
@@ -933,7 +936,7 @@ def find_records(input_record):
                                 else:
                                     r_ = [[chrom, pos, ref, alt]]
 
-                                ann = [0] * NUM_ENS_FEATURES
+                                ann = [0] * num_ens_features
                                 if pos == ens_pos:
                                     if ref == ens_ref and alt == ens_alt:
                                         ann = record_[15:]
@@ -1323,7 +1326,7 @@ def find_records(input_record):
         return None
 
 
-def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
+def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
     ensemble_pos = []
@@ -1335,24 +1338,28 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
                          "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_LoFreq", "if_Scalpel", "if_Strelka",
                          "if_TNscope", "Strelka_Score", "Strelka_QSS", "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score",
                          "Sniper_Score", "VarDict_Score", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
-                         "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ",
-                         "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM",
-                         "nBAM_ALT_NM", "nBAM_NM_Diff", "nBAM_REF_Concordant", "nBAM_REF_Discordant",
-                         "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV",
-                         "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", "nBAM_Z_Ranksums_EndPos",
-                         "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", "nBAM_MQ0",
-                         "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
-                         "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp",
-                         "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "SOR", "MSI", "MSILEN", "SHIFT3",
-                         "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ",
-                         "tBAM_Z_Ranksums_MQ", "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM",
-                         "tBAM_ALT_NM", "tBAM_NM_Diff", "tBAM_REF_Concordant", "tBAM_REF_Discordant",
-                         "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", "tBAM_Concordance_FET", "T_REF_FOR",
-                         "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", "tBAM_Z_Ranksums_EndPos",
-                         "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", "tBAM_MQ0",
-                         "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
-                         "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp",
-                         "InDel_Length"]
+                         "Consistent_Mates", "Inconsistent_Mates"]
+    if seq_complexity:
+        expected_features += ["Seq_Complexity_Span", "Seq_Complexity_Adj"]
+
+    expected_features += ["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ",
+                          "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM",
+                          "nBAM_ALT_NM", "nBAM_NM_Diff", "nBAM_REF_Concordant", "nBAM_REF_Discordant",
+                          "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV",
+                          "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", "nBAM_Z_Ranksums_EndPos",
+                          "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", "nBAM_MQ0",
+                          "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
+                          "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp",
+                          "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "SOR", "MSI", "MSILEN", "SHIFT3",
+                          "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ",
+                          "tBAM_Z_Ranksums_MQ", "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM",
+                          "tBAM_ALT_NM", "tBAM_NM_Diff", "tBAM_REF_Concordant", "tBAM_REF_Discordant",
+                          "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", "tBAM_Concordance_FET", "T_REF_FOR",
+                          "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", "tBAM_Z_Ranksums_EndPos",
+                          "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", "tBAM_MQ0",
+                          "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
+                          "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp",
+                          "InDel_Length"]
     callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
                         "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
                         "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
@@ -1370,8 +1377,9 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
                     lambda x: x[1] in expected_features, enumerate(header_)))
                 header = list(map(lambda x: x[1], header_en))
                 if set(expected_features) - set(header):
-                    logger.error("The following features are missing from ensemble file: {}".format(
-                                 list(set(expected_features) - set(header))))
+                    logger.error("The following features are missing from ensemble file {}: {}".format(
+                        ensemble_tsv,
+                        list(set(expected_features) - set(header))))
                     raise Exception
                 order_header = []
                 for f in expected_features:
@@ -1443,6 +1451,8 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
         lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header))))
     InDel_Length = list(map(lambda x: x[0], filter(
         lambda x: x[1] in ["InDel_Length"], enumerate(header))))
+    Seq_Complexity_ = list(map(lambda x: x[0], filter(
+        lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header))))
 
     min_max_features = [[cov_features, 0, 2 * COV],
                         [mq_features, 0, 70],
@@ -1466,14 +1476,18 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
                         [SiteHomopolymer_Length, 0, 50],
                         [InDel_Length, -30, 30],
                         ]
+    if seq_complexity:
+        min_max_features.append([Seq_Complexity_, 0, 40])
+
     selected_features = sorted([i for f in min_max_features for i in f[0]])
     selected_features_tags = list(map(lambda x: header[x], selected_features))
     if n_vars > 0:
         for i_s, mn, mx in min_max_features:
-            s = ensemble_data[:, np.array(i_s)]
-            s = np.maximum(np.minimum(s, mx), mn)
-            s = (s - mn) / (mx - mn)
-            ensemble_data[:, np.array(i_s)] = s
+            if i_s:
+                s = ensemble_data[:, np.array(i_s)]
+                s = np.maximum(np.minimum(s, mx), mn)
+                s = (s - mn) / (mx - mn)
+                ensemble_data[:, np.array(i_s)] = s
         ensemble_data = ensemble_data[:, selected_features]
         ensemble_data = ensemble_data.tolist()
     with open(ensemble_bed, "w")as f_:
@@ -1486,7 +1500,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend):
 
 def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                     ensemble_bed, tsv_batch_size):
+                     ensemble_bed, seq_complexity, tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
     logger.info("---------------------Generate Dataset----------------------")
@@ -1513,7 +1527,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     split_batch_size = 10000
     if ensemble_tsv and not ensemble_bed:
         ensemble_bed = os.path.join(work, "ensemble.bed")
-        extract_ensemble(ensemble_tsv, ensemble_bed, False)
+        extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False)
 
     tmp_ = bedtools_intersect(
         tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger)
@@ -1540,7 +1554,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     map_args = []
     for i, split_region_file in enumerate(split_region_files):
         map_args.append((work, split_region_file, truth_vcf_file,
-                         tumor_pred_vcf_file, ref_file, ensemble_bed, i))
+                         tumor_pred_vcf_file, ref_file, ensemble_bed, seq_complexity, i))
     try:
         records_data = pool.map_async(find_records, map_args).get()
         pool.close()
@@ -1725,6 +1739,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                         help='Ensemble annotation tsv file (only for short read)', default=None)
     parser.add_argument('--ensemble_bed', type=str,
                         help='Ensemble annotation bed file (only for short read)', default=None)
+    parser.add_argument('--seq_complexity',
+                        help='Compute linguistic sequence complexity features',
+                        action="store_true")
     args = parser.parse_args()
     logger.info(args)
 
@@ -1743,12 +1760,13 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     num_threads = args.num_threads
     ensemble_tsv = args.ensemble_tsv
     ensemble_bed = args.ensemble_bed
+    seq_complexity = args.seq_complexity
     tsv_batch_size = args.tsv_batch_size
 
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                         ensemble_bed, tsv_batch_size)
+                         ensemble_bed, seq_complexity, tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 32ec235..d831b55 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -78,10 +78,11 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
 
 
 def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
-                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, tsv_batch_size):
+                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed,
+                     seq_complexity,
                      tsv_batch_size)
 
 
@@ -193,6 +194,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                ensemble_tsv, long_read, restart, first_do_without_qual,
                filter_duplicate,
                add_extra_features,
+               seq_complexity,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -237,7 +239,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         ensemble_bed = os.path.join(work, "ensemble.bed")
         logger.info("Extract ensemble info.")
         if restart or not os.path.exists(ensemble_bed):
-            extract_ensemble(ensemble_tsv, ensemble_bed, False)
+            extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False)
 
     merge_d_for_short_read = 100
     candidates_split_regions = []
@@ -338,12 +340,13 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     reference, tumor_bam, normal_bam,
                                     min_mapq, snp_min_bq,
                                     dbsnp, None,
+                                    seq_complexity,
                                     num_threads)
                 extra_features_bed = os.path.join(
                     work_dataset_split, "extra_features.bed")
                 if not os.path.exists(extra_features_bed) or restart:
                     extract_ensemble(extra_features_tsv,
-                                     extra_features_bed, True)
+                                     extra_features_bed, seq_complexity, True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
@@ -382,7 +385,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_bed_i, tsv_batch_size)
+                                    ensemble_bed_i, seq_complexity, tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir
@@ -467,6 +470,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--add_extra_features',
                         help='add extra input features',
                         action="store_true")
+    parser.add_argument('--seq_complexity',
+                        help='Compute linguistic sequence complexity features', 
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -485,6 +491,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual,
                    args.filter_duplicate,
                    args.add_extra_features,
+                   args.seq_complexity,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 61723c0..599809c 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -219,3 +219,61 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
         sor = min(sor_numerator / sor_denominator, max_value)
 
     return sor
+
+def max_vocabularies(seq_length):
+    # According to:
+    # https://doi.org/10.1093/bioinformatics/18.5.679
+    # Assume 4 different nucleotides
+    counts = 0
+    k = 1
+    while k <= seq_length:
+        
+        if 4**k < (seq_length - k + 1):
+            counts = counts + 4**k
+        else:
+            counts = counts + (seq_length-k+1 + 1) * (seq_length-k+1 - 1 + 1)/2
+            break
+        
+        k += 1
+                
+    return counts
+
+
+
+def LC(sequence):
+    # Calculate linguistic sequence complexity according to
+    # https://doi.org/10.1093/bioinformatics/18.5.679
+    # Assume 4 different nucleotides
+    sequence = sequence.upper()
+    
+    if not 'N' in sequence:
+        
+        number_of_subseqs     = 0
+        seq_length            = len(sequence)
+        max_number_of_subseqs = max_vocabularies(seq_length)
+    
+        for i in range(1, seq_length+1):
+            
+            #max_vocab_1 = 4**i
+            #max_vocab_2 = seq_length - i + 1
+            set_of_seq_n = set()
+    
+            for n, nth_base in enumerate(sequence):
+                
+                if n+i <= len(sequence):
+                    sub_seq = sequence[n:n+i]
+                    set_of_seq_n.add( sub_seq )
+    
+                    # All possible unique subseqs obtained. Break away and go no further. 
+                    #if ( max_vocab_1 >= max_vocab_2 ) and ( len(set_of_seq_n) == max_vocab_2 ):
+                    #    break
+    
+            num_uniq_subseqs  = len(set_of_seq_n)
+            number_of_subseqs = number_of_subseqs + num_uniq_subseqs
+    
+        lc = number_of_subseqs/max_number_of_subseqs
+    
+    else:
+        lc = float('nan')
+
+    return lc
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index a841a26..c58d271 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -203,6 +203,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                      merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs,
                      train_split_len,
                      normalize_channels,
+                     seq_complexity,
                      use_cuda):
     logger = logging.getLogger(train_neusomatic.__name__)
 
@@ -218,13 +219,50 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
     data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 
+    if checkpoint:
+        logger.info(
+            "Load pretrained model from checkpoint {}".format(checkpoint))
+        pretrained_dict = torch.load(
+            checkpoint, map_location=lambda storage, loc: storage)
+        pretrained_state_dict = pretrained_dict["state_dict"]
+        tag = pretrained_dict["tag"]
+        sofar_epochs = pretrained_dict["epoch"]
+        logger.info(
+            "sofar_epochs from pretrained checkpoint: {}".format(sofar_epochs))
+        coverage_thr = pretrained_dict["coverage_thr"]
+        logger.info(
+            "Override coverage_thr from pretrained checkpoint: {}".format(coverage_thr))
+        if "normalize_channels" in pretrained_dict:
+            normalize_channels = pretrained_dict["normalize_channels"]
+        else:
+            normalize_channels = False
+        logger.info(
+            "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels))
+        if "seq_complexity" in pretrained_dict:
+            seq_complexity = pretrained_dict["seq_complexity"]
+        else:
+            seq_complexity = False
+        logger.info(
+            "Override seq_complexity from pretrained checkpoint: {}".format(seq_complexity))
+        prev_epochs = sofar_epochs + 1
+    else:
+        prev_epochs = 0
+        time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
+        tag = "neusomatic_{}".format(time_now)
+    logger.info("tag: {}".format(tag))
+
+    num_expected_ensemble = NUM_ENS_FEATURES
+    if seq_complexity:
+        num_expected_ensemble += 2
+
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
         x = i_f.readline().strip().split()
-        if len(x) == NUM_ENS_FEATURES + 4:
+        if len(x) == num_expected_ensemble + 4:
             ensemble = True
 
-    num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
+    num_channels = num_expected_ensemble + \
+        NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
@@ -242,25 +280,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         os.mkdir("{}/models/".format(out_dir))
 
     if checkpoint:
-        logger.info(
-            "Load pretrained model from checkpoint {}".format(checkpoint))
-        pretrained_dict = torch.load(
-            checkpoint, map_location=lambda storage, loc: storage)
-        pretrained_state_dict = pretrained_dict["state_dict"]
-        tag = pretrained_dict["tag"]
-        sofar_epochs = pretrained_dict["epoch"]
-        logger.info(
-            "sofar_epochs from pretrained checkpoint: {}".format(sofar_epochs))
-        coverage_thr = pretrained_dict["coverage_thr"]
-        logger.info(
-            "Override coverage_thr from pretrained checkpoint: {}".format(coverage_thr))
-        if "normalize_channels" in pretrained_dict:
-            normalize_channels = pretrained_dict["normalize_channels"]
-        else:
-            normalize_channels = False
-        logger.info(
-            "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels))
-        prev_epochs = sofar_epochs + 1
         model_dict = net.state_dict()
         # 1. filter out unnecessary keys
         # pretrained_state_dict = {
@@ -278,11 +297,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         model_dict.update(pretrained_state_dict)
         # 3. load the new state dict
         net.load_state_dict(pretrained_state_dict)
-    else:
-        prev_epochs = 0
-        time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
-        tag = "neusomatic_{}".format(time_now)
-    logger.info("tag: {}".format(tag))
 
     shuffle(candidates_tsv)
 
@@ -403,8 +417,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "tag": tag,
                 "epoch": curr_epoch,
                 "coverage_thr": coverage_thr,
-                "normalize_channels": normalize_channels},
-               '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
+                "normalize_channels": normalize_channels,
+                "seq_complexity": seq_complexity
+                }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
         train_sets[0].open_candidate_tsvs()
@@ -469,6 +484,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         "epoch": curr_epoch,
                         "coverage_thr": coverage_thr,
                         "normalize_channels": normalize_channels,
+                        "seq_complexity": seq_complexity,
                         }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
             if validation_candidates_tsv:
                 test(net, curr_epoch, validation_loader, use_cuda)
@@ -487,6 +503,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "epoch": curr_epoch,
                 "coverage_thr": coverage_thr,
                 "normalize_channels": normalize_channels,
+                "seq_complexity": seq_complexity,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(
         out_dir, tag, curr_epoch))
     if validation_candidates_tsv:
@@ -561,6 +578,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         help='normalize BQ, MQ, and other bam-info channels by frequency of observed alleles. \
                               Will be overridden if pretrained model is provided',
                         action="store_true")
+    parser.add_argument('--seq_complexity',
+                        help='Compute linguistic sequence complexity features',
+                        action="store_true")
     args = parser.parse_args()
 
     logger.info(args)
@@ -578,6 +598,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.merged_candidates_per_tsv, args.merged_max_num_tsvs,
                                       args.overwrite_merged_tsvs, args.train_split_len,
                                       args.normalize_channels,
+                                      args.seq_complexity,
                                       use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())

From 2c4f45db6f75356cc9d19617faa03a5f31523007 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 5 May 2020 22:00:56 -0700
Subject: [PATCH 19/89] fix ensemble

---
 neusomatic/python/call.py             |   8 +-
 neusomatic/python/extend_features.py  |  51 ++++++++--
 neusomatic/python/generate_dataset.py |  14 ++-
 neusomatic/python/preprocess.py       | 137 ++++++++++++++++++++------
 neusomatic/python/train.py            |   8 +-
 5 files changed, 166 insertions(+), 52 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 3a8229a..2e60f47 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -433,15 +433,15 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     logger.info("seq_complexity: {}".format(seq_complexity))
 
     
-    num_expected_ensemble = NUM_ENS_FEATURES
+    expected_ens_fields = NUM_ENS_FEATURES
     if seq_complexity:
-        num_expected_ensemble += 2
+        expected_ens_fields += 2
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
         x = i_f.readline().strip().split()
-        if len(x) == num_expected_ensemble + 4:
+        if len(x) == expected_ens_fields + 4:
             ensemble = True
-    num_channels = num_expected_ensemble + \
+    num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
     logger.info("Number of channels: {}".format(num_channels))
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index f440bda..43fcf8b 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -199,6 +199,7 @@ def extract_features(candidate_record):
 
 def extend_features(candidates_vcf,
                     exclude_variants,
+                    add_variants,
                     output_tsv,
                     reference, tumor_bam, normal_bam,
                     min_mapq, min_bq,
@@ -253,8 +254,8 @@ def extend_features(candidates_vcf,
                     var_id = "-".join([chrom, pos, ref, alt])
                     cosmic_vars[var_id] = num_cases
 
+    exclude_vars = set([])
     if exclude_variants:
-        exclude_vars = []
         with open(exclude_variants) as i_f:
             for line in skip_empty(i_f):
                 if exclude_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM":
@@ -262,7 +263,18 @@ def extend_features(candidates_vcf,
                 x = line.strip().split("\t")
                 chrom, pos, _, ref, alt = x[0:5]
                 var_id = "-".join([chrom, pos, ref, alt])
-                exclude_vars.append(var_id)
+                exclude_vars.add(var_id)
+
+    add_vars = set([])
+    if add_variants:
+        with open(add_variants) as i_f:
+            for line in skip_empty(i_f):
+                if add_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM":
+                    continue
+                x = line.strip().split("\t")
+                chrom, pos, _, ref, alt = x[0:5]
+                var_id = "-".join([chrom, pos, ref, alt])
+                add_vars.add(var_id)
 
     n_variants = 0
     with open(candidates_vcf) as i_f:
@@ -272,30 +284,46 @@ def extend_features(candidates_vcf,
     split_len = (n_variants + num_threads - 1) // num_threads
     pool = multiprocessing.Pool(num_threads)
     map_args = []
+    batch = []
     with open(candidates_vcf) as i_f:
-        i = 0
-        batch = []
         for line in skip_empty(i_f):
-            i += 1
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
             var_id = "-".join([chrom, pos, ref, alt])
             if exclude_variants:
                 if var_id in exclude_vars:
                     continue
+            if add_variants:
+                if var_id in add_vars:
+                    add_vars = add_vars - set([var_id])
             num_cosmic_cases = float('nan')
             if_cosmic = 0
             if cosmic and var_id in cosmic_vars:
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
-            if len(batch) >= split_len or i == n_variants:
+            if len(batch) >= split_len:
                 map_args.append((reference, tumor_bam, normal_bam,
                                  min_mapq, min_bq, dbsnp, seq_complexity, batch))
                 batch = []
-        if batch:
-            map_args.append((reference, tumor_bam, normal_bam,
-                             min_mapq, min_bq, dbsnp, seq_complexity, batch))
-
+    if add_variants and len(add_vars)>0:
+        for var_id in add_vars-set(exclude_vars):
+            v = var_id.split("-")
+            pos, ref, alt = v[-3:]
+            chrom = "-".join(v[:-3])
+            num_cosmic_cases = float('nan')
+            if_cosmic = 0
+            if cosmic and var_id in cosmic_vars:
+                if_cosmic = 1
+                num_cosmic_cases = cosmic_vars[var_id]
+            batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
+            if len(batch) >= split_len:
+                map_args.append((reference, tumor_bam, normal_bam,
+                                 min_mapq, min_bq, dbsnp, seq_complexity, batch))
+                batch = []
+    if batch:
+        map_args.append((reference, tumor_bam, normal_bam,
+                         min_mapq, min_bq, dbsnp, seq_complexity, batch))
+    
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
               "Consistent_Mates", "Inconsistent_Mates"]
@@ -350,6 +378,8 @@ def extend_features(candidates_vcf,
                         required=True)
     parser.add_argument('--exclude_variants', type=str, help='variants to exclude',
                         default=None)
+    parser.add_argument('--add_variants', type=str, help='variants to add if not exist in vcf. (Lower priority than --exclude_variants)',
+                        default=None)
     parser.add_argument('--output_tsv', type=str, help='output features tsv',
                         required=True)
     parser.add_argument('--reference', type=str, help='reference fasta filename',
@@ -377,6 +407,7 @@ def extend_features(candidates_vcf,
     try:
         output = extend_features(args.candidates_vcf,
                                  args.exclude_variants,
+                                 args.add_variants,
                                  args.output_tsv,
                                  args.reference, args.tumor_bam, args.normal_bam,
                                  args.min_mapq, args.min_bq,
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 19eb077..870a999 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1326,7 +1326,7 @@ def find_records(input_record):
         return None
 
 
-def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend):
+def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
     ensemble_pos = []
@@ -1376,6 +1376,9 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend):
                 header_en = list(filter(
                     lambda x: x[1] in expected_features, enumerate(header_)))
                 header = list(map(lambda x: x[1], header_en))
+                if not enforce_header:
+                    expected_features = header
+
                 if set(expected_features) - set(header):
                     logger.error("The following features are missing from ensemble file {}: {}".format(
                         ensemble_tsv,
@@ -1500,7 +1503,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend):
 
 def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                     ensemble_bed, seq_complexity, tsv_batch_size):
+                     ensemble_bed, seq_complexity, enforce_header, tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
     logger.info("---------------------Generate Dataset----------------------")
@@ -1527,7 +1530,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     split_batch_size = 10000
     if ensemble_tsv and not ensemble_bed:
         ensemble_bed = os.path.join(work, "ensemble.bed")
-        extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False)
+        extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, False)
 
     tmp_ = bedtools_intersect(
         tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger)
@@ -1742,6 +1745,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     parser.add_argument('--seq_complexity',
                         help='Compute linguistic sequence complexity features',
                         action="store_true")
+    parser.add_argument('--enforce_header',
+                        help='Enforce header match for ensemble_tsv',
+                        action="store_true")
     args = parser.parse_args()
     logger.info(args)
 
@@ -1766,7 +1772,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                         ensemble_bed, seq_complexity, tsv_batch_size)
+                         ensemble_bed, seq_complexity, enforce_header, tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index d831b55..487a024 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -15,6 +15,7 @@
 import logging
 
 import tempfile
+import numpy as np
 
 from filter_candidates import filter_candidates
 from generate_dataset import generate_dataset, extract_ensemble
@@ -78,11 +79,13 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
 
 
 def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
-                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, tsv_batch_size):
+                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, 
+                            no_feature_recomp_for_ensemble, tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed,
                      seq_complexity,
+                     no_feature_recomp_for_ensemble,
                      tsv_batch_size)
 
 
@@ -195,6 +198,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                filter_duplicate,
                add_extra_features,
                seq_complexity,
+               no_feature_recomp_for_ensemble,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -239,7 +243,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         ensemble_bed = os.path.join(work, "ensemble.bed")
         logger.info("Extract ensemble info.")
         if restart or not os.path.exists(ensemble_bed):
-            extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False)
+            extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, no_feature_recomp_for_ensemble, False)
 
     merge_d_for_short_read = 100
     candidates_split_regions = []
@@ -335,7 +339,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 if not os.path.exists(extra_features_tsv) or restart:
                     extend_features(filtered_vcf,
                                     ensemble_beds[
-                                        i] if ensemble_tsv else None,
+                                        i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None,
+                                    ensemble_beds[
+                                        i] if (ensemble_tsv and not no_feature_recomp_for_ensemble) else None,
                                     extra_features_tsv,
                                     reference, tumor_bam, normal_bam,
                                     min_mapq, snp_min_bq,
@@ -346,38 +352,105 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                     work_dataset_split, "extra_features.bed")
                 if not os.path.exists(extra_features_bed) or restart:
                     extract_ensemble(extra_features_tsv,
-                                     extra_features_bed, seq_complexity, True)
+                                     extra_features_bed, seq_complexity, True, True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
                     if not os.path.exists(merged_features_bed) or restart:
                         exclude_ens_variants = []
                         header_line = ""
-                        with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
-                            for line in skip_empty(i_f_1, skip_header=False):
-                                if line.startswith("#"):
+                        if no_feature_recomp_for_ensemble:
+                            with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
+                                for line in skip_empty(i_f_1, skip_header=False):
+                                    if line.startswith("#"):
+                                        if not header_line:
+                                            header_line = line
+                                            o_f.write(line)
+                                        else:
+                                            if header_line != line:
+                                                logger.error(
+                                                    "{}!={}".format(header_line, line))
+                                                raise Exception
+                                        continue
+                                    chrom, pos, _, ref, alt = line.strip().split("\t")[
+                                        0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    exclude_ens_variants.append(var_id)
+                                    o_f.write(line)
+                                for line in skip_empty(i_f_2, skip_header=False):
+                                    if line.startswith("#"):
+                                        if header_line != line:
+                                            logger.error(
+                                                "{}!={}".format(header_line, line))
+                                            raise Exception
+                                        continue
+                                    chrom, pos, _, ref, alt = line.strip().split("\t")[
+                                        0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    if var_id in exclude_ens_variants:
+                                        continue
                                     o_f.write(line)
-                                    if not header_line:
-                                        header_line = line
-                                    else:
-                                        assert(header_line == line)
-                                    continue
-                                chrom, pos, _, ref, alt = line.strip().split("\t")[
-                                    0:5]
-                                var_id = "-".join([chrom, pos, ref, alt])
-                                exclude_ens_variants.append(var_id)
-                                o_f.write(line)
-                            for line in skip_empty(i_f_2, skip_header=False):
-                                if line.startswith("#"):
-                                    if header_line:
-                                        assert(header_line == line)
-                                    continue
-                                chrom, pos, _, ref, alt = line.strip().split("\t")[
-                                    0:5]
-                                var_id = "-".join([chrom, pos, ref, alt])
-                                if var_id in exclude_ens_variants:
-                                    continue
-                                o_f.write(line)
+                        else:
+                            callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
+                                                "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
+                                                "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
+                                                "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
+                            with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
+                                ens_variants_info = {}
+                                header_1_found = False
+                                header_2_found = False
+                                for line in skip_empty(i_f_1, skip_header=False):
+                                    if line.startswith("#"):
+                                        if not header_line:
+                                            header_line = line
+                                        else:
+                                            if header_line != line:
+                                                logger.error(
+                                                    "{}!={}".format(header_line, line))
+                                                raise Exception
+                                        header_ = line.strip().split()[5:]
+                                        header_caller = list(filter(
+                                            lambda x: x[1] in callers_features, enumerate(header_)))
+                                        header_caller_ = list(
+                                            map(lambda x: x[1], header_caller))
+                                        header_i = list(
+                                            map(lambda x: x[0], header_caller))
+                                        header_1_found = True
+                                        continue
+                                    assert header_1_found
+                                    fields = line.strip().split("\t")
+                                    chrom, pos, _, ref, alt = fields[0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    ens_variants_info[var_id] = np.array(fields[5:])[
+                                        header_i]
+                                for line in skip_empty(i_f_2, skip_header=False):
+                                    if line.startswith("#"):
+                                        if header_line != line:
+                                            logger.error(
+                                                "{}!={}".format(header_line, line))
+                                        if not header_2_found:
+                                            header_2 = line.strip().split()[5:]
+                                            logger.info(header_2)
+                                            order_header = []
+                                            for f in header_caller_:
+                                                if f not in header_2:
+                                                    logger.info("Missing header field {}".format(f))
+                                                    raise Exception
+                                                order_header.append(header_2.index(f))
+                                            o_f.write(line)
+                                        header_2_found = True
+
+                                    assert header_2_found
+                                    fields = line.strip().split("\t")
+                                    chrom, pos, _, ref, alt = fields[0:5]
+                                    var_id = "-".join([chrom, pos, ref, alt])
+                                    if var_id in ens_variants_info:
+                                        fields_ = np.array(fields[5:])
+                                        fields_[order_header] = ens_variants_info[
+                                            var_id]
+                                        fields[5:] = fields_.tolist()
+                                    o_f.write(
+                                        "\t".join(list(map(str, fields))) + "\n")
                     ensemble_bed_i = merged_features_bed
                 else:
                     ensemble_bed_i = extra_features_bed
@@ -385,7 +458,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_bed_i, seq_complexity, tsv_batch_size)
+                                    ensemble_bed_i, seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir
@@ -471,7 +544,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         help='add extra input features',
                         action="store_true")
     parser.add_argument('--seq_complexity',
-                        help='Compute linguistic sequence complexity features', 
+                        help='Compute linguistic sequence complexity features',
+                        action="store_true")
+    parser.add_argument('--no_feature_recomp_for_ensemble',
+                        help='Do not recompute features for ensemble_tsv',
                         action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
@@ -492,6 +568,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.filter_duplicate,
                    args.add_extra_features,
                    args.seq_complexity,
+                   args.no_feature_recomp_for_ensemble,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index c58d271..8ddc301 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -251,17 +251,17 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         tag = "neusomatic_{}".format(time_now)
     logger.info("tag: {}".format(tag))
 
-    num_expected_ensemble = NUM_ENS_FEATURES
+    expected_ens_fields = NUM_ENS_FEATURES
     if seq_complexity:
-        num_expected_ensemble += 2
+        expected_ens_fields += 2
 
     ensemble = False
     with open(candidates_tsv[0]) as i_f:
         x = i_f.readline().strip().split()
-        if len(x) == num_expected_ensemble + 4:
+        if len(x) == expected_ens_fields + 4:
             ensemble = True
 
-    num_channels = num_expected_ensemble + \
+    num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
     logger.info("Number of channels: {}".format(num_channels))

From e913e83f68fd6cc985123d92f1319c3a4cc284cb Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 5 May 2020 22:48:07 -0700
Subject: [PATCH 20/89] more efficient LC

---
 neusomatic/python/extend_features.py     | 20 +++++------
 neusomatic/python/sequencing_features.py | 43 +++++++++---------------
 2 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 43fcf8b..23b57a9 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -55,9 +55,9 @@ def extract_features(candidate_record):
                     0, my_coordinate[1] - 81), my_coordinate[1])
                 seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[
                                               1], my_coordinate[1] + 81)
-                LC_spanning = sequencing_features.LC(seq_span_80bp)
-                LC_adjacent = min(sequencing_features.LC(
-                    seq_left_80bp), sequencing_features.LC(seq_right_80bp))
+                LC_spanning = sequencing_features.subLC(seq_span_80bp, 20)
+                LC_adjacent = min(sequencing_features.subLC(
+                    seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20))
                 LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
                 LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)
 
@@ -305,8 +305,8 @@ def extend_features(candidates_vcf,
                 map_args.append((reference, tumor_bam, normal_bam,
                                  min_mapq, min_bq, dbsnp, seq_complexity, batch))
                 batch = []
-    if add_variants and len(add_vars)>0:
-        for var_id in add_vars-set(exclude_vars):
+    if add_variants and len(add_vars) > 0:
+        for var_id in add_vars - set(exclude_vars):
             v = var_id.split("-")
             pos, ref, alt = v[-3:]
             chrom = "-".join(v[:-3])
@@ -323,7 +323,7 @@ def extend_features(candidates_vcf,
     if batch:
         map_args.append((reference, tumor_bam, normal_bam,
                          min_mapq, min_bq, dbsnp, seq_complexity, batch))
-    
+
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
               "Consistent_Mates", "Inconsistent_Mates"]
@@ -345,11 +345,11 @@ def extend_features(candidates_vcf,
                    "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"])
 
     try:
-        # ext_features=[]
-        # for w in map_args:
-        #     ext_features.append(extract_features(w))
+        ext_features=[]
+        for w in map_args:
+            ext_features.append(extract_features(w))
 
-        ext_features = pool.map_async(extract_features, map_args).get()
+        # ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
             o_f.write("\t".join(header) + "\n")
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 599809c..c495681 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -220,18 +220,20 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
 
     return sor
 
-def max_vocabularies(seq_length):
+def max_sub_vocabularies(seq_length, max_subseq_length):
     # According to:
     # https://doi.org/10.1093/bioinformatics/18.5.679
-    # Assume 4 different nucleotides
+    # capping the length of sub_string as an input parameter
+    assert max_subseq_length <= seq_length
+    
     counts = 0
     k = 1
-    while k <= seq_length:
+    while k <= max_subseq_length:
         
         if 4**k < (seq_length - k + 1):
             counts = counts + 4**k
         else:
-            counts = counts + (seq_length-k+1 + 1) * (seq_length-k+1 - 1 + 1)/2
+            counts = counts + (2*seq_length - k - max_subseq_length + 2) * (max_subseq_length - k + 1)/2
             break
         
         k += 1
@@ -239,38 +241,23 @@ def max_vocabularies(seq_length):
     return counts
 
 
-
-def LC(sequence):
+def subLC(sequence, max_substring_length=20):
     # Calculate linguistic sequence complexity according to
     # https://doi.org/10.1093/bioinformatics/18.5.679
-    # Assume 4 different nucleotides
+    # Cut off substring at a fixed length
     sequence = sequence.upper()
     
     if not 'N' in sequence:
         
         number_of_subseqs     = 0
         seq_length            = len(sequence)
-        max_number_of_subseqs = max_vocabularies(seq_length)
-    
-        for i in range(1, seq_length+1):
-            
-            #max_vocab_1 = 4**i
-            #max_vocab_2 = seq_length - i + 1
-            set_of_seq_n = set()
-    
-            for n, nth_base in enumerate(sequence):
-                
-                if n+i <= len(sequence):
-                    sub_seq = sequence[n:n+i]
-                    set_of_seq_n.add( sub_seq )
-    
-                    # All possible unique subseqs obtained. Break away and go no further. 
-                    #if ( max_vocab_1 >= max_vocab_2 ) and ( len(set_of_seq_n) == max_vocab_2 ):
-                    #    break
-    
-            num_uniq_subseqs  = len(set_of_seq_n)
-            number_of_subseqs = number_of_subseqs + num_uniq_subseqs
-    
+        max_number_of_subseqs = max_sub_vocabularies(seq_length, max_substring_length)
+        
+        set_of_seq_n = set()
+        for i in range(1, min(max_substring_length+1, seq_length+1) ):
+            set_of_seq_n.update((sequence[n: n+i] for n in range(len(sequence) - i + 1)))
+        
+        number_of_subseqs  = len(set_of_seq_n)
         lc = number_of_subseqs/max_number_of_subseqs
     
     else:

From 2be51cc07e115f6cee385b61ab018bcb38edb00d Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 5 May 2020 22:55:36 -0700
Subject: [PATCH 21/89] small fix

---
 neusomatic/python/extend_features.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 23b57a9..c84261d 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -345,11 +345,7 @@ def extend_features(candidates_vcf,
                    "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"])
 
     try:
-        ext_features=[]
-        for w in map_args:
-            ext_features.append(extract_features(w))
-
-        # ext_features = pool.map_async(extract_features, map_args).get()
+        ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
             o_f.write("\t".join(header) + "\n")

From e9b83dad1d16ea42df3bce57de8bc120da7c822b Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 5 May 2020 23:04:01 -0700
Subject: [PATCH 22/89] filter duplicate by default

---
 neusomatic/python/postprocess.py | 10 ++++++----
 neusomatic/python/preprocess.py  | 10 ++++++----
 test/NeuSomatic_ensemble.vcf     | 16 ++++++++--------
 test/NeuSomatic_standalone.vcf   | 16 ++++++++--------
 4 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index d0d934e..b8d0b3a 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -168,7 +168,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
                 lr_pad, lr_chunk_size, lr_chunk_scale,
                 lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty,
                 lr_gap_open_penalty, lr_gap_ext_penalty, lr_max_realign_dp, lr_do_split,
-                filter_duplicate,
+                keep_duplicate,
                 pass_threshold, lowqual_threshold,
                 msa_binary, num_threads):
     logger = logging.getLogger(postprocess.__name__)
@@ -177,6 +177,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     if not os.path.exists(work):
         os.mkdir(work)
 
+    filter_duplicate = not keep_duplicate
+    
     original_tempdir = tempfile.tempdir
     bed_tempdir = os.path.join(work, "bed_tempdir_postprocess")
     if not os.path.exists(bed_tempdir):
@@ -310,8 +312,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     parser.add_argument('--lowqual_threshold', type=float,
                         help='SCORE for LowQual (PASS for lowqual_threshold <= score < pass_threshold)',
                         default=0.4)
-    parser.add_argument('--filter_duplicate',
-                        help='filter duplicate reads in analysis',
+    parser.add_argument('--keep_duplicate',
+                        help='Dont filter duplicate reads in analysis',
                         action="store_true")
     parser.add_argument('--msa_binary', type=str,
                         help='MSA binary', default="../bin/msa")
@@ -333,7 +335,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
                                  args.lr_gap_open_penalty,
                                  args.lr_gap_ext_penalty, args.lr_max_realign_dp,
                                  args.lr_do_split,
-                                 args.filter_duplicate,
+                                 args.keep_duplicate,
                                  args.pass_threshold, args.lowqual_threshold,
                                  args.msa_binary, args.num_threads)
 
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 487a024..dc39010 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -195,7 +195,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size,
                matrix_width, matrix_base_pad, min_ev_frac_per_col,
                ensemble_tsv, long_read, restart, first_do_without_qual,
-               filter_duplicate,
+               keep_duplicate,
                add_extra_features,
                seq_complexity,
                no_feature_recomp_for_ensemble,
@@ -207,6 +207,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     if restart or not os.path.exists(work):
         os.mkdir(work)
 
+    filter_duplicate = not keep_duplicate
+
     original_tempdir = tempfile.tempdir
     bed_tempdir = os.path.join(work, "bed_tempdir_preprocess")
     if not os.path.exists(bed_tempdir):
@@ -537,8 +539,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--first_do_without_qual',
                         help='Perform initial scan without calculating the quality stats',
                         action="store_true")
-    parser.add_argument('--filter_duplicate',
-                        help='filter duplicate reads when preparing pileup information',
+    parser.add_argument('--keep_duplicate',
+                        help='Don not filter duplicate reads when preparing pileup information',
                         action="store_true")
     parser.add_argument('--add_extra_features',
                         help='add extra input features',
@@ -565,7 +567,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.ins_merge_min_af, args.merge_r,
                    args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col,
                    args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual,
-                   args.filter_duplicate,
+                   args.keep_duplicate,
                    args.add_extra_features,
                    args.seq_complexity,
                    args.no_feature_recomp_for_ensemble,
diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf
index 2302afe..e3a7d8b 100644
--- a/test/NeuSomatic_ensemble.vcf
+++ b/test/NeuSomatic_ensemble.vcf
@@ -14,11 +14,11 @@
 ##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count in the tumor">
 ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
-22	21330787	.	C	T	26.9917	PASS	SCORE=0.9980;DP=396;RO=306;AO=88;AF=0.2234	GT:DP:RO:AO:AF	0/1:396:306:88:0.2234
-22	21332122	.	G	A	28.5402	PASS	SCORE=0.9986;DP=285;RO=223;AO=62;AF=0.2175	GT:DP:RO:AO:AF	0/1:285:223:62:0.2175
-22	21334924	.	G	C	17.5639	PASS	SCORE=0.9825;DP=106;RO=83;AO=23;AF=0.217	GT:DP:RO:AO:AF	0/1:106:83:23:0.217
-22	21335259	.	C	A	19.7149	PASS	SCORE=0.9893;DP=249;RO=200;AO=49;AF=0.1968	GT:DP:RO:AO:AF	0/1:249:200:49:0.1968
-22	21384516	.	C	T	27.6969	PASS	SCORE=0.9983;DP=95;RO=68;AO=27;AF=0.2842	GT:DP:RO:AO:AF	0/1:95:68:27:0.2842
-22	21982892	.	C	T	21.5561	PASS	SCORE=0.9930;DP=158;RO=113;AO=45;AF=0.2848	GT:DP:RO:AO:AF	0/1:158:113:45:0.2848
-22	21983260	.	A	G	31.5494	PASS	SCORE=0.9993;DP=118;RO=74;AO=44;AF=0.3729	GT:DP:RO:AO:AF	0/1:118:74:44:0.3729
-22	21989959	.	AAG	A	33.0106	PASS	SCORE=0.9995;DP=139;RO=107;AO=32;AF=0.2302	GT:DP:RO:AO:AF	0/1:139:107:32:0.2302
+22	21330787	.	C	T	26.9917	PASS	SCORE=0.9980;DP=387;RO=298;AO=87;AF=0.226	GT:DP:RO:AO:AF	0/1:387:298:87:0.226
+22	21332122	.	G	A	28.5402	PASS	SCORE=0.9986;DP=268;RO=209;AO=59;AF=0.2201	GT:DP:RO:AO:AF	0/1:268:209:59:0.2201
+22	21334924	.	G	C	17.6382	PASS	SCORE=0.9828;DP=101;RO=78;AO=23;AF=0.2277	GT:DP:RO:AO:AF	0/1:101:78:23:0.2277
+22	21335259	.	C	A	19.7149	PASS	SCORE=0.9893;DP=234;RO=190;AO=44;AF=0.188	GT:DP:RO:AO:AF	0/1:234:190:44:0.188
+22	21384516	.	C	T	27.9602	PASS	SCORE=0.9984;DP=90;RO=64;AO=26;AF=0.2889	GT:DP:RO:AO:AF	0/1:90:64:26:0.2889
+22	21982892	.	C	T	21.4946	PASS	SCORE=0.9929;DP=152;RO=109;AO=43;AF=0.2829	GT:DP:RO:AO:AF	0/1:152:109:43:0.2829
+22	21983260	.	A	G	31.5494	PASS	SCORE=0.9993;DP=112;RO=70;AO=42;AF=0.375	GT:DP:RO:AO:AF	0/1:112:70:42:0.375
+22	21989959	.	AAG	A	33.0106	PASS	SCORE=0.9995;DP=131;RO=99;AO=32;AF=0.2443	GT:DP:RO:AO:AF	0/1:131:99:32:0.2443
diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf
index a7dd79f..bee861b 100644
--- a/test/NeuSomatic_standalone.vcf
+++ b/test/NeuSomatic_standalone.vcf
@@ -14,11 +14,11 @@
 ##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count in the tumor">
 ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
-22	21330787	.	C	T	33.0111	PASS	SCORE=0.9995;DP=396;RO=306;AO=88;AF=0.2234	GT:DP:RO:AO:AF	0/1:396:306:88:0.2234
-22	21332122	.	G	A	36.9903	PASS	SCORE=0.9998;DP=285;RO=223;AO=62;AF=0.2175	GT:DP:RO:AO:AF	0/1:285:223:62:0.2175
-22	21334924	.	G	C	12.9061	PASS	SCORE=0.9488;DP=106;RO=83;AO=23;AF=0.217	GT:DP:RO:AO:AF	0/1:106:83:23:0.217
-22	21335259	.	C	A	25.0876	PASS	SCORE=0.9969;DP=249;RO=200;AO=49;AF=0.1968	GT:DP:RO:AO:AF	0/1:249:200:49:0.1968
-22	21384516	.	C	T	32.2191	PASS	SCORE=0.9994;DP=95;RO=68;AO=27;AF=0.2842	GT:DP:RO:AO:AF	0/1:95:68:27:0.2842
-22	21982892	.	C	T	29.5872	PASS	SCORE=0.9989;DP=158;RO=113;AO=45;AF=0.2848	GT:DP:RO:AO:AF	0/1:158:113:45:0.2848
-22	21983260	.	A	G	35.2289	PASS	SCORE=0.9997;DP=118;RO=74;AO=44;AF=0.3729	GT:DP:RO:AO:AF	0/1:118:74:44:0.3729
-22	21989959	.	AAG	A	39.9993	PASS	SCORE=0.9999;DP=139;RO=107;AO=32;AF=0.2302	GT:DP:RO:AO:AF	0/1:139:107:32:0.2302
+22	21330787	.	C	T	33.0111	PASS	SCORE=0.9995;DP=387;RO=298;AO=87;AF=0.226	GT:DP:RO:AO:AF	0/1:387:298:87:0.226
+22	21332122	.	G	A	36.9903	PASS	SCORE=0.9998;DP=268;RO=209;AO=59;AF=0.2201	GT:DP:RO:AO:AF	0/1:268:209:59:0.2201
+22	21334924	.	G	C	13.3787	PASS	SCORE=0.9541;DP=101;RO=78;AO=23;AF=0.2277	GT:DP:RO:AO:AF	0/1:101:78:23:0.2277
+22	21335259	.	C	A	24.9497	PASS	SCORE=0.9968;DP=234;RO=190;AO=44;AF=0.188	GT:DP:RO:AO:AF	0/1:234:190:44:0.188
+22	21384516	.	C	T	33.9800	PASS	SCORE=0.9996;DP=90;RO=64;AO=26;AF=0.2889	GT:DP:RO:AO:AF	0/1:90:64:26:0.2889
+22	21982892	.	C	T	29.2094	PASS	SCORE=0.9988;DP=152;RO=109;AO=43;AF=0.2829	GT:DP:RO:AO:AF	0/1:152:109:43:0.2829
+22	21983260	.	A	G	35.2289	PASS	SCORE=0.9997;DP=112;RO=70;AO=42;AF=0.375	GT:DP:RO:AO:AF	0/1:112:70:42:0.375
+22	21989959	.	AAG	A	39.9993	PASS	SCORE=0.9999;DP=131;RO=99;AO=32;AF=0.2443	GT:DP:RO:AO:AF	0/1:131:99:32:0.2443

From a54be6967fa4c82c382ed18ac426130f4400c021 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 7 May 2020 10:46:51 -0700
Subject: [PATCH 23/89] switched fisher test

---
 neusomatic/python/sequencing_features.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index c495681..679ab6d 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -8,17 +8,21 @@
 import genomic_file_handlers as genome
 from read_info_extractor import *
 from collections import defaultdict
+import fisher
 
 nan = float('nan')
 
+def fisher_exact_test(mat):
+    return fisher.pvalue(mat[0][0],mat[0][1],mat[1][0],mat[1][1]).two_tail
+
 
 class AlignmentFeatures:
     def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
         '''
         bam is the opened file handle of bam file
         my_coordiate is a list or tuple of 0-based (contig, position)
-        '''
-
+        '''  
+              
         indel_length = len(first_alt) - len(ref_base)
         reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1])
 
@@ -115,9 +119,9 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10)
         self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
         self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length)
 
-        self.concordance_fet = stats.fisher_exact(concordance_counts)[1]
-        self.strandbias_fet = stats.fisher_exact(orientation_counts)[1]
-        self.clipping_fet = stats.fisher_exact(soft_clip_counts)[1]
+        self.concordance_fet = fisher_exact_test(concordance_counts)
+        self.strandbias_fet = fisher_exact_test(orientation_counts)
+        self.clipping_fet = fisher_exact_test(soft_clip_counts)
 
         self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
 

From 4f6bf57cfabf5f0eb9af2ef8d9a4cb5fd69daff8 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 9 May 2020 12:06:15 -0700
Subject: [PATCH 24/89] fix seq_complexity

---
 neusomatic/python/call.py                | 23 +++---
 neusomatic/python/extend_features.py     | 24 +++----
 neusomatic/python/generate_dataset.py    | 90 +++++++++++++-----------
 neusomatic/python/preprocess.py          | 48 +++++++++----
 neusomatic/python/sequencing_features.py |  2 +-
 neusomatic/python/train.py               | 36 +++++-----
 test/run_test.sh                         |  2 +
 7 files changed, 130 insertions(+), 95 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 2e60f47..e8e8fcb 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -423,24 +423,29 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         normalize_channels = pretrained_dict["normalize_channels"]
     else:
         normalize_channels = False
-    if "seq_complexity" in pretrained_dict:
-        seq_complexity = pretrained_dict["seq_complexity"]
+    if "no_seq_complexity" in pretrained_dict:
+        no_seq_complexity = pretrained_dict["no_seq_complexity"]
     else:
-        seq_complexity = False
+        no_seq_complexity = True
 
     logger.info("coverage_thr: {}".format(coverage_thr))
     logger.info("normalize_channels: {}".format(normalize_channels))
-    logger.info("seq_complexity: {}".format(seq_complexity))
+    logger.info("no_seq_complexity: {}".format(no_seq_complexity))
 
     
     expected_ens_fields = NUM_ENS_FEATURES
-    if seq_complexity:
+    if not no_seq_complexity:
         expected_ens_fields += 2
+    
+    logger.info("expected_ens_fields: {}".format(expected_ens_fields))
+
     ensemble = False
-    with open(candidates_tsv[0]) as i_f:
-        x = i_f.readline().strip().split()
-        if len(x) == expected_ens_fields + 4:
-            ensemble = True
+    for tsv in candidates_tsv:
+        with open(tsv) as i_f:
+            x = i_f.readline().strip().split()
+            if len(x) == expected_ens_fields + 4:
+                ensemble = True
+                break
     num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index c84261d..79743a7 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -20,7 +20,7 @@
 
 
 def extract_features(candidate_record):
-    reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch = candidate_record
+    reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, no_seq_complexity, batch = candidate_record
     thread_logger = logging.getLogger(
         "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name))
     try:
@@ -48,7 +48,7 @@ def extract_features(candidate_record):
 
             indel_length = len(alt) - len(ref)
 
-            if seq_complexity:
+            if not no_seq_complexity:
                 seq_span_80bp = ref_fa.fetch(my_coordinate[0], max(
                     0, my_coordinate[1] - 41), my_coordinate[1] + 40)
                 seq_left_80bp = ref_fa.fetch(my_coordinate[0], max(
@@ -88,7 +88,7 @@ def extract_features(candidate_record):
             COSMIC_CNT = num_cosmic_cases
             Consistent_Mates = tBamFeatures.consistent_mates
             Inconsistent_Mates = tBamFeatures.inconsistent_mates
-            if seq_complexity:
+            if not no_seq_complexity:
                 Seq_Complexity_Span = LC_spanning_phred
                 Seq_Complexity_Adj = LC_adjacent_phred
 
@@ -171,7 +171,7 @@ def extract_features(candidate_record):
 
             features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
                         Consistent_Mates, Inconsistent_Mates]
-            if seq_complexity:
+            if not no_seq_complexity:
                 features.extend([Seq_Complexity_Span, Seq_Complexity_Adj])
             features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
                              nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
@@ -204,7 +204,7 @@ def extend_features(candidates_vcf,
                     reference, tumor_bam, normal_bam,
                     min_mapq, min_bq,
                     dbsnp, cosmic,
-                    seq_complexity,
+                    no_seq_complexity,
                     num_threads):
 
     logger = logging.getLogger(extend_features.__name__)
@@ -303,7 +303,7 @@ def extend_features(candidates_vcf,
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
             if len(batch) >= split_len:
                 map_args.append((reference, tumor_bam, normal_bam,
-                                 min_mapq, min_bq, dbsnp, seq_complexity, batch))
+                                 min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
                 batch = []
     if add_variants and len(add_vars) > 0:
         for var_id in add_vars - set(exclude_vars):
@@ -318,16 +318,16 @@ def extend_features(candidates_vcf,
             batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
             if len(batch) >= split_len:
                 map_args.append((reference, tumor_bam, normal_bam,
-                                 min_mapq, min_bq, dbsnp, seq_complexity, batch))
+                                 min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
                 batch = []
     if batch:
         map_args.append((reference, tumor_bam, normal_bam,
-                         min_mapq, min_bq, dbsnp, seq_complexity, batch))
+                         min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
 
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
               "Consistent_Mates", "Inconsistent_Mates"]
-    if seq_complexity:
+    if not no_seq_complexity:
         header.extend(["Seq_Complexity_Span", "Seq_Complexity_Adj"])
     header.extend(["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ",
                    "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff",
@@ -392,8 +392,8 @@ def extend_features(candidates_vcf,
                         help='dbSNP vcf (to annotate candidate variants)', default=None)
     parser.add_argument('--cosmic', type=str,
                         help='COSMIC vcf (to annotate candidate variants)', default=None)
-    parser.add_argument('--seq_complexity',
-                        help='Compute linguistic sequence complexity features',
+    parser.add_argument('--no_seq_complexity',
+                        help='Dont compute linguistic sequence complexity features',
                         action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
@@ -408,7 +408,7 @@ def extend_features(candidates_vcf,
                                  args.reference, args.tumor_bam, args.normal_bam,
                                  args.min_mapq, args.min_bq,
                                  args.dbsnp, args.cosmic,
-                                 args.seq_complexity,
+                                 args.no_seq_complexity,
                                  args.num_threads,
                                  )
         if output is None:
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 870a999..5f9c6b8 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -826,7 +826,7 @@ def find_len(ref, alt):
 
 
 def find_records(input_record):
-    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, seq_complexity, work_index = input_record
+    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record
     thread_logger = logging.getLogger(
         "{} ({})".format(find_records.__name__, multiprocessing.current_process().name))
     try:
@@ -849,7 +849,7 @@ def find_records(input_record):
             work, "in_ensemble_{}.bed".format(work_index))
 
         num_ens_features = NUM_ENS_FEATURES
-        if seq_complexity:
+        if not no_seq_complexity:
             num_ens_features += 2
         bedtools_intersect(
             truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger)
@@ -1326,7 +1326,7 @@ def find_records(input_record):
         return None
 
 
-def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, is_extend):
+def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
     ensemble_pos = []
@@ -1339,7 +1339,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header,
                          "if_TNscope", "Strelka_Score", "Strelka_QSS", "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score",
                          "Sniper_Score", "VarDict_Score", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
                          "Consistent_Mates", "Inconsistent_Mates"]
-    if seq_complexity:
+    if not no_seq_complexity:
         expected_features += ["Seq_Complexity_Span", "Seq_Complexity_Adj"]
 
     expected_features += ["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ",
@@ -1366,37 +1366,42 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header,
                         "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
 
     n_vars = 0
-    with open(ensemble_tsv) as s_f:
-        for line in skip_empty(s_f):
-            if line.startswith("CHROM"):
-                header_pos = line.strip().split()[0:5]
-                header_ = line.strip().split()[5:]
+    all_headers = set([])
+    for ensemble_tsv in ensemble_tsvs:
+        with open(ensemble_tsv) as s_f:
+            for line in skip_empty(s_f):
+                if line.startswith("CHROM"):
+                    all_headers.add(line)
+                    header_pos = line.strip().split()[0:5]
+                    header_ = line.strip().split()[5:]
+                    if is_extend:
+                        header_ += callers_features
+                    header_en = list(filter(
+                        lambda x: x[1] in expected_features, enumerate(header_)))
+                    header = list(map(lambda x: x[1], header_en))
+                    if not enforce_header:
+                        expected_features = header
+
+                    if set(expected_features) - set(header):
+                        logger.error("The following features are missing from ensemble file {}: {}".format(
+                            ensemble_tsv,
+                            list(set(expected_features) - set(header))))
+                        raise Exception
+                    order_header = []
+                    for f in expected_features:
+                        order_header.append(header_en[header.index(f)][0])
+                    continue
+                fields = line.strip().split()
+                fields[2] = str(int(fields[1]) + len(fields[3]))
+                ensemble_pos.append(fields[0:5])
+                features = fields[5:]
                 if is_extend:
-                    header_ += callers_features
-                header_en = list(filter(
-                    lambda x: x[1] in expected_features, enumerate(header_)))
-                header = list(map(lambda x: x[1], header_en))
-                if not enforce_header:
-                    expected_features = header
-
-                if set(expected_features) - set(header):
-                    logger.error("The following features are missing from ensemble file {}: {}".format(
-                        ensemble_tsv,
-                        list(set(expected_features) - set(header))))
-                    raise Exception
-                order_header = []
-                for f in expected_features:
-                    order_header.append(header_en[header.index(f)][0])
-                continue
-            fields = line.strip().split()
-            fields[2] = str(int(fields[1]) + len(fields[3]))
-            ensemble_pos.append(fields[0:5])
-            features = fields[5:]
-            if is_extend:
-                features += ["0"] * len(callers_features)
-            ensemble_data.append(list(map(lambda x: float(
-                x.replace("False", "0").replace("True", "1")), features)))
-            n_vars += 1
+                    features += ["0"] * len(callers_features)
+                ensemble_data.append(list(map(lambda x: float(
+                    x.replace("False", "0").replace("True", "1")), features)))
+                n_vars += 1
+    if len(set(all_headers)) != 1:
+        raise(RuntimeError("inconsistent headers in {}".format(ensemble_tsvs)))
     if n_vars > 0:
         ensemble_data = np.array(ensemble_data)[:, order_header]
     header = np.array(header_)[order_header].tolist()
@@ -1479,7 +1484,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header,
                         [SiteHomopolymer_Length, 0, 50],
                         [InDel_Length, -30, 30],
                         ]
-    if seq_complexity:
+    if not no_seq_complexity:
         min_max_features.append([Seq_Complexity_, 0, 40])
 
     selected_features = sorted([i for f in min_max_features for i in f[0]])
@@ -1503,7 +1508,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header,
 
 def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                     ensemble_bed, seq_complexity, enforce_header, tsv_batch_size):
+                     ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
     logger.info("---------------------Generate Dataset----------------------")
@@ -1530,7 +1535,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     split_batch_size = 10000
     if ensemble_tsv and not ensemble_bed:
         ensemble_bed = os.path.join(work, "ensemble.bed")
-        extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, False)
+        extract_ensemble([ensemble_tsv], ensemble_bed,
+                         no_seq_complexity, enforce_header, False)
 
     tmp_ = bedtools_intersect(
         tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger)
@@ -1557,7 +1563,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     map_args = []
     for i, split_region_file in enumerate(split_region_files):
         map_args.append((work, split_region_file, truth_vcf_file,
-                         tumor_pred_vcf_file, ref_file, ensemble_bed, seq_complexity, i))
+                         tumor_pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, i))
     try:
         records_data = pool.map_async(find_records, map_args).get()
         pool.close()
@@ -1742,8 +1748,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                         help='Ensemble annotation tsv file (only for short read)', default=None)
     parser.add_argument('--ensemble_bed', type=str,
                         help='Ensemble annotation bed file (only for short read)', default=None)
-    parser.add_argument('--seq_complexity',
-                        help='Compute linguistic sequence complexity features',
+    parser.add_argument('--no_seq_complexity',
+                        help='Dont compute linguistic sequence complexity features',
                         action="store_true")
     parser.add_argument('--enforce_header',
                         help='Enforce header match for ensemble_tsv',
@@ -1766,13 +1772,13 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     num_threads = args.num_threads
     ensemble_tsv = args.ensemble_tsv
     ensemble_bed = args.ensemble_bed
-    seq_complexity = args.seq_complexity
+    no_seq_complexity = args.no_seq_complexity
     tsv_batch_size = args.tsv_batch_size
 
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                         ensemble_bed, seq_complexity, enforce_header, tsv_batch_size)
+                         ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index dc39010..11410d1 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -79,12 +79,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
 
 
 def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
-                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, 
+                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, no_seq_complexity,
                             no_feature_recomp_for_ensemble, tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed,
-                     seq_complexity,
+                     no_seq_complexity,
                      no_feature_recomp_for_ensemble,
                      tsv_batch_size)
 
@@ -197,7 +197,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                ensemble_tsv, long_read, restart, first_do_without_qual,
                keep_duplicate,
                add_extra_features,
-               seq_complexity,
+               no_seq_complexity,
                no_feature_recomp_for_ensemble,
                num_threads,
                scan_alignments_binary,):
@@ -245,7 +245,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         ensemble_bed = os.path.join(work, "ensemble.bed")
         logger.info("Extract ensemble info.")
         if restart or not os.path.exists(ensemble_bed):
-            extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, no_feature_recomp_for_ensemble, False)
+            extract_ensemble([ensemble_tsv], ensemble_bed,
+                             no_seq_complexity, no_feature_recomp_for_ensemble, False)
 
     merge_d_for_short_read = 100
     candidates_split_regions = []
@@ -338,23 +339,38 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 work_tumor_i = os.path.dirname(filtered_vcf)
                 extra_features_tsv = os.path.join(
                     work_tumor_i, "extra_features.tsv")
+                ex_tsvs = [extra_features_tsv]
                 if not os.path.exists(extra_features_tsv) or restart:
                     extend_features(filtered_vcf,
                                     ensemble_beds[
                                         i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None,
-                                    ensemble_beds[
-                                        i] if (ensemble_tsv and not no_feature_recomp_for_ensemble) else None,
+                                    None,
                                     extra_features_tsv,
                                     reference, tumor_bam, normal_bam,
                                     min_mapq, snp_min_bq,
                                     dbsnp, None,
-                                    seq_complexity,
+                                    no_seq_complexity,
                                     num_threads)
+                if ensemble_tsv and not no_feature_recomp_for_ensemble:
+                    extra_features_others_tsv = os.path.join(
+                        work_tumor_i, "extra_features_others.tsv")
+                    ex_tsvs.append(extra_features_others_tsv)
+                    if not os.path.exists(extra_features_others_tsv) or restart:
+                        extend_features(ensemble_beds[i],
+                                        extra_features_tsv,
+                                        None,
+                                        extra_features_others_tsv,
+                                        reference, tumor_bam, normal_bam,
+                                        min_mapq, snp_min_bq,
+                                        dbsnp, None,
+                                        no_seq_complexity,
+                                        num_threads)
+
                 extra_features_bed = os.path.join(
                     work_dataset_split, "extra_features.bed")
                 if not os.path.exists(extra_features_bed) or restart:
-                    extract_ensemble(extra_features_tsv,
-                                     extra_features_bed, seq_complexity, True, True)
+                    extract_ensemble(ex_tsvs,
+                                     extra_features_bed, no_seq_complexity, True, True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
@@ -436,9 +452,11 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                             order_header = []
                                             for f in header_caller_:
                                                 if f not in header_2:
-                                                    logger.info("Missing header field {}".format(f))
+                                                    logger.info(
+                                                        "Missing header field {}".format(f))
                                                     raise Exception
-                                                order_header.append(header_2.index(f))
+                                                order_header.append(
+                                                    header_2.index(f))
                                             o_f.write(line)
                                         header_2_found = True
 
@@ -460,7 +478,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_bed_i, seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
+                                    ensemble_bed_i, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir
@@ -545,8 +563,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--add_extra_features',
                         help='add extra input features',
                         action="store_true")
-    parser.add_argument('--seq_complexity',
-                        help='Compute linguistic sequence complexity features',
+    parser.add_argument('--no_seq_complexity',
+                        help='Dont compute linguistic sequence complexity features',
                         action="store_true")
     parser.add_argument('--no_feature_recomp_for_ensemble',
                         help='Do not recompute features for ensemble_tsv',
@@ -569,7 +587,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual,
                    args.keep_duplicate,
                    args.add_extra_features,
-                   args.seq_complexity,
+                   args.no_seq_complexity,
                    args.no_feature_recomp_for_ensemble,
                    args.num_threads,
                    args.scan_alignments_binary)
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 679ab6d..7e90d1a 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -43,7 +43,7 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10)
         qname_collector = defaultdict(list)
 
         for read_i in reads:
-            if read_i.is_unmapped or not dedup_test(read_i):
+            if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
                 continue
 
             dp += 1
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 8ddc301..966d7f4 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -203,7 +203,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                      merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs,
                      train_split_len,
                      normalize_channels,
-                     seq_complexity,
+                     no_seq_complexity,
                      use_cuda):
     logger = logging.getLogger(train_neusomatic.__name__)
 
@@ -238,12 +238,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
             normalize_channels = False
         logger.info(
             "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels))
-        if "seq_complexity" in pretrained_dict:
-            seq_complexity = pretrained_dict["seq_complexity"]
+        if "no_seq_complexity" in pretrained_dict:
+            no_seq_complexity = pretrained_dict["no_seq_complexity"]
         else:
-            seq_complexity = False
+            no_seq_complexity = True
         logger.info(
-            "Override seq_complexity from pretrained checkpoint: {}".format(seq_complexity))
+            "Override no_seq_complexity from pretrained checkpoint: {}".format(no_seq_complexity))
         prev_epochs = sofar_epochs + 1
     else:
         prev_epochs = 0
@@ -252,14 +252,18 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
     logger.info("tag: {}".format(tag))
 
     expected_ens_fields = NUM_ENS_FEATURES
-    if seq_complexity:
+    if not no_seq_complexity:
         expected_ens_fields += 2
 
+    logger.info("expected_ens_fields: {}".format(expected_ens_fields))
+
     ensemble = False
-    with open(candidates_tsv[0]) as i_f:
-        x = i_f.readline().strip().split()
-        if len(x) == expected_ens_fields + 4:
-            ensemble = True
+    for tsv in candidates_tsv:
+        with open(tsv) as i_f:
+            x = i_f.readline().strip().split()
+            if len(x) == expected_ens_fields + 4:
+                ensemble = True
+                break
 
     num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
@@ -418,7 +422,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "epoch": curr_epoch,
                 "coverage_thr": coverage_thr,
                 "normalize_channels": normalize_channels,
-                "seq_complexity": seq_complexity
+                "no_seq_complexity": no_seq_complexity
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
@@ -484,7 +488,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         "epoch": curr_epoch,
                         "coverage_thr": coverage_thr,
                         "normalize_channels": normalize_channels,
-                        "seq_complexity": seq_complexity,
+                        "no_seq_complexity": no_seq_complexity,
                         }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
             if validation_candidates_tsv:
                 test(net, curr_epoch, validation_loader, use_cuda)
@@ -503,7 +507,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "epoch": curr_epoch,
                 "coverage_thr": coverage_thr,
                 "normalize_channels": normalize_channels,
-                "seq_complexity": seq_complexity,
+                "no_seq_complexity": no_seq_complexity,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(
         out_dir, tag, curr_epoch))
     if validation_candidates_tsv:
@@ -578,8 +582,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         help='normalize BQ, MQ, and other bam-info channels by frequency of observed alleles. \
                               Will be overridden if pretrained model is provided',
                         action="store_true")
-    parser.add_argument('--seq_complexity',
-                        help='Compute linguistic sequence complexity features',
+    parser.add_argument('--no_seq_complexity',
+                        help='Dont compute linguistic sequence complexity features',
                         action="store_true")
     args = parser.parse_args()
 
@@ -598,7 +602,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.merged_candidates_per_tsv, args.merged_max_num_tsvs,
                                       args.overwrite_merged_tsvs, args.train_split_len,
                                       args.normalize_channels,
-                                      args.seq_complexity,
+                                      args.no_seq_complexity,
                                       use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/test/run_test.sh b/test/run_test.sh
index 850132e..22ebbe8 100755
--- a/test/run_test.sh
+++ b/test/run_test.sh
@@ -36,6 +36,7 @@ python ${neusomatic_dir}/neusomatic/python/preprocess.py \
 	--ins_min_af 0.05 \
 	--del_min_af 0.05 \
 	--num_threads 1 \
+	--no_seq_complexity \
 	--scan_alignments_binary ${neusomatic_dir}/neusomatic/bin/scan_alignments
 
 CUDA_VISIBLE_DEVICES= python ${neusomatic_dir}/neusomatic/python/call.py \
@@ -73,6 +74,7 @@ python ${neusomatic_dir}/neusomatic/python/preprocess.py \
 	--del_min_af 0.05 \
 	--num_threads 1 \
 	--ensemble_tsv ${test_dir}/ensemble.tsv \
+	--no_seq_complexity \
 	--scan_alignments_binary ${neusomatic_dir}/neusomatic/bin/scan_alignments
 
 CUDA_VISIBLE_DEVICES= python ${neusomatic_dir}/neusomatic/python/call.py \

From dbe9c4af198086faf2d9f29de197cee970945457 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 9 May 2020 12:29:53 -0700
Subject: [PATCH 25/89] fix num fields

---
 neusomatic/python/call.py  | 16 +++++++++++++---
 neusomatic/python/train.py | 15 ++++++++++++---
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index e8e8fcb..a5d6468 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -439,13 +439,23 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     
     logger.info("expected_ens_fields: {}".format(expected_ens_fields))
 
+    expected_st_fields = 4
+
+    logger.info("expected_st_fields: {}".format(expected_st_fields))
+
     ensemble = False
     for tsv in candidates_tsv:
         with open(tsv) as i_f:
             x = i_f.readline().strip().split()
-            if len(x) == expected_ens_fields + 4:
-                ensemble = True
-                break
+            if x:
+                if len(x) == expected_ens_fields + 4:
+                    ensemble = True
+                    break
+                elif len(x) == 4:
+                    break
+                else:
+                    raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
+
     num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 966d7f4..9124cb9 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -257,13 +257,22 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
 
     logger.info("expected_ens_fields: {}".format(expected_ens_fields))
 
+    expected_st_fields = 4
+
+    logger.info("expected_st_fields: {}".format(expected_st_fields))
+
     ensemble = False
     for tsv in candidates_tsv:
         with open(tsv) as i_f:
             x = i_f.readline().strip().split()
-            if len(x) == expected_ens_fields + 4:
-                ensemble = True
-                break
+            if x:
+                if len(x) == expected_ens_fields + 4:
+                    ensemble = True
+                    break
+                elif len(x) == 4:
+                    break
+                else:
+                    raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
 
     num_channels = expected_ens_fields + \
         NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES

From cb64681a5f6f248912966ce26f609600473e84c4 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 9 May 2020 19:05:48 -0700
Subject: [PATCH 26/89] zero anns columns added

---
 neusomatic/python/call.py       | 20 +++++++++++++++++-
 neusomatic/python/dataloader.py |  7 +++++++
 neusomatic/python/train.py      | 37 ++++++++++++++++++++++++++++++---
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index a5d6468..18b3969 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -396,6 +396,7 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr
 
 def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     batch_size, max_load_candidates, pass_threshold, lowqual_threshold,
+                    force_zero_ann_cols,
                     use_cuda):
     logger = logging.getLogger(call_neusomatic.__name__)
 
@@ -427,10 +428,20 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         no_seq_complexity = pretrained_dict["no_seq_complexity"]
     else:
         no_seq_complexity = True
+    if "zero_ann_cols" in pretrained_dict:
+        zero_ann_cols = pretrained_dict["zero_ann_cols"]
+    else:
+        zero_ann_cols = []
+
+    if force_zero_ann_cols:
+        logger.info(
+            "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols))
+        zero_ann_cols = force_zero_ann_cols
 
     logger.info("coverage_thr: {}".format(coverage_thr))
     logger.info("normalize_channels: {}".format(normalize_channels))
     logger.info("no_seq_complexity: {}".format(no_seq_complexity))
+    logger.info("zero_ann_cols: {}".format(zero_ann_cols))
 
     
     expected_ens_fields = NUM_ENS_FEATURES
@@ -554,7 +565,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                          transform=data_transform, is_test=True,
                                          num_threads=num_threads,
                                          coverage_thr=coverage_thr,
-                                         normalize_channels=normalize_channels)
+                                         normalize_channels=normalize_channels,
+                                         zero_ann_cols=zero_ann_cols)
             call_loader = torch.utils.data.DataLoader(call_set,
                                                       batch_size=batch_size,
                                                       shuffle=True, pin_memory=True,
@@ -634,6 +646,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     parser.add_argument('--lowqual_threshold', type=float,
                         help='SCORE for LowQual (PASS for lowqual_threshold <= score < pass_threshold)',
                         default=0.4)
+    parser.add_argument('--force_zero_ann_cols', nargs="*", type=int,
+                        help='force columns to be set to zero in the annotations. Higher priority than \
+                              --zero_ann_cols and pretrained setting.\
+                              idx starts from 5th column in candidate.tsv file',
+                        default=[])
     args = parser.parse_args()
 
     use_cuda = torch.cuda.is_available()
@@ -644,6 +661,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                      args.checkpoint,
                                      args.num_threads, args.batch_size, args.max_load_candidates,
                                      args.pass_threshold, args.lowqual_threshold,
+                                     args.force_zero_ann_cols,
                                      use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
index 05ea953..c2450dd 100755
--- a/neusomatic/python/dataloader.py
+++ b/neusomatic/python/dataloader.py
@@ -131,6 +131,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
                  num_threads=1, disable_ensemble=False, data_augmentation=False,
                  nclasses_t=4, nclasses_l=4, coverage_thr=100,
                  normalize_channels=False,
+                 zero_ann_cols=[],
                  max_opended_tsv=-1):
 
         soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
@@ -141,6 +142,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
             max_opended_tsv = min(max_opended_tsv, soft)
         self.max_opended_tsv = max_opended_tsv
         self.normalize_channels = normalize_channels
+        self.zero_ann_cols = zero_ann_cols
         self.da_shift_p = 0.3
         self.da_base_p = 0.05
         self.da_rev_p = 0.1
@@ -264,6 +266,11 @@ def __getitem__(self, index):
         if self.disable_ensemble:
             anns = []
 
+        if self.zero_ann_cols and len(anns)>0:
+            anns=np.array(anns)
+            anns[self.zero_ann_cols] = 0
+            anns = anns.tolist()
+
         tag = path.split("/")[-1]
         _, _, _, _, vartype, center, length, tumor_cov, normal_cov = tag.split(
             ".")
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 9124cb9..0211443 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -204,6 +204,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                      train_split_len,
                      normalize_channels,
                      no_seq_complexity,
+                     zero_ann_cols,
+                     force_zero_ann_cols,
                      use_cuda):
     logger = logging.getLogger(train_neusomatic.__name__)
 
@@ -244,6 +246,13 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
             no_seq_complexity = True
         logger.info(
             "Override no_seq_complexity from pretrained checkpoint: {}".format(no_seq_complexity))
+        if "zero_ann_cols" in pretrained_dict:
+            zero_ann_cols = pretrained_dict["zero_ann_cols"]
+        else:
+            zero_ann_cols = []
+        if not force_zero_ann_cols:
+            logger.info(
+                "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols))
         prev_epochs = sofar_epochs + 1
     else:
         prev_epochs = 0
@@ -251,6 +260,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         tag = "neusomatic_{}".format(time_now)
     logger.info("tag: {}".format(tag))
 
+    if force_zero_ann_cols:
+        zero_ann_cols = force_zero_ann_cols
+        logger.info(
+            "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols))
+
+
     expected_ens_fields = NUM_ENS_FEATURES
     if not no_seq_complexity:
         expected_ens_fields += 2
@@ -354,7 +369,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                           max_load_candidates * len(tsvs) / float(len(candidates_tsv))),
                                       transform=data_transform, is_test=False,
                                       num_threads=num_threads, coverage_thr=coverage_thr,
-                                      normalize_channels=normalize_channels)
+                                      normalize_channels=normalize_channels,
+                                      zero_ann_cols=zero_ann_cols)
         train_sets.append(train_set)
         none_indices = train_set.get_none_indices()
         var_indices = train_set.get_var_indices()
@@ -387,7 +403,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                            max_load_candidates=max_load_candidates,
                                            transform=data_transform, is_test=True,
                                            num_threads=num_threads, coverage_thr=coverage_thr,
-                                           normalize_channels=normalize_channels)
+                                           normalize_channels=normalize_channels,
+                                           zero_ann_cols=zero_ann_cols)
         validation_loader = torch.utils.data.DataLoader(validation_set,
                                                         batch_size=batch_size, shuffle=True,
                                                         num_workers=num_threads, pin_memory=True)
@@ -431,7 +448,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "epoch": curr_epoch,
                 "coverage_thr": coverage_thr,
                 "normalize_channels": normalize_channels,
-                "no_seq_complexity": no_seq_complexity
+                "no_seq_complexity": no_seq_complexity,
+                "zero_ann_cols": zero_ann_cols,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
@@ -498,6 +516,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         "coverage_thr": coverage_thr,
                         "normalize_channels": normalize_channels,
                         "no_seq_complexity": no_seq_complexity,
+                        "zero_ann_cols": zero_ann_cols,
                         }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
             if validation_candidates_tsv:
                 test(net, curr_epoch, validation_loader, use_cuda)
@@ -517,6 +536,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "coverage_thr": coverage_thr,
                 "normalize_channels": normalize_channels,
                 "no_seq_complexity": no_seq_complexity,
+                "zero_ann_cols": zero_ann_cols,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(
         out_dir, tag, curr_epoch))
     if validation_candidates_tsv:
@@ -594,6 +614,15 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
     parser.add_argument('--no_seq_complexity',
                         help='Dont compute linguistic sequence complexity features',
                         action="store_true")
+    parser.add_argument('--zero_ann_cols', nargs="*", type=int,
+                        help='columns to be set to zero in the annotations \
+                              idx starts from 5th column in candidate.tsv file',
+                        default=[])
+    parser.add_argument('--force_zero_ann_cols', nargs="*", type=int,
+                        help='force columns to be set to zero in the annotations. Higher priority than \
+                              --zero_ann_cols and pretrained setting \
+                              idx starts from 5th column in candidate.tsv file',
+                        default=[])
     args = parser.parse_args()
 
     logger.info(args)
@@ -612,6 +641,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.overwrite_merged_tsvs, args.train_split_len,
                                       args.normalize_channels,
                                       args.no_seq_complexity,
+                                      args.zero_ann_cols,
+                                      args.force_zero_ann_cols,
                                       use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())

From 97d16f6ca0c6423631c26dd377cfe4670d7ebf31 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 10 May 2020 01:41:33 -0700
Subject: [PATCH 27/89] fix bug in read_info_extractor.py as in somaticseq

---
 neusomatic/python/read_info_extractor.py | 62 +++++++++++++-----------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index b6db804..5c005f4 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -20,28 +20,28 @@
 ### PYSAM ###
 
 
-def position_of_aligned_read(read_i, target_position):
+def position_of_aligned_read(read_i, target_position, win_size=3):
     '''
-    Return the base call of the target position, or if it's a start of insertion/deletion.
+    Return the base call of the target position, and if it's a start of insertion/deletion.
     This target position follows pysam convension, i.e., 0-based.
     In VCF files, deletions/insertions occur AFTER the position.
 
     Return (Code, seq_i, base_at_target, indel_length, nearest insertion/deletion)
 
-    The first number in result is a code:
-    1) Match to reference, which is either a reference read or a SNV/SNP
-    2) Deletion after the target position
-    3) Insertion after the target position
-    0) The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
+    The first number in result is a codeMatch to reference on CIGAR, which is either a reference read or a SNV (substitution counts as M in CIGAR) to reference, which is either a reference read or a SNV/SNP
+        2: Deletion after the target position
+        3: Insertion after the target position
+        0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
     '''
 
     flanking_deletion, flanking_insertion = nan, nan
-    aligned_pairs=read_i.get_aligned_pairs()
+    aligned_pairs = read_i.get_aligned_pairs()
     for i, align_i in enumerate(aligned_pairs):
 
         # If find a match:
         if align_i[1] == target_position:
             seq_i = align_i[0]
+            idx_aligned_pair = i
             break
 
     # If the target position is aligned:
@@ -99,27 +99,33 @@ def position_of_aligned_read(read_i, target_position):
             code = 0
             base_at_target, indel_length, flanking_indel = None, None, None
 
-        # See if there is insertion/deletion within 5 bp of "i":
+        # See if there is insertion/deletion within 5 bp of "seq_i" on the query.
+        # seq_i is the i_th aligned base
         if isinstance(indel_length, int):
-            flanking_indel = inf
-            left_side_start = seq_i
-            right_side_start = seq_i + abs(indel_length) + 1
-            switch = 1
-            for j in (3, 2, 1):
-                for indel_seeker_i in left_side_start, right_side_start:
-
-                    switch = switch * -1
-                    displacement = j * switch
-                    seq_j = indel_seeker_i + displacement
-
-                    if 0 <= seq_j < len(aligned_pairs):
-
-                        # If the reference position has no base aligned to it, it's a deletion.
-                        # On the other hand, if the base has no reference base
-                        # aligned to it, it's an insertion.
-                        if aligned_pairs[seq_j][1] == None or aligned_pairs[seq_j][0] == None:
-                            flanking_indel = j
-                            break
+            right_indel_flanks = inf
+            left_indel_flanks = inf
+            left_side_start = idx_aligned_pair - 1
+            right_side_start = idx_aligned_pair + abs(indel_length) + 1
+
+            #(i, None) = Insertion (or Soft-clips), i.e., means the i_th base in the query is not aligned to a reference
+            #(None, coordinate) = Deletion, i.e., there is no base in it that aligns to this coordinate.
+            # If those two scenarios occur right after an aligned base, that
+            # base position is counted as an indel.
+            for step_right_i in range(min(win_size, len(aligned_pairs) - right_side_start - 1)):
+                j = right_side_start + step_right_i
+
+                if (aligned_pairs[j + 1][1] == None or aligned_pairs[j + 1][0] == None):
+                    right_indel_flanks = step_right_i + 1
+                    break
+
+            for step_left_i in range(min(win_size, left_side_start)):
+                j = left_side_start - step_left_i
+
+                if (aligned_pairs[j][1] == None or aligned_pairs[j][0] == None):
+                    left_indel_flanks = step_left_i + 1
+                    break
+            flanking_indel = min(left_indel_flanks, right_indel_flanks)
+
         else:
             flanking_indel = None
 

From 018e87ac350806b3a4441255676179ad1525ca0a Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Mon, 11 May 2020 21:13:55 -0700
Subject: [PATCH 28/89] cluster variants for feature extraction

---
 neusomatic/python/extend_features.py     | 382 ++++++++++++-----------
 neusomatic/python/preprocess.py          |   7 +
 neusomatic/python/sequencing_features.py | 106 +++++--
 3 files changed, 285 insertions(+), 210 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 79743a7..a952c00 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -16,7 +16,7 @@
 import sequencing_features
 import genomic_file_handlers as genome
 from read_info_extractor import rescale
-from utils import skip_empty
+from utils import skip_empty, get_chromosomes_order
 
 
 def extract_features(candidate_record):
@@ -31,164 +31,167 @@ def extract_features(candidate_record):
             dbsnp_tb = pysam.TabixFile(dbsnp)
 
         ext_features = []
-        for chrom, pos, ref, alt, if_cosmic, num_cosmic_cases in batch:
-            var_id = "-".join([chrom, pos, ref, alt])
-            pos = int(pos)
-            my_coordinate = [chrom, pos]
-            nBamFeatures = sequencing_features.AlignmentFeatures(
-                nbam, my_coordinate, ref, alt, min_mapq, min_bq)
-            tBamFeatures = sequencing_features.AlignmentFeatures(
-                tbam, my_coordinate, ref, alt, min_mapq, min_bq)
-
-            sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
-                                                      tBamFeatures.nalt)
-
-            homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
-                ref_fa, my_coordinate, ref, alt)
-
-            indel_length = len(alt) - len(ref)
-
-            if not no_seq_complexity:
-                seq_span_80bp = ref_fa.fetch(my_coordinate[0], max(
-                    0, my_coordinate[1] - 41), my_coordinate[1] + 40)
-                seq_left_80bp = ref_fa.fetch(my_coordinate[0], max(
-                    0, my_coordinate[1] - 81), my_coordinate[1])
-                seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[
-                                              1], my_coordinate[1] + 81)
-                LC_spanning = sequencing_features.subLC(seq_span_80bp, 20)
-                LC_adjacent = min(sequencing_features.subLC(
-                    seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20))
-                LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
-                LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)
-
-            if_dbsnp = 0
-            if_common = 0
-            if dbsnp:
-                region = "{}:{}-{}".format(chrom, pos, pos + 1)
-                dbsnp_vars = {}
-                for x in dbsnp_tb.fetch(region=region):
-                    chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[
-                        0:8]
-                    for alt_ in alts_.split(","):
-                        dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
-                        dbsnp_vars[
-                            dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
-                if var_id in dbsnp_vars:
-                    if_dbsnp = 1
-                    if_common = dbsnp_vars[var_id]
-
-            p_scale = None
-            CHROM = my_coordinate[0]
-            POS = my_coordinate[1]
-            REF = ref
-            ALT = alt
-            if_dbsnp = if_dbsnp
-            COMMON = if_common
-            if_COSMIC = if_cosmic
-            COSMIC_CNT = num_cosmic_cases
-            Consistent_Mates = tBamFeatures.consistent_mates
-            Inconsistent_Mates = tBamFeatures.inconsistent_mates
-            if not no_seq_complexity:
-                Seq_Complexity_Span = LC_spanning_phred
-                Seq_Complexity_Adj = LC_adjacent_phred
-
-            N_DP = nBamFeatures.dp
-            nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq
-            nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq
-            nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq
-            nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq
-            nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq
-            nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq
-            nBAM_REF_NM = '%g' % nBamFeatures.ref_NM
-            nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM
-            nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff
-            nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads
-            nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads
-            nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads
-            nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads
-            nBAM_Concordance_FET = rescale(
-                nBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
-            N_REF_FOR = nBamFeatures.ref_for
-            N_REF_REV = nBamFeatures.ref_rev
-            N_ALT_FOR = nBamFeatures.alt_for
-            N_ALT_REV = nBamFeatures.alt_rev
-            nBAM_StrandBias_FET = rescale(
-                nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
-            nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos
-            nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads
-            nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads
-            nBAM_Clipping_FET = rescale(
-                nBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
-            nBAM_MQ0 = nBamFeatures.MQ0
-            nBAM_Other_Reads = nBamFeatures.noise_read_count
-            nBAM_Poor_Reads = nBamFeatures.poor_read_count
-            nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp
-            nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp
-            nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp
-            nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp
-            nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp
-            nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp
-            SOR = sor
-            MaxHomopolymer_Length = homopolymer_length
-            SiteHomopolymer_Length = site_homopolymer_length
-            T_DP = tBamFeatures.dp
-            tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq
-            tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq
-            tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq
-            tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq
-            tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq
-            tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq
-            tBAM_REF_NM = '%g' % tBamFeatures.ref_NM
-            tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM
-            tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff
-            tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads
-            tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads
-            tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads
-            tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads
-            tBAM_Concordance_FET = rescale(
-                tBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
-            T_REF_FOR = tBamFeatures.ref_for
-            T_REF_REV = tBamFeatures.ref_rev
-            T_ALT_FOR = tBamFeatures.alt_for
-            T_ALT_REV = tBamFeatures.alt_rev
-            tBAM_StrandBias_FET = rescale(
-                tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
-            tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos
-            tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads
-            tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads
-            tBAM_Clipping_FET = rescale(
-                tBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
-            tBAM_MQ0 = tBamFeatures.MQ0
-            tBAM_Other_Reads = tBamFeatures.noise_read_count
-            tBAM_Poor_Reads = tBamFeatures.poor_read_count
-            tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp
-            tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp
-            tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp
-            tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp
-            tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp
-            tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp
-            InDel_Length = indel_length
-
-            features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
-                        Consistent_Mates, Inconsistent_Mates]
-            if not no_seq_complexity:
-                features.extend([Seq_Complexity_Span, Seq_Complexity_Adj])
-            features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
-                             nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
-                             nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
-                             nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
-                             nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
-                             nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
-                             nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
-                             MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
-                             tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
-                             tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
-                             tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
-                             tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
-                             tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
-                             tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
-
-            ext_features.append(features)
+        for nei_cluster in batch:
+            t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster)
+            n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster)
+            for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster):
+                var_id = "-".join([chrom, str(pos), ref, alt])
+                pos = int(pos)
+                my_coordinate = [chrom, pos]
+                nBamFeatures = sequencing_features.AlignmentFeatures(
+                    n_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq)
+                tBamFeatures = sequencing_features.AlignmentFeatures(
+                    t_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq)
+
+                sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
+                                                          tBamFeatures.nalt)
+
+                homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
+                    ref_fa, my_coordinate, ref, alt)
+
+                indel_length = len(alt) - len(ref)
+
+                if not no_seq_complexity:
+                    seq_span_80bp = ref_fa.fetch(my_coordinate[0], max(
+                        0, my_coordinate[1] - 41), my_coordinate[1] + 40)
+                    seq_left_80bp = ref_fa.fetch(my_coordinate[0], max(
+                        0, my_coordinate[1] - 81), my_coordinate[1])
+                    seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[
+                                                  1], my_coordinate[1] + 81)
+                    LC_spanning = sequencing_features.subLC(seq_span_80bp, 20)
+                    LC_adjacent = min(sequencing_features.subLC(
+                        seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20))
+                    LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40)
+                    LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40)
+
+                if_dbsnp = 0
+                if_common = 0
+                if dbsnp:
+                    region = "{}:{}-{}".format(chrom, pos, pos + 1)
+                    dbsnp_vars = {}
+                    for x in dbsnp_tb.fetch(region=region):
+                        chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[
+                            0:8]
+                        for alt_ in alts_.split(","):
+                            dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_])
+                            dbsnp_vars[
+                                dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0
+                    if var_id in dbsnp_vars:
+                        if_dbsnp = 1
+                        if_common = dbsnp_vars[var_id]
+
+                p_scale = None
+                CHROM = my_coordinate[0]
+                POS = my_coordinate[1]
+                REF = ref
+                ALT = alt
+                if_dbsnp = if_dbsnp
+                COMMON = if_common
+                if_COSMIC = if_cosmic
+                COSMIC_CNT = num_cosmic_cases
+                Consistent_Mates = tBamFeatures.consistent_mates
+                Inconsistent_Mates = tBamFeatures.inconsistent_mates
+                if not no_seq_complexity:
+                    Seq_Complexity_Span = LC_spanning_phred
+                    Seq_Complexity_Adj = LC_adjacent_phred
+
+                N_DP = nBamFeatures.dp
+                nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq
+                nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq
+                nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq
+                nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq
+                nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq
+                nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq
+                nBAM_REF_NM = '%g' % nBamFeatures.ref_NM
+                nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM
+                nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff
+                nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads
+                nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads
+                nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads
+                nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads
+                nBAM_Concordance_FET = rescale(
+                    nBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
+                N_REF_FOR = nBamFeatures.ref_for
+                N_REF_REV = nBamFeatures.ref_rev
+                N_ALT_FOR = nBamFeatures.alt_for
+                N_ALT_REV = nBamFeatures.alt_rev
+                nBAM_StrandBias_FET = rescale(
+                    nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
+                nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos
+                nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads
+                nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads
+                nBAM_Clipping_FET = rescale(
+                    nBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
+                nBAM_MQ0 = nBamFeatures.MQ0
+                nBAM_Other_Reads = nBamFeatures.noise_read_count
+                nBAM_Poor_Reads = nBamFeatures.poor_read_count
+                nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp
+                nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp
+                nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp
+                nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp
+                nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp
+                nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp
+                SOR = sor
+                MaxHomopolymer_Length = homopolymer_length
+                SiteHomopolymer_Length = site_homopolymer_length
+                T_DP = tBamFeatures.dp
+                tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq
+                tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq
+                tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq
+                tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq
+                tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq
+                tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq
+                tBAM_REF_NM = '%g' % tBamFeatures.ref_NM
+                tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM
+                tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff
+                tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads
+                tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads
+                tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads
+                tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads
+                tBAM_Concordance_FET = rescale(
+                    tBamFeatures.concordance_fet, 'fraction', p_scale, 1001)
+                T_REF_FOR = tBamFeatures.ref_for
+                T_REF_REV = tBamFeatures.ref_rev
+                T_ALT_FOR = tBamFeatures.alt_for
+                T_ALT_REV = tBamFeatures.alt_rev
+                tBAM_StrandBias_FET = rescale(
+                    tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001)
+                tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos
+                tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads
+                tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads
+                tBAM_Clipping_FET = rescale(
+                    tBamFeatures.clipping_fet, 'fraction', p_scale, 1001)
+                tBAM_MQ0 = tBamFeatures.MQ0
+                tBAM_Other_Reads = tBamFeatures.noise_read_count
+                tBAM_Poor_Reads = tBamFeatures.poor_read_count
+                tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp
+                tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp
+                tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp
+                tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp
+                tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp
+                tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp
+                InDel_Length = indel_length
+
+                features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT,
+                            Consistent_Mates, Inconsistent_Mates]
+                if not no_seq_complexity:
+                    features.extend([Seq_Complexity_Span, Seq_Complexity_Adj])
+                features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ,
+                                 nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff,
+                                 nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant,
+                                 nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET,
+                                 nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
+                                 nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
+                                 nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
+                                 MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
+                                 tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
+                                 tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
+                                 tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
+                                 tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET,
+                                 tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp,
+                                 tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length])
+
+                ext_features.append(features)
         return ext_features
 
     except Exception as ex:
@@ -205,6 +208,7 @@ def extend_features(candidates_vcf,
                     min_mapq, min_bq,
                     dbsnp, cosmic,
                     no_seq_complexity,
+                    window_extend,
                     num_threads):
 
     logger = logging.getLogger(extend_features.__name__)
@@ -242,6 +246,7 @@ def extend_features(candidates_vcf,
             raise Exception(
                 "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
 
+    chrom_order = get_chromosomes_order(reference)
     if cosmic:
         cosmic_vars = {}
         with open(cosmic) as i_f:
@@ -276,15 +281,7 @@ def extend_features(candidates_vcf,
                 var_id = "-".join([chrom, pos, ref, alt])
                 add_vars.add(var_id)
 
-    n_variants = 0
-    with open(candidates_vcf) as i_f:
-        for line in skip_empty(i_f):
-            n_variants += 1
-    logger.info("Number of variants: {}".format(n_variants))
-    split_len = (n_variants + num_threads - 1) // num_threads
-    pool = multiprocessing.Pool(num_threads)
-    map_args = []
-    batch = []
+    all_variants=[]
     with open(candidates_vcf) as i_f:
         for line in skip_empty(i_f):
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
@@ -300,11 +297,8 @@ def extend_features(candidates_vcf,
             if cosmic and var_id in cosmic_vars:
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
-            batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
-            if len(batch) >= split_len:
-                map_args.append((reference, tumor_bam, normal_bam,
-                                 min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
-                batch = []
+            all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
+
     if add_variants and len(add_vars) > 0:
         for var_id in add_vars - set(exclude_vars):
             v = var_id.split("-")
@@ -315,14 +309,40 @@ def extend_features(candidates_vcf,
             if cosmic and var_id in cosmic_vars:
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
-            batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
-            if len(batch) >= split_len:
+            all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
+
+    all_variants = sorted(all_variants,key=lambda x:[chrom_order[x[0]],x[1]])
+    n_variants = len(all_variants)
+    logger.info("Number of variants: {}".format(n_variants))
+    split_len = (n_variants + num_threads - 1) // num_threads
+    pool = multiprocessing.Pool(num_threads)
+    map_args = []
+    nei_cluster = []
+    batch = []
+    n_batch = 0
+    curr_pos = None
+    for i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(all_variants):
+        if curr_pos is None:
+            curr_pos = [chrom, pos]
+            nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
+            continue
+        if chrom == curr_pos[0] and abs(curr_pos[1]-pos)<window_extend:
+            nei_cluster.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
+        else:
+            batch.append(nei_cluster)
+            n_batch += len(nei_cluster)
+            curr_pos = [chrom, pos]
+            nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
+        if n_batch >= split_len or i == n_variants-1:
+            if i == n_variants-1:
+                batch.append(nei_cluster)
+                curr_pos = None
+                nei_cluster = []
+            if batch:
                 map_args.append((reference, tumor_bam, normal_bam,
                                  min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
-                batch = []
-    if batch:
-        map_args.append((reference, tumor_bam, normal_bam,
-                         min_mapq, min_bq, dbsnp, no_seq_complexity, batch))
+            batch = []
+    assert(n_variants == sum([len(y) for x in map_args for y in x[-1]]))
 
     logger.info("Number of batches: {}".format(len(map_args)))
     header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT",
@@ -395,6 +415,9 @@ def extend_features(candidates_vcf,
     parser.add_argument('--no_seq_complexity',
                         help='Dont compute linguistic sequence complexity features',
                         action="store_true")
+    parser.add_argument('--window_extend', type=int,
+                        help='window size for extending input features (should be in the order of readlength)',
+                         default=1000)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     args = parser.parse_args()
@@ -409,6 +432,7 @@ def extend_features(candidates_vcf,
                                  args.min_mapq, args.min_bq,
                                  args.dbsnp, args.cosmic,
                                  args.no_seq_complexity,
+                                 args.window_extend,
                                  args.num_threads,
                                  )
         if output is None:
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 11410d1..20e60c1 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -199,6 +199,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                add_extra_features,
                no_seq_complexity,
                no_feature_recomp_for_ensemble,
+               window_extend,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -350,6 +351,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     min_mapq, snp_min_bq,
                                     dbsnp, None,
                                     no_seq_complexity,
+                                    window_extend,
                                     num_threads)
                 if ensemble_tsv and not no_feature_recomp_for_ensemble:
                     extra_features_others_tsv = os.path.join(
@@ -364,6 +366,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                         min_mapq, snp_min_bq,
                                         dbsnp, None,
                                         no_seq_complexity,
+                                        window_extend,
                                         num_threads)
 
                 extra_features_bed = os.path.join(
@@ -569,6 +572,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--no_feature_recomp_for_ensemble',
                         help='Do not recompute features for ensemble_tsv',
                         action="store_true")
+    parser.add_argument('--window_extend', type=int,
+                        help='window size for extending input features (should be in the order of readlength)',
+                         default=1000)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -589,6 +595,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.add_extra_features,
                    args.no_seq_complexity,
                    args.no_feature_recomp_for_ensemble,
+                   args.window_extend,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 7e90d1a..ebd37db 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -9,22 +9,54 @@
 from read_info_extractor import *
 from collections import defaultdict
 import fisher
+import logging
+
+FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
 
 nan = float('nan')
 
+
 def fisher_exact_test(mat):
-    return fisher.pvalue(mat[0][0],mat[0][1],mat[1][0],mat[1][1]).two_tail
+    return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail
+
+
+class ClusterReads:
+    def __init__(self, bam, variants):
+        self.chrom = variants[0][0]
+        self.min_pos = variants[0][1]
+        self.max_pos = variants[-1][1]
+        self.reads = []
+        for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos):
+            if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
+                continue
+            self.reads.append(read_i)
+
+        done_i = -1
+        n = len(variants)
+        self.var_reads = [[] for i in range(len(variants))]
+        for i, read in enumerate(self.reads):
+            for j in range(done_i + 1, n):
+                pos = variants[j][1]
+                if read.reference_start > pos:
+                    done_i += 1
+                    continue
+                if pos < read.reference_end:
+                    self.var_reads[j].append(i)
+    def get_var_reads(self, var_index):
+        return [self.reads[i] for i in self.var_reads[var_index]]
 
 
 class AlignmentFeatures:
-    def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+
+    def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
         '''
         bam is the opened file handle of bam file
         my_coordiate is a list or tuple of 0-based (contig, position)
-        '''  
-              
+        '''
+
         indel_length = len(first_alt) - len(ref_base)
-        reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1])
 
         # index 0 for ref, 1 for alt
         read_mq = [[], []]
@@ -59,8 +91,8 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10)
 
             is_ref_call = code_i == 1 and base_call_i == ref_base[0]
             is_alt_call = (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or (
-                        indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or (
-                                      indel_length > 0 and code_i == 3)
+                indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or (
+                indel_length > 0 and code_i == 3)
 
             # inconsistent read or second alternate calls
             if not (is_ref_call or is_alt_call):
@@ -81,22 +113,27 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10)
                 pass
 
             if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
-                concordance_counts[0 if read_i.is_proper_pair else 1][index] += 1
+                concordance_counts[
+                    0 if read_i.is_proper_pair else 1][index] += 1
                 orientation_counts[1 if read_i.is_reverse else 0][index] += 1
 
-            is_soft_clipped = read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip
+            is_soft_clipped = read_i.cigar[0][
+                0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip
             soft_clip_counts[1 if is_soft_clipped else 0][index] += 1
 
             # Distance from the end of the read:
             if ith_base is not None:
-                pos_from_end[index].append(min(ith_base, read_i.query_length - ith_base))
+                pos_from_end[index].append(
+                    min(ith_base, read_i.query_length - ith_base))
 
             flanking_indel[index].append(flanking_indel_i)
 
         # unpack to get the ref and alt values
         ref_pos_from_end, alt_pos_from_end = pos_from_end
-        self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[0]
-        self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[1]
+        self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[
+            0]
+        self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[
+            1]
         self.ref_for, self.alt_for = orientation_counts[0]
         self.ref_rev, self.alt_rev = orientation_counts[1]
         self.ref_notSC_reads, self.alt_notSC_reads = soft_clip_counts[0]
@@ -116,14 +153,16 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10)
         ref_edit_distance, alt_edit_distance = edit_distance
         self.ref_NM = mean(ref_edit_distance)
         self.alt_NM = mean(alt_edit_distance)
-        self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0]
+        self.z_ranksums_NM = stats.ranksums(
+            alt_edit_distance, ref_edit_distance)[0]
         self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length)
 
         self.concordance_fet = fisher_exact_test(concordance_counts)
         self.strandbias_fet = fisher_exact_test(orientation_counts)
         self.clipping_fet = fisher_exact_test(soft_clip_counts)
 
-        self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0]
+        self.z_ranksums_endpos = stats.ranksums(
+            alt_pos_from_end, ref_pos_from_end)[0]
 
         ref_flanking_indel, alt_flanking_indel = flanking_indel
         self.ref_indel_1bp = ref_flanking_indel.count(1)
@@ -224,24 +263,27 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100):
 
     return sor
 
+
 def max_sub_vocabularies(seq_length, max_subseq_length):
     # According to:
     # https://doi.org/10.1093/bioinformatics/18.5.679
     # capping the length of sub_string as an input parameter
     assert max_subseq_length <= seq_length
-    
+
     counts = 0
     k = 1
     while k <= max_subseq_length:
-        
+
         if 4**k < (seq_length - k + 1):
             counts = counts + 4**k
         else:
-            counts = counts + (2*seq_length - k - max_subseq_length + 2) * (max_subseq_length - k + 1)/2
+            counts = counts + \
+                (2 * seq_length - k - max_subseq_length + 2) * \
+                (max_subseq_length - k + 1) / 2
             break
-        
+
         k += 1
-                
+
     return counts
 
 
@@ -250,20 +292,22 @@ def subLC(sequence, max_substring_length=20):
     # https://doi.org/10.1093/bioinformatics/18.5.679
     # Cut off substring at a fixed length
     sequence = sequence.upper()
-    
+
     if not 'N' in sequence:
-        
-        number_of_subseqs     = 0
-        seq_length            = len(sequence)
-        max_number_of_subseqs = max_sub_vocabularies(seq_length, max_substring_length)
-        
+
+        number_of_subseqs = 0
+        seq_length = len(sequence)
+        max_number_of_subseqs = max_sub_vocabularies(
+            seq_length, max_substring_length)
+
         set_of_seq_n = set()
-        for i in range(1, min(max_substring_length+1, seq_length+1) ):
-            set_of_seq_n.update((sequence[n: n+i] for n in range(len(sequence) - i + 1)))
-        
-        number_of_subseqs  = len(set_of_seq_n)
-        lc = number_of_subseqs/max_number_of_subseqs
-    
+        for i in range(1, min(max_substring_length + 1, seq_length + 1)):
+            set_of_seq_n.update((sequence[n: n + i]
+                                 for n in range(len(sequence) - i + 1)))
+
+        number_of_subseqs = len(set_of_seq_n)
+        lc = number_of_subseqs / max_number_of_subseqs
+
     else:
         lc = float('nan')
 

From 0e394ff9c428be7d1b7eacf521e7935114bd6961 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Mon, 11 May 2020 22:53:56 -0700
Subject: [PATCH 29/89] small fix

---
 neusomatic/python/extend_features.py     | 5 ++++-
 neusomatic/python/sequencing_features.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index a952c00..338adf7 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -365,7 +365,10 @@ def extend_features(candidates_vcf,
                    "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"])
 
     try:
-        ext_features = pool.map_async(extract_features, map_args).get()
+        ext_features = []
+        for w in map_args:
+            ext_features.append(extract_features(w))
+        # ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
             o_f.write("\t".join(header) + "\n")
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index ebd37db..f6c510b 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -39,10 +39,10 @@ def __init__(self, bam, variants):
         for i, read in enumerate(self.reads):
             for j in range(done_i + 1, n):
                 pos = variants[j][1]
-                if read.reference_start > pos:
+                if read.reference_start >= pos:
                     done_i += 1
                     continue
-                if pos < read.reference_end:
+                if pos <= read.reference_end:
                     self.var_reads[j].append(i)
     def get_var_reads(self, var_index):
         return [self.reads[i] for i in self.var_reads[var_index]]

From 3c061b37c49e66e8a2b7db693a227ed7b9915226 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 12 May 2020 01:18:19 -0700
Subject: [PATCH 30/89] record aligned_pairs

---
 neusomatic/python/extend_features.py     |  8 +++---
 neusomatic/python/read_info_extractor.py | 27 +++++++++++++++++---
 neusomatic/python/sequencing_features.py | 32 ++++++++++++++++++------
 3 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 338adf7..66ca7ba 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -32,16 +32,14 @@ def extract_features(candidate_record):
 
         ext_features = []
         for nei_cluster in batch:
-            t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster)
             n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster)
+            t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster)
             for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster):
                 var_id = "-".join([chrom, str(pos), ref, alt])
                 pos = int(pos)
                 my_coordinate = [chrom, pos]
-                nBamFeatures = sequencing_features.AlignmentFeatures(
-                    n_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq)
-                tBamFeatures = sequencing_features.AlignmentFeatures(
-                    t_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq)
+                nBamFeatures = n_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq)
+                tBamFeatures = t_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq)
 
                 sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
                                                           tBamFeatures.nalt)
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index 5c005f4..95fd269 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -1,6 +1,12 @@
 #!/usr/bin/env python3
 
 import re
+import logging
+import numpy as np
+
+FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
 
 cigar_aln_match = 0
 cigar_insertion = 1
@@ -19,8 +25,7 @@
 
 ### PYSAM ###
 
-
-def position_of_aligned_read(read_i, target_position, win_size=3):
+def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3):
     '''
     Return the base call of the target position, and if it's a start of insertion/deletion.
     This target position follows pysam convension, i.e., 0-based.
@@ -33,9 +38,18 @@ def position_of_aligned_read(read_i, target_position, win_size=3):
         3: Insertion after the target position
         0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
     '''
-
     flanking_deletion, flanking_insertion = nan, nan
-    aligned_pairs = read_i.get_aligned_pairs()
+    # i_match = np.where(aligned_pairs[:,1]==target_position)[0]
+    # if len(i_match)>0:
+    #     # If find a match:
+    #     seq_i=aligned_pairs[i_match[0],0]
+    #     idx_aligned_pair = i_match[0]
+    #     i = i_match[0]
+    # else:
+    #     seq_i = None
+    #     idx_aligned_pair = None
+    #     i = aligned_pairs.shape[0]-1
+
     for i, align_i in enumerate(aligned_pairs):
 
         # If find a match:
@@ -44,6 +58,11 @@ def position_of_aligned_read(read_i, target_position, win_size=3):
             idx_aligned_pair = i
             break
 
+    # logger.info([aligned_pairs.shape,i_match,seq_i,idx_aligned_pair,seq_i_,idx_aligned_pair_])
+    # assert(i==i_)
+    # assert(seq_i==seq_i_)
+    # assert(idx_aligned_pair==idx_aligned_pair_)
+    # aaa
     # If the target position is aligned:
     try:
         if seq_i is not None:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index f6c510b..24f944f 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -24,6 +24,7 @@ def fisher_exact_test(mat):
 
 class ClusterReads:
     def __init__(self, bam, variants):
+        self.variants = variants
         self.chrom = variants[0][0]
         self.min_pos = variants[0][1]
         self.max_pos = variants[-1][1]
@@ -44,17 +45,33 @@ def __init__(self, bam, variants):
                     continue
                 if pos <= read.reference_end:
                     self.var_reads[j].append(i)
-    def get_var_reads(self, var_index):
-        return [self.reads[i] for i in self.var_reads[var_index]]
-
+        unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j])
+        for i in unused_reads:
+            self.reads[i] = None
+        self.aligned_pairs = []
+        for i, read in enumerate(self.reads):
+            if i not in unused_reads:
+                self.aligned_pairs.append(np.array(read.get_aligned_pairs()))
+            else:
+                self.aligned_pairs.append(None)
 
-class AlignmentFeatures:
 
-    def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+    def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10):
         '''
         bam is the opened file handle of bam file
         my_coordiate is a list or tuple of 0-based (contig, position)
         '''
+        my_coordinate = self.variants[var_index][0:2]
+        reads = [self.reads[i] for i in self.var_reads[var_index]]
+        aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]]
+        bamfeatures = AlignmentFeatures(reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq, min_bq)
+
+        return bamfeatures
+
+
+class AlignmentFeatures:
+
+    def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
         indel_length = len(first_alt) - len(ref_base)
 
@@ -74,14 +91,13 @@ def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=1
 
         qname_collector = defaultdict(list)
 
-        for read_i in reads:
+        for read_i,aligned_pair in zip(reads,aligned_pairs):
             if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
                 continue
-
             dp += 1
 
             code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
-                read_i, my_coordinate[1] - 1)
+                read_i, aligned_pair, my_coordinate[1] - 1)
 
             if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
                 poor_read_count += 1

From 709b64b5fbf1a41eb6c3c5694262c7fbd6208e58 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 12 May 2020 11:06:11 -0700
Subject: [PATCH 31/89] small fix

---
 neusomatic/python/extend_features.py     |  5 +----
 neusomatic/python/read_info_extractor.py | 16 ----------------
 neusomatic/python/sequencing_features.py |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 66ca7ba..ba38f00 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -363,10 +363,7 @@ def extend_features(candidates_vcf,
                    "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"])
 
     try:
-        ext_features = []
-        for w in map_args:
-            ext_features.append(extract_features(w))
-        # ext_features = pool.map_async(extract_features, map_args).get()
+        ext_features = pool.map_async(extract_features, map_args).get()
         pool.close()
         with open(output_tsv, "w") as o_f:
             o_f.write("\t".join(header) + "\n")
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index 95fd269..40f38b7 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -39,17 +39,6 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3)
         0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
     '''
     flanking_deletion, flanking_insertion = nan, nan
-    # i_match = np.where(aligned_pairs[:,1]==target_position)[0]
-    # if len(i_match)>0:
-    #     # If find a match:
-    #     seq_i=aligned_pairs[i_match[0],0]
-    #     idx_aligned_pair = i_match[0]
-    #     i = i_match[0]
-    # else:
-    #     seq_i = None
-    #     idx_aligned_pair = None
-    #     i = aligned_pairs.shape[0]-1
-
     for i, align_i in enumerate(aligned_pairs):
 
         # If find a match:
@@ -58,11 +47,6 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3)
             idx_aligned_pair = i
             break
 
-    # logger.info([aligned_pairs.shape,i_match,seq_i,idx_aligned_pair,seq_i_,idx_aligned_pair_])
-    # assert(i==i_)
-    # assert(seq_i==seq_i_)
-    # assert(idx_aligned_pair==idx_aligned_pair_)
-    # aaa
     # If the target position is aligned:
     try:
         if seq_i is not None:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 24f944f..b2d09da 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -51,7 +51,7 @@ def __init__(self, bam, variants):
         self.aligned_pairs = []
         for i, read in enumerate(self.reads):
             if i not in unused_reads:
-                self.aligned_pairs.append(np.array(read.get_aligned_pairs()))
+                self.aligned_pairs.append(read.get_aligned_pairs())
             else:
                 self.aligned_pairs.append(None)
 

From 12167e1acf7db67f351094aba4f173d46fd47f36 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 12 May 2020 19:15:52 -0700
Subject: [PATCH 32/89] more efficient read/ref pos match search

---
 neusomatic/python/extend_features.py     | 14 +++---
 neusomatic/python/read_info_extractor.py | 22 ++++-----
 neusomatic/python/sequencing_features.py | 57 ++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index ba38f00..9e98551 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -323,14 +323,14 @@ def extend_features(candidates_vcf,
         if curr_pos is None:
             curr_pos = [chrom, pos]
             nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
-            continue
-        if chrom == curr_pos[0] and abs(curr_pos[1]-pos)<window_extend:
-            nei_cluster.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
         else:
-            batch.append(nei_cluster)
-            n_batch += len(nei_cluster)
-            curr_pos = [chrom, pos]
-            nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
+            if chrom == curr_pos[0] and abs(curr_pos[1]-pos)<window_extend:
+                nei_cluster.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
+            else:
+                batch.append(nei_cluster)
+                n_batch += len(nei_cluster)
+                curr_pos = [chrom, pos]
+                nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
         if n_batch >= split_len or i == n_variants-1:
             if i == n_variants-1:
                 batch.append(nei_cluster)
diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index 40f38b7..cf71ed0 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -25,7 +25,7 @@
 
 ### PYSAM ###
 
-def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3):
+def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3):
     '''
     Return the base call of the target position, and if it's a start of insertion/deletion.
     This target position follows pysam convension, i.e., 0-based.
@@ -39,14 +39,8 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3)
         0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics.
     '''
     flanking_deletion, flanking_insertion = nan, nan
-    for i, align_i in enumerate(aligned_pairs):
-
-        # If find a match:
-        if align_i[1] == target_position:
-            seq_i = align_i[0]
-            idx_aligned_pair = i
-            break
 
+    idx_aligned_pair, seq_i = read_pos_for_ref_pos #get_read_pos_for_ref_pos(read_i, target_position)
     # If the target position is aligned:
     try:
         if seq_i is not None:
@@ -55,22 +49,22 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3)
             # Whether if it's a Deletion/Insertion depends on what happens after this position:
             # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel
             # if "i" is NOT the final alignment:
-            if i != len(aligned_pairs) - 1:
+            if idx_aligned_pair != len(aligned_pairs) - 1:
 
                 indel_length = 0
                 # If the next alignment is the next sequenced base, then the
                 # target is either a reference read of a SNP/SNV:
-                if aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == target_position + 1:
+                if aligned_pairs[idx_aligned_pair + 1][0] == seq_i + 1 and aligned_pairs[idx_aligned_pair + 1][1] == target_position + 1:
 
                     code = 1  # Reference read for mismatch
 
                 # If the next reference position has no read position to it, it
                 # is DELETED in this read:
-                elif aligned_pairs[i + 1][0] == None and aligned_pairs[i + 1][1] == target_position + 1:
+                elif aligned_pairs[idx_aligned_pair + 1][0] == None and aligned_pairs[idx_aligned_pair + 1][1] == target_position + 1:
 
                     code = 2  # Deletion
 
-                    for align_j in aligned_pairs[i + 1::]:
+                    for align_j in aligned_pairs[idx_aligned_pair + 1::]:
                         if align_j[0] == None:
                             indel_length -= 1
                         else:
@@ -81,11 +75,11 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3)
                 # the inserted sequence is "too long" to align on a single
                 # read. In this case, the inserted length derived here is but a
                 # lower limit of the real inserted length.
-                elif aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == None:
+                elif aligned_pairs[idx_aligned_pair + 1][0] == seq_i + 1 and aligned_pairs[idx_aligned_pair + 1][1] == None:
 
                     code = 3  # Insertion or soft-clipping
 
-                    for align_j in aligned_pairs[i + 1::]:
+                    for align_j in aligned_pairs[idx_aligned_pair + 1::]:
                         if align_j[1] == None:
                             indel_length += 1
                         else:
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index b2d09da..9fa0c56 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -21,6 +21,45 @@
 def fisher_exact_test(mat):
     return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail
 
+def get_read_pos_for_ref_pos(read, ref_pos_s):
+    cigartuples = read.cigartuples
+    pos_r = read.reference_start
+    current_i = 0
+    output = {}
+    while current_i < len(ref_pos_s):
+        if pos_r > ref_pos_s[current_i] or not cigartuples:
+            output[ref_pos_s[current_i]]=[None, None]
+            current_i +=1
+        else:
+            break
+    if current_i >= len(ref_pos_s):
+        return output
+    cigar_aligned = [cigar_aln_match, cigar_seq_match, cigar_seq_mismatch]
+    cigar_s = 1 if cigartuples[0][0] == cigar_soft_clip else 0
+    cigar_e = (len(cigartuples) - 1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples)
+    count = pos_q = cigartuples[0][1] if cigar_s == 1 else 0
+    cigar_index = cigar_s
+    for op, length in cigartuples[cigar_s: cigar_e]:
+        is_aligned = op == 0 or op >= 7
+        delta_r = length if (is_aligned or op == cigar_deletion) else 0
+        delta_q = length if (is_aligned or op == cigar_insertion) else 0
+        while current_i < len(ref_pos_s):
+            diff = ref_pos_s[current_i] - pos_r
+            if diff < delta_r:
+                output[ref_pos_s[current_i]]=[count + diff, (pos_q + diff) if delta_q else None]
+                current_i +=1
+            else:
+                break
+        if current_i >= len(ref_pos_s):
+            return output
+        count += max(delta_r, delta_q)
+        pos_r += delta_r
+        pos_q += delta_q
+        cigar_index += 1
+    while current_i < len(ref_pos_s):
+        output[ref_pos_s[current_i]]=[None, None]
+        current_i +=1
+    return output
 
 class ClusterReads:
     def __init__(self, bam, variants):
@@ -37,6 +76,7 @@ def __init__(self, bam, variants):
         done_i = -1
         n = len(variants)
         self.var_reads = [[] for i in range(len(variants))]
+        self.read_vars = [[] for i in range(len(self.reads))]
         for i, read in enumerate(self.reads):
             for j in range(done_i + 1, n):
                 pos = variants[j][1]
@@ -45,6 +85,7 @@ def __init__(self, bam, variants):
                     continue
                 if pos <= read.reference_end:
                     self.var_reads[j].append(i)
+                    self.read_vars[i].append(j)
         unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j])
         for i in unused_reads:
             self.reads[i] = None
@@ -54,6 +95,13 @@ def __init__(self, bam, variants):
                 self.aligned_pairs.append(read.get_aligned_pairs())
             else:
                 self.aligned_pairs.append(None)
+        self.read_pos_for_ref_pos = []
+        for i, read in enumerate(self.reads):
+            if i not in unused_reads:
+                self.read_pos_for_ref_pos.append(get_read_pos_for_ref_pos(read, 
+                    [self.variants[j][1]-1 for j in self.read_vars[i]]))
+            else:
+                self.read_pos_for_ref_pos.append(None)
 
 
     def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10):
@@ -64,14 +112,15 @@ def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_b
         my_coordinate = self.variants[var_index][0:2]
         reads = [self.reads[i] for i in self.var_reads[var_index]]
         aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]]
-        bamfeatures = AlignmentFeatures(reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq, min_bq)
+        read_pos_for_ref_pos_s = [self.read_pos_for_ref_pos[i][my_coordinate[1]-1] for i in self.var_reads[var_index]]
+        bamfeatures = AlignmentFeatures(reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq, min_bq)
 
         return bamfeatures
 
 
 class AlignmentFeatures:
 
-    def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+    def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
         indel_length = len(first_alt) - len(ref_base)
 
@@ -91,13 +140,13 @@ def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min
 
         qname_collector = defaultdict(list)
 
-        for read_i,aligned_pair in zip(reads,aligned_pairs):
+        for read_i, aligned_pair, read_pos_for_ref_pos in zip(reads,aligned_pairs,read_pos_for_ref_pos_s):
             if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
                 continue
             dp += 1
 
             code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
-                read_i, aligned_pair, my_coordinate[1] - 1)
+                read_i, aligned_pair, read_pos_for_ref_pos, my_coordinate[1] - 1)
 
             if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
                 poor_read_count += 1

From ff00be31ad2b5c6d2e2e63690625aaed16bdd9eb Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 12 May 2020 23:23:24 -0700
Subject: [PATCH 33/89] input num_splits

---
 neusomatic/python/preprocess.py      | 14 +++++++++-----
 neusomatic/python/scan_alignments.py | 13 +++++++++----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 20e60c1..100f52e 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -31,12 +31,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
                          good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
                          ins_min_af, del_min_af, del_merge_min_af,
                          ins_merge_min_af, merge_r,
-                         scan_alignments_binary, restart, num_threads, calc_qual, regions=[]):
+                         scan_alignments_binary, restart, num_splits, num_threads, calc_qual, regions=[]):
 
     logger = logging.getLogger(process_split_region.__name__)
     logger.info("Scan bam.")
     scan_outputs = scan_alignments(work, scan_alignments_binary, alignment_bam,
-                                   region, reference, num_threads, scan_window_size, scan_maf,
+                                   region, reference, num_splits, num_threads, scan_window_size, scan_maf,
                                    min_mapq, max_dp, filter_duplicate, restart=restart, split_region_files=regions,
                                    calc_qual=calc_qual)
     if filtered_candidates_vcf:
@@ -200,6 +200,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                no_seq_complexity,
                no_feature_recomp_for_ensemble,
                window_extend,
+               num_splits,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -268,7 +269,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                        snp_min_af, -10000, snp_min_ao,
                                                        ins_min_af, del_min_af, del_merge_min_af,
                                                        ins_merge_min_af, merge_r,
-                                                       scan_alignments_binary, restart, num_threads,
+                                                       scan_alignments_binary, restart, num_splits, num_threads,
                                                        calc_qual=False)
         tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q = tumor_outputs_without_q
 
@@ -293,7 +294,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                          snp_min_af, snp_min_bq, snp_min_ao,
                                          ins_min_af, del_min_af, del_merge_min_af,
                                          ins_merge_min_af, merge_r,
-                                         scan_alignments_binary, restart, num_threads,
+                                         scan_alignments_binary, restart, num_splits, num_threads,
                                          calc_qual=True,
                                          regions=candidates_split_regions)
     tumor_counts, split_regions, filtered_candidates_vcfs = tumor_outputs
@@ -320,7 +321,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
                                                ins_min_af, del_min_af, del_merge_min_af,
                                                ins_merge_min_af, merge_r,
-                                               scan_alignments_binary, restart, num_threads,
+                                               scan_alignments_binary, restart, num_splits, num_threads,
                                                calc_qual=True,
                                                regions=candidates_split_regions)
 
@@ -575,6 +576,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--window_extend', type=int,
                         help='window size for extending input features (should be in the order of readlength)',
                          default=1000)
+    parser.add_argument('--num_splits', type=int,
+                        help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -596,6 +599,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.no_seq_complexity,
                    args.no_feature_recomp_for_ensemble,
                    args.window_extend,
+                   args.num_splits,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index 37b47f2..5b703a0 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -70,7 +70,7 @@ def run_scan_alignments(record):
 
 
 def scan_alignments(work, scan_alignments_binary, input_bam,
-                    regions_bed_file, reference,
+                    regions_bed_file, reference, num_splits,
                     num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True,
                     split_region_files=[], calc_qual=True):
 
@@ -115,8 +115,11 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
             regions_bed_file = os.path.join(work, "all_regions.bed")
             shutil.move(regions_bed, regions_bed_file)
 
-            num_split = max(int(np.ceil((total_len // 10000000) //
-                                        num_threads) * num_threads), num_threads)
+            if num_splits is not None:
+                num_split = num_splits
+            else:
+                num_split = max(int(np.ceil((total_len // 10000000) //
+                                            num_threads) * num_threads), num_threads)
             split_region_files = split_region(work, regions_bed_file, num_split,
                                               min_region=window_size, max_region=1e20)
     else:
@@ -189,6 +192,8 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
     parser.add_argument('--filter_duplicate',
                         help='filter duplicate reads when preparing pileup information',
                         action="store_true")
+    parser.add_argument('--num_splits', type=int,
+                        help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     args = parser.parse_args()
@@ -196,7 +201,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
 
     try:
         outputs = scan_alignments(args.work, args.scan_alignments_binary, args.input_bam,
-                                  args.regions_bed_file, args.reference,
+                                  args.regions_bed_file, args.reference, args.num_splits,
                                   args.num_threads, args.window_size, args.maf,
                                   args.min_mapq, args.max_dp, args.filter_duplicate)
     except Exception as e:

From 366964e239e8249d536d5fd4dbbed2702b2bf94d Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 13 May 2020 10:30:10 -0700
Subject: [PATCH 34/89] max_cluster size added

---
 neusomatic/python/extend_features.py | 42 ++++++++++++++++++----------
 neusomatic/python/preprocess.py      |  7 +++++
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index 9e98551..b1606b5 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -32,14 +32,18 @@ def extract_features(candidate_record):
 
         ext_features = []
         for nei_cluster in batch:
-            n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster)
-            t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster)
+            n_cluster_reads = sequencing_features.ClusterReads(
+                nbam, nei_cluster)
+            t_cluster_reads = sequencing_features.ClusterReads(
+                tbam, nei_cluster)
             for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster):
                 var_id = "-".join([chrom, str(pos), ref, alt])
                 pos = int(pos)
                 my_coordinate = [chrom, pos]
-                nBamFeatures = n_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq)
-                tBamFeatures = t_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq)
+                nBamFeatures = n_cluster_reads.get_alignment_features(
+                    var_i, ref, alt, min_mapq, min_bq)
+                tBamFeatures = t_cluster_reads.get_alignment_features(
+                    var_i, ref, alt, min_mapq, min_bq)
 
                 sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
                                                           tBamFeatures.nalt)
@@ -207,6 +211,7 @@ def extend_features(candidates_vcf,
                     dbsnp, cosmic,
                     no_seq_complexity,
                     window_extend,
+                    max_cluster_size,
                     num_threads):
 
     logger = logging.getLogger(extend_features.__name__)
@@ -279,7 +284,7 @@ def extend_features(candidates_vcf,
                 var_id = "-".join([chrom, pos, ref, alt])
                 add_vars.add(var_id)
 
-    all_variants=[]
+    all_variants = []
     with open(candidates_vcf) as i_f:
         for line in skip_empty(i_f):
             chrom, pos, _, ref, alt = line.strip().split("\t")[0:5]
@@ -295,7 +300,8 @@ def extend_features(candidates_vcf,
             if cosmic and var_id in cosmic_vars:
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
-            all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
+            all_variants.append(
+                [chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
 
     if add_variants and len(add_vars) > 0:
         for var_id in add_vars - set(exclude_vars):
@@ -307,9 +313,11 @@ def extend_features(candidates_vcf,
             if cosmic and var_id in cosmic_vars:
                 if_cosmic = 1
                 num_cosmic_cases = cosmic_vars[var_id]
-            all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
+            all_variants.append(
+                [chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases])
 
-    all_variants = sorted(all_variants,key=lambda x:[chrom_order[x[0]],x[1]])
+    all_variants = sorted(all_variants, key=lambda x: [
+                          chrom_order[x[0]], x[1]])
     n_variants = len(all_variants)
     logger.info("Number of variants: {}".format(n_variants))
     split_len = (n_variants + num_threads - 1) // num_threads
@@ -324,15 +332,17 @@ def extend_features(candidates_vcf,
             curr_pos = [chrom, pos]
             nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
         else:
-            if chrom == curr_pos[0] and abs(curr_pos[1]-pos)<window_extend:
-                nei_cluster.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
+            if chrom == curr_pos[0] and abs(curr_pos[1] - pos) < window_extend and len(nei_cluster) < max_cluster_size:
+                nei_cluster.append(
+                    [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases])
             else:
                 batch.append(nei_cluster)
                 n_batch += len(nei_cluster)
                 curr_pos = [chrom, pos]
-                nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
-        if n_batch >= split_len or i == n_variants-1:
-            if i == n_variants-1:
+                nei_cluster = [
+                    [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]]
+        if n_batch >= split_len or i == n_variants - 1:
+            if i == n_variants - 1:
                 batch.append(nei_cluster)
                 curr_pos = None
                 nei_cluster = []
@@ -415,7 +425,10 @@ def extend_features(candidates_vcf,
                         action="store_true")
     parser.add_argument('--window_extend', type=int,
                         help='window size for extending input features (should be in the order of readlength)',
-                         default=1000)
+                        default=1000)
+    parser.add_argument('--max_cluster_size', type=int,
+                        help='max cluster size for extending input features (should be in the order of readlength)',
+                        default=300)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     args = parser.parse_args()
@@ -431,6 +444,7 @@ def extend_features(candidates_vcf,
                                  args.dbsnp, args.cosmic,
                                  args.no_seq_complexity,
                                  args.window_extend,
+                                 args.max_cluster_size,
                                  args.num_threads,
                                  )
         if output is None:
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 100f52e..65d777e 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -200,6 +200,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                no_seq_complexity,
                no_feature_recomp_for_ensemble,
                window_extend,
+               max_cluster_size,
                num_splits,
                num_threads,
                scan_alignments_binary,):
@@ -353,6 +354,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     dbsnp, None,
                                     no_seq_complexity,
                                     window_extend,
+                                    max_cluster_size,
                                     num_threads)
                 if ensemble_tsv and not no_feature_recomp_for_ensemble:
                     extra_features_others_tsv = os.path.join(
@@ -368,6 +370,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                         dbsnp, None,
                                         no_seq_complexity,
                                         window_extend,
+                                        max_cluster_size,
                                         num_threads)
 
                 extra_features_bed = os.path.join(
@@ -576,6 +579,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--window_extend', type=int,
                         help='window size for extending input features (should be in the order of readlength)',
                          default=1000)
+    parser.add_argument('--max_cluster_size', type=int,
+                        help='max cluster size for extending input features (should be in the order of readlength)',
+                         default=300)
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
@@ -599,6 +605,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.no_seq_complexity,
                    args.no_feature_recomp_for_ensemble,
                    args.window_extend,
+                   args.max_cluster_size,
                    args.num_splits,
                    args.num_threads,
                    args.scan_alignments_binary)

From 839cc63e4bb2b6861847ad1913479ca2afa8128e Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 13 May 2020 13:29:22 -0700
Subject: [PATCH 35/89] better memory management for feature extraction

---
 neusomatic/python/read_info_extractor.py |   7 +-
 neusomatic/python/sequencing_features.py | 118 ++++++++++++-----------
 2 files changed, 68 insertions(+), 57 deletions(-)

diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py
index cf71ed0..b9ae3e7 100644
--- a/neusomatic/python/read_info_extractor.py
+++ b/neusomatic/python/read_info_extractor.py
@@ -25,7 +25,8 @@
 
 ### PYSAM ###
 
-def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3):
+
+def position_of_aligned_read(aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3):
     '''
     Return the base call of the target position, and if it's a start of insertion/deletion.
     This target position follows pysam convension, i.e., 0-based.
@@ -40,11 +41,11 @@ def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target
     '''
     flanking_deletion, flanking_insertion = nan, nan
 
-    idx_aligned_pair, seq_i = read_pos_for_ref_pos #get_read_pos_for_ref_pos(read_i, target_position)
+    # get_read_pos_for_ref_pos(read_i, target_position)
+    idx_aligned_pair, seq_i, base_at_target, qual_at_target = read_pos_for_ref_pos
     # If the target position is aligned:
     try:
         if seq_i is not None:
-            base_at_target = read_i.seq[seq_i]
 
             # Whether if it's a Deletion/Insertion depends on what happens after this position:
             # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 9fa0c56..b2fe8d9 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -21,6 +21,7 @@
 def fisher_exact_test(mat):
     return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail
 
+
 def get_read_pos_for_ref_pos(read, ref_pos_s):
     cigartuples = read.cigartuples
     pos_r = read.reference_start
@@ -28,15 +29,16 @@ def get_read_pos_for_ref_pos(read, ref_pos_s):
     output = {}
     while current_i < len(ref_pos_s):
         if pos_r > ref_pos_s[current_i] or not cigartuples:
-            output[ref_pos_s[current_i]]=[None, None]
-            current_i +=1
+            output[ref_pos_s[current_i]] = [None, None, None, None]
+            current_i += 1
         else:
             break
     if current_i >= len(ref_pos_s):
         return output
     cigar_aligned = [cigar_aln_match, cigar_seq_match, cigar_seq_mismatch]
     cigar_s = 1 if cigartuples[0][0] == cigar_soft_clip else 0
-    cigar_e = (len(cigartuples) - 1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples)
+    cigar_e = (len(cigartuples) -
+               1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples)
     count = pos_q = cigartuples[0][1] if cigar_s == 1 else 0
     cigar_index = cigar_s
     for op, length in cigartuples[cigar_s: cigar_e]:
@@ -46,8 +48,13 @@ def get_read_pos_for_ref_pos(read, ref_pos_s):
         while current_i < len(ref_pos_s):
             diff = ref_pos_s[current_i] - pos_r
             if diff < delta_r:
-                output[ref_pos_s[current_i]]=[count + diff, (pos_q + diff) if delta_q else None]
-                current_i +=1
+                output[ref_pos_s[current_i]] = [count + diff, (pos_q + diff) if delta_q else None,
+                                                read.seq[
+                                                    (pos_q + diff)] if delta_q else None,
+                                                read.query_qualities[
+                                                    (pos_q + diff)] if delta_q else None,
+                                                ]
+                current_i += 1
             else:
                 break
         if current_i >= len(ref_pos_s):
@@ -57,70 +64,75 @@ def get_read_pos_for_ref_pos(read, ref_pos_s):
         pos_q += delta_q
         cigar_index += 1
     while current_i < len(ref_pos_s):
-        output[ref_pos_s[current_i]]=[None, None]
-        current_i +=1
+        output[ref_pos_s[current_i]] = [None, None, None, None]
+        current_i += 1
     return output
 
+
+class AugmentedAlignedRead:
+
+    def __init__(self, read, vars_pos):
+        self.qname = read.qname
+        self.vars_pos = vars_pos
+        self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos)
+        self.aligned_pairs = read.get_aligned_pairs()
+        self.mapping_quality = read.mapping_quality
+        self.mean_query_qualities = mean(read.query_qualities)
+        self.is_proper_pair = read.is_proper_pair
+        self.is_reverse = read.is_reverse
+        self.NM = read.get_tag('NM')
+        self.query_length = read.query_length
+        self.is_soft_clipped = read.cigar[0][
+            0] == cigar_soft_clip or read.cigar[-1][0] == cigar_soft_clip
+
+
 class ClusterReads:
+
     def __init__(self, bam, variants):
         self.variants = variants
         self.chrom = variants[0][0]
         self.min_pos = variants[0][1]
         self.max_pos = variants[-1][1]
         self.reads = []
+        n = len(variants)
+        self.var_reads = [[] for i in range(len(variants))]
+        self.read_pos_for_ref_pos = []
+        self.aligned_pairs = []
+        done_j = -1
+        i = 0
         for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos):
             if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
                 continue
-            self.reads.append(read_i)
-
-        done_i = -1
-        n = len(variants)
-        self.var_reads = [[] for i in range(len(variants))]
-        self.read_vars = [[] for i in range(len(self.reads))]
-        for i, read in enumerate(self.reads):
-            for j in range(done_i + 1, n):
+            read_vars = []
+            for j in range(done_j + 1, n):
                 pos = variants[j][1]
-                if read.reference_start >= pos:
-                    done_i += 1
+                if read_i.reference_start >= pos:
+                    done_j += 1
                     continue
-                if pos <= read.reference_end:
+                if pos <= read_i.reference_end:
                     self.var_reads[j].append(i)
-                    self.read_vars[i].append(j)
-        unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j])
-        for i in unused_reads:
-            self.reads[i] = None
-        self.aligned_pairs = []
-        for i, read in enumerate(self.reads):
-            if i not in unused_reads:
-                self.aligned_pairs.append(read.get_aligned_pairs())
-            else:
-                self.aligned_pairs.append(None)
-        self.read_pos_for_ref_pos = []
-        for i, read in enumerate(self.reads):
-            if i not in unused_reads:
-                self.read_pos_for_ref_pos.append(get_read_pos_for_ref_pos(read, 
-                    [self.variants[j][1]-1 for j in self.read_vars[i]]))
-            else:
-                self.read_pos_for_ref_pos.append(None)
-
+                    read_vars.append(j)
+            if len(read_vars) > 0:
+                vars_pos = [self.variants[j][1] - 1 for j in read_vars]
+                self.reads.append(AugmentedAlignedRead(read_i, vars_pos))
+                i += 1
 
     def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10):
         '''
         bam is the opened file handle of bam file
-        my_coordiate is a list or tuple of 0-based (contig, position)
+        my_coordinate is a list or tuple of 0-based (contig, position)
         '''
         my_coordinate = self.variants[var_index][0:2]
         reads = [self.reads[i] for i in self.var_reads[var_index]]
-        aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]]
-        read_pos_for_ref_pos_s = [self.read_pos_for_ref_pos[i][my_coordinate[1]-1] for i in self.var_reads[var_index]]
-        bamfeatures = AlignmentFeatures(reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq, min_bq)
+        bamfeatures = AlignmentFeatures(
+            reads, my_coordinate, ref_base, first_alt, min_mq, min_bq)
 
         return bamfeatures
 
 
 class AlignmentFeatures:
 
-    def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
+    def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10):
 
         indel_length = len(first_alt) - len(ref_base)
 
@@ -140,15 +152,15 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate,
 
         qname_collector = defaultdict(list)
 
-        for read_i, aligned_pair, read_pos_for_ref_pos in zip(reads,aligned_pairs,read_pos_for_ref_pos_s):
-            if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None:
-                continue
+        for read_i in reads:
             dp += 1
-
+            read_pos_for_ref_pos = read_i.read_pos_for_ref_pos[
+                my_coordinate[1] - 1]
             code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
-                read_i, aligned_pair, read_pos_for_ref_pos, my_coordinate[1] - 1)
+                read_i.aligned_pairs, read_pos_for_ref_pos, my_coordinate[1] - 1)
+            read_i_qual_ith_base = read_pos_for_ref_pos[3]
 
-            if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq:
+            if read_i.mapping_quality < min_mq and read_i.mean_query_qualities < min_bq:
                 poor_read_count += 1
 
             if read_i.mapping_quality == 0:
@@ -170,21 +182,19 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate,
             qname_collector[read_i.qname].append(index)
 
             read_mq[index].append(read_i.mapping_quality)
-            read_bq[index].append(read_i.query_qualities[ith_base])
+            read_bq[index].append(read_i_qual_ith_base)
 
             try:
-                edit_distance[index].append(read_i.get_tag('NM'))
+                edit_distance[index].append(read_i.NM)
             except KeyError:
                 pass
 
-            if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq:
+            if read_i.mapping_quality >= min_mq and read_i_qual_ith_base >= min_bq:
                 concordance_counts[
                     0 if read_i.is_proper_pair else 1][index] += 1
                 orientation_counts[1 if read_i.is_reverse else 0][index] += 1
 
-            is_soft_clipped = read_i.cigar[0][
-                0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip
-            soft_clip_counts[1 if is_soft_clipped else 0][index] += 1
+            soft_clip_counts[1 if read_i.is_soft_clipped else 0][index] += 1
 
             # Distance from the end of the read:
             if ith_base is not None:
@@ -258,7 +268,7 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate,
 def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt):
     '''
     ref_fa is the opened reference fasta file handle
-    my_coordiate is a list or tuple of 0-based (contig, position)
+    my_coordinate is a list or tuple of 0-based (contig, position)
     '''
 
     # Homopolymer eval (Make sure to modify for INDEL):

From 050188311b99fdb420cea27057396b1d33e43cc7 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 13 May 2020 15:54:58 -0700
Subject: [PATCH 36/89] not to store aligned_pairs

---
 neusomatic/python/sequencing_features.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index b2fe8d9..8aa7bf8 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -75,7 +75,12 @@ def __init__(self, read, vars_pos):
         self.qname = read.qname
         self.vars_pos = vars_pos
         self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos)
-        self.aligned_pairs = read.get_aligned_pairs()
+        self.pos_of_aligned_read = {}
+        for pos in vars_pos:
+            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
+                read.get_aligned_pairs(), self.read_pos_for_ref_pos[pos], pos)
+            self.pos_of_aligned_read[pos] = [
+                code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i]
         self.mapping_quality = read.mapping_quality
         self.mean_query_qualities = mean(read.query_qualities)
         self.is_proper_pair = read.is_proper_pair
@@ -96,8 +101,6 @@ def __init__(self, bam, variants):
         self.reads = []
         n = len(variants)
         self.var_reads = [[] for i in range(len(variants))]
-        self.read_pos_for_ref_pos = []
-        self.aligned_pairs = []
         done_j = -1
         i = 0
         for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos):
@@ -156,8 +159,8 @@ def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=1
             dp += 1
             read_pos_for_ref_pos = read_i.read_pos_for_ref_pos[
                 my_coordinate[1] - 1]
-            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
-                read_i.aligned_pairs, read_pos_for_ref_pos, my_coordinate[1] - 1)
+            code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = read_i.pos_of_aligned_read[
+                my_coordinate[1] - 1]
             read_i_qual_ith_base = read_pos_for_ref_pos[3]
 
             if read_i.mapping_quality < min_mq and read_i.mean_query_qualities < min_bq:

From 90a68da63b131a8676f5d5360da6d44c32c3c13a Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 13 May 2020 16:17:14 -0700
Subject: [PATCH 37/89] small fix

---
 neusomatic/python/sequencing_features.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 8aa7bf8..265d0f8 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -76,9 +76,10 @@ def __init__(self, read, vars_pos):
         self.vars_pos = vars_pos
         self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos)
         self.pos_of_aligned_read = {}
+        aligned_pairs = read.get_aligned_pairs()
         for pos in vars_pos:
             code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(
-                read.get_aligned_pairs(), self.read_pos_for_ref_pos[pos], pos)
+                aligned_pairs, self.read_pos_for_ref_pos[pos], pos)
             self.pos_of_aligned_read[pos] = [
                 code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i]
         self.mapping_quality = read.mapping_quality

From f83b6b520961d163ebc51ff96acaf3c7e609a069 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 14 May 2020 17:57:28 -0700
Subject: [PATCH 38/89] enable custom header

---
 neusomatic/python/generate_dataset.py |  7 +++++--
 neusomatic/python/preprocess.py       | 28 ++++++++++++++++++---------
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 5f9c6b8..f9c5bf3 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1535,8 +1535,11 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     split_batch_size = 10000
     if ensemble_tsv and not ensemble_bed:
         ensemble_bed = os.path.join(work, "ensemble.bed")
-        extract_ensemble([ensemble_tsv], ensemble_bed,
-                         no_seq_complexity, enforce_header, False)
+        extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed,
+                         no_seq_complexity=no_seq_complexity, enforce_header=enforce_header,
+                         custom_header=ensemble_custom_header,
+                         is_extend=False)
+
 
     tmp_ = bedtools_intersect(
         tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger)
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 65d777e..5b742dd 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -194,7 +194,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                ins_min_af, del_min_af, del_merge_min_af,
                ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size,
                matrix_width, matrix_base_pad, min_ev_frac_per_col,
-               ensemble_tsv, long_read, restart, first_do_without_qual,
+               ensemble_tsv, ensemble_custom_header,
+               long_read, restart, first_do_without_qual,
                keep_duplicate,
                add_extra_features,
                no_seq_complexity,
@@ -248,9 +249,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         ensemble_bed = os.path.join(work, "ensemble.bed")
         logger.info("Extract ensemble info.")
         if restart or not os.path.exists(ensemble_bed):
-            extract_ensemble([ensemble_tsv], ensemble_bed,
-                             no_seq_complexity, no_feature_recomp_for_ensemble, False)
-
+            extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed,
+                             no_seq_complexity=no_seq_complexity, enforce_header=no_feature_recomp_for_ensemble,
+                             custom_header=ensemble_custom_header,
+                             is_extend=False)
     merge_d_for_short_read = 100
     candidates_split_regions = []
     ensemble_beds = []
@@ -376,8 +378,12 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 extra_features_bed = os.path.join(
                     work_dataset_split, "extra_features.bed")
                 if not os.path.exists(extra_features_bed) or restart:
-                    extract_ensemble(ex_tsvs,
-                                     extra_features_bed, no_seq_complexity, True, True)
+                    extract_ensemble(ensemble_tsvs=ex_tsvs,
+                                     ensemble_bed=extra_features_bed, 
+                                     no_seq_complexity=no_seq_complexity, 
+                                     enforce_header=True, 
+                                     custom_header=False,
+                                     is_extend=True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
@@ -555,6 +561,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         help='minimum frac cov per column to keep columm', default=0.06)
     parser.add_argument('--ensemble_tsv', type=str,
                         help='Ensemble annotation tsv file (only for short read)', default=None)
+    parser.add_argument('--ensemble_custom_header',
+                        help='Allow ensemble tsv to have custom header fields',
+                        action="store_true")
     parser.add_argument('--long_read',
                         help='Enable long_read (high error-rate sequence) indel realignment',
                         action="store_true")
@@ -578,10 +587,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         action="store_true")
     parser.add_argument('--window_extend', type=int,
                         help='window size for extending input features (should be in the order of readlength)',
-                         default=1000)
+                        default=1000)
     parser.add_argument('--max_cluster_size', type=int,
                         help='max cluster size for extending input features (should be in the order of readlength)',
-                         default=300)
+                        default=300)
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
@@ -599,7 +608,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.ins_min_af, args.del_min_af, args.del_merge_min_af,
                    args.ins_merge_min_af, args.merge_r,
                    args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col,
-                   args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual,
+                   args.ensemble_tsv, args.ensemble_custom_header,
+                   args.long_read, args.restart, args.first_do_without_qual,
                    args.keep_duplicate,
                    args.add_extra_features,
                    args.no_seq_complexity,

From 49c809f82a8242e38d26d4e4f89c8ca9e9916ff5 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 14 May 2020 17:58:24 -0700
Subject: [PATCH 39/89] fixed a bug

---
 neusomatic/python/generate_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 5f9c6b8..235f6db 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -984,7 +984,7 @@ def find_records(input_record):
                         else:
                             r_ = [[chrom, pos, ref, alt]]
 
-                        ann = [0] * NUM_ENS_FEATURES
+                        ann = [0] * num_ens_features
                         if pos == ens_pos:
                             if ref == ens_ref and alt == ens_alt:
                                 ann = record_[15:]

From 2f9905bbe8504468a9f40c09433e376c1d944aa1 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 14 May 2020 22:14:38 -0700
Subject: [PATCH 40/89] fix bug in region splitting

---
 neusomatic/python/generate_dataset.py | 45 +++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 235f6db..5e7eacb 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -825,6 +825,41 @@ def find_len(ref, alt):
     return max(len(ref_), len(alt_))
 
 
+def keep_in_region(input_file, region_bed,
+                   output_fn):
+    logger = logging.getLogger(find_len.__name__)
+    i = 0
+    tmp_ = get_tmp_file()
+    with open(input_file) as i_f, open(tmp_, "w") as o_f:
+        for line in skip_empty(i_f):
+            fields = line.strip().split()
+            chrom, start, end = fields[0:3]
+            o_f.write(
+                "\t".join([chrom, start, str(int(start) + 1), str(i)]) + "\n")
+            i += 1
+
+    good_i = set([])
+    tmp_ = bedtools_intersect(
+        tmp_, region_bed, args=" -wa -wb", run_logger=logger)
+    with open(tmp_) as i_f:
+        for line in skip_empty(i_f):
+            fields = line.strip().split()
+            chrom, start, end, i_, chrom_, start_, end_ = fields[0:7]
+            assert(chrom == chrom_)
+            if start_ <= start <= end_:
+                good_i.add(int(i_))
+    i = 0
+    with open(input_file) as i_f, open(output_fn, "w") as o_f:
+        for line in skip_empty(i_f, skip_header=False):
+            if line.startswith("#"):
+                o_f.write(line)
+                continue
+            fields = line.strip().split()
+            if i in good_i:
+                o_f.write(line)
+            i += 1
+
+
 def find_records(input_record):
     work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record
     thread_logger = logging.getLogger(
@@ -853,11 +888,17 @@ def find_records(input_record):
             num_ens_features += 2
         bedtools_intersect(
             truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger)
+        tmp_ = get_tmp_file()
         bedtools_intersect(
-            pred_vcf_file, split_bed, args=" -u", output_fn=split_pred_vcf_file, run_logger=thread_logger)
+            pred_vcf_file, split_bed, args=" -u", output_fn=tmp_, run_logger=thread_logger)
+        keep_in_region(input_file=tmp_, region_bed=split_region_file,
+                       output_fn=split_pred_vcf_file)
         if ensemble_bed:
+            tmp_ = get_tmp_file()
             bedtools_intersect(
-                ensemble_bed, split_bed, args=" -u", output_fn=split_ensemble_bed_file, run_logger=thread_logger)
+                ensemble_bed, split_bed, args=" -u", output_fn=tmp_, run_logger=thread_logger)
+            keep_in_region(input_file=tmp_, region_bed=split_region_file,
+                           output_fn=split_ensemble_bed_file)
             tmp_ = bedtools_window(
                 split_ensemble_bed_file, split_pred_vcf_file, args=" -w 5 -v", run_logger=thread_logger)
 

From c4f24ace7a294c84a94d7719860ac7f66be6deb5 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 14 May 2020 22:37:42 -0700
Subject: [PATCH 41/89] small fix

---
 neusomatic/python/generate_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 5e7eacb..ee89e6b 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -839,8 +839,8 @@ def keep_in_region(input_file, region_bed,
             i += 1
 
     good_i = set([])
-    tmp_ = bedtools_intersect(
-        tmp_, region_bed, args=" -wa -wb", run_logger=logger)
+    tmp_ = bedtools_window(
+        tmp_, region_bed, args=" -w 1", run_logger=logger)
     with open(tmp_) as i_f:
         for line in skip_empty(i_f):
             fields = line.strip().split()

From 1ce3935b8359fe2f0beb69d10427f872298bbbcc Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 14 May 2020 23:50:07 -0700
Subject: [PATCH 42/89] small_fix

---
 neusomatic/python/generate_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index ee89e6b..ca2afc5 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -827,7 +827,7 @@ def find_len(ref, alt):
 
 def keep_in_region(input_file, region_bed,
                    output_fn):
-    logger = logging.getLogger(find_len.__name__)
+    logger = logging.getLogger(keep_in_region.__name__)
     i = 0
     tmp_ = get_tmp_file()
     with open(input_file) as i_f, open(tmp_, "w") as o_f:
@@ -846,7 +846,7 @@ def keep_in_region(input_file, region_bed,
             fields = line.strip().split()
             chrom, start, end, i_, chrom_, start_, end_ = fields[0:7]
             assert(chrom == chrom_)
-            if start_ <= start <= end_:
+            if int(start_) <= int(start) <= int(end_):
                 good_i.add(int(i_))
     i = 0
     with open(input_file) as i_f, open(output_fn, "w") as o_f:

From 8b29757ea55c431b0153efd2fb83ceeb312a0dbc Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 15 May 2020 11:58:27 -0700
Subject: [PATCH 43/89] small fix

---
 neusomatic/python/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 0211443..e488058 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -253,7 +253,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         if not force_zero_ann_cols:
             logger.info(
                 "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols))
-        prev_epochs = sofar_epochs + 1
+        prev_epochs = sofar_epochs
     else:
         prev_epochs = 0
         time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
@@ -450,7 +450,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "normalize_channels": normalize_channels,
                 "no_seq_complexity": no_seq_complexity,
                 "zero_ann_cols": zero_ann_cols,
-                }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
+                }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
         train_sets[0].open_candidate_tsvs()

From 7b0cb751957ab3f4dd9f5572dcc9689ae9646725 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 15 May 2020 15:07:37 -0700
Subject: [PATCH 44/89] small fix

---
 neusomatic/python/generate_dataset.py | 16 ++++++++++++----
 neusomatic/python/preprocess.py       |  9 +++++++--
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 95ccb1f..3bebdc8 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1367,7 +1367,9 @@ def find_records(input_record):
         return None
 
 
-def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, is_extend):
+def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header,
+                     ensemble_custom_header,
+                     is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
     ensemble_pos = []
@@ -1549,7 +1551,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
 
 def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                     ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size):
+                     ensemble_bed,
+                     ensemble_custom_header,
+                     no_seq_complexity, enforce_header, tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
     logger.info("---------------------Generate Dataset----------------------")
@@ -1581,7 +1585,6 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                          custom_header=ensemble_custom_header,
                          is_extend=False)
 
-
     tmp_ = bedtools_intersect(
         tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger)
     len_candids = 0
@@ -1792,6 +1795,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                         help='Ensemble annotation tsv file (only for short read)', default=None)
     parser.add_argument('--ensemble_bed', type=str,
                         help='Ensemble annotation bed file (only for short read)', default=None)
+    parser.add_argument('--ensemble_custom_header',
+                        help='Allow ensemble tsv to have custom header fields',
+                        action="store_true")
     parser.add_argument('--no_seq_complexity',
                         help='Dont compute linguistic sequence complexity features',
                         action="store_true")
@@ -1822,7 +1828,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
-                         ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size)
+                         ensemble_bed,
+                         ensemble_custom_header,
+                         no_seq_complexity, enforce_header, tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 5b742dd..049e041 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -79,11 +79,14 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
 
 
 def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
-                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, no_seq_complexity,
+                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, 
+                            ensemble_custom_header,
+                            no_seq_complexity,
                             no_feature_recomp_for_ensemble, tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed,
+                     ensemble_custom_header,
                      no_seq_complexity,
                      no_feature_recomp_for_ensemble,
                      tsv_batch_size)
@@ -491,7 +494,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_bed_i, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
+                                    ensemble_bed_i, 
+                                    ensemble_custom_header,
+                                    no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir

From 7706c3b141bf4bdfabd966774284fd48dbbc1373 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 16 May 2020 00:47:15 -0700
Subject: [PATCH 45/89] enable custom heading

---
 neusomatic/python/call.py             |  60 +++---
 neusomatic/python/generate_dataset.py | 254 ++++++++++++++------------
 neusomatic/python/postprocess.py      |  44 +++--
 neusomatic/python/preprocess.py       |   3 +-
 neusomatic/python/train.py            |  62 ++++---
 5 files changed, 244 insertions(+), 179 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 18b3969..2395947 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -432,6 +432,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         zero_ann_cols = pretrained_dict["zero_ann_cols"]
     else:
         zero_ann_cols = []
+    if "ensemble_custom_header" in pretrained_dict:
+        ensemble_custom_header = pretrained_dict["ensemble_custom_header"]
+    else:
+        ensemble_custom_header = False
 
     if force_zero_ann_cols:
         logger.info(
@@ -442,33 +446,43 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     logger.info("normalize_channels: {}".format(normalize_channels))
     logger.info("no_seq_complexity: {}".format(no_seq_complexity))
     logger.info("zero_ann_cols: {}".format(zero_ann_cols))
-
+    logger.info("ensemble_custom_header: {}".format(ensemble_custom_header))
     
-    expected_ens_fields = NUM_ENS_FEATURES
-    if not no_seq_complexity:
-        expected_ens_fields += 2
     
-    logger.info("expected_ens_fields: {}".format(expected_ens_fields))
-
-    expected_st_fields = 4
-
-    logger.info("expected_st_fields: {}".format(expected_st_fields))
+    if ensemble_custom_header:    
+        expected_ens_fields = NUM_ENS_FEATURES
+        if not no_seq_complexity:
+            expected_ens_fields += 2
+        
+        logger.info("expected_ens_fields: {}".format(expected_ens_fields))
+
+        expected_st_fields = 4
+
+        logger.info("expected_st_fields: {}".format(expected_st_fields))
+
+        ensemble = False
+        for tsv in candidates_tsv:
+            with open(tsv) as i_f:
+                x = i_f.readline().strip().split()
+                if x:
+                    if len(x) == expected_ens_fields + 4:
+                        ensemble = True
+                        break
+                    elif len(x) == 4:
+                        break
+                    else:
+                        raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
 
-    ensemble = False
-    for tsv in candidates_tsv:
-        with open(tsv) as i_f:
-            x = i_f.readline().strip().split()
-            if x:
-                if len(x) == expected_ens_fields + 4:
-                    ensemble = True
-                    break
-                elif len(x) == 4:
+        num_channels = expected_ens_fields + \
+            NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
+    else:
+        num_channels = 0
+        for tsv in candidates_tsv:
+            with open(tsv) as i_f:
+                x = i_f.readline().strip().split()
+                if x:
+                    num_channels = len(x) - 4 + NUM_ST_FEATURES
                     break
-                else:
-                    raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
-
-    num_channels = expected_ens_fields + \
-        NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
 
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 3bebdc8..62649ac 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -861,7 +861,7 @@ def keep_in_region(input_file, region_bed,
 
 
 def find_records(input_record):
-    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record
+    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, work_index = input_record
     thread_logger = logging.getLogger(
         "{} ({})".format(find_records.__name__, multiprocessing.current_process().name))
     try:
@@ -883,9 +883,6 @@ def find_records(input_record):
         split_in_ensemble_bed = os.path.join(
             work, "in_ensemble_{}.bed".format(work_index))
 
-        num_ens_features = NUM_ENS_FEATURES
-        if not no_seq_complexity:
-            num_ens_features += 2
         bedtools_intersect(
             truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger)
         tmp_ = get_tmp_file()
@@ -1368,7 +1365,7 @@ def find_records(input_record):
 
 
 def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header,
-                     ensemble_custom_header,
+                     custom_header,
                      is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
@@ -1417,31 +1414,39 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                     all_headers.add(line)
                     header_pos = line.strip().split()[0:5]
                     header_ = line.strip().split()[5:]
-                    if is_extend:
-                        header_ += callers_features
-                    header_en = list(filter(
-                        lambda x: x[1] in expected_features, enumerate(header_)))
-                    header = list(map(lambda x: x[1], header_en))
-                    if not enforce_header:
-                        expected_features = header
-
-                    if set(expected_features) - set(header):
-                        logger.error("The following features are missing from ensemble file {}: {}".format(
-                            ensemble_tsv,
-                            list(set(expected_features) - set(header))))
-                        raise Exception
-                    order_header = []
-                    for f in expected_features:
-                        order_header.append(header_en[header.index(f)][0])
+                    if not custom_header:
+                        if is_extend:
+                            header_ += callers_features
+                        header_en = list(filter(
+                            lambda x: x[1] in expected_features, enumerate(header_)))
+                        header = list(map(lambda x: x[1], header_en))
+                        if not enforce_header:
+                            expected_features = header
+
+                        if set(expected_features) - set(header):
+                            logger.error("The following features are missing from ensemble file {}: {}".format(
+                                ensemble_tsv,
+                                list(set(expected_features) - set(header))))
+                            raise Exception
+                        order_header = []
+                        for f in expected_features:
+                            order_header.append(header_en[header.index(f)][0])
+                    else:
+                        order_header=range(len(header_))
                     continue
                 fields = line.strip().split()
                 fields[2] = str(int(fields[1]) + len(fields[3]))
                 ensemble_pos.append(fields[0:5])
                 features = fields[5:]
-                if is_extend:
+                if is_extend and not custom_header:
                     features += ["0"] * len(callers_features)
-                ensemble_data.append(list(map(lambda x: float(
-                    x.replace("False", "0").replace("True", "1")), features)))
+                features = list(map(lambda x: float(
+                    x.replace("False", "0").replace("True", "1")), features))
+                if custom_header:
+                    if min(features)<0 or max(features)>1:
+                        logger.info("In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]" )
+                        raise Exception
+                ensemble_data.append(features)
                 n_vars += 1
     if len(set(all_headers)) != 1:
         raise(RuntimeError("inconsistent headers in {}".format(ensemble_tsvs)))
@@ -1449,98 +1454,102 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         ensemble_data = np.array(ensemble_data)[:, order_header]
     header = np.array(header_)[order_header].tolist()
 
-    cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
-        "Consistent_Mates", "Inconsistent_Mates", "N_DP",
-        "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
-        "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads",  "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads",
-        "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp",
-        "nBAM_ALT_InDel_1bp",
-        "T_DP", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
-        "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV",
-        "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads",
-        "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
-        "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp",
-    ], enumerate(header))))
-    mq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
-        "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header))))
-    bq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
-        "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header))))
-    nm_diff_features = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header))))
-    ranksum_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ",
-                                                                          "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ",  "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header))))
-    zero_to_one_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict",
-                                                                              "MuSE_Tier", "if_Strelka"] + ["nBAM_Concordance_FET", "nBAM_StrandBias_FET", "nBAM_Clipping_FET",
-                                                                                                            "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header))))
-    stralka_scor = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["Strelka_Score"], enumerate(header))))
-    stralka_qss = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["Strelka_QSS"], enumerate(header))))
-    stralka_tqss = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["Strelka_TQSS"], enumerate(header))))
-    varscan2_score = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["VarScan2_Score"], enumerate(header))))
-    vardict_score = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["VarDict_Score"], enumerate(header))))
-    m2_lod = list(map(lambda x: x[0], filter(lambda x: x[1] in [
-        "M2_NLOD", "M2_TLOD"], enumerate(header))))
-    sniper_score = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["Sniper_Score"], enumerate(header))))
-    m2_ecent = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["M2_ECNT"], enumerate(header))))
-    sor = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["SOR"], enumerate(header))))
-    msi = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["MSI"], enumerate(header))))
-    msilen = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["MSILEN"], enumerate(header))))
-    shift3 = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["SHIFT3"], enumerate(header))))
-    MaxHomopolymer_Length = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header))))
-    SiteHomopolymer_Length = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header))))
-    InDel_Length = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["InDel_Length"], enumerate(header))))
-    Seq_Complexity_ = list(map(lambda x: x[0], filter(
-        lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header))))
-
-    min_max_features = [[cov_features, 0, 2 * COV],
-                        [mq_features, 0, 70],
-                        [bq_features, 0, 41],
-                        [nm_diff_features, -2 * COV, 2 * COV],
-                        [zero_to_one_features, 0, 1],
-                        [ranksum_features, -30, 30],
-                        [stralka_scor, 0, 40],
-                        [stralka_qss, 0, 200],
-                        [stralka_tqss, 0, 4],
-                        [varscan2_score, 0, 60],
-                        [vardict_score, 0, 120],
-                        [m2_lod, 0, 100],
-                        [sniper_score, 0, 120],
-                        [m2_ecent, 0, 40],
-                        [sor, 0, 100],
-                        [msi, 0, 100],
-                        [msilen, 0, 10],
-                        [shift3, 0, 100],
-                        [MaxHomopolymer_Length, 0, 50],
-                        [SiteHomopolymer_Length, 0, 50],
-                        [InDel_Length, -30, 30],
-                        ]
-    if not no_seq_complexity:
-        min_max_features.append([Seq_Complexity_, 0, 40])
-
-    selected_features = sorted([i for f in min_max_features for i in f[0]])
-    selected_features_tags = list(map(lambda x: header[x], selected_features))
-    if n_vars > 0:
-        for i_s, mn, mx in min_max_features:
-            if i_s:
-                s = ensemble_data[:, np.array(i_s)]
-                s = np.maximum(np.minimum(s, mx), mn)
-                s = (s - mn) / (mx - mn)
-                ensemble_data[:, np.array(i_s)] = s
-        ensemble_data = ensemble_data[:, selected_features]
-        ensemble_data = ensemble_data.tolist()
+    if not custom_header:
+        cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
+            "Consistent_Mates", "Inconsistent_Mates", "N_DP",
+            "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
+            "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads",  "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads",
+            "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp",
+            "nBAM_ALT_InDel_1bp",
+            "T_DP", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
+            "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV",
+            "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads",
+            "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp",
+            "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp",
+        ], enumerate(header))))
+        mq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
+            "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header))))
+        bq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
+            "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header))))
+        nm_diff_features = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header))))
+        ranksum_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ",
+                                                                              "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ",  "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header))))
+        zero_to_one_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict",
+                                                                                  "MuSE_Tier", "if_Strelka"] + ["nBAM_Concordance_FET", "nBAM_StrandBias_FET", "nBAM_Clipping_FET",
+                                                                                                                "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header))))
+        stralka_scor = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["Strelka_Score"], enumerate(header))))
+        stralka_qss = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["Strelka_QSS"], enumerate(header))))
+        stralka_tqss = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["Strelka_TQSS"], enumerate(header))))
+        varscan2_score = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["VarScan2_Score"], enumerate(header))))
+        vardict_score = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["VarDict_Score"], enumerate(header))))
+        m2_lod = list(map(lambda x: x[0], filter(lambda x: x[1] in [
+            "M2_NLOD", "M2_TLOD"], enumerate(header))))
+        sniper_score = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["Sniper_Score"], enumerate(header))))
+        m2_ecent = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["M2_ECNT"], enumerate(header))))
+        sor = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["SOR"], enumerate(header))))
+        msi = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["MSI"], enumerate(header))))
+        msilen = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["MSILEN"], enumerate(header))))
+        shift3 = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["SHIFT3"], enumerate(header))))
+        MaxHomopolymer_Length = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header))))
+        SiteHomopolymer_Length = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header))))
+        InDel_Length = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["InDel_Length"], enumerate(header))))
+        Seq_Complexity_ = list(map(lambda x: x[0], filter(
+            lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header))))
+
+        min_max_features = [[cov_features, 0, 2 * COV],
+                            [mq_features, 0, 70],
+                            [bq_features, 0, 41],
+                            [nm_diff_features, -2 * COV, 2 * COV],
+                            [zero_to_one_features, 0, 1],
+                            [ranksum_features, -30, 30],
+                            [stralka_scor, 0, 40],
+                            [stralka_qss, 0, 200],
+                            [stralka_tqss, 0, 4],
+                            [varscan2_score, 0, 60],
+                            [vardict_score, 0, 120],
+                            [m2_lod, 0, 100],
+                            [sniper_score, 0, 120],
+                            [m2_ecent, 0, 40],
+                            [sor, 0, 100],
+                            [msi, 0, 100],
+                            [msilen, 0, 10],
+                            [shift3, 0, 100],
+                            [MaxHomopolymer_Length, 0, 50],
+                            [SiteHomopolymer_Length, 0, 50],
+                            [InDel_Length, -30, 30],
+                            ]
+        if not no_seq_complexity:
+            min_max_features.append([Seq_Complexity_, 0, 40])
+
+        selected_features = sorted([i for f in min_max_features for i in f[0]])
+        selected_features_tags = list(map(lambda x: header[x], selected_features))
+        if n_vars > 0:
+            for i_s, mn, mx in min_max_features:
+                if i_s:
+                    s = ensemble_data[:, np.array(i_s)]
+                    s = np.maximum(np.minimum(s, mx), mn)
+                    s = (s - mn) / (mx - mn)
+                    ensemble_data[:, np.array(i_s)] = s
+            ensemble_data = ensemble_data[:, selected_features]
+            ensemble_data = ensemble_data.tolist()
+    else:
+        ensemble_data = ensemble_data.tolist()        
+        selected_features_tags = header_
     with open(ensemble_bed, "w")as f_:
         f_.write(
             "#" + "\t".join(map(str, header_pos + selected_features_tags)) + "\n")
@@ -1606,11 +1615,22 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     fasta_file = pysam.Fastafile(ref_file)
     chrom_lengths = dict(zip(fasta_file.references, fasta_file.lengths))
 
+    if not ensemble_custom_header:
+        num_ens_features = NUM_ENS_FEATURES
+        if not no_seq_complexity:
+            num_ens_features += 2
+    else:
+        num_ens_features = 0
+        with open(ensemble_bed) as i_f:
+            x = i_f.readline().strip().split()
+            if x:
+                num_ens_features = len(x) - 5 
+
     pool = multiprocessing.Pool(num_threads)
     map_args = []
     for i, split_region_file in enumerate(split_region_files):
         map_args.append((work, split_region_file, truth_vcf_file,
-                         tumor_pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, i))
+                         tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, i))
     try:
         records_data = pool.map_async(find_records, map_args).get()
         pool.close()
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index b8d0b3a..b62f9e7 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -33,6 +33,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
     logger = logging.getLogger(add_vcf_info.__name__)
 
     ensemble_candids_vcf = None
+    use_ensemble_candids = False
     if ensemble_tsv:
         ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf")
         with open(ensemble_tsv) as e_f, open(ensemble_candids_vcf, "w") as c_f:
@@ -40,39 +41,48 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
             c_f.write(
                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
             for line in e_f:
-                if "T_REF_FOR" in line:
+                if "POS" in line:
                     header = line.strip().split()
                     chrom_id = header.index("CHROM")
                     pos_id = header.index("POS")
                     ref_id = header.index("REF")
                     alt_id = header.index("ALT")
-                    dp_id = header.index("T_DP")
-                    ref_fw_id = header.index("T_REF_FOR")
-                    ref_rv_id = header.index("T_REF_REV")
-                    alt_fw_id = header.index("T_ALT_FOR")
-                    alt_rv_id = header.index("T_ALT_REV")
+                    if "T_DP" in line:
+                        dp_id = header.index("T_DP")
+                        ref_fw_id = header.index("T_REF_FOR")
+                        ref_rv_id = header.index("T_REF_REV")
+                        alt_fw_id = header.index("T_ALT_FOR")
+                        alt_rv_id = header.index("T_ALT_REV")
+                        use_ensemble_candids = True
+                    else:
+                        dp_id, ref_fw_id, ref_rv_id, alt_fw_id, alt_rv_id = None, None, None, None, None
                     continue
                 fields = line.strip().split()
                 chrom = fields[chrom_id]
                 pos = fields[pos_id]
                 ref = fields[ref_id]
                 alt = fields[alt_id]
-                dp = int(fields[dp_id])
-                ro_fw = int(fields[ref_fw_id])
-                ro_rv = int(fields[ref_rv_id])
-                ao_fw = int(fields[alt_fw_id])
-                ao_rv = int(fields[alt_rv_id])
-                ro = ro_fw + ro_rv
-                ao = ao_fw + ao_rv
-                af = np.round(ao / float(ao + ro + 0.0001), 4)
-                c_f.write(
-                    "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n")
+                if dp_id is not None:
+                    dp = int(fields[dp_id])
+                    ro_fw = int(fields[ref_fw_id])
+                    ro_rv = int(fields[ref_rv_id])
+                    ao_fw = int(fields[alt_fw_id])
+                    ao_rv = int(fields[alt_rv_id])
+                    ro = ro_fw + ro_rv
+                    ao = ao_fw + ao_rv
+                    af = np.round(ao / float(ao + ro + 0.0001), 4)
+                    c_f.write(
+                        "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n")
+                else:
+                    c_f.write(
+                        "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", ".", "."])) + "\n")
+
 
     in_candidates = bedtools_window(
         merged_vcf, candidates_vcf, args=" -w 5", run_logger=logger)
     notin_candidates = bedtools_window(
         merged_vcf, candidates_vcf, args=" -w 5 -v", run_logger=logger)
-    if ensemble_tsv:
+    if ensemble_tsv and use_ensemble_candids:
         in_ensemble = bedtools_window(
             merged_vcf, ensemble_candids_vcf, args=" -w 5", run_logger=logger)
         notin_any = bedtools_window(
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 049e041..f69734e 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -567,7 +567,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--ensemble_tsv', type=str,
                         help='Ensemble annotation tsv file (only for short read)', default=None)
     parser.add_argument('--ensemble_custom_header',
-                        help='Allow ensemble tsv to have custom header fields',
+                        help='Allow ensemble tsv to have custom header fields. (Features should be\
+                            normalized between [0,1]',
                         action="store_true")
     parser.add_argument('--long_read',
                         help='Enable long_read (high error-rate sequence) indel realignment',
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index e488058..45fbae2 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -206,6 +206,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                      no_seq_complexity,
                      zero_ann_cols,
                      force_zero_ann_cols,
+                     ensemble_custom_header,
                      use_cuda):
     logger = logging.getLogger(train_neusomatic.__name__)
 
@@ -253,6 +254,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         if not force_zero_ann_cols:
             logger.info(
                 "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols))
+        if "ensemble_custom_header" in pretrained_dict:
+            ensemble_custom_header = pretrained_dict["ensemble_custom_header"]
+        else:
+            ensemble_custom_header = False
         prev_epochs = sofar_epochs
     else:
         prev_epochs = 0
@@ -265,33 +270,40 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
         logger.info(
             "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols))
 
+    if not ensemble_custom_header:
+        expected_ens_fields = NUM_ENS_FEATURES
+        if not no_seq_complexity:
+            expected_ens_fields += 2
 
-    expected_ens_fields = NUM_ENS_FEATURES
-    if not no_seq_complexity:
-        expected_ens_fields += 2
+        logger.info("expected_ens_fields: {}".format(expected_ens_fields))
 
-    logger.info("expected_ens_fields: {}".format(expected_ens_fields))
+        expected_st_fields = 4
 
-    expected_st_fields = 4
+        logger.info("expected_st_fields: {}".format(expected_st_fields))
 
-    logger.info("expected_st_fields: {}".format(expected_st_fields))
+        ensemble = False
+        for tsv in candidates_tsv:
+            with open(tsv) as i_f:
+                x = i_f.readline().strip().split()
+                if x:
+                    if len(x) == expected_ens_fields + 4:
+                        ensemble = True
+                        break
+                    elif len(x) == 4:
+                        break
+                    else:
+                        raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
 
-    ensemble = False
-    for tsv in candidates_tsv:
-        with open(tsv) as i_f:
-            x = i_f.readline().strip().split()
-            if x:
-                if len(x) == expected_ens_fields + 4:
-                    ensemble = True
-                    break
-                elif len(x) == 4:
+        num_channels = expected_ens_fields + \
+            NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
+    else:
+        num_channels = 0
+        for tsv in candidates_tsv:
+            with open(tsv) as i_f:
+                x = i_f.readline().strip().split()
+                if x:
+                    num_channels = len(x) - 4 + NUM_ST_FEATURES
                     break
-                else:
-                    raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
-
-    num_channels = expected_ens_fields + \
-        NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
-
     logger.info("Number of channels: {}".format(num_channels))
     net = NeuSomaticNet(num_channels)
     if use_cuda:
@@ -450,6 +462,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "normalize_channels": normalize_channels,
                 "no_seq_complexity": no_seq_complexity,
                 "zero_ann_cols": zero_ann_cols,
+                "ensemble_custom_header": ensemble_custom_header,
                 }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
@@ -517,6 +530,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         "normalize_channels": normalize_channels,
                         "no_seq_complexity": no_seq_complexity,
                         "zero_ann_cols": zero_ann_cols,
+                        "ensemble_custom_header": ensemble_custom_header,
                         }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
             if validation_candidates_tsv:
                 test(net, curr_epoch, validation_loader, use_cuda)
@@ -537,6 +551,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "normalize_channels": normalize_channels,
                 "no_seq_complexity": no_seq_complexity,
                 "zero_ann_cols": zero_ann_cols,
+                "ensemble_custom_header": ensemble_custom_header,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(
         out_dir, tag, curr_epoch))
     if validation_candidates_tsv:
@@ -623,6 +638,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                               --zero_ann_cols and pretrained setting \
                               idx starts from 5th column in candidate.tsv file',
                         default=[])
+    parser.add_argument('--ensemble_custom_header',
+                        help='Allow ensemble tsv to have custom header fields. (Features should be\
+                            normalized between [0,1]',
+                        action="store_true")
     args = parser.parse_args()
 
     logger.info(args)
@@ -643,6 +662,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.no_seq_complexity,
                                       args.zero_ann_cols,
                                       args.force_zero_ann_cols,
+                                      args.ensemble_custom_header,
                                       use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())

From 27f98c8db8cd3479d312373d45bfa269723b73e5 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 16 May 2020 05:04:07 -0700
Subject: [PATCH 46/89] small fix

---
 neusomatic/python/call.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 2395947..d814a1b 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -449,7 +449,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     logger.info("ensemble_custom_header: {}".format(ensemble_custom_header))
     
     
-    if ensemble_custom_header:    
+    if not ensemble_custom_header:    
         expected_ens_fields = NUM_ENS_FEATURES
         if not no_seq_complexity:
             expected_ens_fields += 2

From 0bc4655e2d4a1698a3bafe3b3b4754edb0e4fbe1 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 16 May 2020 17:50:43 -0700
Subject: [PATCH 47/89] small fix

---
 neusomatic/python/generate_dataset.py |  27 ++--
 neusomatic/python/preprocess.py       | 200 +++++++++++++++++---------
 2 files changed, 152 insertions(+), 75 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 62649ac..53f8ca8 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1405,6 +1405,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                         "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
                         "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
 
+    if is_extend and custom_header:
+        expected_features = list(
+            filter(lambda x: x not in callers_features, expected_features))
     n_vars = 0
     all_headers = set([])
     for ensemble_tsv in ensemble_tsvs:
@@ -1414,8 +1417,10 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                     all_headers.add(line)
                     header_pos = line.strip().split()[0:5]
                     header_ = line.strip().split()[5:]
-                    if not custom_header:
-                        if is_extend:
+                    if custom_header and not is_extend:
+                        order_header = range(len(header_))
+                    else:
+                        if is_extend and not custom_header:
                             header_ += callers_features
                         header_en = list(filter(
                             lambda x: x[1] in expected_features, enumerate(header_)))
@@ -1431,8 +1436,6 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                         order_header = []
                         for f in expected_features:
                             order_header.append(header_en[header.index(f)][0])
-                    else:
-                        order_header=range(len(header_))
                     continue
                 fields = line.strip().split()
                 fields[2] = str(int(fields[1]) + len(fields[3]))
@@ -1442,9 +1445,10 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                     features += ["0"] * len(callers_features)
                 features = list(map(lambda x: float(
                     x.replace("False", "0").replace("True", "1")), features))
-                if custom_header:
-                    if min(features)<0 or max(features)>1:
-                        logger.info("In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]" )
+                if custom_header and not is_extend:
+                    if min(features) < 0 or max(features) > 1:
+                        logger.info(
+                            "In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]")
                         raise Exception
                 ensemble_data.append(features)
                 n_vars += 1
@@ -1454,7 +1458,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         ensemble_data = np.array(ensemble_data)[:, order_header]
     header = np.array(header_)[order_header].tolist()
 
-    if not custom_header:
+    if not custom_header or is_extend:
         cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [
             "Consistent_Mates", "Inconsistent_Mates", "N_DP",
             "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant",
@@ -1537,7 +1541,8 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
             min_max_features.append([Seq_Complexity_, 0, 40])
 
         selected_features = sorted([i for f in min_max_features for i in f[0]])
-        selected_features_tags = list(map(lambda x: header[x], selected_features))
+        selected_features_tags = list(
+            map(lambda x: header[x], selected_features))
         if n_vars > 0:
             for i_s, mn, mx in min_max_features:
                 if i_s:
@@ -1548,7 +1553,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
             ensemble_data = ensemble_data[:, selected_features]
             ensemble_data = ensemble_data.tolist()
     else:
-        ensemble_data = ensemble_data.tolist()        
+        ensemble_data = ensemble_data.tolist()
         selected_features_tags = header_
     with open(ensemble_bed, "w")as f_:
         f_.write(
@@ -1624,7 +1629,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
         with open(ensemble_bed) as i_f:
             x = i_f.readline().strip().split()
             if x:
-                num_ens_features = len(x) - 5 
+                num_ens_features = len(x) - 5
 
     pool = multiprocessing.Pool(num_threads)
     map_args = []
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index f69734e..6b3900f 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -79,7 +79,7 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
 
 
 def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
-                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, 
+                            matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed,
                             ensemble_custom_header,
                             no_seq_complexity,
                             no_feature_recomp_for_ensemble, tsv_batch_size):
@@ -236,6 +236,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         logger.error("Aborting!")
         raise Exception(
             "No normal .bai index file {}".format(normal_bam + ".bai"))
+    if no_feature_recomp_for_ensemble and ensemble_custom_header:
+        logger.error("Aborting!")
+        raise Exception(
+            "--ensemble_custom_header and --no_feature_recomp_for_ensemble are incompatible")
 
     if dbsnp:
         if dbsnp[-6:] != "vcf.gz":
@@ -382,18 +386,18 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                     work_dataset_split, "extra_features.bed")
                 if not os.path.exists(extra_features_bed) or restart:
                     extract_ensemble(ensemble_tsvs=ex_tsvs,
-                                     ensemble_bed=extra_features_bed, 
-                                     no_seq_complexity=no_seq_complexity, 
-                                     enforce_header=True, 
-                                     custom_header=False,
+                                     ensemble_bed=extra_features_bed,
+                                     no_seq_complexity=no_seq_complexity,
+                                     enforce_header=True,
+                                     custom_header=ensemble_custom_header,
                                      is_extend=True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
                         work_dataset_split, "merged_features.bed")
                     if not os.path.exists(merged_features_bed) or restart:
                         exclude_ens_variants = []
-                        header_line = ""
                         if no_feature_recomp_for_ensemble:
+                            header_line = ""
                             with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
                                 for line in skip_empty(i_f_1, skip_header=False):
                                     if line.startswith("#"):
@@ -425,68 +429,136 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                         continue
                                     o_f.write(line)
                         else:
-                            callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
-                                                "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
-                                                "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
-                                                "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
-                            with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
-                                ens_variants_info = {}
-                                header_1_found = False
-                                header_2_found = False
-                                for line in skip_empty(i_f_1, skip_header=False):
-                                    if line.startswith("#"):
-                                        if not header_line:
-                                            header_line = line
-                                        else:
+                            if not ensemble_custom_header:
+                                header_line = ""
+                                callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
+                                                    "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
+                                                    "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
+                                                    "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
+                                with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
+                                    ens_variants_info = {}
+                                    header_1_found = False
+                                    header_2_found = False
+                                    for line in skip_empty(i_f_1, skip_header=False):
+                                        if line.startswith("#"):
+                                            if not header_line:
+                                                header_line = line
+                                            else:
+                                                if header_line != line:
+                                                    logger.error(
+                                                        "{}!={}".format(header_line, line))
+                                                    raise Exception
+                                            header_ = line.strip().split()[5:]
+                                            header_caller = list(filter(
+                                                lambda x: x[1] in callers_features, enumerate(header_)))
+                                            header_caller_ = list(
+                                                map(lambda x: x[1], header_caller))
+                                            header_i = list(
+                                                map(lambda x: x[0], header_caller))
+                                            header_1_found = True
+                                            continue
+                                        assert header_1_found
+                                        fields = line.strip().split("\t")
+                                        chrom, pos, _, ref, alt = fields[0:5]
+                                        var_id = "-".join([chrom,
+                                                           pos, ref, alt])
+                                        ens_variants_info[var_id] = np.array(fields[5:])[
+                                            header_i]
+                                    for line in skip_empty(i_f_2, skip_header=False):
+                                        if line.startswith("#"):
                                             if header_line != line:
                                                 logger.error(
                                                     "{}!={}".format(header_line, line))
-                                                raise Exception
-                                        header_ = line.strip().split()[5:]
-                                        header_caller = list(filter(
-                                            lambda x: x[1] in callers_features, enumerate(header_)))
-                                        header_caller_ = list(
-                                            map(lambda x: x[1], header_caller))
-                                        header_i = list(
-                                            map(lambda x: x[0], header_caller))
-                                        header_1_found = True
-                                        continue
-                                    assert header_1_found
-                                    fields = line.strip().split("\t")
-                                    chrom, pos, _, ref, alt = fields[0:5]
-                                    var_id = "-".join([chrom, pos, ref, alt])
-                                    ens_variants_info[var_id] = np.array(fields[5:])[
-                                        header_i]
-                                for line in skip_empty(i_f_2, skip_header=False):
-                                    if line.startswith("#"):
-                                        if header_line != line:
-                                            logger.error(
-                                                "{}!={}".format(header_line, line))
-                                        if not header_2_found:
-                                            header_2 = line.strip().split()[5:]
-                                            logger.info(header_2)
-                                            order_header = []
-                                            for f in header_caller_:
-                                                if f not in header_2:
-                                                    logger.info(
-                                                        "Missing header field {}".format(f))
+                                            if not header_2_found:
+                                                header_2 = line.strip().split()[
+                                                    5:]
+                                                order_header = []
+                                                for f in header_caller_:
+                                                    if f not in header_2:
+                                                        logger.info(
+                                                            "Missing header field {}".format(f))
+                                                        raise Exception
+                                                    order_header.append(
+                                                        header_2.index(f))
+                                                o_f.write(line)
+                                            header_2_found = True
+
+                                        assert header_2_found
+                                        fields = line.strip().split("\t")
+                                        chrom, pos, _, ref, alt = fields[0:5]
+                                        var_id = "-".join([chrom,
+                                                           pos, ref, alt])
+                                        if var_id in ens_variants_info:
+                                            fields_ = np.array(fields[5:])
+                                            fields_[order_header] = ens_variants_info[
+                                                var_id]
+                                            fields[5:] = fields_.tolist()
+                                        o_f.write(
+                                            "\t".join(list(map(str, fields))) + "\n")
+                            else:
+                                header_line_1 = ""
+                                header_line_2 = ""
+                                with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
+                                    ens_variants_info = {}
+                                    ex_variants_info = {}
+                                    header_1_found = False
+                                    header_2_found = False
+                                    for line in skip_empty(i_f_1, skip_header=False):
+                                        if line.startswith("#"):
+                                            if not header_line_1:
+                                                header_line_1 = line
+                                            else:
+                                                if header_line_1 != line:
+                                                    logger.error(
+                                                        "{}!={}".format(header_line_1, line))
                                                     raise Exception
-                                                order_header.append(
-                                                    header_2.index(f))
-                                            o_f.write(line)
-                                        header_2_found = True
-
-                                    assert header_2_found
-                                    fields = line.strip().split("\t")
-                                    chrom, pos, _, ref, alt = fields[0:5]
-                                    var_id = "-".join([chrom, pos, ref, alt])
-                                    if var_id in ens_variants_info:
-                                        fields_ = np.array(fields[5:])
-                                        fields_[order_header] = ens_variants_info[
-                                            var_id]
-                                        fields[5:] = fields_.tolist()
+                                            header_1 = line.strip().split()[5:]
+                                            header_1_found = True
+                                            continue
+                                        assert header_1_found
+                                        fields = line.strip().split("\t")
+                                        chrom, pos, _, ref, alt = fields[0:5]
+                                        var_id = "-".join([chrom,
+                                                           pos, ref, alt])
+                                        ens_variants_info[
+                                            var_id] = np.array(fields[5:])
+                                    for line in skip_empty(i_f_2, skip_header=False):
+                                        if line.startswith("#"):
+                                            if not header_line_2:
+                                                header_line_2 = line
+                                            else:
+                                                if header_line_2 != line:
+                                                    logger.error(
+                                                        "{}!={}".format(header_line_2, line))
+                                                    raise Exception
+                                            header_2 = line.strip().split()[5:]
+                                            header_2_found = True
+                                            continue
+                                        assert header_2_found
+                                        fields = line.strip().split("\t")
+                                        chrom, pos, _, ref, alt = fields[0:5]
+                                        var_id = "-".join([chrom,
+                                                           pos, ref, alt])
+                                        ex_variants_info[
+                                            var_id] = np.array(fields[5:])
+                                    header_mixed = [
+                                        "#CHROM", "POS", "ID", "REF", "ALT"] + header_1 + header_2
                                     o_f.write(
-                                        "\t".join(list(map(str, fields))) + "\n")
+                                        "\t".join(list(map(str, header_mixed))) + "\n")
+                                    for var_id in set(ens_variants_info.keys()) | set(ex_variants_info.keys()):
+                                        features = [0.0] * \
+                                            (len(header_1) + len(header_2))
+                                        if var_id in ens_variants_info:
+                                            features[0:len(header_1)] = ens_variants_info[
+                                                var_id]
+                                        if var_id in ex_variants_info:
+                                            features[len(header_1):] = ex_variants_info[
+                                                var_id]
+                                        chrom = "-".join(var_id.split("-")
+                                                         [:-3])
+                                        pos, ref, alt = var_id.split("-")[-3:]
+                                        o_f.write(
+                                            "\t".join(list(map(str, [chrom, pos, int(pos) + len(ref), ref, alt] + features))) + "\n")
                     ensemble_bed_i = merged_features_bed
                 else:
                     ensemble_bed_i = extra_features_bed
@@ -494,7 +566,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf,
                                     candidates_split_region, tumor_count, normal_count, reference,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
-                                    ensemble_bed_i, 
+                                    ensemble_bed_i,
                                     ensemble_custom_header,
                                     no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
 

From 7deb7d6e963607eb920b78837d55e7af7f48954a Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 23 May 2020 16:20:40 -0700
Subject: [PATCH 48/89] small fix

---
 neusomatic/python/generate_dataset.py | 2 ++
 neusomatic/python/resolve_variants.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 53f8ca8..042333e 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1849,6 +1849,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     ensemble_bed = args.ensemble_bed
     no_seq_complexity = args.no_seq_complexity
     tsv_batch_size = args.tsv_batch_size
+    ensemble_custom_header = args.ensemble_custom_header
+    enforce_header = args.enforce_header
 
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index 0a672c5..c2dbf09 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -61,7 +61,7 @@ def extract_ins(record):
             continue
         if C == CIGAR_INS:
             inss.append([record.reference_name, pos, pos + 1,
-                         record.query[seq_pos:seq_pos + L]])
+                         record.seq[seq_pos:seq_pos + L]])
             seq_pos += L
         else:
             if C != CIGAR_DEL:

From 7897df8dc0e84c98158a2c1bc29d704aa83e23d5 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 28 May 2020 08:53:33 -0700
Subject: [PATCH 49/89] small fix

---
 neusomatic/python/split_bed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/split_bed.py b/neusomatic/python/split_bed.py
index b9bb1b7..e1be65f 100755
--- a/neusomatic/python/split_bed.py
+++ b/neusomatic/python/split_bed.py
@@ -37,7 +37,7 @@ def split_region(work, region_bed_file, num_splits, max_region=1000000, min_regi
         shuffle(intervals)
     total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, intervals))
     logger.info("Total length: {}".format(total_len))
-    split_len = total_len // num_splits
+    split_len = max(total_len // num_splits, min_region)
     split_regions = []
     current_regions = []
     sofar_len = 0

From 8ac67a1885c801e9cd1ade685c1a537c96d86f5c Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 30 May 2020 21:57:57 -0700
Subject: [PATCH 50/89] fix ann

---
 neusomatic/python/generate_dataset.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 042333e..bf808e2 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -975,9 +975,11 @@ def find_records(input_record):
                                     r_ = [[chrom, pos, ref, alt]]
 
                                 ann = [0] * num_ens_features
+                                var_match = False
                                 if pos == ens_pos:
                                     if ref == ens_ref and alt == ens_alt:
                                         ann = record_[15:]
+                                        var_match = True
                                     elif (len(ref) > len(alt) and len(ens_ref) > len(ens_alt) and
                                             (alt) == (ens_alt)):
                                         if ((len(ref) > len(ens_ref) and ref[0:len(ens_ref)] == ens_ref) or (
@@ -990,14 +992,18 @@ def find_records(input_record):
                                             ann = record_[15:]
                                 if ann:
                                     ann = list(map(float, ann))
-                                rrs.append([r_, ann])
+                                rrs.append([r_, ann, var_match])
+                            has_var_match = sum(map(lambda x: x[2], rrs))
+                            if has_var_match:
+                                rrs = list(
+                                    filter(lambda x: x[2], rrs))[0:1]
                             max_ann = max(map(lambda x: sum(x[1]), rrs))
                             if max_ann > 0:
                                 rrs = list(
                                     filter(lambda x: sum(x[1]) > 0, rrs))
                             elif max_ann == 0:
                                 rrs = rrs[0:1]
-                            for r_, ann in rrs:
+                            for r_, ann, _ in rrs:
                                 for rr in r_:
                                     records.append(rr + [str(i)])
                                     anns[i] = ann
@@ -1023,9 +1029,11 @@ def find_records(input_record):
                             r_ = [[chrom, pos, ref, alt]]
 
                         ann = [0] * num_ens_features
+                        var_match = False
                         if pos == ens_pos:
                             if ref == ens_ref and alt == ens_alt:
                                 ann = record_[15:]
+                                var_match = True
                             elif (len(ref) > len(alt) and len(ens_ref) > len(ens_alt) and
                                     (alt) == (ens_alt)):
                                 if ((len(ref) > len(ens_ref) and ref[0:len(ens_ref)] == ens_ref) or (
@@ -1038,13 +1046,17 @@ def find_records(input_record):
                                     ann = record_[15:]
                         if ann:
                             ann = list(map(float, ann))
-                        rrs.append([r_, ann])
+                        rrs.append([r_, ann, var_match])
+                    has_var_match = sum(map(lambda x: x[2], rrs))
+                    if has_var_match:
+                        rrs = list(
+                            filter(lambda x: x[2], rrs))[0:1]
                     max_ann = max(map(lambda x: sum(x[1]), rrs))
                     if max_ann > 0:
                         rrs = list(filter(lambda x: sum(x[1]) > 0, rrs))
                     elif max_ann == 0:
                         rrs = rrs[0:1]
-                    for r_, ann in rrs:
+                    for r_, ann, _ in rrs:
                         for rr in r_:
                             records.append(rr + [str(i)])
                             anns[i] = ann

From 607abcdebf9943839a292188cfaf4b879ee6c4b4 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 5 Jun 2020 14:36:01 -0700
Subject: [PATCH 51/89] small fix

---
 neusomatic/python/long_read_indelrealign.py | 1 -
 neusomatic/python/postprocess.py            | 9 ++++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/neusomatic/python/long_read_indelrealign.py b/neusomatic/python/long_read_indelrealign.py
index 4a74eda..53c1635 100755
--- a/neusomatic/python/long_read_indelrealign.py
+++ b/neusomatic/python/long_read_indelrealign.py
@@ -1069,7 +1069,6 @@ def run_realignment(input_record):
                             num_add_before = min(40, pos - 1)
                             before = ref_fasta.fetch(
                                 region.chrom, pos - num_add_before, pos - 1).upper()
-                            print(before)
                             pos -= num_add_before - 1
                             ref = before + ref
                             alt = before + alt
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index b62f9e7..e549a33 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -21,7 +21,7 @@
 from extract_postprocess_targets import extract_postprocess_targets
 from merge_post_vcfs import merge_post_vcfs
 from resolve_variants import resolve_variants
-from utils import concatenate_files, get_chromosomes_order, bedtools_window, run_bedtools_cmd, skip_empty
+from utils import concatenate_files, get_chromosomes_order, bedtools_window, bedtools_intersect, skip_empty
 from long_read_indelrealign import long_read_indelrealign
 from resolve_scores import resolve_scores
 from _version import __version__
@@ -196,7 +196,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     tempfile.tempdir = bed_tempdir
 
     candidates_preds = os.path.join(work, "candidates_preds.vcf")
-    ensembled_preds = os.path.join(work, "ensembled_preds.vcf")
+    ensembled_preds = os.path.join(work, "ensemble_preds.vcf")
 
     bedtools_window(
         pred_vcf_file, candidates_vcf, args=" -w 5 -v", output_fn=ensembled_preds, run_logger=logger)
@@ -242,9 +242,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
 
         not_resolved_vcf = os.path.join(
             work, "candidates_preds.not_ra_resolved.vcf")
-        cmd = "bedtools intersect -a {} -b {} -u".format(
-            target_vcf, not_resolved_bed)
-        run_bedtools_cmd(cmd, output_fn=not_resolved_vcf, run_logger=logger)
+        bedtools_intersect(target_vcf, not_resolved_bed, args=" -u ",
+                           output_fn=not_resolved_vcf, run_logger=logger)
 
         all_no_resolve = concatenate_files(
             [no_resolve, ensembled_preds, not_resolved_vcf], os.path.join(work, "no_resolve.vcf"))

From 867ba5f37ac7429b770fc6014f14c119f1474c38 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 17 Jun 2020 21:59:53 -0700
Subject: [PATCH 52/89] added callers vcf to tsv

---
 neusomatic/python/extend_features.py     |  13 +-
 neusomatic/python/generate_dataset.py    |  20 +-
 neusomatic/python/preprocess.py          |  18 +-
 neusomatic/python/read_callers_vcf.py    | 477 +++++++++++++++++++++++
 neusomatic/python/sequencing_features.py |  12 +-
 5 files changed, 528 insertions(+), 12 deletions(-)
 create mode 100755 neusomatic/python/read_callers_vcf.py

diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index b1606b5..bf4c118 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -48,6 +48,14 @@ def extract_features(candidate_record):
                 sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref,
                                                           tBamFeatures.nalt)
 
+                try:
+                    score_varscan2 = genome.p2phred(sequencing_features.fisher_exact_test(
+                        ((tBamFeatures.nalt, nBamFeatures.nalt),
+                         (tBamFeatures.nref, nBamFeatures.nref)),
+                        alternative='greater'))
+                except ValueError:
+                    score_varscan2 = nan
+
                 homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                     ref_fa, my_coordinate, ref, alt)
 
@@ -136,6 +144,7 @@ def extract_features(candidate_record):
                 SOR = sor
                 MaxHomopolymer_Length = homopolymer_length
                 SiteHomopolymer_Length = site_homopolymer_length
+                score_varscan2 = rescale(score_varscan2,      'phred', p_scale, 1001)
                 T_DP = tBamFeatures.dp
                 tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq
                 tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq
@@ -185,7 +194,7 @@ def extract_features(candidate_record):
                                  nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET,
                                  nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp,
                                  nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR,
-                                 MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
+                                 MaxHomopolymer_Length, SiteHomopolymer_Length, score_varscan2, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ,
                                  tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff,
                                  tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant,
                                  tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET,
@@ -364,7 +373,7 @@ def extend_features(candidates_vcf,
                    "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET",
                    "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp",
                    "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR",
-                   "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ",
+                   "MaxHomopolymer_Length", "SiteHomopolymer_Length", "VarScan2_Score", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ",
                    "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff",
                    "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant",
                    "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET",
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index bf808e2..2991274 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1378,6 +1378,7 @@ def find_records(input_record):
 
 def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header,
                      custom_header,
+                     zero_vscore,
                      is_extend):
     logger = logging.getLogger(extract_ensemble.__name__)
     ensemble_data = []
@@ -1414,7 +1415,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                           "InDel_Length"]
     callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
                         "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
-                        "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
+                        "Strelka_TQSS", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
                         "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
 
     if is_extend and custom_header:
@@ -1527,6 +1528,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         Seq_Complexity_ = list(map(lambda x: x[0], filter(
             lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header))))
 
+        max_varscan2_score = 0 if zero_vscore else 60
         min_max_features = [[cov_features, 0, 2 * COV],
                             [mq_features, 0, 70],
                             [bq_features, 0, 41],
@@ -1536,7 +1538,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                             [stralka_scor, 0, 40],
                             [stralka_qss, 0, 200],
                             [stralka_tqss, 0, 4],
-                            [varscan2_score, 0, 60],
+                            [varscan2_score, 0, max_varscan2_score],
                             [vardict_score, 0, 120],
                             [m2_lod, 0, 100],
                             [sniper_score, 0, 120],
@@ -1579,7 +1581,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
                      ensemble_bed,
                      ensemble_custom_header,
-                     no_seq_complexity, enforce_header, tsv_batch_size):
+                     no_seq_complexity, enforce_header,
+                     zero_vscore,
+                     tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
     logger.info("---------------------Generate Dataset----------------------")
@@ -1609,6 +1613,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
         extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed,
                          no_seq_complexity=no_seq_complexity, enforce_header=enforce_header,
                          custom_header=ensemble_custom_header,
+                         zero_vscore=zero_vscore,
                          is_extend=False)
 
     tmp_ = bedtools_intersect(
@@ -1841,6 +1846,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     parser.add_argument('--enforce_header',
                         help='Enforce header match for ensemble_tsv',
                         action="store_true")
+    parser.add_argument('--zero_vscore',
+                        help='set VarScan2_Score to zero',
+                        action="store_true")
     args = parser.parse_args()
     logger.info(args)
 
@@ -1863,13 +1871,15 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     tsv_batch_size = args.tsv_batch_size
     ensemble_custom_header = args.ensemble_custom_header
     enforce_header = args.enforce_header
-
+    zero_vscore = zero_vscore
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
                          ensemble_bed,
                          ensemble_custom_header,
-                         no_seq_complexity, enforce_header, tsv_batch_size)
+                         no_seq_complexity, enforce_header,
+                         zero_vscore,
+                         tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 6b3900f..fbbe1ff 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -82,13 +82,16 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi
                             matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed,
                             ensemble_custom_header,
                             no_seq_complexity,
-                            no_feature_recomp_for_ensemble, tsv_batch_size):
+                            no_feature_recomp_for_ensemble, 
+                            zero_vscore,
+                            tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
                      matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed,
                      ensemble_custom_header,
                      no_seq_complexity,
                      no_feature_recomp_for_ensemble,
+                     zero_vscore,
                      tsv_batch_size)
 
 
@@ -251,6 +254,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             raise Exception(
                 "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
 
+    zero_vscore = False
+    if not ensemble_tsv and add_extra_features:
+        zero_vscore = True
+
     ensemble_bed = None
     if ensemble_tsv:
         ensemble_bed = os.path.join(work, "ensemble.bed")
@@ -259,7 +266,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed,
                              no_seq_complexity=no_seq_complexity, enforce_header=no_feature_recomp_for_ensemble,
                              custom_header=ensemble_custom_header,
+                             zero_vscore=zero_vscore,
                              is_extend=False)
+
     merge_d_for_short_read = 100
     candidates_split_regions = []
     ensemble_beds = []
@@ -390,6 +399,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                      no_seq_complexity=no_seq_complexity,
                                      enforce_header=True,
                                      custom_header=ensemble_custom_header,
+                                     zero_vscore=zero_vscore,
                                      is_extend=True)
                 if ensemble_tsv:
                     merged_features_bed = os.path.join(
@@ -433,7 +443,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                 header_line = ""
                                 callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
                                                     "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS",
-                                                    "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
+                                                    "Strelka_TQSS", "SNVMix2_Score", "Sniper_Score", "VarDict_Score",
                                                     "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
                                 with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2:
                                     ens_variants_info = {}
@@ -568,7 +578,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
                                     ensemble_bed_i,
                                     ensemble_custom_header,
-                                    no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size)
+                                    no_seq_complexity, no_feature_recomp_for_ensemble, 
+                                    zero_vscore,
+                                    tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
     tempfile.tempdir = original_tempdir
diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py
new file mode 100755
index 0000000..997270e
--- /dev/null
+++ b/neusomatic/python/read_callers_vcf.py
@@ -0,0 +1,477 @@
+#!/usr/bin/env python
+#-------------------------------------------------------------------------
+# read_callers_vcf.py
+# read callers vcf files and generate ensemble tsv
+#-------------------------------------------------------------------------
+import argparse
+import traceback
+import logging
+import re
+import gzip
+
+import genomic_file_handlers as genome
+from read_info_extractor import rescale
+from utils import skip_empty, get_chromosomes_order
+
+import numpy as np
+
+# Normal/Tumor index in the Merged VCF file, or any other VCF file that
+# puts NORMAL first.
+idxN, idxT = 0, 1
+nan = float('nan')
+
+
+def get_info_value(info_field, variable, ith_alt=None):
+    logger = logging.getLogger(get_info_value.__name__)
+    key_item = re.search(
+        r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), info_field)
+
+    # The key has a value attached to it, e.g., VAR=1,2,3
+    if key_item:
+        if ith_alt is None:
+            return key_item.groups()[0]
+        else:
+            return key_item.groups()[0].split(",")[ith_alt]
+
+    # Perhaps it's simply a flag without "="
+    else:
+        key_item = info_field.split(';')
+        return True if variable in key_item else False
+
+
+def get_sample_value(fields, samples, variable, idx=0):
+
+    var2value = dict(zip(fields.split(':'), samples[idx].split(':')))
+    try:
+        return var2value[variable]
+    except KeyError:
+        return None
+
+
+def get_mutect2_info(filters, info, ith_alt):
+
+    mutect_classification = 1 if (get_info_value(info,
+                                                 'SOMATIC') or 'PASS' in filters) else 0
+
+    # MuTect2 has some useful information:
+    nlod = get_info_value(info, 'NLOD', ith_alt)
+    nlod = float(nlod) if nlod else nan
+
+    tlod = get_info_value(info, 'TLOD', ith_alt)
+    tlod = float(tlod) if tlod else nan
+
+    tandem = 1 if get_info_value(info, 'STR') else 0
+
+    ecnt = get_info_value(info, 'ECNT')
+    if ecnt:
+        try:
+            ecnt = int(ecnt)
+        except ValueError:
+            ecnt = nan
+    else:
+        ecnt = nan
+    return mutect_classification, nlod, tlod, tandem, ecnt
+
+
+def get_varscan2_info(info):
+    varscan_classification = 1 if get_info_value(info,
+                                                 'SOMATIC') else 0
+    return varscan_classification
+
+
+def get_somaticsniper_info(fields, samples, idxT):
+    somaticsniper_classification = 1 if get_sample_value(fields, samples,
+                                                         'SS', idxT) == '2' else 0
+    if somaticsniper_classification == 1:
+        score_somaticsniper = get_sample_value(fields, samples,
+                                               'SSC', idxT)
+        score_somaticsniper = int(
+            score_somaticsniper) if score_somaticsniper else nan
+    else:
+        score_somaticsniper = nan
+
+    return somaticsniper_classification, score_somaticsniper
+
+
+def get_vardict_info(filters, info, fields, samples):
+
+    if (filters == 'PASS') and ('Somatic' in info):
+        vardict_classification = 1
+    elif 'Somatic' in info:
+        vardict_filters = filters.split(';')
+
+        disqualifying_filters = \
+            ('d7'      in vardict_filters  or 'd5' in vardict_filters) or \
+            ('DIFF0.2' in vardict_filters) or \
+            ('LongAT'  in vardict_filters) or \
+            ('MAF0.05' in vardict_filters) or \
+            ('MSI6'    in vardict_filters) or \
+            ('NM4'     in vardict_filters  or 'NM4.25' in vardict_filters) or \
+            ('pSTD'    in vardict_filters) or \
+            ('SN1.5'   in vardict_filters) or \
+            ( 'P0.05'  in vardict_filters  and float(get_info_value(info, 'SSF') ) >= 0.15 ) or \
+            (('v3' in vardict_filters or 'v4' in vardict_filters)
+             and int(get_sample_value(fields, samples, 'VD', 1)) < 3)
+
+        no_bad_filter = not disqualifying_filters
+        filter_fail_times = len(vardict_filters)
+
+        if no_bad_filter and filter_fail_times <= 2:
+            vardict_classification = 0.5
+        else:
+            vardict_classification = 0
+
+    else:
+        vardict_classification = 0
+
+    # Somatic Score:
+    score_vardict = get_info_value(info, 'SSF')
+    if score_vardict:
+        score_vardict = float(score_vardict)
+        score_vardict = genome.p2phred(score_vardict, max_phred=100)
+        score_vardict = rescale(score_vardict,       'phred', None, 1001)
+    else:
+        score_vardict = nan
+
+    # MSI, MSILEN, and SHIFT3:
+    msi = get_info_value(info, 'MSI')
+    if msi:
+        msi = float(msi)
+    else:
+        msi = nan
+    msilen = get_info_value(info, 'MSILEN')
+    if msilen:
+        msilen = float(msilen)
+    else:
+        msilen = nan
+    shift3 = get_info_value(info, 'SHIFT3')
+    if shift3:
+        shift3 = float(shift3)
+    else:
+        shift3 = nan
+
+    return vardict_classification, msi, msilen, shift3, score_vardict
+
+
+def get_muse_info(filters):
+    if filters == 'PASS':
+        muse_classification = 1
+    elif filters == 'Tier1':
+        muse_classification = 0.9
+    elif filters == 'Tier2':
+        muse_classification = 0.8
+    elif filters == 'Tier3':
+        muse_classification = 0.7
+    elif filters == 'Tier4':
+        muse_classification = 0.6
+    elif filters == 'Tier5':
+        muse_classification = 0.5
+    else:
+        muse_classification = 0
+    return muse_classification
+
+
+def get_strelka2_info(filters, info):
+    strelka_classification = 1 if 'PASS' in filters else 0
+    somatic_evs = get_info_value(info, 'SomaticEVS')
+    qss = get_info_value(info, 'QSS')
+    tqss = get_info_value(info, 'TQSS')
+    return strelka_classification, somatic_evs, qss, tqss
+
+
+def open_textfile(file_name):
+
+    # See if the input file is a .gz file:
+    if file_name.lower().endswith('.gz'):
+        return gzip.open(file_name, 'rt')
+
+    else:
+        return open(file_name)
+
+
+def read_callers_vcf(reference,
+                     output_tsv,
+                     mutect2_vcfs,
+                     strelka2_vcfs,
+                     varscan2_vcfs,
+                     muse_vcfs,
+                     vardict_vcfs,
+                     somaticsniper_vcfs,
+                     min_caller):
+
+    logger = logging.getLogger(read_callers_vcf.__name__)
+
+    logger.info(
+        "----------------------Read Callers VCF------------------------")
+
+    mutect2_info = {}
+    if mutect2_vcfs:
+        for mutect2_vcf in mutect2_vcfs:
+            i_f = open_textfile(mutect2_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info = x[0:8]
+                for ith_alt, alt in enumerate(alts.split(",")):
+                    if ref != alt:
+                        mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info(
+                            filters, info, ith_alt)
+                        var_id = "-".join([chrom, pos, ref, alt])
+                        mutect2_info[var_id] = [
+                            mutect_classification, nlod, tlod, tandem, ecnt]
+            i_f.close()
+    strelka2_info = {}
+    if strelka2_vcfs:
+        for strelka2_vcf in strelka2_vcfs:
+            i_f = open_textfile(strelka2_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info = x[0:8]
+                strelka_classification, somatic_evs, qss, tqss = get_strelka2_info(
+                    filters, info)
+                for alt in alts.split(","):
+                    if ref != alt:
+                        var_id = "-".join([chrom, pos, ref, alt])
+                        strelka2_info[var_id] = [
+                            strelka_classification, somatic_evs, qss, tqss]
+            i_f.close()
+    vardict_info = {}
+    if vardict_vcfs:
+        for vardict_vcf in vardict_vcfs:
+            i_f = open_textfile(vardict_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info, fields = x[0:9]
+                samples = x[9:]
+
+                # In the REF/ALT field, non-GCTA characters should be
+                # changed to N to fit the VCF standard:
+                ref = re.sub(r'[^GCTA]', 'N', ref, flags=re.I)
+                alts = re.sub(r'[^GCTA]', 'N', alts, flags=re.I)
+
+                vardict_classification, msi, msilen, shift3, score_vardict = get_vardict_info(
+                    filters, info, fields, samples)
+                for alt in alts.split(","):
+                    if ref != alt:
+                        if 'TYPE=SNV' in info or 'TYPE=Deletion' in info or 'TYPE=Insertion' in info:
+                            var_id = "-".join([chrom, pos, ref, alt])
+                            vardict_info[var_id] = [
+                                vardict_classification, msi, msilen, shift3, score_vardict]
+                        elif 'TYPE=Complex' in info and (len(ref) == len(alt)):
+                            for i, (ref_i, alt_i) in enumerate(zip(ref, alt)):
+                                if ref_i != alt_i:
+                                    var_id = "-".join([chrom,
+                                                       str(int(pos) + i), ref_i, alt_i])
+                                    vardict_info[var_id] = [
+                                        vardict_classification, msi, msilen, shift3, score_vardict]
+            i_f.close()
+    varscan2_info = {}
+    if varscan2_vcfs:
+        for varscan2_vcf in varscan2_vcfs:
+            i_f = open_textfile(varscan2_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info = x[0:8]
+                varscan_classification = get_varscan2_info(info)
+
+                # Replace the wrong "G/A" with the correct "G,A" in ALT
+                # column:
+                alts = alts.replace('/', ',')
+
+                # multiple sequences in the REF, as is the case in
+                # VarScan2's indel output:
+                ref = re.sub(r'[^\w].*$', '', ref)
+
+                # Get rid of non-compliant characters in the ALT column:
+                alts = re.sub(r'[^\w,.]', '', alts)
+
+                # Eliminate dupliate entries in ALT:
+                alts = re.sub(r'(\w+),\1', r'\1', alts)
+
+                # VarScan2 output a line with REF allele as "M"
+                if re.search(r'[^GCTAU]', ref, re.I):
+                    continue
+
+                for alt in alts.split(","):
+                    if ref != alt:
+                        var_id = "-".join([chrom, pos, ref, alt])
+                        varscan2_info[var_id] = varscan_classification
+            i_f.close()
+
+    muse_info = {}
+    if muse_vcfs:
+        for muse_vcf in muse_vcfs:
+            i_f = open_textfile(muse_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info = x[0:8]
+                muse_classification = get_muse_info(filters)
+                for alt in alts.split(","):
+                    if ref != alt:
+                        var_id = "-".join([chrom, pos, ref, alt])
+                        muse_info[var_id] = muse_classification
+            i_f.close()
+
+    somaticsniper_info = {}
+    if somaticsniper_vcfs:
+        for somaticsniper_vcf in somaticsniper_vcfs:
+            i_f = open_textfile(somaticsniper_vcf)
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                chrom, pos, _, ref, alts, _, filters, info, fields = x[0:9]
+                samples = x[9:]
+                ref = re.sub(r'[^GCTA]', 'N', ref, flags=re.I)
+                somaticsniper_classification, score_somaticsniper = get_somaticsniper_info(
+                    fields, samples, idxT)
+                for alt in alts.split(","):
+                    if ref != alt:
+                        var_id = "-".join([chrom, pos, ref, alt])
+                        somaticsniper_info[var_id] = [
+                            somaticsniper_classification, score_somaticsniper]
+            i_f.close()
+
+    features = {}
+    for var_id in (set(mutect2_info.keys()) | set(strelka2_info.keys()) | set(vardict_info.keys()) |
+                   set(varscan2_info.keys()) | set(somaticsniper_info.keys()) | set(muse_info.keys())):
+        num_callers = 0
+        if var_id in mutect2_info:
+            mutect_classification, nlod, tlod, tandem, ecnt = mutect2_info[
+                var_id]
+            num_callers += mutect_classification
+        else:
+            mutect_classification = 0
+            nlod = tlod = tandem = ecnt = nan
+
+        if var_id in strelka2_info:
+            strelka_classification, somatic_evs, qss, tqss = strelka2_info[
+                var_id]
+            num_callers += strelka_classification
+        else:
+            strelka_classification = 0
+            somatic_evs = qss = tqss = nan
+
+        if var_id in vardict_info:
+            vardict_classification, msi, msilen, shift3, score_vardict = vardict_info[
+                var_id]
+            num_callers += vardict_classification
+        else:
+            vardict_classification = 0
+            msi = msilen = shift3 = score_vardict = nan
+
+        if var_id in varscan2_info:
+            varscan_classification = varscan2_info[var_id]
+            num_callers += varscan_classification
+        else:
+            varscan_classification = 0
+
+        if var_id in muse_info:
+            muse_classification = muse_info[var_id]
+            num_callers += muse_classification
+        else:
+            muse_classification = 0
+
+        if var_id in somaticsniper_info:
+            somaticsniper_classification, score_somaticsniper = somaticsniper_info[
+                var_id]
+            num_callers += somaticsniper_classification
+        else:
+            somaticsniper_classification = 0
+            score_somaticsniper = nan
+
+        if num_callers >= min_caller:
+            features[var_id] = [mutect_classification, nlod, tlod, tandem, ecnt,
+                                strelka_classification, somatic_evs, qss, tqss,
+                                vardict_classification, msi, msilen, shift3, score_vardict,
+                                varscan_classification,
+                                muse_classification,
+                                somaticsniper_classification, score_somaticsniper]
+
+    chrom_order = get_chromosomes_order(reference)
+    ordered_vars = sorted(features.keys(), key=lambda x: [
+                          chrom_order["-".join(x.split("-")[:-3])], int(x.split("-")[1])])
+    n_variants = len(ordered_vars)
+    logger.info("Number of variants: {}".format(n_variants))
+    header = ["CHROM", "POS", "ID", "REF", "ALT", "if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier",
+              "if_Strelka", "Strelka_Score", "Strelka_QSS",
+              "Strelka_TQSS", "Sniper_Score", "VarDict_Score",
+              "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"]
+
+    with open(output_tsv, "w") as o_f:
+        o_f.write("\t".join(header) + "\n")
+        for var_id in ordered_vars:
+            mutect_classification, nlod, tlod, tandem, ecnt, \
+                strelka_classification, somatic_evs, qss, tqss, \
+                vardict_classification, msi, msilen, shift3, score_vardict, \
+                varscan_classification, \
+                muse_classification, \
+                somaticsniper_classification, score_somaticsniper = features[
+                    var_id]
+
+            f = [mutect_classification, varscan_classification, somaticsniper_classification,
+                 vardict_classification, muse_classification, strelka_classification,
+                 somatic_evs, qss, tqss,
+                 score_somaticsniper, score_vardict,
+                 nlod, tlod, tandem, ecnt,
+                 msi, msilen, shift3]
+            chrom = "-".join(var_id.split("-")[:-3])
+            pos, ref, alt = var_id.split("-")[-3:]
+            o_f.write(
+                "\t".join([chrom, pos, ".", ref, alt] + list(map(lambda x: str(x).replace("nan", "0"), f))) + "\n")
+
+    logger.info("Done Reading Callers' Features.")
+    return output_tsv
+
+
+if __name__ == '__main__':
+    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=FORMAT)
+    logger = logging.getLogger(__name__)
+
+    parser = argparse.ArgumentParser(
+        description='extract extra features for standalone mode')
+    parser.add_argument('--reference', type=str, help='reference fasta filename',
+                        required=True)
+    parser.add_argument('--output_tsv', type=str, help='output features tsv',
+                        required=True)
+    parser.add_argument('--mutect2_vcfs', type=str, nargs="*",
+                        help='MuTect2 VCFs',
+                        default=None)
+    parser.add_argument('--strelka2_vcfs', type=str, nargs="*",
+                        help='Strelka2 VCFs',
+                        default=None)
+    parser.add_argument('--varscan2_vcfs', type=str, nargs="*",
+                        help='VarScan2 VCFs',
+                        default=None)
+    parser.add_argument('--muse_vcfs', type=str, nargs="*",
+                        help='MuSE VCFs',
+                        default=None)
+    parser.add_argument('--vardict_vcfs', type=str, nargs="*",
+                        help='VarDict VCFs',
+                        default=None)
+    parser.add_argument('--somaticsniper_vcfs', type=str, nargs="*",
+                        help='SomaticSniper VCFs',
+                        default=None)
+    parser.add_argument('--min_caller', type=float,
+                        help='Number of minimum callers support a call',
+                        default=0.5)
+    args = parser.parse_args()
+    logger.info(args)
+
+    try:
+        output = read_callers_vcf(args.reference,
+                                  args.output_tsv,
+                                  args.mutect2_vcfs,
+                                  args.strelka2_vcfs,
+                                  args.varscan2_vcfs,
+                                  args.muse_vcfs,
+                                  args.vardict_vcfs,
+                                  args.somaticsniper_vcfs,
+                                  args.min_caller,
+                                  )
+        if output is None:
+            raise Exception("read_callers_vcf failed!")
+    except Exception as e:
+        logger.error(traceback.format_exc())
+        logger.error("Aborting!")
+        logger.error(
+            "read_callers_vcf.py failure on arguments: {}".format(args))
+        raise e
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 265d0f8..6aafc1e 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -18,8 +18,16 @@
 nan = float('nan')
 
 
-def fisher_exact_test(mat):
-    return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail
+def fisher_exact_test(mat, alternative="two-sided"):
+    if alternative == "two-sided":
+        return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail
+    elif alternative == "greater":
+        return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).right_tail
+    elif alternative == "less":
+        return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).left_tail
+    else:
+        logger.error("Wrong fisher_test alternative: {}".format(alternative))
+        raise Exception
 
 
 def get_read_pos_for_ref_pos(read, ref_pos_s):

From 635341ee55b63cf02dff34659184a66bad6d2569 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 18 Jun 2020 16:09:25 -0700
Subject: [PATCH 53/89] merge regions for scanning

---
 neusomatic/python/preprocess.py      | 11 ++++++++++-
 neusomatic/python/scan_alignments.py | 24 ++++++++++++++++++------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index fbbe1ff..ae1586c 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -31,11 +31,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
                          good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
                          ins_min_af, del_min_af, del_merge_min_af,
                          ins_merge_min_af, merge_r,
+                         merge_d_for_scan,
                          scan_alignments_binary, restart, num_splits, num_threads, calc_qual, regions=[]):
 
     logger = logging.getLogger(process_split_region.__name__)
     logger.info("Scan bam.")
-    scan_outputs = scan_alignments(work, scan_alignments_binary, alignment_bam,
+    scan_outputs = scan_alignments(work, merge_d_for_scan, scan_alignments_binary, alignment_bam,
                                    region, reference, num_splits, num_threads, scan_window_size, scan_maf,
                                    min_mapq, max_dp, filter_duplicate, restart=restart, split_region_files=regions,
                                    calc_qual=calc_qual)
@@ -208,6 +209,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                no_feature_recomp_for_ensemble,
                window_extend,
                max_cluster_size,
+               merge_d_for_scan,
                num_splits,
                num_threads,
                scan_alignments_binary,):
@@ -288,6 +290,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                        snp_min_af, -10000, snp_min_ao,
                                                        ins_min_af, del_min_af, del_merge_min_af,
                                                        ins_merge_min_af, merge_r,
+                                                       merge_d_for_scan,
                                                        scan_alignments_binary, restart, num_splits, num_threads,
                                                        calc_qual=False)
         tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q = tumor_outputs_without_q
@@ -313,6 +316,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                          snp_min_af, snp_min_bq, snp_min_ao,
                                          ins_min_af, del_min_af, del_merge_min_af,
                                          ins_merge_min_af, merge_r,
+                                         merge_d_for_scan,
                                          scan_alignments_binary, restart, num_splits, num_threads,
                                          calc_qual=True,
                                          regions=candidates_split_regions)
@@ -340,6 +344,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
                                                ins_min_af, del_min_af, del_merge_min_af,
                                                ins_merge_min_af, merge_r,
+                                               merge_d_for_scan,
                                                scan_alignments_binary, restart, num_splits, num_threads,
                                                calc_qual=True,
                                                regions=candidates_split_regions)
@@ -681,6 +686,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--max_cluster_size', type=int,
                         help='max cluster size for extending input features (should be in the order of readlength)',
                         default=300)
+    parser.add_argument('--merge_d_for_scan', type=int,
+                        help='-d used to merge regions before scan',
+                        default=None)
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
@@ -706,6 +714,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.no_feature_recomp_for_ensemble,
                    args.window_extend,
                    args.max_cluster_size,
+                   args.merge_d_for_scan,
                    args.num_splits,
                    args.num_threads,
                    args.scan_alignments_binary)
diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index 5b703a0..b8aaf11 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -24,7 +24,7 @@
 
 
 def run_scan_alignments(record):
-    work, reference, scan_alignments_binary, split_region_file, \
+    work, reference, merge_d_for_scan, scan_alignments_binary, split_region_file, \
         input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record
 
     if filter_duplicate:
@@ -39,10 +39,19 @@ def run_scan_alignments(record):
             raise IOError("File not found: {}".format(scan_alignments_binary))
         if not os.path.exists(work):
             os.mkdir(work)
-        if os.path.getsize(split_region_file) > 0:
+
+        if merge_d_for_scan is not None:
+            split_region_file_=os.path.join(work,"merged_region.bed")
+            tmp_ = bedtools_sort(split_region_file, run_logger=thread_logger)
+            bedtools_merge(
+                tmp_, output_fn=split_region_file_ , args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger)
+        else:
+            split_region_file_=split_region_file
+
+        if os.path.getsize(split_region_file_) > 0:
             cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \
                         --window_size {} --min_af {} --min_mapq {} --max_depth {} {}".format(
-                scan_alignments_binary, reference, input_bam, split_region_file,
+                scan_alignments_binary, reference, input_bam, split_region_file_,
                 work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, filter_duplicate_str)
             if calc_qual:
                 cmd += " --calculate_qual_stat"
@@ -69,7 +78,7 @@ def run_scan_alignments(record):
         return None
 
 
-def scan_alignments(work, scan_alignments_binary, input_bam,
+def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
                     regions_bed_file, reference, num_splits,
                     num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True,
                     split_region_files=[], calc_qual=True):
@@ -137,7 +146,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
             if os.path.exists(work_):
                 shutil.rmtree(work_)
             map_args.append((os.path.join(work, "work.{}".format(i)),
-                             reference, scan_alignments_binary, split_region_file,
+                             reference, merge_d_for_scan, scan_alignments_binary, split_region_file,
                              input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual))
             not_done.append(i)
         else:
@@ -192,6 +201,9 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
     parser.add_argument('--filter_duplicate',
                         help='filter duplicate reads when preparing pileup information',
                         action="store_true")
+    parser.add_argument('--merge_d_for_scan', type=int,
+                        help='-d used to merge regions before scan',
+                        default=None)
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
@@ -200,7 +212,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam,
     logger.info(args)
 
     try:
-        outputs = scan_alignments(args.work, args.scan_alignments_binary, args.input_bam,
+        outputs = scan_alignments(args.work, args.merge_d_for_scan, args.scan_alignments_binary, args.input_bam,
                                   args.regions_bed_file, args.reference, args.num_splits,
                                   args.num_threads, args.window_size, args.maf,
                                   args.min_mapq, args.max_dp, args.filter_duplicate)

From a52d0c21029d03828d9c3ae3c0eea6a72f719471 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb160.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 18 Jun 2020 17:02:12 -0700
Subject: [PATCH 54/89] bug fixes for call/post

---
 neusomatic/python/call.py                     |  98 ++--
 .../python/extract_postprocess_targets.py     |  36 +-
 neusomatic/python/postprocess.py              |   6 +-
 neusomatic/python/resolve_variants.py         | 437 ++++++++++++++----
 4 files changed, 435 insertions(+), 142 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index d814a1b..6b5a8b9 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -146,6 +146,7 @@ def pred_vcf_records_path(record):
 
         chrom, pos, ref, alt, _, center, _, _, _ = path.split(
             ".")
+        ref, alt = ref.upper(), alt.upper()        
         center = int(center)
         pos = int(pos)
 
@@ -212,61 +213,82 @@ def pred_vcf_records_path(record):
         for i in nzref_pos:
             col_2_pos[i] = cnt
             cnt += 1
+        if vartype_candidate == "INS" and anchor[1] == 0 and 0 not in col_2_pos:
+            col_2_pos[0] = -1
+            nzref_pos = np.array([0] + list(nzref_pos))
         if anchor[1] not in col_2_pos:
-            # print "NNN",path,pred
-            return vcf_record
+            if I[0, anchor[1], 0] > 0 and vartype_candidate == "INS" and type_pred == "INS":
+                ins_no_zref_pos = True
+            else:
+                # thread_logger.info(["NNN", path, pred])
+                return vcf_record
+        if not ins_no_zref_pos:
+            b = (anchor[0] - col_2_pos[anchor[1]])
+            for i in nzref_pos:
+                col_2_pos[i] += b
+            pos_2_col = {v: k for k, v in col_2_pos.items()}
 
-        b = (anchor[0] - col_2_pos[anchor[1]])
-        for i in nzref_pos:
-            col_2_pos[i] += b
-        pos_2_col = {v: k for k, v in col_2_pos.items()}
+        if type_pred == "SNP" and len(ref) - len(alt) > 1 and abs(center_pred - center) < center_dist_roundback:
+            thread_logger.info(["TBC", path, nzref_pos])
 
         if abs(center_pred - center) < too_far_center:
             if type_pred == "SNP":
-                pos_ = col_2_pos[center_]
-                ref_ = ""
-                alt_ = ""
-                for i in range(len_pred):
-                    nzp = nzref_pos[nzref_pos >= (center_ + i)]
-                    if len(nzp) > 0:
-                        center__ = nzp[np.argmin(abs(nzp - (center_ + i)))]
-                        rb = np.argmax(I[1:, center__, 0])
-                        ref_ += ACGT[rb]
-                        II = I.copy()
-                        II[rb + 1, center__, 1] = 0
-                        alt_ += ACGT[np.argmax(II[1:, center__, 1])]
-                        if sum(I[1:, center__, 1]) == 0:
-                            break
-                if not ref_:
-                    # print "SSS",path,pred
-                    return vcf_record
+                if abs(center_pred - center) < center_dist_roundback and len_pred == 1 and len(ref) == 1 and len(alt) == 1:
+                    pos_, ref_, alt_ = pos, ref.upper(), alt.upper()
+                else:
+                    pos_ = col_2_pos[center_]
+                    ref_ = ""
+                    alt_ = ""
+                    for i in range(len_pred):
+                        nzp = nzref_pos[nzref_pos >= (center_ + i)]
+                        if len(nzp) > 0:
+                            center__ = nzp[np.argmin(abs(nzp - (center_ + i)))]
+                            rb = np.argmax(I[1:, center__, 0])
+                            ref_ += ACGT[rb]
+                            II = I.copy()
+                            II[rb + 1, center__, 1] = 0
+                            if max(II[1:, center__, 1]) == 0 and center__ == center and ref == ref_ and len(alt) == 1:
+                                alt_ = alt
+                            else:
+                                alt_ += ACGT[np.argmax(II[1:, center__, 1])]
+                            if sum(I[1:, center__, 1]) == 0:
+                                break
+                    if not ref_:
+                        # thread_logger.info(["SSS", path, pred])
+                        return vcf_record
             elif type_pred == "INS":
                 if ins_no_zref_pos:
                     pos_, ref_, alt_ = pos, ref.upper(), alt.upper()
                 else:
-                    pos_ = -1
+                    pos_ = -2
                     i_ = center_ - 1
-                    for i_ in range(center_ - 1, 0, -1):
+                    for i_ in range(center_ - 1, -2, -1):
                         if i_ in nzref_pos:
                             pos_ = col_2_pos[i_]
                             break
-                    if pos_ == -1:
+                    if pos_ == -2:
                         # print "PPP-1",path,pred
                         return vcf_record
-                    if (sum(I[1:, i_, 1]) == 0):
-                        # path,pred,i_,nzref_pos,col_2_pos,I[1:,i_,1],true_path[path]
-                        return vcf_record
-                    ref_ = ACGT[np.argmax(I[1:, i_, 0])]
-                    alt_ = ref_
+                    len_pred_=len_pred
                     if len_pred == 3:
                         len_pred = max(len(alt) - len(ref), len_pred)
-                    for i in range(i_ + 1, Iw):
-                        if i in zref_pos:
-                            alt_ += ACGT[np.argmax(I[1:, i, 1])]
-                        else:
-                            break
-                        if (len(alt_) - len(ref_)) >= len_pred:
-                            break
+                    if (sum(I[1:, i_, 1]) == 0):
+                        # thread_logger.info(["PPP-2", path, pred])
+                        return vcf_record
+                    if len_pred == len(alt) - len(ref) and pos_ == pos:
+                        pos_, ref_, alt_ = pos, ref.upper(), alt.upper()
+                    else:
+                        ref_ = ACGT[np.argmax(I[1:, i_, 0])]
+                        alt_ = ref_
+                        for i in range(i_ + 1, Iw):
+                            if i in zref_pos:
+                                alt_ += ACGT[np.argmax(I[1:, i, 1])]
+                            else:
+                                break
+                            if (len(alt_) - len(ref_)) >= len_pred:
+                                break
+                        if len_pred_ == 3 and (len(alt_) - len(ref_)) < len_pred and pos_ == pos:
+                            pos_, ref_, alt_ = pos, ref.upper(), alt.upper()
             elif type_pred == "DEL":
                 pos_ = col_2_pos[center_] - 1
                 if pos_ not in pos_2_col:
diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py
index c3dee50..be0089e 100755
--- a/neusomatic/python/extract_postprocess_targets.py
+++ b/neusomatic/python/extract_postprocess_targets.py
@@ -7,16 +7,20 @@
 import argparse
 import traceback
 import logging
+import pysam
 
 from utils import skip_empty
 from defaults import VCF_HEADER
+from resolve_variants import push_left_var
 
 
-def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
+def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
     logger = logging.getLogger(extract_postprocess_targets.__name__)
 
     logger.info("--------------Extract Postprocessing Targets---------------")
 
+    ref_fasta = pysam.FastaFile(reference)
+
     base_name = ".".join(input_vcf.split(".")[:-1])
     out_vcf = "{}.no_resolve.vcf".format(base_name)
     redo_vcf = "{}.resolve_target.vcf".format(base_name)
@@ -37,10 +41,20 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
             if not record_set:
                 record_set.append(record)
                 continue
-            if len(list(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set))) > 0:
+            chrom_, pos_, ref_, alt_ = push_left_var(
+                ref_fasta, chrom, pos, ref, alt)
+            if len(list(filter(lambda x: (chrom == x[0] and
+                                          (min(abs(x[1] + len(x[2]) - (pos + len(ref))),
+                                               abs(x[1] - pos),
+                                               abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos))) <= max_dist)), record_set))) > 0 or len(
+                list(filter(lambda x: (chrom_ == x[0] and
+                                       (min(abs(x[1] + len(x[2]) - (pos_ + len(ref_))),
+                                            abs(x[1] - pos_),
+                                            abs(min(x[1] + len(x[2]), pos_ + len(ref_)) - max(x[1], pos_))) <= max_dist)), record_set))) > 0:
                 record_set.append(record)
                 continue
 
+
             if record_set:
                 record_sets.append(record_set)
             record_set = [record]
@@ -48,7 +62,18 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
 
         for ii, record_set in enumerate(record_sets):
             if len(record_set) > 1:
-                if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)):
+                varid_pos = {}
+                for chrom, pos, ref, alt, _, _ in record_set:
+                    if pos not in varid_pos:
+                        varid_pos[pos] = set([])
+                    vid = "-".join([ref, alt])
+                    varid_pos[pos].add(vid)
+                multi_allelic = False
+                for vid in varid_pos:
+                    if len(varid_pos[vid]) > 1:
+                        multi_allelic = True
+
+                if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)) or multi_allelic:
                     for x in record_set:
                         fields = x[-1].strip().split()
                         fields[2] = str(ii)
@@ -60,6 +85,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
                 else:
                     for x in record_set:
                         o_f.write(x[-1])
+
             elif record_set:
                 if abs(len(record_set[0][2]) - len(record_set[0][3])) >= min_len:
                     fields = record_set[0][-1].strip().split()
@@ -80,6 +106,8 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
 
     parser = argparse.ArgumentParser(
         description='infer genotype by ao and ro counts')
+    parser.add_argument('--reference', type=str,
+                        help='reference fasta filename', required=True)
     parser.add_argument('--input_vcf', type=str,
                         help='input vcf', required=True)
     parser.add_argument('--min_len', type=int,
@@ -92,7 +120,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad):
     logger.info(args)
     try:
         extract_postprocess_targets(
-            args.input_vcf, args.min_len, args.max_dist, args.pad)
+            args.reference, args.input_vcf, args.min_len, args.max_dist, args.pad)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index e549a33..53a7dd8 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -110,13 +110,15 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                     dp, ro, ao = list(map(int, info[1:4]))
                     af = float(info[4])
                     is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14]
+                    is_same = 0 if is_same else 1
                     is_same_type = np.sign(
                         len(x[3]) - len(x[13])) == np.sign(len(x[4]) - len(x[14]))
+                    is_same_type = 0 if is_same_type else 1
                     dist = abs(int(x[1]) - int(x[11]))
                     len_diff = abs(
                         (len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14])))
                     tags_info[tag].append(
-                        [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af])
+                        [is_same, is_same_type, dist, len_diff, s_e, dp, ro, ao, af])
     fina_info_tag = {}
     for tag, hits in tags_info.items():
         hits = sorted(hits, key=lambda x: x[0:5])
@@ -206,7 +208,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     logger.info("Extract targets")
     postprocess_pad = 1 if not long_read else 10
     extract_postprocess_targets(
-        candidates_preds, min_len, postprocess_max_dist, postprocess_pad)
+        reference, candidates_preds, min_len, postprocess_max_dist, postprocess_pad)
 
     no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf")
     target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf")
diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index c2dbf09..a73325c 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -33,16 +33,25 @@
 NUC_to_NUM = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0, "N": 5}
 NUM_to_NUC = {1: "A", 2: "C", 3: "G", 4: "T", 0: "-", 5: "N"}
 
+max_indel = 100
+
 
 def extract_del(record):
     logger = logging.getLogger(extract_del.__name__)
     dels = []
     pos = record.pos
-    for C, L in record.cigartuples:
+    cigartuples = record.cigartuples
+    first_sc = 1 if cigartuples[0][0] in [
+        CIGAR_SOFTCLIP, CIGAR_HARDCLIP] else 0
+    last_sc = 1 if cigartuples[-1][0] in [CIGAR_SOFTCLIP,
+                                          CIGAR_HARDCLIP] else 0
+    for i, (C, L) in enumerate(cigartuples):
         if C in [CIGAR_SOFTCLIP, CIGAR_HARDCLIP, CIGAR_INS]:
             continue
         if C == CIGAR_DEL:
-            dels.append([record.reference_name, pos, pos + L])
+            if i > first_sc and i < len(cigartuples) - 1 - last_sc:
+                L_ = min(L, max_indel)
+                dels.append([record.reference_name, pos, pos + L_])
         pos += L
 
     return dels
@@ -53,15 +62,25 @@ def extract_ins(record):
     inss = []
     pos = record.pos
     seq_pos = 0
-    for C, L in record.cigartuples:
+    cigartuples = record.cigartuples
+    first_sc = 1 if cigartuples[0][0] in [
+        CIGAR_SOFTCLIP, CIGAR_HARDCLIP] else 0
+    last_sc = 1 if cigartuples[-1][0] in [CIGAR_SOFTCLIP,
+                                          CIGAR_HARDCLIP] else 0
+    for i, (C, L) in enumerate(cigartuples):
         if C == CIGAR_SOFTCLIP:
             seq_pos += L
             continue
         elif C == CIGAR_HARDCLIP:
             continue
         if C == CIGAR_INS:
-            inss.append([record.reference_name, pos, pos + 1,
-                         record.seq[seq_pos:seq_pos + L]])
+            if not record.seq[seq_pos:seq_pos + L]:
+                logger.info([str(record).split("\t"), seq_pos,
+                             L, len(record.seq), len(record.seq)])
+            if i > first_sc and i < len(cigartuples) - 1 - last_sc:
+                L_ = min(L, max_indel)
+                inss.append([record.reference_name, pos, pos + 1,
+                             record.seq[seq_pos:seq_pos + L_]])
             seq_pos += L
         else:
             if C != CIGAR_DEL:
@@ -69,93 +88,304 @@ def extract_ins(record):
             pos += L
     return inss
 
+def push_left_var(ref_fasta, chrom, pos, ref, alt):
+    logger = logging.getLogger(push_left_var.__name__)
+    pos = int(pos)
+    while ref[-1] == alt[-1] and pos > 1:
+        prev_base = ref_fasta.fetch(chrom, pos - 2, pos - 1)
+        pos -= 1
+        ref = prev_base + ref[:-1]
+        alt = prev_base + alt[:-1]
+    while ref[0] == alt[0] and len(ref) == len(alt) and len(ref) > 1:
+        pos += 1
+        ref = ref[1:]
+        alt = alt[1:]
+    return [chrom, pos, ref, alt]
+
+
+class Variant:
+
+    def __init__(self, chrom, pos, ref, alt, gt, score, cnt, vtype):
+        self.chrom = chrom
+        self.pos = int(pos)
+        self.ref = ref
+        self.alt = alt
+        self.gt = gt
+        self.score = float(score)
+        self.cnt = float(cnt) if cnt is not None else None
+        self.vtype = vtype
+        self.processed = False
+
+    def push_left(self, ref_fasta):
+        _, self.pos, self.ref, self.alt = push_left_var(
+            ref_fasta, self.chrom, self.pos, self.ref, self.alt)
+
+    def var_str(self):
+        return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.vtype]))
+
+    def var_pos_vt_str(self):
+        return "-".join(map(str, [self.chrom, self.pos, self.vtype]))
+
+    def var_gt_str(self):
+        return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.gt, self.vtype]))
+
+    def __str__(self):
+        return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.gt,
+                                  self.score, self.cnt, self.vtype]))
+
+
+def resolve_group(ref_fasta, variants, vars_count):
+    logger = logging.getLogger(resolve_group.__name__)
+    chrom = variants[0].chrom
+    vars_count_ = {}
+    for var_str in vars_count:
+        pos, ref, alt, vtype = var_str.split("-")[-4:]
+        pos = int(pos)
+        v = Variant(chrom, pos, ref, alt, "0/0",
+                    0, vars_count[var_str], vtype)
+        v.push_left(ref_fasta)
+        s = v.var_str()
+        if s not in vars_count_:
+            vars_count_[s] = 0
+        vars_count_[s] += vars_count[var_str]
+    vars_count = vars_count_
+
+    group_vars = {}
+    processed = []
+    for v in variants:
+        if v.pos not in group_vars:
+            group_vars[v.pos] = []
+        var_str = v.var_str()
+        if var_str not in vars_count:
+            vars_count[var_str] = 0
+        cnt = vars_count[var_str]
+        v.cnt = cnt
+        processed.append(var_str)
+        group_vars[v.pos].append(v)
+
+    for var_str in vars_count:
+        if var_str not in processed:
+            pos, ref, alt, vtype = var_str.split("-")[-4:]
+            pos = int(pos)
+            if pos not in group_vars:
+                group_vars[pos] = []
+            v = Variant(chrom, pos, ref, alt, "0/0",
+                        0, vars_count[var_str], vtype)
+            group_vars[pos].append(v)
+    for pos in group_vars:
+        var_ = {}
+        for v in group_vars[pos]:
+            var_id = v.var_gt_str()
+            if var_id not in var_:
+                var_[var_id] = []
+            var_[var_id].append(v)
+        group_vars[pos] = []
+        for var_id in var_:
+            group_vars[pos].append(sorted(var_[var_id], key=lambda x: x.score, reverse=True
+                                          )[0])
+
+    out_variants_ = []
+    max_target = [
+        v.cnt for pos in group_vars for v in group_vars[pos] if v.score > 0]
+    if len(max_target) == 0:
+        # logger.info(
+        #     "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos])))
+        return []
+
+    max_count = max(max_target)
+    for pos in group_vars.keys():
+        if max(map(lambda x: x.cnt, group_vars[pos])
+               ) < 0.2 * max_count:
+            continue
+        mx = max(map(lambda x: x.cnt, group_vars[pos]))
+        gts = [x.gt for x in group_vars[pos]]
+        gts = set(gts) - set(["0/0"])
+        if len(gts) == 0:
+            continue
+        if len(gts) > 1:
+            gts_count = {"0/1": 0, "0/0": 0}
+            gts_score = {"0/1": 0, "0/0": 0}
+            for x in group_vars[pos]:
+                if x.gt != "0/0" and x.cnt >= 0.4 * mx:
+                    gts_count[x.gt] += x.cnt
+                    gts_score[x.gt] += x.score
+            priority = {"0/1": 2, "0/0": 1}
+            sorted_gts = sorted(gts_count.keys(), key=lambda x: [
+                                gts_count[x], gts_score[x],
+                                priority[x]], reverse=True)
+            gt = sorted_gts[0]
+        else:
+            gt = list(gts)[0]
+
+        all_vars = sorted(group_vars[pos], key=lambda x: [
+                          x.cnt, x.score, x.gt != "0/0"], reverse=True)
+
+        vtypes = set([x.vtype for x in group_vars[pos]
+                      if x.gt != "0/0" and x.cnt >= 0.4 * mx])
+        if not vtypes:
+            vtypes = set([x.vtype for x in group_vars[pos]
+                          if x.gt != "0/0"])
+        all_vars = list(
+            filter(lambda x: x.vtype in vtypes, all_vars))
+        if not all_vars:
+            logger.info(
+                "No vars: {}".format(list(str(x) for x in group_vars[pos])))
+            logger.info(
+                "No vars: {}".format([[list(str(x) for x in group_vars[pos_])]for pos_ in group_vars]))
+            raise Exception
+        score = max([v.score for v in all_vars])
+        v = all_vars[0]
+        out_variants_.append(
+            [v.chrom, v.pos, v.ref, v.alt, gt, score, v.cnt])
+
+
+    vars_gt = {}
+    for chrom_, pos_, ref_, alt_, gt_, score_, cnt_ in out_variants_:
+        if gt_ not in vars_gt:
+            vars_gt[gt_] = []
+        vars_gt[gt_].append(
+            Variant(chrom_, pos_, ref_, alt_, gt_, score_, cnt_, ""))
+    vars_gt = {gt_: sorted(vars_gt[gt_], key=lambda x: [
+                           x.cnt, x.score], reverse=True) for gt_ in vars_gt}
+    out_variants_ = []
+    for gt_ in vars_gt:
+        v0 = vars_gt[gt_][0]
+        good_vs = [v0]
+        for v in vars_gt[gt_][1:]:
+            keep=True
+            for g_v in good_vs:
+                if min(v.pos + len(v.ref), g_v.pos + len(g_v.ref)) > max(v.pos, g_v.pos):
+                    keep=False
+                    break
+            if keep:
+                good_vs.append(v)
+        for v in good_vs:
+            out_variants_.append(
+                [v.chrom, v.pos, v.ref, v.alt, v.gt, v.score])
+    return out_variants_
+
+
 
 def find_resolved_variants(input_record):
     chrom, start, end, variants, input_bam, filter_duplicate, reference = input_record
     thread_logger = logging.getLogger(
         "{} ({})".format(find_resolved_variants.__name__, multiprocessing.current_process().name))
     try:
-        ref = pysam.FastaFile(reference)
-        out_variants = []
+        ref_fasta = pysam.FastaFile(reference)
+        variants_ = []
+        for x in variants:
+            pos = int(x[1])
+            ref = x[3]
+            alt = x[4]
+            gt = x[9].split(":")[0]
+            score = x[5]
+            vtype = x[-1]
+            v = Variant(chrom, pos, ref, alt, gt, score, None, vtype)
+            v.push_left(ref_fasta)
+            variants_.append(v)
+        variants = variants_
         start, end = list(map(int, [start, end]))
         region = [chrom, start, end]
-        vartypes = list(map(lambda x: x[-1], variants))
-        scores = list(map(lambda x: x[5], variants))
-        if len(set(vartypes)) > 1:
-            out_variants.extend(
-                list(map(lambda x: [x[0], int(x[1]), x[3], x[4], x[9].split(":")[0], x[5]], variants)))
-        else:
-            vartype = vartypes[0]
-            score = max(scores)
-            if vartype == "DEL":
-                dels = []
-                with pysam.AlignmentFile(input_bam) as samfile:
-                    for record in samfile.fetch(chrom, start, end):
-                        if not record.is_duplicate or not filter_duplicate:
-                            if record.cigarstring and "D" in record.cigarstring:
-                                dels.extend(extract_del(record))
-                dels = list(filter(lambda x: (
-                    start <= x[1] <= end) or start <= x[2] <= end, dels))
-                if dels:
-                    del_strs = list(
-                        map(lambda x: "---".join(map(str, x[0:3])), dels))
-                    uniq_dels = list(set(del_strs))
-                    uniq_dels_count = {}
-                    for del_ in uniq_dels:
-                        uniq_dels_count[del_] = del_strs.count(del_)
-                    max_count = max(uniq_dels_count.values())
-                    for del_ in uniq_dels:
-                        if uniq_dels_count[del_] <= max_count * 0.5:
-                            del uniq_dels_count[del_]
-                    new_bed = get_tmp_file()
-                    with open(new_bed, "w") as f_o:
-                        for k in uniq_dels_count.keys():
-                            x = k.split("---")
-                            f_o.write(
-                                "\t".join(map(str, x + [".", "."])) + "\n")
-                    new_bed = bedtools_sort(new_bed, run_logger=thread_logger)
-                    new_bed = bedtools_merge(
-                        new_bed, args=" -c 1 -o count", run_logger=thread_logger)
-                    vs = read_tsv_file(new_bed, fields=range(4))
-                    vs = list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int(
-                        x[1]) - 1, int(x[2])).upper(), ref.fetch(x[0], int(x[1]) - 1, int(x[1])).upper(), "0/1", score], vs))
-                    out_variants.extend(vs)
-            elif vartype == "INS":
-                intervals = []
-                inss = []
-                with pysam.AlignmentFile(input_bam) as samfile:
-                    for record in samfile.fetch(chrom, start, end):
-                        if not record.is_duplicate or not filter_duplicate:
-                            if record.cigarstring and "I" in record.cigarstring:
-                                inss.extend(extract_ins(record))
-                inss = list(filter(lambda x: (
-                    start <= x[1] <= end) or start <= x[2] <= end, inss))
-                if inss:
-                    ins_strs = list(
-                        map(lambda x: "---".join(map(str, x[0:4])), inss))
-                    uniq_inss = list(set(ins_strs))
-                    uniq_inss_count = {}
-                    for ins_ in uniq_inss:
-                        uniq_inss_count[ins_] = ins_strs.count(ins_)
-                    max_ins, max_count = sorted(
-                        uniq_inss_count.items(), key=lambda x: x[1])[-1]
-                    max_pos = int(max_ins.split("---")[1])
-                    for ins_ in uniq_inss:
-                        if uniq_inss_count[ins_] <= max_count * 0.5 or 0 < abs(int(ins_.split("---")[1]) - max_pos) < 4:
-                            del uniq_inss_count[ins_]
-
-                    new_bed = get_tmp_file()
-                    with open(new_bed, "w") as f_o:
-                        for k in uniq_inss_count.keys():
-                            x = k.split("---")
-                            f_o.write(
-                                "\t".join(map(str, x + [".", "."])) + "\n")
-                    new_bed = bedtools_sort(new_bed, run_logger=thread_logger)
-                    vs = read_tsv_file(new_bed, fields=range(4))
-                    vs = list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int(
-                        x[1]) - 1, int(x[1])).upper(), ref.fetch(x[0], int(x[1]) - 1, int(x[1])).upper() + x[3], "0/1", score], vs))
-                    out_variants.extend(vs)
-        return out_variants
+        vartypes = list(map(lambda x: x.vtype, variants))
+        scores = list(map(lambda x: x.score, variants))
+        dels = []
+        inss = []
+        snps = []
+        vars_count = {}
+        with pysam.AlignmentFile(i_bam) as samfile:
+            cov = 0
+            dels_ = []
+            inss_ = []
+            snps_ = []
+            for record in samfile.fetch(chrom, start, end):
+                if record.is_unmapped:
+                    continue
+                if record.seq is None:
+                    continue
+                if not record.is_duplicate or not filter_duplicate:
+                    cov += 1
+                    if record.cigarstring and "D" in record.cigarstring:
+                        dels_.extend(extract_del(record))
+                    if record.cigarstring and "I" in record.cigarstring:
+                        inss_.extend(extract_ins(record))
+                    aligned_pairs = np.array(
+                        record.get_aligned_pairs(matches_only=True))
+                    near_pos = np.where((start <= aligned_pairs[:, 1]) & (
+                        aligned_pairs[:, 1] <= end))[0]
+                    if len(near_pos) != 0:
+                        for pos_i in near_pos:
+                            seq_pos, ref_pos = aligned_pairs[pos_i, :]
+                            if seq_pos is not None:
+                                ref_snp = ref_fasta.fetch(
+                                    chrom, ref_pos, ref_pos + 1).upper()
+                                alt_snp = record.seq[seq_pos]
+                                if alt_snp != ref_snp:
+                                    snps_.append(
+                                        [chrom, ref_pos + 1, ref_snp, alt_snp])
+
+            dels.extend([x + [1.0 / (cov)] for x in dels_])
+            inss.extend([x + [1.0 / (cov)] for x in inss_])
+            snps.extend([x + [1.0 / (cov)] for x in snps_])
+
+        dels = list(filter(lambda x: (
+            start <= x[1] <= end) or start <= x[2] <= end, dels))
+        if dels:
+            del_strs = []
+            cnt_ = {}
+            for x in dels:
+                chrom, st, en, cnt = x
+                del_str = "---".join(map(str, [chrom, st, en]))
+                if del_str not in cnt_:
+                    cnt_[del_str] = 0
+                cnt_[del_str] += cnt
+                del_strs.append(del_str)
+
+            uniq_dels = list(set(del_strs))
+            for del_ in uniq_dels:
+                st, en = map(int, del_.split("---")[1:3])
+                del_str = "-".join(list(map(str, [chrom, int(st), ref_fasta.fetch(chrom, st - 1, en).upper(),
+                                                  ref_fasta.fetch(chrom, st - 1, st).upper(), "DEL"])))
+                vars_count[del_str] = np.round(cnt_[del_], 4)
+        inss = list(filter(lambda x: (
+            start <= x[1] <= end) or start <= x[2] <= end, inss))
+        if inss:
+            cnt_ = {}
+            ins_strs = []
+            for x in inss:
+                chrom, st, en, bases, cnt = x
+                ins_str = "---".join(map(str, [chrom, st, en, bases]))
+                if ins_str not in cnt_:
+                    cnt_[ins_str] = 0
+                cnt_[ins_str] += cnt
+                ins_strs.append(ins_str)
+            uniq_inss = list(set(ins_strs))
+            for ins_ in uniq_inss:
+                st, en, bases = ins_.split("---")[1:4]
+                st, en = map(int, [st, en])
+                ins_str = "-".join(list(map(str, [chrom, int(st), ref_fasta.fetch(chrom, st - 1, st).upper(),
+                                                  ref_fasta.fetch(chrom, st - 1, st).upper() + bases, "INS"])))
+                vars_count[ins_str] = np.round(cnt_[ins_], 4)
+
+        if snps:
+            cnt_ = {}
+            snp_strs = []
+            for x in snps:
+                chrom, st, ref_, alt_, cnt = x
+                snp_str = "---".join(map(str, [chrom, st, ref_, alt_]))
+                if snp_str not in cnt_:
+                    cnt_[snp_str] = 0
+                cnt_[snp_str] += cnt
+                snp_strs.append(snp_str)
+            uniq_snps = list(set(snp_strs))
+            for snp_ in uniq_snps:
+                st, ref_, alt_ = snp_.split("---")[1:4]
+                snp_str = "-".join(list(map(str, [chrom, st, ref_,
+                                                  alt_, "SNP"])))
+                vars_count[snp_str] = np.round(cnt_[snp_], 4)
+
+        out_variants_ = resolve_group(ref_fasta, variants, vars_count)
+        return out_variants_
+
     except Exception as ex:
         thread_logger.error(traceback.format_exc())
         thread_logger.error(ex)
@@ -191,20 +421,26 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file,
             map_args.append([chrom, start, end, variants[id_],
                              input_bam, filter_duplicate, reference])
 
-    pool = multiprocessing.Pool(num_threads)
-    try:
-        out_variants_list = pool.map_async(
-            find_resolved_variants, map_args).get()
-        pool.close()
-    except Exception as inst:
-        logger.error(inst)
-        pool.close()
-        traceback.print_exc()
-        raise Exception
-
-    for o in out_variants_list:
-        if o is None:
-            raise Exception("resolve_variants failed!")
+    if num_threads > 1:
+        try:
+            n_per_bacth = min(10 * num_threads, len(map_args))
+            out_variants_list = []
+            i = 0
+            while i < len(map_args):
+                pool = multiprocessing.Pool(num_threads)
+                batch_i_s = i
+                batch_i_e = min(i + n_per_bacth, len(map_args))
+                out_variants_list.extend(pool.map_async(
+                    find_resolved_variants, map_args[batch_i_s:batch_i_e]).get())
+                i = batch_i_e
+                pool.close()
+        except Exception as inst:
+            logger.error(inst)
+            pool.close()
+            traceback.print_exc()
+            raise Exception
+    else:
+        out_variants_list = [find_resolved_variants(w) for w in map_args]
 
     out_variants = [x for xs in out_variants_list for x in xs]
     chroms_order = get_chromosomes_order(bam=input_bam)
@@ -214,8 +450,13 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file,
     with open(resolved_vcf, "w") as o_f:
         o_f.write("{}\n".format(VCF_HEADER))
         o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
+        done_id = set([])
         for chrom, pos, ref, alt, gt, phred_score in out_variants:
             if ref != alt:
+                id_ = "-".join(list(map(str, [chrom, pos, ref, alt])))
+                if id_ in done_id:
+                    continue
+                done_id.add(id_)
                 phred_score = float(phred_score)
                 prob = np.round(1 - (10**(-phred_score / 10)), 4)
                 o_f.write("\t".join([chrom, str(pos), ".", ref,

From 2094b3e9c1b95612ee745180abcce98c85aaef92 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb160.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 18 Jun 2020 20:47:25 -0700
Subject: [PATCH 55/89] fix_bugs

---
 docker/Dockerfile                     |  2 +-
 neusomatic/python/call.py             | 26 +++++++++++++++-----------
 neusomatic/python/resolve_variants.py |  5 ++++-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 847941b..fc28553 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:16.04
 		
-ENV NEUSOMATIC_VERSION 0.2.1
+ENV NEUSOMATIC_VERSION 0.3.0
 ENV ZLIB_VERSION 1.2.11
 ENV NUMPY_VERSION 1.15.4
 ENV SCIPY_VERSION 1.2.0
diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 6b5a8b9..52f3a69 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -146,7 +146,7 @@ def pred_vcf_records_path(record):
 
         chrom, pos, ref, alt, _, center, _, _, _ = path.split(
             ".")
-        ref, alt = ref.upper(), alt.upper()        
+        ref, alt = ref.upper(), alt.upper()
         center = int(center)
         pos = int(pos)
 
@@ -247,8 +247,14 @@ def pred_vcf_records_path(record):
                             ref_ += ACGT[rb]
                             II = I.copy()
                             II[rb + 1, center__, 1] = 0
-                            if max(II[1:, center__, 1]) == 0 and center__ == center and ref == ref_ and len(alt) == 1:
-                                alt_ = alt
+                            if max(II[1:, center__, 1]) == 0:
+                                if abs(center_pred - center) < center_dist_roundback * 3 and len_pred == 1:
+                                    pos_, ref_, alt_ = pos, ref.upper(), alt.upper()
+                                    break
+                                else:
+                                    ref_ = ""
+                                    alt_ = ""
+                                    break
                             else:
                                 alt_ += ACGT[np.argmax(II[1:, center__, 1])]
                             if sum(I[1:, center__, 1]) == 0:
@@ -269,7 +275,7 @@ def pred_vcf_records_path(record):
                     if pos_ == -2:
                         # print "PPP-1",path,pred
                         return vcf_record
-                    len_pred_=len_pred
+                    len_pred_ = len_pred
                     if len_pred == 3:
                         len_pred = max(len(alt) - len(ref), len_pred)
                     if (sum(I[1:, i_, 1]) == 0):
@@ -469,13 +475,12 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     logger.info("no_seq_complexity: {}".format(no_seq_complexity))
     logger.info("zero_ann_cols: {}".format(zero_ann_cols))
     logger.info("ensemble_custom_header: {}".format(ensemble_custom_header))
-    
-    
-    if not ensemble_custom_header:    
+
+    if not ensemble_custom_header:
         expected_ens_fields = NUM_ENS_FEATURES
         if not no_seq_complexity:
             expected_ens_fields += 2
-        
+
         logger.info("expected_ens_fields: {}".format(expected_ens_fields))
 
         expected_st_fields = 4
@@ -493,7 +498,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     elif len(x) == 4:
                         break
                     else:
-                        raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
+                        raise Exception(
+                            "Wrong number of fields in {}: {}".format(tsv, len(x)))
 
         num_channels = expected_ens_fields + \
             NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
@@ -539,7 +545,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     # 3. load the new state dict
     net.load_state_dict(pretrained_state_dict)
 
-
     if not os.path.exists(out_dir):
         os.mkdir(out_dir)
     matrices_dir = "{}/matrices_{}".format(out_dir, model_tag)
@@ -548,7 +553,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         shutil.rmtree(matrices_dir)
     os.mkdir(matrices_dir)
 
-
     new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs")
     if os.path.exists(new_split_tsvs_dir):
         logger.warning(
diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index a73325c..e08cf5f 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -192,6 +192,9 @@ def resolve_group(ref_fasta, variants, vars_count):
         #     "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos])))
         return []
 
+
+    # logger.info(list([pos, [str(y) for y in x]] for pos,x in group_vars.items()))
+
     max_count = max(max_target)
     for pos in group_vars.keys():
         if max(map(lambda x: x.cnt, group_vars[pos])
@@ -292,7 +295,7 @@ def find_resolved_variants(input_record):
         inss = []
         snps = []
         vars_count = {}
-        with pysam.AlignmentFile(i_bam) as samfile:
+        with pysam.AlignmentFile(input_bam) as samfile:
             cov = 0
             dels_ = []
             inss_ = []

From 30ac6cf2f5e76d339f5f3c0912b2dabdde8571f3 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb160.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 19 Jun 2020 01:29:07 -0700
Subject: [PATCH 56/89] updated versions to 0.3.0

---
 docker/Dockerfile              | 33 +++++++++++++++++----------------
 neusomatic/python/_version.py  |  2 +-
 test/NeuSomatic_standalone.vcf |  2 +-
 test/docker_test.sh            | 18 +++++++++---------
 4 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index fc28553..223b8a5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,19 +2,20 @@ FROM ubuntu:16.04
 		
 ENV NEUSOMATIC_VERSION 0.3.0
 ENV ZLIB_VERSION 1.2.11
-ENV NUMPY_VERSION 1.15.4
-ENV SCIPY_VERSION 1.2.0
-ENV IMAGEIO_VERSION 2.5.0
-ENV PYTORCH_VERSION 1.1.0
-ENV TORCHVISION_VERSION 0.3.0
-ENV CUDATOOLKIT_VERSION 9.0
-ENV CMAKE_VERSION 3.13.2
-ENV PYBEDTOOLS_VERSION 0.8.0
-ENV PYSAM_VERSION 0.15.2
+ENV NUMPY_VERSION 1.18.5
+ENV SCIPY_VERSION 1.4.1
+ENV IMAGEIO_VERSION 2.8.0
+ENV PILLOW_VERSION 2.8.0
+ENV PYTORCH_VERSION 1.4.0
+ENV TORCHVISION_VERSION 0.5.0
+ENV CUDATOOLKIT_VERSION 9.2
+ENV CMAKE_VERSION 3.17.0
+ENV PYSAM_VERSION 0.15.3
 ENV SAMTOOLS_VERSION 1.9
 ENV TABIX_VERSION 0.2.6
-ENV BEDTOOLS_VERSION 2.27.1
-ENV BIOPYTHON_VERSION 1.72
+ENV BEDTOOLS_VERSION 2.29.2
+ENV BIOPYTHON_VERSION 1.76
+ENV FISHER_VERSION 0.1.9
 ENV GCC_VERSION 5
 
 RUN apt-get update && apt-get install -y --fix-missing \
@@ -30,9 +31,9 @@ RUN conda update -y conda
 
 
 RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \
-					 imageio=${IMAGEIO_VERSION} && conda clean -a
-RUN conda install -y cmake=${CMAKE_VERSION} -c conda-forge && conda clean -a
-RUN conda install -y pysam=${PYSAM_VERSION} pybedtools=${PYBEDTOOLS_VERSION} \
+					 pillow=${PILLOW_VERSION} cmake=${CMAKE_VERSION} imageio=${IMAGEIO_VERSION} && conda clean -a
+RUN conda install -y fisher=${FISHER_VERSION} -c conda-forge && conda clean -a
+RUN conda install -y pysam=${PYSAM_VERSION} \
 					 samtools=${SAMTOOLS_VERSION} tabix=${TABIX_VERSION} \
 					 bedtools=${BEDTOOLS_VERSION} \
 					 biopython=${BIOPYTHON_VERSION} -c bioconda && conda clean -a
@@ -42,7 +43,7 @@ RUN conda install -y pytorch=${PYTORCH_VERSION} \
 
 RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION}
 
-ADD https://github.com/bioinform/neusomatic/archive/v${NEUSOMATIC_VERSION}.tar.gz /opt/v${NEUSOMATIC_VERSION}.tar.gz 
-RUN cd /opt/ && tar -xzvf v${NEUSOMATIC_VERSION}.tar.gz && mv neusomatic-${NEUSOMATIC_VERSION} neusomatic && rm /opt/v${NEUSOMATIC_VERSION}.tar.gz
+ADD https://github.com/bioinform/neusomatic/archive/extended_standalone.tar.gz /opt/extended_standalone.tar.gz 
+RUN cd /opt/ && tar -xzvf extended_standalone.tar.gz && mv neusomatic-extended_standalone neusomatic && rm /opt/extended_standalone.tar.gz
 RUN cd /opt/neusomatic/ && ./build.sh 
 ENV PATH=/opt/neusomatic/neusomatic/bin:/opt/neusomatic/neusomatic/python/:${PATH}
diff --git a/neusomatic/python/_version.py b/neusomatic/python/_version.py
index 3ced358..493f741 100755
--- a/neusomatic/python/_version.py
+++ b/neusomatic/python/_version.py
@@ -1 +1 @@
-__version__ = "0.2.1"
+__version__ = "0.3.0"
diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf
index bee861b..b054942 100644
--- a/test/NeuSomatic_standalone.vcf
+++ b/test/NeuSomatic_standalone.vcf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##NeuSomatic Version=0.2.1
+##NeuSomatic Version=0.3.0
 ##INFO=<ID=SCORE,Number=1,Type=Float,Description="Prediction probability score">
 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth in the tumor">
 ##INFO=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count in the tumor">
diff --git a/test/docker_test.sh b/test/docker_test.sh
index 6118711..fcbead1 100755
--- a/test/docker_test.sh
+++ b/test/docker_test.sh
@@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ]
 then
 	if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ]
 	then
-		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 		"cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	fi
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 	"cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	
 fi
 if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ]
 then
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 	"samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa"
 fi
 rm -rf work_standalone
@@ -27,7 +27,7 @@ rm -rf work_standalone
 
 
 #Stand-alone NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1
 	--num_threads 1 \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu
 		--num_threads 1 \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \
@@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1
 
 rm -rf /mnt/example/work_ensemble
 #Ensemble NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1
 	--ensemble_tsv /mnt/ensemble.tsv \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu
 		--ensemble \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.2.1 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \

From c2deecb66ff20de1f22c8cbdfc5de0ed0628783e Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 19 Jun 2020 12:27:44 -0700
Subject: [PATCH 57/89] small fix

---
 docker/Dockerfile                     | 10 +++++-----
 neusomatic/python/extend_features.py  |  2 +-
 neusomatic/python/generate_dataset.py |  6 ++++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 223b8a5..51696af 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,15 +1,15 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 		
 ENV NEUSOMATIC_VERSION 0.3.0
 ENV ZLIB_VERSION 1.2.11
-ENV NUMPY_VERSION 1.18.5
+ENV NUMPY_VERSION 1.18.1
 ENV SCIPY_VERSION 1.4.1
 ENV IMAGEIO_VERSION 2.8.0
-ENV PILLOW_VERSION 2.8.0
+ENV PILLOW_VERSION 7.1.2
 ENV PYTORCH_VERSION 1.4.0
 ENV TORCHVISION_VERSION 0.5.0
 ENV CUDATOOLKIT_VERSION 9.2
-ENV CMAKE_VERSION 3.17.0
+ENV CMAKE_VERSION 3.14.0
 ENV PYSAM_VERSION 0.15.3
 ENV SAMTOOLS_VERSION 1.9
 ENV TABIX_VERSION 0.2.6
@@ -22,7 +22,7 @@ RUN apt-get update && apt-get install -y --fix-missing \
 				build-essential zlib1g-dev curl less vim bzip2
 RUN apt-get install -y --fix-missing git wget
 
-RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b
 RUN rm Miniconda3-latest-Linux-x86_64.sh
 ENV PATH=/miniconda/bin:${PATH}
diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py
index bf4c118..5d4b482 100755
--- a/neusomatic/python/extend_features.py
+++ b/neusomatic/python/extend_features.py
@@ -54,7 +54,7 @@ def extract_features(candidate_record):
                          (tBamFeatures.nref, nBamFeatures.nref)),
                         alternative='greater'))
                 except ValueError:
-                    score_varscan2 = nan
+                    score_varscan2 = float('nan')
 
                 homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference(
                     ref_fa, my_coordinate, ref, alt)
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 2991274..6148fbb 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1528,7 +1528,6 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         Seq_Complexity_ = list(map(lambda x: x[0], filter(
             lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header))))
 
-        max_varscan2_score = 0 if zero_vscore else 60
         min_max_features = [[cov_features, 0, 2 * COV],
                             [mq_features, 0, 70],
                             [bq_features, 0, 41],
@@ -1538,7 +1537,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                             [stralka_scor, 0, 40],
                             [stralka_qss, 0, 200],
                             [stralka_tqss, 0, 4],
-                            [varscan2_score, 0, max_varscan2_score],
+                            [varscan2_score, 0, 60],
                             [vardict_score, 0, 120],
                             [m2_lod, 0, 100],
                             [sniper_score, 0, 120],
@@ -1554,6 +1553,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         if not no_seq_complexity:
             min_max_features.append([Seq_Complexity_, 0, 40])
 
+        if zero_vscore:
+            ensemble_data[:,np.array(varscan2_score)] = 0
+
         selected_features = sorted([i for f in min_max_features for i in f[0]])
         selected_features_tags = list(
             map(lambda x: header[x], selected_features))

From bad8004e9d797b1cbe796c121a1f97dbd33b8655 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 19 Jun 2020 12:31:00 -0700
Subject: [PATCH 58/89] small fix

---
 neusomatic/python/preprocess.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index ae1586c..648a5b6 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -689,6 +689,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--merge_d_for_scan', type=int,
                         help='-d used to merge regions before scan',
                         default=None)
+    parser.add_argument('--zero_vscore',
+                        help='set VarScan2_Score to zero',
+                        action="store_true")
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,

From 97e4864ccd967cdd4746822c7581592d9da97d3f Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 19 Jun 2020 22:59:22 -0700
Subject: [PATCH 59/89] fix test

---
 docker/Dockerfile            |  1 +
 test/NeuSomatic_ensemble.vcf |  2 +-
 test/docker_test.sh          | 18 +++++++++---------
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 51696af..f6bdbc9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -43,6 +43,7 @@ RUN conda install -y pytorch=${PYTORCH_VERSION} \
 
 RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION}
 
+
 ADD https://github.com/bioinform/neusomatic/archive/extended_standalone.tar.gz /opt/extended_standalone.tar.gz 
 RUN cd /opt/ && tar -xzvf extended_standalone.tar.gz && mv neusomatic-extended_standalone neusomatic && rm /opt/extended_standalone.tar.gz
 RUN cd /opt/neusomatic/ && ./build.sh 
diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf
index e3a7d8b..d62986b 100644
--- a/test/NeuSomatic_ensemble.vcf
+++ b/test/NeuSomatic_ensemble.vcf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##NeuSomatic Version=0.2.1
+##NeuSomatic Version=0.3.0
 ##INFO=<ID=SCORE,Number=1,Type=Float,Description="Prediction probability score">
 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth in the tumor">
 ##INFO=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count in the tumor">
diff --git a/test/docker_test.sh b/test/docker_test.sh
index fcbead1..4a8f34f 100755
--- a/test/docker_test.sh
+++ b/test/docker_test.sh
@@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ]
 then
 	if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ]
 	then
-		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 		"cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	fi
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 	"cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	
 fi
 if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ]
 then
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 	"samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa"
 fi
 rm -rf work_standalone
@@ -27,7 +27,7 @@ rm -rf work_standalone
 
 
 #Stand-alone NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0
 	--num_threads 1 \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu
 		--num_threads 1 \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \
@@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0
 
 rm -rf /mnt/example/work_ensemble
 #Ensemble NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0
 	--ensemble_tsv /mnt/ensemble.tsv \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu
 		--ensemble \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \

From fb6ea214ece6b743fb2c070a670c1eafbc702c50 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 23 Jun 2020 17:08:43 -0700
Subject: [PATCH 60/89] fix in resolve variants

---
 neusomatic/python/generate_dataset.py    |  4 +-
 neusomatic/python/postprocess.py         |  3 +
 neusomatic/python/resolve_variants.py    | 76 ++++++++++++++++++------
 neusomatic/python/sequencing_features.py |  2 +-
 4 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 6148fbb..53920c5 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1553,8 +1553,8 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
         if not no_seq_complexity:
             min_max_features.append([Seq_Complexity_, 0, 40])
 
-        if zero_vscore:
-            ensemble_data[:,np.array(varscan2_score)] = 0
+        if zero_vscore and n_vars > 0:
+            ensemble_data[:, np.array(varscan2_score)] = 0
 
         selected_features = sorted([i for f in min_max_features for i in f[0]])
         selected_features_tags = list(
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index 53a7dd8..32043a8 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -40,6 +40,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
             c_f.write("{}\n".format(VCF_HEADER))
             c_f.write(
                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
+            ensemble_header_found = False
             for line in e_f:
                 if "POS" in line:
                     header = line.strip().split()
@@ -47,6 +48,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                     pos_id = header.index("POS")
                     ref_id = header.index("REF")
                     alt_id = header.index("ALT")
+                    ensemble_header_found = True
                     if "T_DP" in line:
                         dp_id = header.index("T_DP")
                         ref_fw_id = header.index("T_REF_FOR")
@@ -57,6 +59,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv,
                     else:
                         dp_id, ref_fw_id, ref_rv_id, alt_fw_id, alt_rv_id = None, None, None, None, None
                     continue
+                assert ensemble_header_found
                 fields = line.strip().split()
                 chrom = fields[chrom_id]
                 pos = fields[pos_id]
diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index e08cf5f..cd170f7 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -88,6 +88,17 @@ def extract_ins(record):
             pos += L
     return inss
 
+
+def find_vtype(ref, alt):
+    if len(alt) < len(ref):
+        vtype = "DEL"
+    elif len(alt) > len(ref):
+        vtype = "INS"
+    else:
+        vtype = "SNP"
+    return vtype
+
+
 def push_left_var(ref_fasta, chrom, pos, ref, alt):
     logger = logging.getLogger(push_left_var.__name__)
     pos = int(pos)
@@ -114,6 +125,7 @@ def __init__(self, chrom, pos, ref, alt, gt, score, cnt, vtype):
         self.score = float(score)
         self.cnt = float(cnt) if cnt is not None else None
         self.vtype = vtype
+        self.len = abs(len(alt) - len(ref))
         self.processed = False
 
     def push_left(self, ref_fasta):
@@ -185,15 +197,15 @@ def resolve_group(ref_fasta, variants, vars_count):
                                           )[0])
 
     out_variants_ = []
-    max_target = [
-        v.cnt for pos in group_vars for v in group_vars[pos] if v.score > 0]
+    max_target = [v.cnt for pos in group_vars for v in group_vars[
+        pos] if v.score > 0 or v.len >= 3]
     if len(max_target) == 0:
         # logger.info(
         #     "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos])))
         return []
 
-
-    # logger.info(list([pos, [str(y) for y in x]] for pos,x in group_vars.items()))
+    # logger.info(list([pos, [str(y) for y in x]]
+    #                  for pos, x in group_vars.items()))
 
     max_count = max(max_target)
     for pos in group_vars.keys():
@@ -202,14 +214,15 @@ def resolve_group(ref_fasta, variants, vars_count):
             continue
         mx = max(map(lambda x: x.cnt, group_vars[pos]))
         gts = [x.gt for x in group_vars[pos]]
-        gts = set(gts) - set(["0/0"])
+        gts = set([x.gt for x in group_vars[pos]
+                   if x.gt != "0/0" or x.len >= 3])
         if len(gts) == 0:
             continue
         if len(gts) > 1:
             gts_count = {"0/1": 0, "0/0": 0}
             gts_score = {"0/1": 0, "0/0": 0}
             for x in group_vars[pos]:
-                if x.gt != "0/0" and x.cnt >= 0.4 * mx:
+                if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx:
                     gts_count[x.gt] += x.cnt
                     gts_score[x.gt] += x.score
             priority = {"0/1": 2, "0/0": 1}
@@ -222,12 +235,11 @@ def resolve_group(ref_fasta, variants, vars_count):
 
         all_vars = sorted(group_vars[pos], key=lambda x: [
                           x.cnt, x.score, x.gt != "0/0"], reverse=True)
-
         vtypes = set([x.vtype for x in group_vars[pos]
-                      if x.gt != "0/0" and x.cnt >= 0.4 * mx])
+                      if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx])
         if not vtypes:
             vtypes = set([x.vtype for x in group_vars[pos]
-                          if x.gt != "0/0"])
+                          if (x.gt != "0/0" or x.len >= 3)])
         all_vars = list(
             filter(lambda x: x.vtype in vtypes, all_vars))
         if not all_vars:
@@ -237,10 +249,44 @@ def resolve_group(ref_fasta, variants, vars_count):
                 "No vars: {}".format([[list(str(x) for x in group_vars[pos_])]for pos_ in group_vars]))
             raise Exception
         score = max([v.score for v in all_vars])
+        if gt == "0/0":
+            nz_vars = [x for x in all_vars if x.gt !=
+                       "0/0" and x.vtype == all_vars[0].vtype]
+            if nz_vars:
+                nz_vars = sorted(nz_vars, key=lambda x: [
+                                 x.score], reverse=True)[0]
+                gt = nz_vars.gt
         v = all_vars[0]
         out_variants_.append(
             [v.chrom, v.pos, v.ref, v.alt, gt, score, v.cnt])
 
+    if len(out_variants_) == 1 and out_variants_[0][4] == "0/0" and abs(len(out_variants_[0][2]) - len(out_variants_[0][3])) >= 3:
+        chrom_, pos_, ref_, alt_, gt_, score_, cnt_ = out_variants_[0]
+        vtype = find_vtype(ref_, alt_)
+        resolve_candids = []
+        for pos in group_vars.keys():
+            for y in group_vars[pos]:
+                if y.vtype == vtype and y.gt != "0/0":
+                    resolve_candids.append(y)
+        if resolve_candids:
+            resolve_candids = sorted(resolve_candids, key=lambda x: [
+                x.score], reverse=True)[0]
+            out_variants_ = [[chrom_, pos_, ref_, alt_,
+                              resolve_candids.gt, resolve_candids.score, cnt_]]
+
+    if len(out_variants_) > 1 and "0/0" in [x[4] for x in out_variants_]:
+        nz_vars = [x for x in out_variants_ if x[4] != "0/0"]
+        if nz_vars:
+            nz_vtypes = [find_vtype(x[2], x[3]) for x in nz_vars]
+            out_variants__ = []
+            for x in out_variants_:
+                if x[4] != "0/0":
+                    out_variants__.append(x)
+                else:
+                    vtype = find_vtype(x[2], x[3])
+                    if vtype not in nz_vtypes:
+                        out_variants__.append(x)
+            out_variants_ = out_variants__
 
     vars_gt = {}
     for chrom_, pos_, ref_, alt_, gt_, score_, cnt_ in out_variants_:
@@ -255,10 +301,10 @@ def resolve_group(ref_fasta, variants, vars_count):
         v0 = vars_gt[gt_][0]
         good_vs = [v0]
         for v in vars_gt[gt_][1:]:
-            keep=True
+            keep = True
             for g_v in good_vs:
                 if min(v.pos + len(v.ref), g_v.pos + len(g_v.ref)) > max(v.pos, g_v.pos):
-                    keep=False
+                    keep = False
                     break
             if keep:
                 good_vs.append(v)
@@ -268,7 +314,6 @@ def resolve_group(ref_fasta, variants, vars_count):
     return out_variants_
 
 
-
 def find_resolved_variants(input_record):
     chrom, start, end, variants, input_bam, filter_duplicate, reference = input_record
     thread_logger = logging.getLogger(
@@ -406,12 +451,7 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file,
         for line in skip_empty(tv_f):
             fields = line.strip().split()
             id_ = int(fields[2])
-            if len(fields[4]) < len(fields[3]):
-                vartype = "DEL"
-            elif len(fields[4]) > len(fields[3]):
-                vartype = "INS"
-            else:
-                vartype = "SNP"
+            vartype = find_vtype(fields[3], fields[4])
             if id_ not in variants:
                 variants[id_] = []
             variants[id_].append(fields + [vartype])
diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py
index 6aafc1e..364f76b 100644
--- a/neusomatic/python/sequencing_features.py
+++ b/neusomatic/python/sequencing_features.py
@@ -385,7 +385,7 @@ def subLC(sequence, max_substring_length=20):
         number_of_subseqs = 0
         seq_length = len(sequence)
         max_number_of_subseqs = max_sub_vocabularies(
-            seq_length, max_substring_length)
+            seq_length, min(seq_length, max_substring_length))
 
         set_of_seq_n = set()
         for i in range(1, min(max_substring_length + 1, seq_length + 1)):

From 1bf40b46ab37eeb9ee28d87062f4f3329d959d33 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 25 Jun 2020 00:03:33 -0700
Subject: [PATCH 61/89] ensemble with internal features

---
 neusomatic/python/filter_candidates.py | 32 +--------------
 neusomatic/python/preprocess.py        | 54 ++++++++++++++------------
 neusomatic/python/read_callers_vcf.py  | 11 +++---
 test/NeuSomatic_ensemble.vcf           | 16 ++++----
 4 files changed, 45 insertions(+), 68 deletions(-)

diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py
index 58fc628..1fa395e 100755
--- a/neusomatic/python/filter_candidates.py
+++ b/neusomatic/python/filter_candidates.py
@@ -17,7 +17,7 @@
 
 
 def filter_candidates(candidate_record):
-    candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, max_dp, good_ao, \
+    candidates_vcf, filtered_candidates_vcf, reference, min_dp, max_dp, good_ao, \
         min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af,  \
         del_merge_min_af, ins_merge_min_af, merge_r = candidate_record
     thread_logger = logging.getLogger(
@@ -26,16 +26,6 @@ def filter_candidates(candidate_record):
         thread_logger.info(
             "---------------------Filter Candidates---------------------")
 
-        if dbsnp:
-            if not dbsnp.endswith("vcf.gz"):
-                thread_logger.error("Aborting!")
-                raise Exception(
-                    "The dbSNP file should be a tabix indexed file with .vcf.gz format")
-            if not os.path.exists(dbsnp + ".tbi"):
-                thread_logger.error("Aborting!")
-                raise Exception(
-                    "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
-
         records = {}
         with open(candidates_vcf) as v_f:
             for line in skip_empty(v_f):
@@ -267,27 +257,11 @@ def filter_candidates(candidate_record):
                                   "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)])
                 final_records.append([chrom, pos - 1, ref, alt, line])
         final_records = sorted(final_records, key=lambda x: x[0:2])
-        if dbsnp:
-            dbsnp_tb = pysam.TabixFile(dbsnp)
         with open(filtered_candidates_vcf, "w") as o_f:
             o_f.write("{}\n".format(VCF_HEADER))
             o_f.write(
                 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
             for record in final_records:
-                if dbsnp:
-                    chrom, pos, ref, alt = record[0:4]
-                    var_id = "-".join(map(str, [chrom, pos, ref, alt]))
-                    region = "{}:{}-{}".format(chrom, pos, pos + 1)
-                    dbsnp_vars = []
-                    for x in dbsnp_tb.fetch(region=region):
-                        chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[
-                            0:5]
-                        for alt_ in alts_.split(","):
-                            dbsnp_var_id = "-".join(map(str,
-                                                        [chrom_, pos_, ref_, alt_]))
-                            dbsnp_vars.append(dbsnp_var_id)
-                    if var_id in dbsnp_vars:
-                        continue
                 o_f.write(record[-1] + "\n")
         return filtered_candidates_vcf
 
@@ -309,8 +283,6 @@ def filter_candidates(candidate_record):
                         required=True)
     parser.add_argument('--reference', type=str, help='reference fasta filename',
                         required=True)
-    parser.add_argument('--dbsnp_to_filter', type=str,
-                        help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None)
     parser.add_argument('--good_ao', type=float, help='good alternate count (ignores maf)',
                         default=10)
     parser.add_argument('--min_ao', type=float,
@@ -341,7 +313,7 @@ def filter_candidates(candidate_record):
 
     try:
         output = filter_candidates((args.candidates_vcf, args.filtered_candidates_vcf,
-                                    args.reference, args.dbsnp_to_filter, args.min_dp, args.max_dp,
+                                    args.reference, args.min_dp, args.max_dp,
                                     args.good_ao, args.min_ao,
                                     args.snp_min_af, args.snp_min_bq, args.snp_min_ao,
                                     args.ins_min_af, args.del_min_af,
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 648a5b6..3b470e8 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -24,7 +24,7 @@
 from utils import concatenate_vcfs, run_bedtools_cmd, bedtools_sort, bedtools_merge, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty, vcf_2_bed
 
 
-def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp,
+def process_split_region(tn, work, region, reference, mode, alignment_bam,
                          scan_window_size, scan_maf, min_mapq,
                          filtered_candidates_vcf, min_dp, max_dp,
                          filter_duplicate,
@@ -48,7 +48,7 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp
             for i, (raw_vcf, count_bed, split_region_bed) in enumerate(scan_outputs):
                 filtered_vcf = os.path.join(os.path.dirname(
                     os.path.realpath(raw_vcf)), "filtered_candidates.vcf")
-                map_args.append((raw_vcf, filtered_vcf, reference, dbsnp, min_dp, max_dp, good_ao,
+                map_args.append((raw_vcf, filtered_vcf, reference, min_dp, max_dp, good_ao,
                                  min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af,
                                  ins_merge_min_af, merge_r))
             try:
@@ -283,7 +283,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
             work_tumor_without_q, "filtered_candidates.vcf")
 
         tumor_outputs_without_q = process_split_region("tumor", work_tumor_without_q, region_bed, reference, mode,
-                                                       tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq,
+                                                       tumor_bam, scan_window_size, scan_maf, min_mapq,
                                                        filtered_candidates_vcf_without_q, min_dp, max_dp,
                                                        filter_duplicate,
                                                        good_ao, min_ao,
@@ -309,7 +309,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
 
     logger.info("Scan tumor bam (and extracting quality scores).")
     tumor_outputs = process_split_region("tumor", work_tumor, region_bed, reference, mode,
-                                         tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq,
+                                         tumor_bam, scan_window_size, scan_maf, min_mapq,
                                          filtered_candidates_vcf, min_dp, max_dp,
                                          filter_duplicate,
                                          good_ao, min_ao,
@@ -338,7 +338,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
         os.mkdir(work_normal)
     logger.info("Scan normal bam (and extracting quality scores).")
     normal_counts, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam,
-                                               None, scan_window_size, 0.2, min_mapq,
+                                               scan_window_size, 0.2, min_mapq,
                                                None, min_dp, max_dp,
                                                filter_duplicate,
                                                good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao,
@@ -361,24 +361,28 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 shutil.rmtree(work_dataset_split)
             os.mkdir(work_dataset_split)
             ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None
-            if add_extra_features:
+            if add_extra_features or (ensemble_tsv and not no_feature_recomp_for_ensemble):
                 work_tumor_i = os.path.dirname(filtered_vcf)
-                extra_features_tsv = os.path.join(
-                    work_tumor_i, "extra_features.tsv")
-                ex_tsvs = [extra_features_tsv]
-                if not os.path.exists(extra_features_tsv) or restart:
-                    extend_features(filtered_vcf,
-                                    ensemble_beds[
-                                        i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None,
-                                    None,
-                                    extra_features_tsv,
-                                    reference, tumor_bam, normal_bam,
-                                    min_mapq, snp_min_bq,
-                                    dbsnp, None,
-                                    no_seq_complexity,
-                                    window_extend,
-                                    max_cluster_size,
-                                    num_threads)
+                if add_extra_features:
+                    extra_features_tsv = os.path.join(
+                        work_tumor_i, "extra_features.tsv")
+                    ex_tsvs = [extra_features_tsv]
+                    if not os.path.exists(extra_features_tsv) or restart:
+                        extend_features(filtered_vcf,
+                                        ensemble_beds[
+                                            i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None,
+                                        None,
+                                        extra_features_tsv,
+                                        reference, tumor_bam, normal_bam,
+                                        min_mapq, snp_min_bq,
+                                        dbsnp, None,
+                                        no_seq_complexity,
+                                        window_extend,
+                                        max_cluster_size,
+                                        num_threads)
+                else:
+                    ex_tsvs = []
+                    extra_features_tsv = None
                 if ensemble_tsv and not no_feature_recomp_for_ensemble:
                     extra_features_others_tsv = os.path.join(
                         work_tumor_i, "extra_features_others.tsv")
@@ -612,8 +616,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         help='normal bam', required=True)
     parser.add_argument('--work', type=str,
                         help='work directory', required=True)
-    parser.add_argument('--dbsnp_to_filter', type=str,
-                        help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None)
+    parser.add_argument('--dbsnp', type=str,
+                        help='dbsnp vcf.gz', default=None)
     parser.add_argument('--scan_window_size', type=int,
                         help='window size to scan the variants', default=2000)
     parser.add_argument('--scan_maf', type=float,
@@ -703,7 +707,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
 
     try:
         preprocess(args.work, args.mode, args.reference, args.region_bed, args.tumor_bam, args.normal_bam,
-                   args.dbsnp_to_filter,
+                   args.dbsnp,
                    args.scan_window_size, args.scan_maf, args.min_mapq,
                    args.min_dp, args.max_dp, args.good_ao, args.min_ao, args.snp_min_af, args.snp_min_bq, args.snp_min_ao,
                    args.ins_min_af, args.del_min_af, args.del_merge_min_af,
diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py
index 997270e..1114358 100755
--- a/neusomatic/python/read_callers_vcf.py
+++ b/neusomatic/python/read_callers_vcf.py
@@ -213,11 +213,12 @@ def read_callers_vcf(reference,
                 chrom, pos, _, ref, alts, _, filters, info = x[0:8]
                 for ith_alt, alt in enumerate(alts.split(",")):
                     if ref != alt:
-                        mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info(
-                            filters, info, ith_alt)
-                        var_id = "-".join([chrom, pos, ref, alt])
-                        mutect2_info[var_id] = [
-                            mutect_classification, nlod, tlod, tandem, ecnt]
+                        if len(ref) == 1 or len(alt) == 1:
+                            mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info(
+                                filters, info, ith_alt)
+                            var_id = "-".join([chrom, pos, ref, alt])
+                            mutect2_info[var_id] = [
+                                mutect_classification, nlod, tlod, tandem, ecnt]
             i_f.close()
     strelka2_info = {}
     if strelka2_vcfs:
diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf
index d62986b..82e77b0 100644
--- a/test/NeuSomatic_ensemble.vcf
+++ b/test/NeuSomatic_ensemble.vcf
@@ -14,11 +14,11 @@
 ##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count in the tumor">
 ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele fractions of alternate alleles in the tumor">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
-22	21330787	.	C	T	26.9917	PASS	SCORE=0.9980;DP=387;RO=298;AO=87;AF=0.226	GT:DP:RO:AO:AF	0/1:387:298:87:0.226
-22	21332122	.	G	A	28.5402	PASS	SCORE=0.9986;DP=268;RO=209;AO=59;AF=0.2201	GT:DP:RO:AO:AF	0/1:268:209:59:0.2201
-22	21334924	.	G	C	17.6382	PASS	SCORE=0.9828;DP=101;RO=78;AO=23;AF=0.2277	GT:DP:RO:AO:AF	0/1:101:78:23:0.2277
-22	21335259	.	C	A	19.7149	PASS	SCORE=0.9893;DP=234;RO=190;AO=44;AF=0.188	GT:DP:RO:AO:AF	0/1:234:190:44:0.188
-22	21384516	.	C	T	27.9602	PASS	SCORE=0.9984;DP=90;RO=64;AO=26;AF=0.2889	GT:DP:RO:AO:AF	0/1:90:64:26:0.2889
-22	21982892	.	C	T	21.4946	PASS	SCORE=0.9929;DP=152;RO=109;AO=43;AF=0.2829	GT:DP:RO:AO:AF	0/1:152:109:43:0.2829
-22	21983260	.	A	G	31.5494	PASS	SCORE=0.9993;DP=112;RO=70;AO=42;AF=0.375	GT:DP:RO:AO:AF	0/1:112:70:42:0.375
-22	21989959	.	AAG	A	33.0106	PASS	SCORE=0.9995;DP=131;RO=99;AO=32;AF=0.2443	GT:DP:RO:AO:AF	0/1:131:99:32:0.2443
+22	21330787	.	C	T	33.9793	PASS	SCORE=0.9996;DP=387;RO=298;AO=87;AF=0.226	GT:DP:RO:AO:AF	0/1:387:298:87:0.226
+22	21332122	.	G	A	35.2289	PASS	SCORE=0.9997;DP=268;RO=209;AO=59;AF=0.2201	GT:DP:RO:AO:AF	0/1:268:209:59:0.2201
+22	21334924	.	G	C	24.4407	PASS	SCORE=0.9964;DP=101;RO=78;AO=23;AF=0.2277	GT:DP:RO:AO:AF	0/1:101:78:23:0.2277
+22	21335259	.	C	A	29.2081	PASS	SCORE=0.9988;DP=234;RO=190;AO=44;AF=0.188	GT:DP:RO:AO:AF	0/1:234:190:44:0.188
+22	21384516	.	C	T	36.9903	PASS	SCORE=0.9998;DP=90;RO=64;AO=26;AF=0.2889	GT:DP:RO:AO:AF	0/1:90:64:26:0.2889
+22	21982892	.	C	T	33.0101	PASS	SCORE=0.9995;DP=152;RO=109;AO=43;AF=0.2829	GT:DP:RO:AO:AF	0/1:152:109:43:0.2829
+22	21983260	.	A	G	36.9903	PASS	SCORE=0.9998;DP=112;RO=70;AO=42;AF=0.375	GT:DP:RO:AO:AF	0/1:112:70:42:0.375
+22	21989959	.	AAG	A	39.9993	PASS	SCORE=0.9999;DP=131;RO=99;AO=32;AF=0.2443	GT:DP:RO:AO:AF	0/1:131:99:32:0.2443

From 2471f8e5e1983d7005671bff7a2d2b22e330c7ee Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 25 Jun 2020 11:16:25 -0700
Subject: [PATCH 62/89] fix resolve

---
 neusomatic/python/resolve_variants.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index cd170f7..e4359ba 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -358,6 +358,8 @@ def find_resolved_variants(input_record):
                         inss_.extend(extract_ins(record))
                     aligned_pairs = np.array(
                         record.get_aligned_pairs(matches_only=True))
+                    if len(aligned_pairs)==0:
+                        continue
                     near_pos = np.where((start <= aligned_pairs[:, 1]) & (
                         aligned_pairs[:, 1] <= end))[0]
                     if len(near_pos) != 0:
@@ -435,9 +437,10 @@ def find_resolved_variants(input_record):
         return out_variants_
 
     except Exception as ex:
+        thread_logger.error("Error in {}".format(input_record))
         thread_logger.error(traceback.format_exc())
         thread_logger.error(ex)
-        return None
+        raise Exception
 
 
 def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file,

From 544e2f23dfc9cb6b74487b5f3c776971eb0c5c1f Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 2 Jul 2020 13:20:22 -0700
Subject: [PATCH 63/89] small fix

---
 neusomatic/python/read_callers_vcf.py |  1 +
 neusomatic/python/scan_alignments.py  | 13 +++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py
index 1114358..42c1428 100755
--- a/neusomatic/python/read_callers_vcf.py
+++ b/neusomatic/python/read_callers_vcf.py
@@ -415,6 +415,7 @@ def read_callers_vcf(reference,
                  msi, msilen, shift3]
             chrom = "-".join(var_id.split("-")[:-3])
             pos, ref, alt = var_id.split("-")[-3:]
+            ref, alt = ref.upper(), alt.upper()
             o_f.write(
                 "\t".join([chrom, pos, ".", ref, alt] + list(map(lambda x: str(x).replace("nan", "0"), f))) + "\n")
 
diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index b8aaf11..4bff295 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -41,12 +41,12 @@ def run_scan_alignments(record):
             os.mkdir(work)
 
         if merge_d_for_scan is not None:
-            split_region_file_=os.path.join(work,"merged_region.bed")
+            split_region_file_ = os.path.join(work, "merged_region.bed")
             tmp_ = bedtools_sort(split_region_file, run_logger=thread_logger)
             bedtools_merge(
-                tmp_, output_fn=split_region_file_ , args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger)
+                tmp_, output_fn=split_region_file_, args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger)
         else:
-            split_region_file_=split_region_file
+            split_region_file_ = split_region_file
 
         if os.path.getsize(split_region_file_) > 0:
             cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \
@@ -90,7 +90,12 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
     split_len_ratio = 0.98
     if not split_region_files:
         if regions_bed_file:
-            regions_bed = bedtools_sort(regions_bed_file, run_logger=logger)
+            regions_bed = get_tmp_file()
+            with open(regions_bed_file) as i_f, open(regions_bed, "w") as o_f:
+                for line in skip_empty(i_f):
+                    chrom, st, en = line.strip().split()[0:3]
+                    o_f.wirte("\t".join([chrom, st, en]) + "\n")
+            regions_bed = bedtools_sort(regions_bed, run_logger=logger)
             regions_bed = bedtools_merge(
                 regions_bed, args=" -d 0", run_logger=logger)
         else:

From 96d2091953c31980ee3c101136bcbb7db110290f Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 2 Jul 2020 13:21:22 -0700
Subject: [PATCH 64/89] small fix

---
 neusomatic/python/scan_alignments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index 4bff295..ae522ec 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -94,7 +94,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
             with open(regions_bed_file) as i_f, open(regions_bed, "w") as o_f:
                 for line in skip_empty(i_f):
                     chrom, st, en = line.strip().split()[0:3]
-                    o_f.wirte("\t".join([chrom, st, en]) + "\n")
+                    o_f.write("\t".join([chrom, st, en]) + "\n")
             regions_bed = bedtools_sort(regions_bed, run_logger=logger)
             regions_bed = bedtools_merge(
                 regions_bed, args=" -d 0", run_logger=logger)

From fd2a2f7c1237df75860dc7288f17fb2c3dd7cb9d Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@ls003gpu.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 19 Jul 2020 01:46:49 -0700
Subject: [PATCH 65/89] repeat extension

---
 .../python/extract_postprocess_targets.py     | 137 +++++++++++++++---
 neusomatic/python/postprocess.py              |   7 +-
 neusomatic/python/preprocess.py               |   8 +-
 3 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py
index be0089e..bdbe40d 100755
--- a/neusomatic/python/extract_postprocess_targets.py
+++ b/neusomatic/python/extract_postprocess_targets.py
@@ -9,12 +9,78 @@
 import logging
 import pysam
 
-from utils import skip_empty
+from utils import skip_empty, get_tmp_file, bedtools_sort, bedtools_merge
 from defaults import VCF_HEADER
 from resolve_variants import push_left_var
 
 
-def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
+def check_rep(ref_seq, left_right, w):
+    logger = logging.getLogger(check_rep.__name__)
+    if len(ref_seq) < 2 * w:
+        return False
+    if left_right == "left":
+        return ref_seq[0:w] == ref_seq[w:2 * w]
+    elif left_right == "right":
+        return ref_seq[-w:] == ref_seq[-2 * w:-w]
+    else:
+        logger.error("Wrong left/right value: {}".format(left_right))
+        raise Exception
+
+
+def extend_region_repeat(chrom, start, end, ref_fasta,
+                         chrom_length, pad):
+    logger = logging.getLogger(extend_region_repeat.__name__)
+    new_start = start
+    new_end = end
+    w = 3
+    while True:
+        changed = False
+        new_start = max(new_start - pad - w, 1)
+        ref_seq = ref_fasta.fetch(
+            chrom, new_start, new_end + 1).upper()
+        while True:
+            cnt_s = 0
+            for rep_len in [1, 2, 3, 4]:
+                if cnt_s > 0:
+                    continue
+                while check_rep(ref_seq, "left", rep_len) and new_start > rep_len:
+                    new_start -= rep_len
+                    ref_seq = ref_fasta.fetch(
+                        chrom, new_start, new_end + 1).upper()
+                    cnt_s += rep_len
+                    changed = True
+                if cnt_s > 0:
+                    continue
+            if cnt_s == 0:
+                break
+        if not changed:
+            break
+    while True:
+        changed = False
+        new_end = min(new_end + pad + w, chrom_length - 2)
+        ref_seq = ref_fasta.fetch(
+            chrom, new_start, new_end + 1).upper()
+        while True:
+            cnt_e = 0
+            for rep_len in [1, 2, 3, 4]:
+                if cnt_e > 0:
+                    continue
+                while check_rep(ref_seq, "right", rep_len) and new_end < chrom_length - rep_len - 1:
+                    new_end += rep_len
+                    ref_seq = ref_fasta.fetch(
+                        chrom, new_start, new_end + 1).upper()
+                    cnt_e += rep_len
+                    changed = True
+                if cnt_e > 0:
+                    continue
+            if cnt_e == 0:
+                break
+        if not changed:
+            break
+    return new_start, new_end
+
+
+def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_repeats, pad):
     logger = logging.getLogger(extract_postprocess_targets.__name__)
 
     logger.info("--------------Extract Postprocessing Targets---------------")
@@ -28,9 +94,9 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
 
     record_sets = []
     record_set = []
-    with open(input_vcf) as i_f, open(out_vcf, "w") as o_f, open(redo_vcf, "w") as r_f, open(redo_bed, "w") as r_b:
-        r_f.write("{}\n".format(VCF_HEADER))
-        r_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
+    redo_vars = {}
+    redo_regions = {}
+    with open(input_vcf) as i_f, open(out_vcf, "w") as o_f:
         for line in skip_empty(i_f):
             if len(line) < 2:
                 continue
@@ -54,7 +120,6 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
                 record_set.append(record)
                 continue
 
-
             if record_set:
                 record_sets.append(record_set)
             record_set = [record]
@@ -76,12 +141,13 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
                 if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)) or multi_allelic:
                     for x in record_set:
                         fields = x[-1].strip().split()
-                        fields[2] = str(ii)
-                        r_f.write("\t".join(fields) + "\n")
-                    r_b.write("\t".join(map(str, [record_set[0][0], max(0, min(map(lambda x:x[1], record_set)) - pad),
-                                                  max(map(lambda x:x[
-                                                      1] + len(x[2]), record_set)) + pad, ii,
-                                                  ])) + "\n")
+                        # fields[2] = str(ii)
+                        if ii not in redo_vars:
+                            redo_vars[ii] = []
+                        redo_vars[ii].append(fields)
+                    redo_regions[ii] = [record_set[0][0], max(0,
+                                                              min(map(lambda x:x[1], record_set)) - pad),
+                                        max(map(lambda x:x[1] + len(x[2]), record_set)) + pad]
                 else:
                     for x in record_set:
                         o_f.write(x[-1])
@@ -89,15 +155,49 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
             elif record_set:
                 if abs(len(record_set[0][2]) - len(record_set[0][3])) >= min_len:
                     fields = record_set[0][-1].strip().split()
-                    fields[2] = str(ii)
-                    r_f.write("\t".join(fields) + "\n")
+                    # fields[2] = str(ii)
+                    if ii not in redo_vars:
+                        redo_vars[ii] = []
+                    redo_vars[ii].append(fields)
                     chrom_, pos_, ref_, alt_ = record_set[0][0:4]
-                    r_b.write("\t".join(
-                        map(str, [chrom_, max(0, pos_ - pad), pos_ + len(ref_) + pad, ii])) + "\n")
+                    redo_regions[ii] = [chrom_, max(
+                        0, pos_ - pad), pos_ + len(ref_) + pad]
                 else:
                     o_f.write(record_set[0][-1])
 
 
+    if extend_repeats:
+        chrom_lengths = dict(
+            zip(ref_fasta.references, ref_fasta.lengths))
+        tmp_ = get_tmp_file()
+        with open(tmp_, "w") as o_f:
+            for ii in redo_regions:
+                chrom, st, en = redo_regions[ii]
+                st, en = extend_region_repeat(
+                    chrom, st, en, ref_fasta, chrom_lengths[chrom], 0)
+                o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n")
+        tmp_=bedtools_sort(tmp_,run_logger=logger)
+        tmp_=bedtools_merge(tmp_,args="-c 4 -o collapse", run_logger=logger)
+    else:
+        tmp_ = get_tmp_file()
+        with open(tmp_, "w") as o_f:
+            for ii in redo_regions:
+                chrom, st, en = redo_regions[ii]
+                o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n")        
+    j = 0
+    with open(tmp_) as i_f, open(redo_vcf, "w") as r_f, open(redo_bed, "w") as r_b:
+        r_f.write("{}\n".format(VCF_HEADER))
+        r_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
+        for line in skip_empty(i_f):
+            chrom, st, en, i_s = line.strip().split()
+            for i in list(map(int, i_s.split(","))):
+                for fields in redo_vars[i]:
+                    fields[2] = str(j)
+                    r_f.write("\t".join(fields) + "\n")
+            r_b.write("\t".join(list(map(str, [chrom, st, en, j]))) + "\n")
+            j += 1
+
+
 if __name__ == '__main__':
 
     FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
@@ -114,13 +214,16 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad):
                         help='minimum INDEL len to resolve', default=4)
     parser.add_argument('--max_dist', type=int,
                         help='max distance to neighboring variant', default=5)
+    parser.add_argument('--extend_repeats', 
+                        help='extend resolve regions to repeat boundaries',
+                        action='store_true')
     parser.add_argument(
         '--pad', type=int, help='padding to bed region for extracting reads', default=10)
     args = parser.parse_args()
     logger.info(args)
     try:
         extract_postprocess_targets(
-            args.reference, args.input_vcf, args.min_len, args.max_dist, args.pad)
+            args.reference, args.input_vcf, args.min_len, args.max_dist, args.extend_repeats, args.pad)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")
diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py
index 32043a8..f5ed9a7 100755
--- a/neusomatic/python/postprocess.py
+++ b/neusomatic/python/postprocess.py
@@ -185,6 +185,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
                 lr_gap_open_penalty, lr_gap_ext_penalty, lr_max_realign_dp, lr_do_split,
                 keep_duplicate,
                 pass_threshold, lowqual_threshold,
+                extend_repeats,
                 msa_binary, num_threads):
     logger = logging.getLogger(postprocess.__name__)
 
@@ -211,7 +212,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     logger.info("Extract targets")
     postprocess_pad = 1 if not long_read else 10
     extract_postprocess_targets(
-        reference, candidates_preds, min_len, postprocess_max_dist, postprocess_pad)
+        reference, candidates_preds, min_len, postprocess_max_dist, extend_repeats, postprocess_pad)
 
     no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf")
     target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf")
@@ -329,6 +330,9 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
     parser.add_argument('--keep_duplicate',
                         help='Dont filter duplicate reads in analysis',
                         action="store_true")
+    parser.add_argument('--extend_repeats', 
+                        help='extend resolve regions to repeat boundaries',
+                        action='store_true')
     parser.add_argument('--msa_binary', type=str,
                         help='MSA binary', default="../bin/msa")
     parser.add_argument('--num_threads', type=int,
@@ -351,6 +355,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense
                                  args.lr_do_split,
                                  args.keep_duplicate,
                                  args.pass_threshold, args.lowqual_threshold,
+                                 args.extend_repeats,
                                  args.msa_binary, args.num_threads)
 
     except Exception as e:
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 3b470e8..e6b5559 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -210,6 +210,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                window_extend,
                max_cluster_size,
                merge_d_for_scan,
+               use_vscore,
                num_splits,
                num_threads,
                scan_alignments_binary,):
@@ -257,7 +258,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                 "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))
 
     zero_vscore = False
-    if not ensemble_tsv and add_extra_features:
+    if (not ensemble_tsv and add_extra_features) and not use_vscore:
         zero_vscore = True
 
     ensemble_bed = None
@@ -693,8 +694,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--merge_d_for_scan', type=int,
                         help='-d used to merge regions before scan',
                         default=None)
-    parser.add_argument('--zero_vscore',
-                        help='set VarScan2_Score to zero',
+    parser.add_argument('--use_vscore',
+                        help='don\'t set VarScan2_Score to zero',
                         action="store_true")
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
@@ -722,6 +723,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.window_extend,
                    args.max_cluster_size,
                    args.merge_d_for_scan,
+                   args.use_vscore,
                    args.num_splits,
                    args.num_threads,
                    args.scan_alignments_binary)

From cca94c82837cab5867c5af1fd2330bd4e6fa1d58 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@ls003gpu.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 24 Jul 2020 01:31:40 -0700
Subject: [PATCH 66/89] small fix

---
 neusomatic/python/resolve_variants.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py
index e4359ba..4e7c009 100755
--- a/neusomatic/python/resolve_variants.py
+++ b/neusomatic/python/resolve_variants.py
@@ -221,10 +221,14 @@ def resolve_group(ref_fasta, variants, vars_count):
         if len(gts) > 1:
             gts_count = {"0/1": 0, "0/0": 0}
             gts_score = {"0/1": 0, "0/0": 0}
+            nz = 0
             for x in group_vars[pos]:
                 if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx:
                     gts_count[x.gt] += x.cnt
                     gts_score[x.gt] += x.score
+                    nz += 1
+            if nz == 0:
+                continue
             priority = {"0/1": 2, "0/0": 1}
             sorted_gts = sorted(gts_count.keys(), key=lambda x: [
                                 gts_count[x], gts_score[x],
@@ -311,6 +315,7 @@ def resolve_group(ref_fasta, variants, vars_count):
         for v in good_vs:
             out_variants_.append(
                 [v.chrom, v.pos, v.ref, v.alt, v.gt, v.score])
+    out_variants_ = [x for x in out_variants_ if x[4] != "0/0"]
     return out_variants_
 
 
@@ -358,7 +363,7 @@ def find_resolved_variants(input_record):
                         inss_.extend(extract_ins(record))
                     aligned_pairs = np.array(
                         record.get_aligned_pairs(matches_only=True))
-                    if len(aligned_pairs)==0:
+                    if len(aligned_pairs) == 0:
                         continue
                     near_pos = np.where((start <= aligned_pairs[:, 1]) & (
                         aligned_pairs[:, 1] <= end))[0]

From b4d2bdf3ed39bd6be0c6d22646128f51ae208458 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 28 Jul 2020 23:12:14 -0700
Subject: [PATCH 67/89] improve cpu multi-thread call.py

---
 neusomatic/python/call.py       | 262 ++++++++++++++++++++++++--------
 neusomatic/python/dataloader.py |  25 +--
 2 files changed, 217 insertions(+), 70 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 52f3a69..79ea73f 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -25,7 +25,7 @@
 
 from network import NeuSomaticNet
 from dataloader import NeuSomaticDataset, matrix_transform
-from utils import get_chromosomes_order, prob2phred
+from utils import get_chromosomes_order, prob2phred, skip_empty
 from merge_tsvs import merge_tsvs
 from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES
 
@@ -349,15 +349,21 @@ def pred_vcf_records(ref_file, final_preds, true_path, chroms, num_threads):
         map_args.append([path, true_path[path], final_preds[path],
                          chroms, ref_file])
 
-    pool = multiprocessing.Pool(num_threads)
-    try:
-        all_vcf_records = pool.map_async(pred_vcf_records_path, map_args).get()
-        pool.close()
-    except Exception as inst:
-        logger.error(inst)
-        pool.close()
-        traceback.print_exc()
-        raise Exception
+    if num_threads == 1:
+        all_vcf_records = []
+        for w in map_args:
+            all_vcf_records.append(pred_vcf_records_path(w))
+    else:
+        pool = multiprocessing.Pool(num_threads)
+        try:
+            all_vcf_records = pool.map_async(
+                pred_vcf_records_path, map_args).get()
+            pool.close()
+        except Exception as inst:
+            logger.error(inst)
+            pool.close()
+            traceback.print_exc()
+            raise Exception
 
     for o in all_vcf_records:
         if o is None:
@@ -422,6 +428,78 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr
                 lines.append(line)
 
 
+def write_merged_vcf(output_vcfs, output_vcf, chroms_order):
+    logger = logging.getLogger(write_merged_vcf.__name__)
+    vcf_records = []
+    for vcf in output_vcfs:
+        with open(vcf) as i_f:
+            for line in skip_empty(i_f):
+                x = line.strip().split()
+                vcf_records.append([x[0], int(x[1]), line])
+    vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]])
+    lines = []
+    with open(output_vcf, "w") as ov:
+        for chrom_, pos_, line in vcf_records:
+            if line not in lines:
+                ov.write(line)
+                lines.append(line)
+
+
+def single_thread_call(record):
+    thread_logger = logging.getLogger(
+        "{} ({})".format(single_thread_call.__name__, multiprocessing.current_process().name))
+    try:
+        torch.set_num_threads(1)
+        net, candidate_files, max_load_candidates, data_transform, \
+            coverage_thr, normalize_channels, zero_ann_cols, batch_size, \
+            out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \
+            pass_threshold, lowqual_threshold, i = record
+
+        call_set = NeuSomaticDataset(roots=candidate_files,
+                                     max_load_candidates=max_load_candidates,
+                                     transform=data_transform, is_test=True,
+                                     num_threads=1,
+                                     coverage_thr=coverage_thr,
+                                     normalize_channels=normalize_channels,
+                                     zero_ann_cols=zero_ann_cols)
+        call_loader = torch.utils.data.DataLoader(call_set,
+                                                  batch_size=batch_size,
+                                                  shuffle=True,  # pin_memory=True,
+                                                  num_workers=0)
+        logger.info("N_dataset: {}".format(len(call_set)))
+        if len(call_set) == 0:
+            logger.warning(
+                "Skip {} with 0 candidates".format(candidate_file))
+            return [], []
+
+        final_preds_, none_preds_, true_path_ = call_variants(
+            net, call_loader, out_dir, model_tag, use_cuda)
+        all_vcf_records = pred_vcf_records(
+            ref_file, final_preds_, true_path_, chroms, 1)
+        all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms)
+
+        all_vcf_records = dict(all_vcf_records)
+        all_vcf_records_none = dict(all_vcf_records_none)
+
+        var_vcf_records = get_vcf_records(all_vcf_records)
+        vcf_records_none = get_vcf_records(all_vcf_records_none)
+
+        output_vcf = "{}/pred_{}.vcf".format(tmp_preds_dir, i)
+        write_vcf(var_vcf_records, output_vcf, chroms_order,
+                  pass_threshold, lowqual_threshold)
+
+        logger.info("Prepare Non-Somatics VCF")
+        output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i)
+        write_vcf(vcf_records_none, output_vcf_none,
+                  chroms_order, pass_threshold, lowqual_threshold)
+
+        return output_vcf, output_vcf_none
+    except Exception as ex:
+        thread_logger.error(traceback.format_exc())
+        thread_logger.error(ex)
+        return None
+
+
 def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     batch_size, max_load_candidates, pass_threshold, lowqual_threshold,
                     force_zero_ann_cols,
@@ -562,6 +640,13 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     Ls = []
     candidates_tsv_ = []
     split_i = 0
+    total_L = 0
+    for candidate_file in candidates_tsv:
+        total_L += len(pickle.load(open(candidate_file + ".idx", "rb")))
+    logger.info("Total number of candidates: {}".format(total_L))
+    if not use_cuda:
+        max_load_candidates = min(
+            max_load_candidates, 3 * total_L // num_threads)
     for candidate_file in candidates_tsv:
         idx = pickle.load(open(candidate_file + ".idx", "rb"))
         if len(idx) > max_load_candidates / 2:
@@ -595,59 +680,114 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     candidate_files = []
     all_vcf_records = []
     all_vcf_records_none = []
-    for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])):
-        current_L += L
-        candidate_files.append(candidate_file)
-        if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1:
-            logger.info("Run for candidate files: {}".format(candidate_files))
-            call_set = NeuSomaticDataset(roots=candidate_files,
-                                         max_load_candidates=max_load_candidates,
-                                         transform=data_transform, is_test=True,
-                                         num_threads=num_threads,
-                                         coverage_thr=coverage_thr,
-                                         normalize_channels=normalize_channels,
-                                         zero_ann_cols=zero_ann_cols)
-            call_loader = torch.utils.data.DataLoader(call_set,
-                                                      batch_size=batch_size,
-                                                      shuffle=True, pin_memory=True,
-                                                      num_workers=num_threads)
-
-            current_L = 0
-            candidate_files = []
-
-            logger.info("N_dataset: {}".format(len(call_set)))
-            if len(call_set) == 0:
-                logger.warning(
-                    "Skip {} with 0 candidates".format(candidate_file))
-                continue
-
-            final_preds_, none_preds_, true_path_ = call_variants(
-                net, call_loader, out_dir, model_tag, use_cuda)
-            all_vcf_records.extend(pred_vcf_records(
-                ref_file, final_preds_, true_path_, chroms, num_threads))
-            all_vcf_records_none.extend(
-                pred_vcf_records_none(none_preds_, chroms))
-
-    all_vcf_records = dict(all_vcf_records)
-    all_vcf_records_none = dict(all_vcf_records_none)
-
-    if os.path.exists(new_split_tsvs_dir):
+    if use_cuda:
+        for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])):
+            current_L += L
+            candidate_files.append(candidate_file)
+            if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1:
+                logger.info(
+                    "Run for candidate files: {}".format(candidate_files))
+                call_set = NeuSomaticDataset(roots=candidate_files,
+                                             max_load_candidates=max_load_candidates,
+                                             transform=data_transform, is_test=True,
+                                             num_threads=num_threads,
+                                             coverage_thr=coverage_thr,
+                                             normalize_channels=normalize_channels,
+                                             zero_ann_cols=zero_ann_cols)
+                call_loader = torch.utils.data.DataLoader(call_set,
+                                                          batch_size=batch_size,
+                                                          shuffle=True, pin_memory=True,
+                                                          num_workers=num_threads)
+
+                current_L = 0
+                candidate_files = []
+
+                logger.info("N_dataset: {}".format(len(call_set)))
+                if len(call_set) == 0:
+                    logger.warning(
+                        "Skip {} with 0 candidates".format(candidate_file))
+                    continue
+
+                final_preds_, none_preds_, true_path_ = call_variants(
+                    net, call_loader, out_dir, model_tag, use_cuda)
+                all_vcf_records.extend(pred_vcf_records(
+                    ref_file, final_preds_, true_path_, chroms, num_threads))
+                all_vcf_records_none.extend(
+                    pred_vcf_records_none(none_preds_, chroms))
+        all_vcf_records = dict(all_vcf_records)
+        all_vcf_records_none = dict(all_vcf_records_none)
+
+        logger.info("Prepare Output VCF")
+        output_vcf = "{}/pred.vcf".format(out_dir)
+        var_vcf_records = get_vcf_records(all_vcf_records)
+        write_vcf(var_vcf_records, output_vcf, chroms_order,
+                  pass_threshold, lowqual_threshold)
+
+        logger.info("Prepare Non-Somatics VCF")
+        output_vcf_none = "{}/none.vcf".format(out_dir)
+        vcf_records_none = get_vcf_records(all_vcf_records_none)
+        write_vcf(vcf_records_none, output_vcf_none,
+                  chroms_order, pass_threshold, lowqual_threshold)
+    else:
+        tmp_preds_dir = os.path.join(out_dir, "tmp_preds")
+        if os.path.exists(tmp_preds_dir):
+            logger.warning(
+                "Remove tmp_preds directory: {}".format(tmp_preds_dir))
+            shutil.rmtree(tmp_preds_dir)
+        os.mkdir(tmp_preds_dir)
+
+        map_args = []
+        j = 0
+        for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])):
+            current_L += L
+            candidate_files.append(candidate_file)
+            if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1:
+                logger.info(
+                    "Run for candidate files: {}".format(candidate_files))
+
+                map_args.append([net, candidate_files, max_load_candidates, data_transform,
+                                 coverage_thr, normalize_channels, zero_ann_cols, batch_size,
+                                 out_dir,
+                                 model_tag, ref_file, chroms, tmp_preds_dir, chroms_order,
+                                 pass_threshold, lowqual_threshold, j])
+                j += 1
+                current_L = 0
+                candidate_files = []
+
+        pool = multiprocessing.Pool(num_threads)
+        try:
+            all_records = pool.map_async(single_thread_call, map_args).get()
+            pool.close()
+        except Exception as inst:
+            logger.error(inst)
+            pool.close()
+            traceback.print_exc()
+            raise Exception
+
+        for o in all_records:
+            if o is None:
+                raise Exception("single_thread_call failed!")
+
+        output_vcfs = [x[0] for x in all_records]
+        output_vcfs_none = [x[1] for x in all_records]
+
+        logger.info("Prepare Output VCF")
+        output_vcf = "{}/pred.vcf".format(out_dir)
+        write_merged_vcf(output_vcfs, output_vcf, chroms_order)
+
+        logger.info("Prepare Non-Somatics VCF")
+        output_vcf_none = "{}/none.vcf".format(out_dir)
+        write_merged_vcf(output_vcfs_none, output_vcf_none, chroms_order)
+
+        if os.path.exists(tmp_preds_dir):
+            logger.warning(
+                "Remove tmp_preds directory: {}".format(tmp_preds_dir))
+            shutil.rmtree(tmp_preds_dir)
+
+    if os.path.exists(tmp_preds_dir):
         logger.warning(
             "Remove split candidates directory: {}".format(new_split_tsvs_dir))
         shutil.rmtree(new_split_tsvs_dir)
-
-    logger.info("Prepare Output VCF")
-    output_vcf = "{}/pred.vcf".format(out_dir)
-    var_vcf_records = get_vcf_records(all_vcf_records)
-    write_vcf(var_vcf_records, output_vcf, chroms_order,
-              pass_threshold, lowqual_threshold)
-
-    logger.info("Prepare Non-Somatics VCF")
-    output_vcf_none = "{}/none.vcf".format(out_dir)
-    vcf_records_none = get_vcf_records(all_vcf_records_none)
-    write_vcf(vcf_records_none, output_vcf_none,
-              chroms_order, pass_threshold, lowqual_threshold)
-
     if os.path.exists(matrices_dir):
         logger.warning("Remove matrices directory: {}".format(matrices_dir))
         shutil.rmtree(matrices_dir)
@@ -693,6 +833,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                         default=[])
     args = parser.parse_args()
 
+    logger.info(args)
+
     use_cuda = torch.cuda.is_available()
     logger.info("use_cuda: {}".format(use_cuda))
 
diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
index c2450dd..76a3773 100755
--- a/neusomatic/python/dataloader.py
+++ b/neusomatic/python/dataloader.py
@@ -195,16 +195,21 @@ def __init__(self, roots, max_load_candidates, transform=None,
             if len(map_args) == 1:
                 records_ = [extract_info_tsv(map_args[0])]
             else:
-                pool = multiprocessing.Pool(num_threads)
-                try:
-                    records_ = pool.map_async(
-                        extract_info_tsv, map_args).get()
-                    pool.close()
-                except Exception as inst:
-                    pool.close()
-                    logger.error(inst)
-                    traceback.print_exc()
-                    raise Exception
+                if num_threads==1:
+                    records_ = []
+                    for w in map_args:
+                        records_.append(extract_info_tsv(w))
+                else:
+                    pool = multiprocessing.Pool(num_threads)
+                    try:
+                        records_ = pool.map_async(
+                            extract_info_tsv, map_args).get()
+                        pool.close()
+                    except Exception as inst:
+                        pool.close()
+                        logger.error(inst)
+                        traceback.print_exc()
+                        raise Exception
 
             for o in records_:
                 if o is None:

From a35932764d64b4db96a2facf74419acf47eab67e Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 30 Jul 2020 00:43:48 -0700
Subject: [PATCH 68/89] small fix

---
 neusomatic/python/call.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 79ea73f..7835778 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -784,7 +784,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                 "Remove tmp_preds directory: {}".format(tmp_preds_dir))
             shutil.rmtree(tmp_preds_dir)
 
-    if os.path.exists(tmp_preds_dir):
+    if os.path.exists(new_split_tsvs_dir):
         logger.warning(
             "Remove split candidates directory: {}".format(new_split_tsvs_dir))
         shutil.rmtree(new_split_tsvs_dir)

From f6d3174a03b4904083b11adcb3df3e7cb0efe54d Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb056.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 30 Jul 2020 14:00:12 -0700
Subject: [PATCH 69/89] updated dockerfile

---
 docker/Dockerfile | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index f6bdbc9..af16a44 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,19 +2,19 @@ FROM ubuntu:18.04
 		
 ENV NEUSOMATIC_VERSION 0.3.0
 ENV ZLIB_VERSION 1.2.11
-ENV NUMPY_VERSION 1.18.1
-ENV SCIPY_VERSION 1.4.1
-ENV IMAGEIO_VERSION 2.8.0
-ENV PILLOW_VERSION 7.1.2
-ENV PYTORCH_VERSION 1.4.0
-ENV TORCHVISION_VERSION 0.5.0
-ENV CUDATOOLKIT_VERSION 9.2
+ENV NUMPY_VERSION 1.18.5
+ENV SCIPY_VERSION 1.5.0
+ENV IMAGEIO_VERSION 2.9.0
+ENV PILLOW_VERSION 7.2.0
+ENV PYTORCH_VERSION 1.6.0
+ENV TORCHVISION_VERSION 0.7.0
+ENV CUDATOOLKIT_VERSION 10.1
 ENV CMAKE_VERSION 3.14.0
 ENV PYSAM_VERSION 0.15.3
 ENV SAMTOOLS_VERSION 1.9
 ENV TABIX_VERSION 0.2.6
 ENV BEDTOOLS_VERSION 2.29.2
-ENV BIOPYTHON_VERSION 1.76
+ENV BIOPYTHON_VERSION 1.77
 ENV FISHER_VERSION 0.1.9
 ENV GCC_VERSION 5
 
@@ -22,14 +22,13 @@ RUN apt-get update && apt-get install -y --fix-missing \
 				build-essential zlib1g-dev curl less vim bzip2
 RUN apt-get install -y --fix-missing git wget
 
-RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b
-RUN rm Miniconda3-latest-Linux-x86_64.sh
+RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh
+RUN bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -p /miniconda -b
+RUN rm Miniconda3-py37_4.8.3-Linux-x86_64.sh
 ENV PATH=/miniconda/bin:${PATH}
 ENV LD_LIBRARY_PATH=/miniconda/lib:${LD_LIBRARY_PATH}
 RUN conda update -y conda
 
-
 RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \
 					 pillow=${PILLOW_VERSION} cmake=${CMAKE_VERSION} imageio=${IMAGEIO_VERSION} && conda clean -a
 RUN conda install -y fisher=${FISHER_VERSION} -c conda-forge && conda clean -a

From accb759a7b1fc4f420c71b8630247f3b59c617ad Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb056.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 30 Jul 2020 14:03:03 -0700
Subject: [PATCH 70/89] updated docker test

---
 test/docker_test.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/test/docker_test.sh b/test/docker_test.sh
index 4a8f34f..191f838 100755
--- a/test/docker_test.sh
+++ b/test/docker_test.sh
@@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ]
 then
 	if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ]
 	then
-		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+		docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 		"cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	fi
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 	"cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz"
 	
 fi
 if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ]
 then
-	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+	docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 	"samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa"
 fi
 rm -rf work_standalone
@@ -27,7 +27,7 @@ rm -rf work_standalone
 
 
 #Stand-alone NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash
 	--num_threads 1 \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:test /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3
 		--num_threads 1 \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \
@@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash
 
 rm -rf /mnt/example/work_ensemble
 #Ensemble NeuSomatic test 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/preprocess.py \
 	--mode call \
 	--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash
 	--ensemble_tsv /mnt/ensemble.tsv \
 	--scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:test /bin/bash -c \
 "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \
 		--candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
@@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3
 		--ensemble \
 		--batch_size 100"
 
-docker run -v ${test_dir}:/mnt -u $UID --memory 30G  neusomatic:0.3.0 /bin/bash -c \
+docker run -v ${test_dir}:/mnt -u $UID --memory 30G  msahraeian/neusomatic:test /bin/bash -c \
 "python /opt/neusomatic/neusomatic/python/postprocess.py \
 		--reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \
 		--tumor_bam /mnt/tumor.bam \

From 9f5d876580053824ec0263d76ad7ee20c01a9757 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 18 Sep 2020 21:52:37 -0700
Subject: [PATCH 71/89] fix build

---
 build.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/build.sh b/build.sh
index c3fbf6d..e40035f 100755
--- a/build.sh
+++ b/build.sh
@@ -2,6 +2,9 @@
 set -e
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -P)"
 
+if [ -d ${DIR}/neusomatic/build ]; then
+	rm -rf ${DIR}/neusomatic/build
+fi
 rm -rf $DIR/third_party/SeqLib/ $DIR/third_party/seqan/
 pushd $DIR/neusomatic
   mkdir build

From e500bf7ac727578e6b56b72f679e61c623e16df0 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 6 Oct 2020 13:47:34 -0700
Subject: [PATCH 72/89] small_fix

---
 build.sh                  |  1 +
 neusomatic/python/call.py | 32 ++++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/build.sh b/build.sh
index e40035f..db9ba66 100755
--- a/build.sh
+++ b/build.sh
@@ -13,3 +13,4 @@ pushd $DIR/neusomatic
       make
     popd
 popd
+rm -rf $DIR/third_party/SeqLib/ $DIR/third_party/seqan/
diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 7835778..dc7b379 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -408,7 +408,8 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr
     logger = logging.getLogger(write_vcf.__name__)
     vcf_records = list(filter(lambda x: len(x) > 0, vcf_records))
     vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]])
-    lines = []
+    old_pos = ""
+    lines_old_pos = set([])
     with open(output_vcf, "w") as ov:
         for chrom_, pos_, ref_, alt_, prob in vcf_records:
             if ref_ == alt_:
@@ -423,9 +424,18 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr
                               filter_, "SCORE={:.4f}".format(
                                   np.round(prob, 4)),
                               "GT", "0/1"]) + "\n"
-            if line not in lines:
+            curr_pos = "-".join([chrom_, str(pos_)])
+            emit = False
+            if old_pos != curr_pos:
+                old_pos = curr_pos
+                lines_old_pos = set([line])
+                emit = True
+            else:
+                if line not in lines_old_pos:
+                    emit = True
+                    lines_old_pos.add(line)
+            if emit:
                 ov.write(line)
-                lines.append(line)
 
 
 def write_merged_vcf(output_vcfs, output_vcf, chroms_order):
@@ -437,12 +447,22 @@ def write_merged_vcf(output_vcfs, output_vcf, chroms_order):
                 x = line.strip().split()
                 vcf_records.append([x[0], int(x[1]), line])
     vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]])
-    lines = []
+    old_pos = ""
+    lines_old_pos = set([])
     with open(output_vcf, "w") as ov:
         for chrom_, pos_, line in vcf_records:
-            if line not in lines:
+            curr_pos = "-".join([chrom_, str(pos_)])
+            emit = False
+            if old_pos != curr_pos:
+                old_pos = curr_pos
+                lines_old_pos = set([line])
+                emit = True
+            else:
+                if line not in lines_old_pos:
+                    emit = True
+                    lines_old_pos.add(line)
+            if emit:
                 ov.write(line)
-                lines.append(line)
 
 
 def single_thread_call(record):

From e48e15d428a5db762d623914ca2b3b5a87199442 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 7 Nov 2020 00:18:13 -0800
Subject: [PATCH 73/89] force cov_thr

---
 neusomatic/python/call.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index dc7b379..5cf437a 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -523,6 +523,7 @@ def single_thread_call(record):
 def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     batch_size, max_load_candidates, pass_threshold, lowqual_threshold,
                     force_zero_ann_cols,
+                    force_cov_thr,
                     use_cuda):
     logger = logging.getLogger(call_neusomatic.__name__)
 
@@ -568,6 +569,12 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
             "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols))
         zero_ann_cols = force_zero_ann_cols
 
+    if force_cov_thr is not None:
+        logger.info(
+            "Override coverage_thr from force_cov_thr: {}".format(force_cov_thr))
+        coverage_thr = force_cov_thr
+
+
     logger.info("coverage_thr: {}".format(coverage_thr))
     logger.info("normalize_channels: {}".format(normalize_channels))
     logger.info("no_seq_complexity: {}".format(no_seq_complexity))
@@ -851,6 +858,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                               --zero_ann_cols and pretrained setting.\
                               idx starts from 5th column in candidate.tsv file',
                         default=[])
+    parser.add_argument('--force_cov_thr', type=int,
+                        help='Force maximum coverage threshold.', default=None)
     args = parser.parse_args()
 
     logger.info(args)
@@ -864,6 +873,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                      args.num_threads, args.batch_size, args.max_load_candidates,
                                      args.pass_threshold, args.lowqual_threshold,
                                      args.force_zero_ann_cols,
+                                     args.force_cov_thr,
                                      use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())

From a96a2367d1a53e2d4a54cd1c1be41f50704cb224 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 8 Nov 2020 20:35:11 -0800
Subject: [PATCH 74/89] fix max_cov

---
 neusomatic/python/call.py       | 20 ++++++++++----------
 neusomatic/python/dataloader.py | 11 ++++++++---
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 5cf437a..f57c638 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -471,7 +471,7 @@ def single_thread_call(record):
     try:
         torch.set_num_threads(1)
         net, candidate_files, max_load_candidates, data_transform, \
-            coverage_thr, normalize_channels, zero_ann_cols, batch_size, \
+            coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, \
             out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \
             pass_threshold, lowqual_threshold, i = record
 
@@ -480,6 +480,7 @@ def single_thread_call(record):
                                      transform=data_transform, is_test=True,
                                      num_threads=1,
                                      coverage_thr=coverage_thr,
+                                     max_cov=max_cov,
                                      normalize_channels=normalize_channels,
                                      zero_ann_cols=zero_ann_cols)
         call_loader = torch.utils.data.DataLoader(call_set,
@@ -523,7 +524,7 @@ def single_thread_call(record):
 def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     batch_size, max_load_candidates, pass_threshold, lowqual_threshold,
                     force_zero_ann_cols,
-                    force_cov_thr,
+                    max_cov,
                     use_cuda):
     logger = logging.getLogger(call_neusomatic.__name__)
 
@@ -569,11 +570,9 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
             "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols))
         zero_ann_cols = force_zero_ann_cols
 
-    if force_cov_thr is not None:
+    if max_cov is not None:
         logger.info(
-            "Override coverage_thr from force_cov_thr: {}".format(force_cov_thr))
-        coverage_thr = force_cov_thr
-
+            "Set max_cov: {}".format(max_cov))
 
     logger.info("coverage_thr: {}".format(coverage_thr))
     logger.info("normalize_channels: {}".format(normalize_channels))
@@ -719,6 +718,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                              transform=data_transform, is_test=True,
                                              num_threads=num_threads,
                                              coverage_thr=coverage_thr,
+                                             max_cov=max_cov,
                                              normalize_channels=normalize_channels,
                                              zero_ann_cols=zero_ann_cols)
                 call_loader = torch.utils.data.DataLoader(call_set,
@@ -773,7 +773,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     "Run for candidate files: {}".format(candidate_files))
 
                 map_args.append([net, candidate_files, max_load_candidates, data_transform,
-                                 coverage_thr, normalize_channels, zero_ann_cols, batch_size,
+                                 coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size,
                                  out_dir,
                                  model_tag, ref_file, chroms, tmp_preds_dir, chroms_order,
                                  pass_threshold, lowqual_threshold, j])
@@ -858,8 +858,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                               --zero_ann_cols and pretrained setting.\
                               idx starts from 5th column in candidate.tsv file',
                         default=[])
-    parser.add_argument('--force_cov_thr', type=int,
-                        help='Force maximum coverage threshold.', default=None)
+    parser.add_argument('--max_cov', type=int,
+                        help='maximum coverage threshold.', default=None)
     args = parser.parse_args()
 
     logger.info(args)
@@ -873,7 +873,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                      args.num_threads, args.batch_size, args.max_load_candidates,
                                      args.pass_threshold, args.lowqual_threshold,
                                      args.force_zero_ann_cols,
-                                     args.force_cov_thr,
+                                     args.max_cov,
                                      use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
index 76a3773..71fe47b 100755
--- a/neusomatic/python/dataloader.py
+++ b/neusomatic/python/dataloader.py
@@ -130,6 +130,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
                  loader=candidate_loader_tsv, is_test=False,
                  num_threads=1, disable_ensemble=False, data_augmentation=False,
                  nclasses_t=4, nclasses_l=4, coverage_thr=100,
+                 max_cov=None,
                  normalize_channels=False,
                  zero_ann_cols=[],
                  max_opended_tsv=-1):
@@ -195,7 +196,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
             if len(map_args) == 1:
                 records_ = [extract_info_tsv(map_args[0])]
             else:
-                if num_threads==1:
+                if num_threads == 1:
                     records_ = []
                     for w in map_args:
                         records_.append(extract_info_tsv(w))
@@ -237,6 +238,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
         self.disable_ensemble = disable_ensemble
         self.data_augmentation = data_augmentation
         self.coverage_thr = coverage_thr
+        self.max_cov = max_cov
 
     def open_candidate_tsvs(self):
         for i, tsv in enumerate(self.tsvs):
@@ -271,8 +273,8 @@ def __getitem__(self, index):
         if self.disable_ensemble:
             anns = []
 
-        if self.zero_ann_cols and len(anns)>0:
-            anns=np.array(anns)
+        if self.zero_ann_cols and len(anns) > 0:
+            anns = np.array(anns)
             anns[self.zero_ann_cols] = 0
             anns = anns.tolist()
 
@@ -281,6 +283,9 @@ def __getitem__(self, index):
             ".")
         tumor_cov = int(tumor_cov)
         normal_cov = int(normal_cov)
+        if self.max_cov is not None:
+            tumor_cov = min(tumor_cov, self.max_cov)
+            normal_cov = min(normal_cov, self.max_cov)
         center = int(center)
         length = int(length)
 

From d804742168d111182f222163deb8fa29c7e41259 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sat, 5 Dec 2020 22:04:50 -0800
Subject: [PATCH 75/89] fixed matrices gradual delete

---
 neusomatic/python/call.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index f57c638..db1b55b 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -53,7 +53,7 @@ def get_type(ref, alt):
         return "SNP"
 
 
-def call_variants(net, call_loader, out_dir, model_tag, use_cuda):
+def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
     logger = logging.getLogger(call_variants.__name__)
     net.eval()
     nclasses = len(VARTYPE_CLASSES)
@@ -94,8 +94,8 @@ def call_variants(net, call_loader, out_dir, model_tag, use_cuda):
             path = path_.split("/")[-1]
             preds[i] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i]]
             if VARTYPE_CLASSES[predicted[i]] != "NONE":
-                file_name = "{}/matrices_{}/{}.png".format(
-                    out_dir, model_tag, path)
+                file_name = "{}/matrices_{}/{}/{}.{}_{}.png".format(
+                    out_dir, model_tag, run_i, path, iii, i)
                 if not os.path.exists(file_name):
                     imwrite(file_name, np.array(
                         non_transformed_matrices[i, :, :, 0:3]))
@@ -494,7 +494,7 @@ def single_thread_call(record):
             return [], []
 
         final_preds_, none_preds_, true_path_ = call_variants(
-            net, call_loader, out_dir, model_tag, use_cuda)
+            net, call_loader, out_dir, model_tag, i, use_cuda)
         all_vcf_records = pred_vcf_records(
             ref_file, final_preds_, true_path_, chroms, 1)
         all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms)
@@ -513,6 +513,10 @@ def single_thread_call(record):
         output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i)
         write_vcf(vcf_records_none, output_vcf_none,
                   chroms_order, pass_threshold, lowqual_threshold)
+        matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, i)
+        if os.path.exists(matrices_dir_j):
+            logger.warning("Done with {}. Remove matrices directory {}: {}".format(i, i, matrices_dir_j))
+            shutil.rmtree(matrices_dir_j)
 
         return output_vcf, output_vcf_none
     except Exception as ex:
@@ -707,6 +711,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     all_vcf_records = []
     all_vcf_records_none = []
     if use_cuda:
+        run_i = -1
         for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])):
             current_L += L
             candidate_files.append(candidate_file)
@@ -728,19 +733,30 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
                 current_L = 0
                 candidate_files = []
-
+                run_i += 1
                 logger.info("N_dataset: {}".format(len(call_set)))
                 if len(call_set) == 0:
                     logger.warning(
                         "Skip {} with 0 candidates".format(candidate_file))
                     continue
 
+                matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, run_i)
+                if os.path.exists(matrices_dir_j):
+                    logger.warning("Remove matrices directory {}: {}".format(run_i, matrices_dir_j))
+                    shutil.rmtree(matrices_dir_j)
+                os.mkdir(matrices_dir_j)
+
                 final_preds_, none_preds_, true_path_ = call_variants(
-                    net, call_loader, out_dir, model_tag, use_cuda)
+                    net, call_loader, out_dir, model_tag, run_i, use_cuda)
                 all_vcf_records.extend(pred_vcf_records(
                     ref_file, final_preds_, true_path_, chroms, num_threads))
                 all_vcf_records_none.extend(
                     pred_vcf_records_none(none_preds_, chroms))
+
+                if os.path.exists(matrices_dir_j):
+                    logger.warning("Done with {}. Remove matrices directory {}: {}".format(run_i, run_i, matrices_dir_j))
+                    shutil.rmtree(matrices_dir_j)
+
         all_vcf_records = dict(all_vcf_records)
         all_vcf_records_none = dict(all_vcf_records_none)
 
@@ -772,6 +788,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                 logger.info(
                     "Run for candidate files: {}".format(candidate_files))
 
+                matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, j)
+                if os.path.exists(matrices_dir_j):
+                    logger.warning("Remove matrices directory {}: {}".format(j, matrices_dir_j))
+                    shutil.rmtree(matrices_dir_j)
+                os.mkdir(matrices_dir_j)
                 map_args.append([net, candidate_files, max_load_candidates, data_transform,
                                  coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size,
                                  out_dir,

From 45fbcd6d6cbfed77260b6e17e296665f7cc5e209 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 29 Dec 2020 00:31:22 -0800
Subject: [PATCH 76/89] fix generate_dataset

---
 neusomatic/python/generate_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 53920c5..4881382 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1707,8 +1707,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                             rlen = record_len[int(record[-1])]
                             rcenter = record_center[int(record[-1])]
                             ch_order = chroms_order[record[0]]
-                            ann = anns[
-                                int(record[-1])] if ensemble_bed else []
+                            ann = list(anns[int(record[-1])]
+                                       ) if ensemble_bed else []
                             map_args_records.append((ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order,
                                                      matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths))
                         if cnt >= is_end:
@@ -1726,8 +1726,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                         if is_current <= cnt < is_end:
                             rcenter = record_center[int(record[-1])]
                             ch_order = chroms_order[record[0]]
-                            ann = anns[
-                                int(record[-1])] if ensemble_bed else []
+                            ann = list(anns[int(record[-1])]
+                                       ) if ensemble_bed else []
                             map_args_nones.append((ref_file, tumor_count_bed, normal_count_bed, record, "NONE",
                                                    0, rcenter, ch_order,
                                                    matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths))

From cff3653d76e08177b1b800274f862a3ac3c45382 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 22 Jan 2021 12:25:58 -0800
Subject: [PATCH 77/89] fix ensemble rounding

---
 neusomatic/python/generate_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 4881382..97f2eac 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1564,7 +1564,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                 if i_s:
                     s = ensemble_data[:, np.array(i_s)]
                     s = np.maximum(np.minimum(s, mx), mn)
-                    s = (s - mn) / (mx - mn)
+                    s = np.round((s - mn) / (mx - mn),6)
                     ensemble_data[:, np.array(i_s)] = s
             ensemble_data = ensemble_data[:, selected_features]
             ensemble_data = ensemble_data.tolist()

From 603d582f32bb3c8b9e58e987e4b9649ab1387200 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Fri, 22 Jan 2021 22:36:30 -0800
Subject: [PATCH 78/89] reduce disc I/O while calling

---
 neusomatic/python/call.py | 56 ++++++++-------------------------------
 1 file changed, 11 insertions(+), 45 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index db1b55b..5505d59 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -15,7 +15,6 @@
 
 import pysam
 import numpy as np
-from imageio import imwrite, imread
 import torch
 from torch.autograd import Variable
 import torch.nn as nn
@@ -59,7 +58,6 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
     nclasses = len(VARTYPE_CLASSES)
     final_preds = {}
     none_preds = {}
-    true_path = {}
 
     final_preds = {}
     none_preds = {}
@@ -94,12 +92,6 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
             path = path_.split("/")[-1]
             preds[i] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i]]
             if VARTYPE_CLASSES[predicted[i]] != "NONE":
-                file_name = "{}/matrices_{}/{}/{}.{}_{}.png".format(
-                    out_dir, model_tag, run_i, path, iii, i)
-                if not os.path.exists(file_name):
-                    imwrite(file_name, np.array(
-                        non_transformed_matrices[i, :, :, 0:3]))
-                true_path[path] = file_name
                 final_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i],
                                      list(map(lambda x: round(x, 4), F.softmax(
                                          outputs1[i, :], 0).data.cpu().numpy())),
@@ -108,7 +100,8 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
                                      list(map(lambda x: round(x, 4),
                                               outputs1.data.cpu()[i].numpy())),
                                      list(map(lambda x: round(x, 4),
-                                              outputs3.data.cpu()[i].numpy()))]
+                                              outputs3.data.cpu()[i].numpy())),
+                                     np.array(non_transformed_matrices[i, :, :, 0:3])]
             else:
                 none_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i],
                                     list(map(lambda x: round(x, 4), F.softmax(
@@ -122,17 +115,17 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
         if (iii % 10 == 0):
             logger.info("Called {} candidates in this batch.".format(j))
     logger.info("Called {} candidates in this batch.".format(j))
-    return final_preds, none_preds, true_path
+    return final_preds, none_preds
 
 
 def pred_vcf_records_path(record):
-    path, true_path_, pred_all, chroms, ref_file = record
+    path, pred_all, chroms, ref_file = record
     thread_logger = logging.getLogger(
         "{} ({})".format(pred_vcf_records_path.__name__, multiprocessing.current_process().name))
     try:
         fasta_file = pysam.FastaFile(ref_file)
         ACGT = "ACGT"
-        I = imread(true_path_) / 255.0
+        I = pred_all[-1] / 255.0
         vcf_record = []
         Ih, Iw, _ = I.shape
         zref_pos = np.where((np.argmax(I[:, :, 0], 0) == 0) & (
@@ -340,13 +333,13 @@ def pred_vcf_records_path(record):
         return None
 
 
-def pred_vcf_records(ref_file, final_preds, true_path, chroms, num_threads):
+def pred_vcf_records(ref_file, final_preds, chroms, num_threads):
     logger = logging.getLogger(pred_vcf_records.__name__)
     logger.info(
         "Prepare VCF records for predicted somatic variants in this batch.")
     map_args = []
     for path in final_preds.keys():
-        map_args.append([path, true_path[path], final_preds[path],
+        map_args.append([path, final_preds[path],
                          chroms, ref_file])
 
     if num_threads == 1:
@@ -493,10 +486,10 @@ def single_thread_call(record):
                 "Skip {} with 0 candidates".format(candidate_file))
             return [], []
 
-        final_preds_, none_preds_, true_path_ = call_variants(
+        final_preds_, none_preds_ = call_variants(
             net, call_loader, out_dir, model_tag, i, use_cuda)
         all_vcf_records = pred_vcf_records(
-            ref_file, final_preds_, true_path_, chroms, 1)
+            ref_file, final_preds_, chroms, 1)
         all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms)
 
         all_vcf_records = dict(all_vcf_records)
@@ -513,10 +506,6 @@ def single_thread_call(record):
         output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i)
         write_vcf(vcf_records_none, output_vcf_none,
                   chroms_order, pass_threshold, lowqual_threshold)
-        matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, i)
-        if os.path.exists(matrices_dir_j):
-            logger.warning("Done with {}. Remove matrices directory {}: {}".format(i, i, matrices_dir_j))
-            shutil.rmtree(matrices_dir_j)
 
         return output_vcf, output_vcf_none
     except Exception as ex:
@@ -655,11 +644,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
 
     if not os.path.exists(out_dir):
         os.mkdir(out_dir)
-    matrices_dir = "{}/matrices_{}".format(out_dir, model_tag)
-    if os.path.exists(matrices_dir):
-        logger.warning("Remove matrices directory: {}".format(matrices_dir))
-        shutil.rmtree(matrices_dir)
-    os.mkdir(matrices_dir)
 
     new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs")
     if os.path.exists(new_split_tsvs_dir):
@@ -740,23 +724,13 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                         "Skip {} with 0 candidates".format(candidate_file))
                     continue
 
-                matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, run_i)
-                if os.path.exists(matrices_dir_j):
-                    logger.warning("Remove matrices directory {}: {}".format(run_i, matrices_dir_j))
-                    shutil.rmtree(matrices_dir_j)
-                os.mkdir(matrices_dir_j)
-
-                final_preds_, none_preds_, true_path_ = call_variants(
+                final_preds_, none_preds_ = call_variants(
                     net, call_loader, out_dir, model_tag, run_i, use_cuda)
                 all_vcf_records.extend(pred_vcf_records(
-                    ref_file, final_preds_, true_path_, chroms, num_threads))
+                    ref_file, final_preds_, chroms, num_threads))
                 all_vcf_records_none.extend(
                     pred_vcf_records_none(none_preds_, chroms))
 
-                if os.path.exists(matrices_dir_j):
-                    logger.warning("Done with {}. Remove matrices directory {}: {}".format(run_i, run_i, matrices_dir_j))
-                    shutil.rmtree(matrices_dir_j)
-
         all_vcf_records = dict(all_vcf_records)
         all_vcf_records_none = dict(all_vcf_records_none)
 
@@ -788,11 +762,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                 logger.info(
                     "Run for candidate files: {}".format(candidate_files))
 
-                matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, j)
-                if os.path.exists(matrices_dir_j):
-                    logger.warning("Remove matrices directory {}: {}".format(j, matrices_dir_j))
-                    shutil.rmtree(matrices_dir_j)
-                os.mkdir(matrices_dir_j)
                 map_args.append([net, candidate_files, max_load_candidates, data_transform,
                                  coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size,
                                  out_dir,
@@ -836,9 +805,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         logger.warning(
             "Remove split candidates directory: {}".format(new_split_tsvs_dir))
         shutil.rmtree(new_split_tsvs_dir)
-    if os.path.exists(matrices_dir):
-        logger.warning("Remove matrices directory: {}".format(matrices_dir))
-        shutil.rmtree(matrices_dir)
 
     logger.info("Calling is Done.")
 

From d8738f05c608a57a102c7548f213999754480821 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 26 Jan 2021 09:30:31 -0800
Subject: [PATCH 79/89] Updated README

---
 README.md | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 18b2c6d..8764832 100644
--- a/README.md
+++ b/README.md
@@ -40,27 +40,29 @@ NeuSomatic first scans the genome to identify candidate variants and extract ali
 The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.13.2 and g++ 5.4.0).
 
 Python 3.7 and the following Python packages must be installed:
-* pytorch 1.1.0
-* torchvision 0.3.0
-* pybedtools 0.8.0
-* pysam 0.15.2
+* pytorch 1.6.0
+* torchvision 0.7.0
+* pysam 0.16.0.1
 * zlib 1.2.11
-* numpy 1.15.4
-* scipy 1.2.0
-* imageio 2.5.0
-* biopython 1.73
+* numpy 1.18.1
+* scipy 1.4.1
+* pillow 7.2.0
+* imageio 2.8.0
+* biopython 1.77
+* fisher 0.1.9
 
 It also depends on the following packages:
-* cudatoolkit 9.0 (if you want to use GPU)
+* cudatoolkit 10.1 (if you want to use GPU)
 * tabix 0.2.6
-* bedtools 2.27.1
+* bedtools 2.29.2
 * samtools 1.9
 
-You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) :
+You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) (for Python 3.7 on miniconda you can use [this link](https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh)):
 ```
-conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 imageio=2.5.0
-conda install pysam=0.15.2 pybedtools=0.8.0 samtools=1.9 tabix=0.2.6 bedtools=2.27.1 biopython=1.73 -c bioconda
-conda install pytorch=1.1.0 torchvision=0.3.0 cudatoolkit=9.0 -c pytorch
+conda install zlib=1.2.11 numpy=1.18.1 scipy=1.4.1 pillow=7.2.0 cmake=3.17.0 imageio=2.8.0
+conda install pysam=0.16.0.1 samtools=1.9 tabix=0.2.6 bedtools=2.29.2 biopython=1.77 -c bioconda
+conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 -c pytorch
+conda install -c conda-forge fisher=0.1.9
 ```
 Then you can export the conda paths as:
 ```

From e8f9f04db31f6cf89626148f321cb1d6046862e2 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 26 Jan 2021 11:25:27 -0800
Subject: [PATCH 80/89] fix README

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8764832..83daf44 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ python preprocess.py \
 	--work work_train \
 	--truth_vcf truth.vcf \
 	--min_mapq 10 \
-	--number_threads 10 \
+	--num_threads 10 \
 	--scan_alignments_binary ../bin/scan_alignments
 ```
 2. Train network
@@ -149,7 +149,7 @@ python preprocess.py \
 	--normal_bam normal.bam \
 	--work work_call \
 	--min_mapq 10 \
-	--number_threads 10 \
+	--num_threads 10 \
 	--scan_alignments_binary ../bin/scan_alignments
 ```
 2. Call variants
@@ -280,7 +280,7 @@ do
 	--reference GRCh38.fa --tumor_bam tumor.bam --normal_bam normal.bam \
 	--region_bed work/splits/region_${i}.bed \
 	--work work/work_${i} \
-	--min_mapq 10 --number_threads 24 \
+	--min_mapq 10 --num_threads 24 \
 	--scan_alignments_binary ../bin/scan_alignments"
 done
 ```

From 088c8458e816f82f00659eb5be59aa11b023f71b Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 4 Mar 2021 19:21:50 -0800
Subject: [PATCH 81/89] added uint16 as an option for input matrices

---
 neusomatic/python/call.py             | 15 +++++++---
 neusomatic/python/dataloader.py       | 43 ++++++++++++++++++++-------
 neusomatic/python/defaults.py         |  1 +
 neusomatic/python/generate_dataset.py | 30 ++++++++++++++-----
 neusomatic/python/preprocess.py       | 13 ++++++--
 neusomatic/python/train.py            | 23 +++++++++++---
 6 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 5505d59..9130dca 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -466,7 +466,7 @@ def single_thread_call(record):
         net, candidate_files, max_load_candidates, data_transform, \
             coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, \
             out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \
-            pass_threshold, lowqual_threshold, i = record
+            pass_threshold, lowqual_threshold, matrix_dtype, i = record
 
         call_set = NeuSomaticDataset(roots=candidate_files,
                                      max_load_candidates=max_load_candidates,
@@ -475,7 +475,8 @@ def single_thread_call(record):
                                      coverage_thr=coverage_thr,
                                      max_cov=max_cov,
                                      normalize_channels=normalize_channels,
-                                     zero_ann_cols=zero_ann_cols)
+                                     zero_ann_cols=zero_ann_cols,
+                                     matrix_dtype=matrix_dtype)
         call_loader = torch.utils.data.DataLoader(call_set,
                                                   batch_size=batch_size,
                                                   shuffle=True,  # pin_memory=True,
@@ -557,6 +558,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
         ensemble_custom_header = pretrained_dict["ensemble_custom_header"]
     else:
         ensemble_custom_header = False
+    if "matrix_dtype" in pretrained_dict:
+        matrix_dtype = pretrained_dict["matrix_dtype"]
+    else:
+        matrix_dtype = "uint8"
 
     if force_zero_ann_cols:
         logger.info(
@@ -572,6 +577,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
     logger.info("no_seq_complexity: {}".format(no_seq_complexity))
     logger.info("zero_ann_cols: {}".format(zero_ann_cols))
     logger.info("ensemble_custom_header: {}".format(ensemble_custom_header))
+    logger.info("matrix_dtype: {}".format(matrix_dtype))
 
     if not ensemble_custom_header:
         expected_ens_fields = NUM_ENS_FEATURES
@@ -709,7 +715,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                              coverage_thr=coverage_thr,
                                              max_cov=max_cov,
                                              normalize_channels=normalize_channels,
-                                             zero_ann_cols=zero_ann_cols)
+                                             zero_ann_cols=zero_ann_cols,
+                                             matrix_dtype=matrix_dtype)
                 call_loader = torch.utils.data.DataLoader(call_set,
                                                           batch_size=batch_size,
                                                           shuffle=True, pin_memory=True,
@@ -766,7 +773,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                                  coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size,
                                  out_dir,
                                  model_tag, ref_file, chroms, tmp_preds_dir, chroms_order,
-                                 pass_threshold, lowqual_threshold, j])
+                                 pass_threshold, lowqual_threshold, matrix_dtype, j])
                 j += 1
                 current_L = 0
                 candidate_files = []
diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
index 71fe47b..3ae64ee 100755
--- a/neusomatic/python/dataloader.py
+++ b/neusomatic/python/dataloader.py
@@ -16,7 +16,7 @@
 import resource
 
 from utils import skip_empty
-from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES
+from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, MAT_DTYPES
 
 FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -36,11 +36,18 @@ def __call__(self, matrix):
         return matrix_
 
 
-def extract_zlib(zlib_compressed_im):
-    return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint8").reshape((5, 32, 23))
+def extract_zlib(zlib_compressed_im, matrix_dtype):
+    if matrix_dtype == "uint8":
+        return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint8").reshape((5, 32, 23))
+    elif matrix_dtype == "uint16":
+        return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint16").reshape((5, 32, 23))
+    else:
+        logger.info(
+            "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
+        raise Exception
 
 
-def candidate_loader_tsv(tsv, open_tsv, idx, i):
+def candidate_loader_tsv(tsv, open_tsv, idx, i, matrix_dtype):
     if open_tsv:
         i_f = open_tsv
     else:
@@ -48,7 +55,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i):
     i_f.seek(idx[i])
     fields = i_f.read(idx[i + 1] - idx[i]).strip().split()
     tag = fields[2]
-    im = extract_zlib(base64.b64decode(fields[3]))
+    im = extract_zlib(base64.b64decode(fields[3]), matrix_dtype)
     if len(fields) > 4:
         anns = list(map(float, fields[4:]))
     else:
@@ -60,7 +67,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i):
 
 
 def extract_info_tsv(record):
-    i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l = record
+    i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l, matrix_dtype = record
     thread_logger = logging.getLogger(
         "{} ({})".format(extract_info_tsv.__name__, multiprocessing.current_process().name))
     try:
@@ -101,7 +108,8 @@ def extract_info_tsv(record):
                 count_class_l[min(int(length), 3)] += 1
                 if ((cnt_var < max_load_candidates_var) and ("NONE" not in tag)) or (
                         (cnt_none < max_load_candidates_none) and ("NONE" in tag)):
-                    im = extract_zlib(base64.b64decode(fields[3]))
+                    im = extract_zlib(base64.b64decode(
+                        fields[3]), matrix_dtype)
                     label = TYPE_CLASS_DICT[tag.split(".")[4]]
                     if len(fields) > 4:
                         anns = list(map(float, fields[4:]))
@@ -133,6 +141,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
                  max_cov=None,
                  normalize_channels=False,
                  zero_ann_cols=[],
+                 matrix_dtype="uint8",
                  max_opended_tsv=-1):
 
         soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
@@ -144,6 +153,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
         self.max_opended_tsv = max_opended_tsv
         self.normalize_channels = normalize_channels
         self.zero_ann_cols = zero_ann_cols
+        self.matrix_dtype = matrix_dtype
         self.da_shift_p = 0.3
         self.da_base_p = 0.05
         self.da_rev_p = 0.1
@@ -190,7 +200,7 @@ def __init__(self, roots, max_load_candidates, transform=None,
                 max_load_ = self.Ls[i_b] * max_load_candidates // \
                     total_L if total_L > 0 else 0
                 map_args.append([i_b, tsv, self.idxs[i_b], self.Ls[i_b],
-                                 max_load_, nclasses_t, nclasses_l])
+                                 max_load_, nclasses_t, nclasses_l, self.matrix_dtype])
                 Ls_.append(self.Ls[i_b])
             logger.info("Len's of tsv files in this batch: {}".format(Ls_))
             if len(map_args) == 1:
@@ -260,12 +270,14 @@ def __getitem__(self, index):
                                                                  self.open_tsvs[
                     int(multiprocessing.current_process()._identity[0]
                         ) % self.num_threads][i_b],
-                    self.idxs[i_b], i)
+                    self.idxs[i_b], i, self.matrix_dtype)
             else:
                 path, matrix, anns, label = candidate_loader_tsv(self.tsvs[i_b],
                                                                  self.open_tsvs[
                                                                      0][i_b],
-                                                                 self.idxs[i_b], i)
+                                                                 self.idxs[
+                                                                     i_b], i,
+                                                                 self.matrix_dtype)
 
         else:
             path, matrix, anns, label = self.data[index]
@@ -419,7 +431,16 @@ def __getitem__(self, index):
             orig_matrix_[:, :, 0:2] = orig_matrix[:, :, 0:2]
             orig_matrix_[:, orig_center, 2] = np.max(orig_matrix[:, :, 0])
             orig_matrix = orig_matrix_
-            non_transformed_matrix = np.array(orig_matrix).astype(np.uint8)
+            if self.matrix_dtype == "uint8":
+                non_transformed_matrix = np.array(orig_matrix).astype(np.uint8)
+            elif self.matrix_dtype == "uint16":
+                non_transformed_matrix = np.array(
+                    orig_matrix).astype(np.uint16)
+            else:
+                logger.info(
+                    "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
+                raise Exception
+
         else:
             non_transformed_matrix = []
 
diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py
index 4cf0d21..a959cae 100644
--- a/neusomatic/python/defaults.py
+++ b/neusomatic/python/defaults.py
@@ -3,3 +3,4 @@
 VCF_HEADER = "##fileformat=VCFv4.2"
 TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3}
 VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP']
+MAT_DTYPES = ["uint8", "uint16"]
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 97f2eac..6b6f8a8 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -21,7 +21,7 @@
 
 from split_bed import split_region
 from utils import concatenate_vcfs, get_chromosomes_order, run_bedtools_cmd, vcf_2_bed, bedtools_sort, bedtools_window, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty
-from defaults import NUM_ENS_FEATURES, VCF_HEADER
+from defaults import NUM_ENS_FEATURES, VCF_HEADER, MAT_DTYPES
 
 NUC_to_NUM_tabix = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0}
 
@@ -571,7 +571,7 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec
 def prep_data_single_tabix(input_record):
 
     ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, \
-        matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths = input_record
+        matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype = input_record
 
     thread_logger = logging.getLogger(
         "{} ({})".format(prep_data_single_tabix.__name__, multiprocessing.current_process().name))
@@ -647,8 +647,16 @@ def prep_data_single_tabix(input_record):
             candidate_mat[:, :, 13 + (iii * 2) + 1] = candidate_mat[:, :, 13 + (
                 iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * 255
 
-        candidate_mat = np.maximum(0, np.minimum(
-            candidate_mat, 255)).astype(np.uint8)
+        if matrix_dtype == "uint8":
+            candidate_mat = np.maximum(0, np.minimum(
+                candidate_mat, 255)).astype(np.uint8)
+        elif matrix_dtype == "uint16":
+            candidate_mat = np.maximum(0, np.minimum(
+                candidate_mat, 255)).astype(np.uint16)
+        else:
+            logger.info(
+                "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
+            raise Exception
         tag = "{}.{}.{}.{}.{}.{}.{}.{}.{}".format(ch_order, pos, ref[0:55], alt[
                                                   0:55], vartype, center, rlen, tumor_cov, normal_cov)
         candidate_mat = base64.b64encode(
@@ -1564,7 +1572,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea
                 if i_s:
                     s = ensemble_data[:, np.array(i_s)]
                     s = np.maximum(np.minimum(s, mx), mn)
-                    s = np.round((s - mn) / (mx - mn),6)
+                    s = np.round((s - mn) / (mx - mn), 6)
                     ensemble_data[:, np.array(i_s)] = s
             ensemble_data = ensemble_data[:, selected_features]
             ensemble_data = ensemble_data.tolist()
@@ -1585,6 +1593,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                      ensemble_custom_header,
                      no_seq_complexity, enforce_header,
                      zero_vscore,
+                     matrix_dtype,
                      tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
@@ -1710,7 +1719,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                             ann = list(anns[int(record[-1])]
                                        ) if ensemble_bed else []
                             map_args_records.append((ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order,
-                                                     matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths))
+                                                     matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype))
                         if cnt >= is_end:
                             break
                     if cnt >= is_end:
@@ -1730,7 +1739,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                                        ) if ensemble_bed else []
                             map_args_nones.append((ref_file, tumor_count_bed, normal_count_bed, record, "NONE",
                                                    0, rcenter, ch_order,
-                                                   matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths))
+                                                   matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype))
                         if cnt >= is_end:
                             break
                     if cnt >= is_end:
@@ -1851,6 +1860,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     parser.add_argument('--zero_vscore',
                         help='set VarScan2_Score to zero',
                         action="store_true")
+    parser.add_argument('--matrix_dtype', type=str,
+                        help='matrix_dtype to be used to store matrix', default="uint8",
+                        choices=MAT_DTYPES)
     args = parser.parse_args()
     logger.info(args)
 
@@ -1873,7 +1885,8 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     tsv_batch_size = args.tsv_batch_size
     ensemble_custom_header = args.ensemble_custom_header
     enforce_header = args.enforce_header
-    zero_vscore = zero_vscore
+    zero_vscore = args.zero_vscore
+    matrix_dtype = args.matrix_dtype
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
@@ -1881,6 +1894,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                          ensemble_custom_header,
                          no_seq_complexity, enforce_header,
                          zero_vscore,
+                         matrix_dtype,
                          tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index e6b5559..5007d58 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -22,6 +22,7 @@
 from scan_alignments import scan_alignments
 from extend_features import extend_features
 from utils import concatenate_vcfs, run_bedtools_cmd, bedtools_sort, bedtools_merge, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty, vcf_2_bed
+from defaults import MAT_DTYPES
 
 
 def process_split_region(tn, work, region, reference, mode, alignment_bam,
@@ -83,8 +84,9 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi
                             matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed,
                             ensemble_custom_header,
                             no_seq_complexity,
-                            no_feature_recomp_for_ensemble, 
+                            no_feature_recomp_for_ensemble,
                             zero_vscore,
+                            matrix_dtype,
                             tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
@@ -93,6 +95,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi
                      no_seq_complexity,
                      no_feature_recomp_for_ensemble,
                      zero_vscore,
+                     matrix_dtype,
                      tsv_batch_size)
 
 
@@ -212,6 +215,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                merge_d_for_scan,
                use_vscore,
                num_splits,
+               matrix_dtype,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -588,8 +592,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads,
                                     ensemble_bed_i,
                                     ensemble_custom_header,
-                                    no_seq_complexity, no_feature_recomp_for_ensemble, 
+                                    no_seq_complexity, no_feature_recomp_for_ensemble,
                                     zero_vscore,
+                                    matrix_dtype,
                                     tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
@@ -699,6 +704,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                         action="store_true")
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
+    parser.add_argument('--matrix_dtype', type=str,
+                        help='matrix_dtype to be used to store matrix', default="uint8",
+                        choices=MAT_DTYPES)
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -725,6 +733,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.merge_d_for_scan,
                    args.use_vscore,
                    args.num_splits,
+                   args.matrix_dtype,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py
index 45fbae2..d90184f 100755
--- a/neusomatic/python/train.py
+++ b/neusomatic/python/train.py
@@ -24,7 +24,7 @@
 from network import NeuSomaticNet
 from dataloader import NeuSomaticDataset, matrix_transform
 from merge_tsvs import merge_tsvs
-from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES
+from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES, MAT_DTYPES
 
 import torch._utils
 try:
@@ -207,6 +207,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                      zero_ann_cols,
                      force_zero_ann_cols,
                      ensemble_custom_header,
+                     matrix_dtype,
                      use_cuda):
     logger = logging.getLogger(train_neusomatic.__name__)
 
@@ -258,6 +259,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
             ensemble_custom_header = pretrained_dict["ensemble_custom_header"]
         else:
             ensemble_custom_header = False
+        if "matrix_dtype" in pretrained_dict:
+            matrix_dtype = pretrained_dict["matrix_dtype"]
+        else:
+            matrix_dtype = "uint8"
         prev_epochs = sofar_epochs
     else:
         prev_epochs = 0
@@ -292,7 +297,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                     elif len(x) == 4:
                         break
                     else:
-                        raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x)))
+                        raise Exception(
+                            "Wrong number of fields in {}: {}".format(tsv, len(x)))
 
         num_channels = expected_ens_fields + \
             NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES
@@ -382,7 +388,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       transform=data_transform, is_test=False,
                                       num_threads=num_threads, coverage_thr=coverage_thr,
                                       normalize_channels=normalize_channels,
-                                      zero_ann_cols=zero_ann_cols)
+                                      zero_ann_cols=zero_ann_cols,
+                                      matrix_dtype=matrix_dtype)
         train_sets.append(train_set)
         none_indices = train_set.get_none_indices()
         var_indices = train_set.get_var_indices()
@@ -416,7 +423,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                            transform=data_transform, is_test=True,
                                            num_threads=num_threads, coverage_thr=coverage_thr,
                                            normalize_channels=normalize_channels,
-                                           zero_ann_cols=zero_ann_cols)
+                                           zero_ann_cols=zero_ann_cols,
+                                           matrix_dtype=matrix_dtype)
         validation_loader = torch.utils.data.DataLoader(validation_set,
                                                         batch_size=batch_size, shuffle=True,
                                                         num_workers=num_threads, pin_memory=True)
@@ -463,6 +471,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "no_seq_complexity": no_seq_complexity,
                 "zero_ann_cols": zero_ann_cols,
                 "ensemble_custom_header": ensemble_custom_header,
+                "matrix_dtype": matrix_dtype,
                 }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch))
 
     if len(train_sets) == 1:
@@ -531,6 +540,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         "no_seq_complexity": no_seq_complexity,
                         "zero_ann_cols": zero_ann_cols,
                         "ensemble_custom_header": ensemble_custom_header,
+                        "matrix_dtype": matrix_dtype,
                         }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch))
             if validation_candidates_tsv:
                 test(net, curr_epoch, validation_loader, use_cuda)
@@ -552,6 +562,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                 "no_seq_complexity": no_seq_complexity,
                 "zero_ann_cols": zero_ann_cols,
                 "ensemble_custom_header": ensemble_custom_header,
+                "matrix_dtype": matrix_dtype,
                 }, '{}/models/checkpoint_{}_epoch{}.pth'.format(
         out_dir, tag, curr_epoch))
     if validation_candidates_tsv:
@@ -642,6 +653,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                         help='Allow ensemble tsv to have custom header fields. (Features should be\
                             normalized between [0,1]',
                         action="store_true")
+    parser.add_argument('--matrix_dtype', type=str,
+                        help='matrix_dtype to be used to store matrix', default="uint8",
+                        choices=MAT_DTYPES)
     args = parser.parse_args()
 
     logger.info(args)
@@ -663,6 +677,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo
                                       args.zero_ann_cols,
                                       args.force_zero_ann_cols,
                                       args.ensemble_custom_header,
+                                      args.matrix_dtype,
                                       use_cuda)
     except Exception as e:
         logger.error(traceback.format_exc())

From 47ac4da42aafb312cbbd06f17d77c37dcbaed674 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Thu, 4 Mar 2021 23:09:22 -0800
Subject: [PATCH 82/89] fixed uint16

---
 neusomatic/python/call.py             | 20 +++++++++----
 neusomatic/python/dataloader.py       | 30 +++++++++----------
 neusomatic/python/generate_dataset.py | 42 ++++++++++++++++-----------
 3 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py
index 9130dca..be1b40a 100755
--- a/neusomatic/python/call.py
+++ b/neusomatic/python/call.py
@@ -26,7 +26,7 @@
 from dataloader import NeuSomaticDataset, matrix_transform
 from utils import get_chromosomes_order, prob2phred, skip_empty
 from merge_tsvs import merge_tsvs
-from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES
+from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES, MAT_DTYPES
 
 import torch._utils
 try:
@@ -52,7 +52,7 @@ def get_type(ref, alt):
         return "SNP"
 
 
-def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
+def call_variants(net, call_loader, out_dir, model_tag, run_i, matrix_dtype, use_cuda):
     logger = logging.getLogger(call_variants.__name__)
     net.eval()
     nclasses = len(VARTYPE_CLASSES)
@@ -63,6 +63,14 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
     none_preds = {}
     loader_ = call_loader
 
+    if matrix_dtype == "uint8":
+        max_norm = 255.0
+    elif matrix_dtype == "uint16":
+        max_norm = 65535.0
+    else:
+        logger.info(
+            "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
+
     iii = 0
     j = 0
     for data in loader_:
@@ -101,7 +109,7 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda):
                                               outputs1.data.cpu()[i].numpy())),
                                      list(map(lambda x: round(x, 4),
                                               outputs3.data.cpu()[i].numpy())),
-                                     np.array(non_transformed_matrices[i, :, :, 0:3])]
+                                     np.array(non_transformed_matrices[i, :, :, 0:3]) / max_norm]
             else:
                 none_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i],
                                     list(map(lambda x: round(x, 4), F.softmax(
@@ -125,7 +133,7 @@ def pred_vcf_records_path(record):
     try:
         fasta_file = pysam.FastaFile(ref_file)
         ACGT = "ACGT"
-        I = pred_all[-1] / 255.0
+        I = pred_all[-1]
         vcf_record = []
         Ih, Iw, _ = I.shape
         zref_pos = np.where((np.argmax(I[:, :, 0], 0) == 0) & (
@@ -488,7 +496,7 @@ def single_thread_call(record):
             return [], []
 
         final_preds_, none_preds_ = call_variants(
-            net, call_loader, out_dir, model_tag, i, use_cuda)
+            net, call_loader, out_dir, model_tag, i, matrix_dtype, use_cuda)
         all_vcf_records = pred_vcf_records(
             ref_file, final_preds_, chroms, 1)
         all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms)
@@ -732,7 +740,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads,
                     continue
 
                 final_preds_, none_preds_ = call_variants(
-                    net, call_loader, out_dir, model_tag, run_i, use_cuda)
+                    net, call_loader, out_dir, model_tag, run_i, matrix_dtype, use_cuda)
                 all_vcf_records.extend(pred_vcf_records(
                     ref_file, final_preds_, chroms, num_threads))
                 all_vcf_records_none.extend(
diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py
index 3ae64ee..0fe5470 100755
--- a/neusomatic/python/dataloader.py
+++ b/neusomatic/python/dataloader.py
@@ -410,20 +410,28 @@ def __getitem__(self, index):
             tumor_cov *= r_cov
             normal_cov *= r_cov
 
+        if self.matrix_dtype == "uint8":
+            max_norm = 255.0
+        elif self.matrix_dtype == "uint16":
+            max_norm = 65535.0
+        else:
+            logger.info(
+                "Wrong matrix_dtype {}. Choices are {}".format(self.matrix_dtype, MAT_DTYPES))
+
         # add COV channel
         matrix_ = np.zeros((matrix.shape[0], matrix.shape[1], 26 + len(anns)))
         matrix_[:, :, 0:23] = matrix
         if self.normalize_channels:
-            matrix_[:, :, 3:23:2] *= (matrix_[:, :, 1:2] / 255.0)
-            matrix_[:, :, 4:23:2] *= (matrix_[:, :, 2:3] / 255.0)
+            matrix_[:, :, 3:23:2] *= (matrix_[:, :, 1:2] / max_norm)
+            matrix_[:, :, 4:23:2] *= (matrix_[:, :, 2:3] / max_norm)
         matrix = matrix_
         matrix[:, center, 23] = np.max(matrix[:, :, 0])
         matrix[:, :, 24] = (min(tumor_cov, self.coverage_thr) /
-                            float(self.coverage_thr)) * 255.0
+                            float(self.coverage_thr)) * max_norm
         matrix[:, :, 25] = (
-            min(normal_cov, self.coverage_thr) / float(self.coverage_thr)) * 255.0
+            min(normal_cov, self.coverage_thr) / float(self.coverage_thr)) * max_norm
         for i, a in enumerate(anns):
-            matrix[:, :, 26 + i] = a * 255.0
+            matrix[:, :, 26 + i] = a * max_norm
 
         if self.is_test:
             orig_matrix_ = np.zeros(
@@ -431,21 +439,13 @@ def __getitem__(self, index):
             orig_matrix_[:, :, 0:2] = orig_matrix[:, :, 0:2]
             orig_matrix_[:, orig_center, 2] = np.max(orig_matrix[:, :, 0])
             orig_matrix = orig_matrix_
-            if self.matrix_dtype == "uint8":
-                non_transformed_matrix = np.array(orig_matrix).astype(np.uint8)
-            elif self.matrix_dtype == "uint16":
-                non_transformed_matrix = np.array(
-                    orig_matrix).astype(np.uint16)
-            else:
-                logger.info(
-                    "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
-                raise Exception
+            non_transformed_matrix = np.array(orig_matrix)
 
         else:
             non_transformed_matrix = []
 
         matrix = torch.from_numpy(matrix.transpose((2, 0, 1)))
-        matrix = matrix.float().div(255)
+        matrix = matrix.float().div(max_norm)
         if self.transform is not None:
             matrix = self.transform(matrix)
 
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 6b6f8a8..cfdc280 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -615,44 +615,52 @@ def prep_data_single_tabix(input_record):
         tumor_cov = int(round(max(np.sum(tumor_count_matrix, 0))))
         normal_cov = int(round(max(np.sum(normal_count_matrix, 0))))
 
+        if matrix_dtype == "uint8":
+            max_norm = 255.0
+        elif matrix_dtype == "uint16":
+            max_norm = 65535.0
+        else:
+            logger.info(
+                "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))
+
         candidate_mat[:, :, 0] = candidate_mat[
-            :, :, 0] / (max(np.max(ref_count_matrix), np.max(tumor_count_matrix)) + 0.00001) * 255
+            :, :, 0] / (max(np.max(ref_count_matrix), np.max(tumor_count_matrix)) + 0.00001) * max_norm
         candidate_mat[:, :, 1] = candidate_mat[:, :, 1] / \
-            (np.max(tumor_count_matrix) + 0.00001) * 255
+            (np.max(tumor_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 2] = candidate_mat[:, :, 2] / \
-            (np.max(normal_count_matrix) + 0.00001) * 255
+            (np.max(normal_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 3] = candidate_mat[:, :, 3] / \
-            (max(np.max(bq_tumor_count_matrix), 41.0)) * 255
+            (max(np.max(bq_tumor_count_matrix), 41.0)) * max_norm
         candidate_mat[:, :, 4] = candidate_mat[:, :, 4] / \
-            (max(np.max(bq_normal_count_matrix), 41.0)) * 255
+            (max(np.max(bq_normal_count_matrix), 41.0)) * max_norm
         candidate_mat[:, :, 5] = candidate_mat[:, :, 5] / \
-            (max(np.max(mq_tumor_count_matrix), 70.0)) * 255
+            (max(np.max(mq_tumor_count_matrix), 70.0)) * max_norm
         candidate_mat[:, :, 6] = candidate_mat[:, :, 6] / \
-            (max(np.max(mq_normal_count_matrix), 70.0)) * 255
+            (max(np.max(mq_normal_count_matrix), 70.0)) * max_norm
         candidate_mat[:, :, 7] = candidate_mat[:, :, 7] / \
-            (np.max(tumor_count_matrix) + 0.00001) * 255
+            (np.max(tumor_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 8] = candidate_mat[:, :, 8] / \
-            (np.max(normal_count_matrix) + 0.00001) * 255
+            (np.max(normal_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 9] = candidate_mat[:, :, 9] / \
-            (np.max(tumor_count_matrix) + 0.00001) * 255
+            (np.max(tumor_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 10] = candidate_mat[:, :, 10] / \
-            (np.max(normal_count_matrix) + 0.00001) * 255
+            (np.max(normal_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 11] = candidate_mat[:, :, 11] / \
-            (np.max(tumor_count_matrix) + 0.00001) * 255
+            (np.max(tumor_count_matrix) + 0.00001) * max_norm
         candidate_mat[:, :, 12] = candidate_mat[:, :, 12] / \
-            (np.max(normal_count_matrix) + 0.00001) * 255
+            (np.max(normal_count_matrix) + 0.00001) * max_norm
         for iii in range(len(tag_tumor_count_matrices)):
             candidate_mat[:, :, 13 + (iii * 2)] = candidate_mat[:, :, 13 + (iii * 2)] / (
-                max(np.max(tag_tumor_count_matrices[iii]), 100.0)) * 255
+                max(np.max(tag_tumor_count_matrices[iii]), 100.0)) * max_norm
             candidate_mat[:, :, 13 + (iii * 2) + 1] = candidate_mat[:, :, 13 + (
-                iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * 255
+                iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * max_norm
 
         if matrix_dtype == "uint8":
             candidate_mat = np.maximum(0, np.minimum(
-                candidate_mat, 255)).astype(np.uint8)
+                candidate_mat, max_norm)).astype(np.uint8)
         elif matrix_dtype == "uint16":
             candidate_mat = np.maximum(0, np.minimum(
-                candidate_mat, 255)).astype(np.uint16)
+                candidate_mat, max_norm)).astype(np.uint16)
         else:
             logger.info(
                 "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES))

From d948d084a634b5b6a917e9898ab00c7be99bd6ab Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Sun, 7 Mar 2021 23:50:22 -0800
Subject: [PATCH 83/89] added report_all_alleles

---
 neusomatic/cpp/scan_alignments.cpp   | 157 ++++++++++++++++-----------
 neusomatic/include/Options.h         |  14 +++
 neusomatic/python/preprocess.py      |  11 +-
 neusomatic/python/scan_alignments.py |  19 +++-
 4 files changed, 129 insertions(+), 72 deletions(-)

diff --git a/neusomatic/cpp/scan_alignments.cpp b/neusomatic/cpp/scan_alignments.cpp
index d019597..3f8c552 100644
--- a/neusomatic/cpp/scan_alignments.cpp
+++ b/neusomatic/cpp/scan_alignments.cpp
@@ -58,6 +58,7 @@ int main(int argc, char **argv) {
   float del_min_af=min_af;
   float snp_min_af=min_af;
   const bool calculate_qual_stat = opts.calculate_qual_stat(); 
+  const bool report_all_alleles = opts.report_all_alleles(); 
 
   //const std::map<char, int> empty_pileup_counts = {{'-', 0}, {'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}};
   static const std::vector<char> nuc_code_char = {'A', 'C', 'G', 'T', '-', 'N'};
@@ -206,83 +207,109 @@ int main(int argc, char **argv) {
           <<":"<<pileup_counts[3]<<std::endl;
         }
 
-        int major = -1;
-        int major_count = 0;
-        int minor = -1;
-        int minor_count = 0;
-        int minor2 = -1;
-        int minor2_count = 0;
-
-        for (int row = 0;  row < cols[i].base_freq_.size(); ++row) {
-          if (cols[i].base_freq_[row] > major_count) {
-            minor2 = minor;
-            minor2_count = minor_count;
-            minor_count = major_count;
-            minor = major;
-            major_count = cols[i].base_freq_[row];
-            major = row;
-          } else if (cols[i].base_freq_[row] > minor_count) {
-            minor2 = minor;
-            minor2_count = minor_count;
-            minor_count = cols[i].base_freq_[row];
-            minor = row;
-          } else if (cols[i].base_freq_[row] > minor2_count) {
-            minor2_count = cols[i].base_freq_[row];
-            minor2 = row;
+        std::map<int, int> alt_counts;
+        auto ref_count = cols[i].base_freq_[ref_code];
+        auto var_code = ref_code; 
+        int var_count = 0;
+        int dp = ref_count;
+        if (report_all_alleles){
+          for (int row = 0;  row < cols[i].base_freq_.size(); ++row) {
+            auto alt_cnt = cols[i].base_freq_[row];
+            if (( row != ref_code) and (alt_cnt > 0)){
+              auto af = alt_cnt/float(alt_cnt+ref_count);
+              if ((alt_cnt >= ref_code) or ((row == 4 and  af > del_min_af ) or
+                                              (row != 4 and ref_base != '-' and af > snp_min_af ) or
+                                              (ref_base =='-' and af > ins_min_af))){
+                alt_counts.insert(std::pair<int, int>(row, alt_cnt));
+                dp += alt_cnt;
+              }
+            }
+          }
+        }else{
+          int major = -1;
+          int major_count = 0;
+          int minor = -1;
+          int minor_count = 0;
+          int minor2 = -1;
+          int minor2_count = 0;
+
+          for (int row = 0;  row < cols[i].base_freq_.size(); ++row) {
+            if (cols[i].base_freq_[row] > major_count) {
+              minor2 = minor;
+              minor2_count = minor_count;
+              minor_count = major_count;
+              minor = major;
+              major_count = cols[i].base_freq_[row];
+              major = row;
+            } else if (cols[i].base_freq_[row] > minor_count) {
+              minor2 = minor;
+              minor2_count = minor_count;
+              minor_count = cols[i].base_freq_[row];
+              minor = row;
+            } else if (cols[i].base_freq_[row] > minor2_count) {
+              minor2_count = cols[i].base_freq_[row];
+              minor2 = row;
+            }
           }
-        }
 
-        if (minor != -1 and major != -1){
-          if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4 ){
-            if (minor2_count>0.5*minor_count){
-              minor = minor2;
-              minor_count = minor2_count;
+          if (minor != -1 and major != -1){
+            if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4 ){
+              if (minor2_count>0.5*minor_count){
+                minor = minor2;
+                minor_count = minor2_count;
+              }
             }
           }
+          auto af = minor_count/float(major_count+minor_count);
+          if (major != ref_code){
+            var_code = major;
+            var_count = major_count;
+          } else if (minor != ref_code and ( (minor == 4 and  af > del_min_af ) or
+                                          (minor != 4 and ref_base != '-' and af > snp_min_af ) or
+                                          (ref_base =='-' and af > ins_min_af))){
+            var_code = minor;
+            var_count = minor_count;
+          }
+          if (var_count > 0) { 
+            alt_counts.insert(std::pair<int, int>(var_code,var_count));
+            dp += var_count;
+          }
         }
-        auto ref_count = cols[i].base_freq_[ref_code];
-        auto var_code = ref_code; 
-        int var_count = 0;
-        auto af = minor_count/float(major_count+minor_count);
-        if (major != ref_code){
-          var_code = major;
-          var_count = major_count;
-        } else if (minor != ref_code and ( (minor == 4 and  af > del_min_af ) or
-                                        (minor != 4 and ref_base != '-' and af > snp_min_af ) or
-                                        (ref_base =='-' and af > ins_min_af))){
-          var_code = minor;
-          var_count = minor_count;
-        }
-
-        if (var_count > 0) { 
-
-          auto record_info = "AF="+std::to_string((var_count)/float(var_count+ref_count))+";DP="+std::to_string(nrow)+";RO="+std::to_string(ref_count)+";AO="+std::to_string(var_count);
-          auto gtinfo = "0/1:"+std::to_string(nrow)+":"+std::to_string(ref_count)+":"+std::to_string(var_count);
+        // for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it)
+        // {
+        //     std::cout << it->first << " " << it->second << std::endl;
+        // }          
+        for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it)
+        {
+          auto var_code_ = it->first;
+          auto var_count_ = it->second;
+          auto record_info = "AF="+std::to_string((var_count_)/float(dp))+";DP="+std::to_string(nrow)+";RO="+std::to_string(ref_count)+";AO="+std::to_string(var_count_);
+          auto gtinfo = "0/1:"+std::to_string(nrow)+":"+std::to_string(ref_count)+":"+std::to_string(var_count_);
           if (calculate_qual_stat){
             record_info += ";ST="+std::to_string(int(round(ref_count*(cols_strand[i].strand_mean[ref_code]/100))))+ \
-                           ","+std::to_string(int(round(var_count*(cols_strand[i].strand_mean[var_code]/100))))+ \
+                           ","+std::to_string(int(round(var_count_*(cols_strand[i].strand_mean[var_code_]/100))))+ \
                            ";LS="+std::to_string(lsc_counts)+\
                            ";RS="+std::to_string(rsc_counts)+\
-                           ";NM="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][0])))+\
-                           ";AS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][1])))+ \
-                           ";XS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][2])))+ \
-                           ";PR="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][3])))+ \
-                           ";CL="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][4])))+ \
-                           ";MQ="+std::to_string(int(round(cols_mqual[i].mqual_mean[var_code])))+ \
-                           ";BQ="+std::to_string(int(round(cols[i].bqual_mean[var_code])));
+                           ";NM="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][0])))+\
+                           ";AS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][1])))+ \
+                           ";XS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][2])))+ \
+                           ";PR="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][3])))+ \
+                           ";CL="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][4])))+ \
+                           ";MQ="+std::to_string(int(round(cols_mqual[i].mqual_mean[var_code_])))+ \
+                           ";BQ="+std::to_string(int(round(cols[i].bqual_mean[var_code_])));
             gtinfo += ":"+std::to_string(int(round(ref_count*(cols_strand[i].strand_mean[ref_code]/100))))+","+ \
-                      std::to_string(int(round(var_count*(cols_strand[i].strand_mean[var_code]/100))))+":"+\
+                      std::to_string(int(round(var_count_*(cols_strand[i].strand_mean[var_code_]/100))))+":"+\
                       std::to_string(lsc_counts)+":"+\
                       std::to_string(rsc_counts)+":"+\
-                      std::to_string(int(round(cols_tag[i].tag_mean[var_code][0])))+":"+\
-                      std::to_string(int(round(cols_tag[i].tag_mean[var_code][1])))+":"+\
-                      std::to_string(int(round(cols_tag[i].tag_mean[var_code][2])))+":"+\
-                      std::to_string(int(round(cols_tag[i].tag_mean[var_code][3])))+":"+\
-                      std::to_string(int(round(cols_tag[i].tag_mean[var_code][4])))+":"+\
-                      std::to_string(int(round(cols_mqual[i].mqual_mean[var_code])))+":"+\
-                      std::to_string(int(round(cols[i].bqual_mean[var_code])));
+                      std::to_string(int(round(cols_tag[i].tag_mean[var_code_][0])))+":"+\
+                      std::to_string(int(round(cols_tag[i].tag_mean[var_code_][1])))+":"+\
+                      std::to_string(int(round(cols_tag[i].tag_mean[var_code_][2])))+":"+\
+                      std::to_string(int(round(cols_tag[i].tag_mean[var_code_][3])))+":"+\
+                      std::to_string(int(round(cols_tag[i].tag_mean[var_code_][4])))+":"+\
+                      std::to_string(int(round(cols_mqual[i].mqual_mean[var_code_])))+":"+\
+                      std::to_string(int(round(cols[i].bqual_mean[var_code_])));
           }
-          auto var_base = nuc_code_char[var_code];  
+          auto var_base = nuc_code_char[var_code_];  
           if (ref_base == '-') {ref_base = 'N';}
           if (var_base == '-') {var_base = 'N';}
           auto var_ref_pos=ginv.left() + cc.UngapPos(i);
@@ -304,7 +331,7 @@ int main(int argc, char **argv) {
           appendValue(record.genotypeInfos, gtinfo);
           vcf_writer.Write(record);
           if (opts.verbosity()>0){
-            std::cout<<"var: " << i << "," << var_ref_pos << ","<< ref_base << "," << var_base<<","<<nrow<<":"<<ref_count<<":"<<var_count<<std::endl;
+            std::cout<<"var: " << i << "," << var_ref_pos << ","<< ref_base << "," << var_base<<","<<nrow<<":"<<ref_count<<":"<<var_count_<<std::endl;
             std::cout<<"col "<<i<<": ";
             std::cout<<"(ref= "<< ref_base << ") ";
             for (size_t row = 0; row < cols[i].base_freq_.size(); ++row) {
diff --git a/neusomatic/include/Options.h b/neusomatic/include/Options.h
index 2260673..c7b9a6f 100644
--- a/neusomatic/include/Options.h
+++ b/neusomatic/include/Options.h
@@ -29,6 +29,7 @@ namespace neusomatic {
     {"filter_improper_pair",            no_argument,            0,        'E'},
     {"filter_mate_unmapped",            no_argument,            0,        'F'},
     {"filter_improper_orientation",     no_argument,            0,        'G'},
+    {"report_all_alleles",     no_argument,            0,        'A'},
     {0, 0, 0, 0} // terminator
   };
 
@@ -58,6 +59,7 @@ namespace neusomatic {
     std::cerr<< "-E/--filter_improper_pair,            filter improper pairs if the flag is set,                              default is False.\n";
     std::cerr<< "-F/--filter_mate_unmapped,            filter reads with unmapeed mates if the flag is set,                   default is False.\n";
     std::cerr<< "-G/--filter_improper_orientation,     filter reads with improper orientation (not FR) or different chrom,    default is False.\n";
+    std::cerr<< "-A/--report_all_alleles,     report all alleles per column,    default is False.\n";
   }
 
   int parseInt(const char* optarg, int lower, const char *errmsg, void (*print_help)()) {
@@ -163,6 +165,9 @@ namespace neusomatic {
         case 'G':
           opt.filter_improper_orientation() = true;
           break;
+        case 'A':
+          opt.report_all_alleles() = true;
+          break;
         case 'a':
           opt.min_allele_freq() = parseFloat(optarg, 0.0, 1.0, "-a/--min_af must be between 0 and 1", print_help);
           break;
@@ -329,6 +334,14 @@ struct Options {
     return (filter_improper_orientation_);
   }
 
+  decltype(auto) report_all_alleles() const {
+    return (report_all_alleles_);
+  }
+
+  decltype(auto) report_all_alleles() {
+    return (report_all_alleles_);
+  }
+
 private:
   unsigned verbosity_ = 0;
   std::string bam_in_;
@@ -350,6 +363,7 @@ struct Options {
   bool filter_improper_pair_ = false;
   bool filter_mate_unmapped_ = false;
   bool filter_improper_orientation_ = false;
+  bool report_all_alleles_ = false;
 };
 }//namespace neusomatic
 
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index 5007d58..c54cb98 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -33,13 +33,14 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam,
                          ins_min_af, del_min_af, del_merge_min_af,
                          ins_merge_min_af, merge_r,
                          merge_d_for_scan,
+                         report_all_alleles,
                          scan_alignments_binary, restart, num_splits, num_threads, calc_qual, regions=[]):
 
     logger = logging.getLogger(process_split_region.__name__)
     logger.info("Scan bam.")
     scan_outputs = scan_alignments(work, merge_d_for_scan, scan_alignments_binary, alignment_bam,
                                    region, reference, num_splits, num_threads, scan_window_size, scan_maf,
-                                   min_mapq, max_dp, filter_duplicate, restart=restart, split_region_files=regions,
+                                   min_mapq, max_dp, report_all_alleles, filter_duplicate, restart=restart, split_region_files=regions,
                                    calc_qual=calc_qual)
     if filtered_candidates_vcf:
         logger.info("Filter candidates.")
@@ -216,6 +217,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                use_vscore,
                num_splits,
                matrix_dtype,
+               report_all_alleles,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -296,6 +298,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                        ins_min_af, del_min_af, del_merge_min_af,
                                                        ins_merge_min_af, merge_r,
                                                        merge_d_for_scan,
+                                                       report_all_alleles,
                                                        scan_alignments_binary, restart, num_splits, num_threads,
                                                        calc_qual=False)
         tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q = tumor_outputs_without_q
@@ -322,6 +325,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                          ins_min_af, del_min_af, del_merge_min_af,
                                          ins_merge_min_af, merge_r,
                                          merge_d_for_scan,
+                                         report_all_alleles,
                                          scan_alignments_binary, restart, num_splits, num_threads,
                                          calc_qual=True,
                                          regions=candidates_split_regions)
@@ -350,6 +354,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                                ins_min_af, del_min_af, del_merge_min_af,
                                                ins_merge_min_af, merge_r,
                                                merge_d_for_scan,
+                                               report_all_alleles,
                                                scan_alignments_binary, restart, num_splits, num_threads,
                                                calc_qual=True,
                                                regions=candidates_split_regions)
@@ -707,6 +712,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--matrix_dtype', type=str,
                         help='matrix_dtype to be used to store matrix', default="uint8",
                         choices=MAT_DTYPES)
+    parser.add_argument('--report_all_alleles',
+                        help='report all alleles per position',
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -734,6 +742,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.use_vscore,
                    args.num_splits,
                    args.matrix_dtype,
+                   args.report_all_alleles,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:
diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py
index ae522ec..49b282f 100755
--- a/neusomatic/python/scan_alignments.py
+++ b/neusomatic/python/scan_alignments.py
@@ -25,12 +25,16 @@
 
 def run_scan_alignments(record):
     work, reference, merge_d_for_scan, scan_alignments_binary, split_region_file, \
-        input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record
+        input_bam, window_size, maf, min_mapq, max_dp, report_all_alleles, filter_duplicate, calc_qual = record
 
     if filter_duplicate:
         filter_duplicate_str = "--filter_duplicate"
     else:
         filter_duplicate_str = ""
+    if report_all_alleles:
+        report_all_alleles_str = "--report_all_alleles"
+    else:
+        report_all_alleles_str = ""
     thread_logger = logging.getLogger(
         "{} ({})".format(run_scan_alignments.__name__, multiprocessing.current_process().name))
     try:
@@ -50,9 +54,9 @@ def run_scan_alignments(record):
 
         if os.path.getsize(split_region_file_) > 0:
             cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \
-                        --window_size {} --min_af {} --min_mapq {} --max_depth {} {}".format(
+                        --window_size {} --min_af {} --min_mapq {} --max_depth {} {} {}".format(
                 scan_alignments_binary, reference, input_bam, split_region_file_,
-                work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, filter_duplicate_str)
+                work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, report_all_alleles_str, filter_duplicate_str)
             if calc_qual:
                 cmd += " --calculate_qual_stat"
             run_shell_command(cmd, stdout=os.path.join(work, "scan.out"),
@@ -80,7 +84,7 @@ def run_scan_alignments(record):
 
 def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
                     regions_bed_file, reference, num_splits,
-                    num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True,
+                    num_threads, window_size, maf, min_mapq, max_dp, report_all_alleles, filter_duplicate, restart=True,
                     split_region_files=[], calc_qual=True):
 
     logger = logging.getLogger(scan_alignments.__name__)
@@ -152,7 +156,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
                 shutil.rmtree(work_)
             map_args.append((os.path.join(work, "work.{}".format(i)),
                              reference, merge_d_for_scan, scan_alignments_binary, split_region_file,
-                             input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual))
+                             input_bam, window_size, maf, min_mapq, max_dp, report_all_alleles, filter_duplicate, calc_qual))
             not_done.append(i)
         else:
             all_outputs[i] = [os.path.join(work, "work.{}".format(i), "candidates.vcf"),
@@ -209,6 +213,9 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
     parser.add_argument('--merge_d_for_scan', type=int,
                         help='-d used to merge regions before scan',
                         default=None)
+    parser.add_argument('--report_all_alleles',
+                        help='report all alleles per position',
+                        action="store_true")
     parser.add_argument('--num_splits', type=int,
                         help='number of region splits', default=None)
     parser.add_argument('--num_threads', type=int,
@@ -220,7 +227,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam,
         outputs = scan_alignments(args.work, args.merge_d_for_scan, args.scan_alignments_binary, args.input_bam,
                                   args.regions_bed_file, args.reference, args.num_splits,
                                   args.num_threads, args.window_size, args.maf,
-                                  args.min_mapq, args.max_dp, args.filter_duplicate)
+                                  args.min_mapq, args.max_dp, args.report_all_alleles, args.filter_duplicate)
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error("Aborting!")

From 798c880690d8133e4be54e9f5cdbd65734ab292d Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Mon, 8 Mar 2021 00:05:41 -0800
Subject: [PATCH 84/89] added strict_labeling

---
 neusomatic/python/generate_dataset.py | 20 +++++++++++++-------
 neusomatic/python/preprocess.py       |  8 ++++++++
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index cfdc280..ffce9da 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -794,7 +794,7 @@ def merge_records(fasta_file, records):
     return [str(chrom), pos_m + 1, ref2_, alt2_]
 
 
-def is_part_of(record1, record2):
+def is_part_of(record1, record2, strict_labeling):
     logger = logging.getLogger(is_part_of.__name__)
     chrom1, pos1, ref1, alt1 = record1[0:4]
     chrom2, pos2, ref2, alt2 = record2[0:4]
@@ -802,10 +802,10 @@ def is_part_of(record1, record2):
         return False
     vartype1 = get_type(ref1, alt1)
     vartype2 = get_type(ref2, alt2)
-    if vartype1 == "SNP" and vartype2 == "DEL":
+    if (not strict_labeling) and (vartype1 == "SNP" and vartype2 == "DEL"):
         if pos2 < pos1 < pos2 + len(ref2):
             return True
-    elif vartype2 == "SNP" and vartype1 == "DEL":
+    elif (not strict_labeling) and (vartype2 == "SNP" and vartype1 == "DEL"):
         if pos1 < pos2 < pos1 + len(ref1):
             return True
     elif vartype1 == vartype2:
@@ -877,7 +877,7 @@ def keep_in_region(input_file, region_bed,
 
 
 def find_records(input_record):
-    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, work_index = input_record
+    work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, strict_labeling, work_index = input_record
     thread_logger = logging.getLogger(
         "{} ({})".format(find_records.__name__, multiprocessing.current_process().name))
     try:
@@ -1238,7 +1238,7 @@ def find_records(input_record):
                 truth_record = truth_records[i]
                 tr, eqs = push_lr(fasta_file, truth_record, 2)
                 for eq in eqs:
-                    if is_part_of(eq, record):
+                    if is_part_of(eq, record, strict_labeling):
                         ref_t, alt_t = truth_record[2:4]
                         vartype_t = get_type(ref_t, alt_t)
                         record_center[j] = find_i_center(ref, alt)
@@ -1256,7 +1256,7 @@ def find_records(input_record):
                     ref_p, alt_p = records[p][2:4]
                     tr, eqs = push_lr(fasta_file, records[p], 2)
                     for eq in eqs:
-                        if is_part_of(eq, record):
+                        if is_part_of(eq, record, strict_labeling):
                             vartype = vtype[p]
                             record_center[j] = find_i_center(ref, alt)
                             record_len[j] = find_len(ref_p, alt_p)
@@ -1602,6 +1602,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                      no_seq_complexity, enforce_header,
                      zero_vscore,
                      matrix_dtype,
+                     strict_labeling,
                      tsv_batch_size):
     logger = logging.getLogger(generate_dataset.__name__)
 
@@ -1671,7 +1672,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     map_args = []
     for i, split_region_file in enumerate(split_region_files):
         map_args.append((work, split_region_file, truth_vcf_file,
-                         tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, i))
+                         tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, strict_labeling, i))
     try:
         records_data = pool.map_async(find_records, map_args).get()
         pool.close()
@@ -1871,6 +1872,9 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     parser.add_argument('--matrix_dtype', type=str,
                         help='matrix_dtype to be used to store matrix', default="uint8",
                         choices=MAT_DTYPES)
+    parser.add_argument('--strict_labeling',
+                        help='strict labeling in train mode',
+                        action="store_true")
     args = parser.parse_args()
     logger.info(args)
 
@@ -1895,6 +1899,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
     enforce_header = args.enforce_header
     zero_vscore = args.zero_vscore
     matrix_dtype = args.matrix_dtype
+    strict_labeling = args.strict_labeling
     try:
         generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file,
                          matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv,
@@ -1903,6 +1908,7 @@ def generate_dataset(work, truth_vcf_file, mode,  tumor_pred_vcf_file, region_be
                          no_seq_complexity, enforce_header,
                          zero_vscore,
                          matrix_dtype,
+                         strict_labeling,
                          tsv_batch_size)
     except Exception as e:
         logger.error(traceback.format_exc())
diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py
index c54cb98..6c5860f 100755
--- a/neusomatic/python/preprocess.py
+++ b/neusomatic/python/preprocess.py
@@ -88,6 +88,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi
                             no_feature_recomp_for_ensemble,
                             zero_vscore,
                             matrix_dtype,
+                            strict_labeling,
                             tsv_batch_size):
     logger = logging.getLogger(generate_dataset_region.__name__)
     generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference,
@@ -97,6 +98,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi
                      no_feature_recomp_for_ensemble,
                      zero_vscore,
                      matrix_dtype,
+                     strict_labeling,
                      tsv_batch_size)
 
 
@@ -218,6 +220,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                num_splits,
                matrix_dtype,
                report_all_alleles,
+               strict_labeling,
                num_threads,
                scan_alignments_binary,):
     logger = logging.getLogger(preprocess.__name__)
@@ -600,6 +603,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                                     no_seq_complexity, no_feature_recomp_for_ensemble,
                                     zero_vscore,
                                     matrix_dtype,
+                                    strict_labeling,
                                     tsv_batch_size)
 
     shutil.rmtree(bed_tempdir)
@@ -715,6 +719,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
     parser.add_argument('--report_all_alleles',
                         help='report all alleles per position',
                         action="store_true")
+    parser.add_argument('--strict_labeling',
+                        help='strict labeling in train mode',
+                        action="store_true")
     parser.add_argument('--num_threads', type=int,
                         help='number of threads', default=1)
     parser.add_argument('--scan_alignments_binary', type=str,
@@ -743,6 +750,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp,
                    args.num_splits,
                    args.matrix_dtype,
                    args.report_all_alleles,
+                   args.strict_labeling,
                    args.num_threads,
                    args.scan_alignments_binary)
     except Exception as e:

From cdfc0629ed113e1a76d80e8c28b6d4594289d129 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Mon, 8 Mar 2021 00:25:56 -0800
Subject: [PATCH 85/89] fixed strict_labeling

---
 neusomatic/python/generate_dataset.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index ffce9da..609c597 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -794,7 +794,7 @@ def merge_records(fasta_file, records):
     return [str(chrom), pos_m + 1, ref2_, alt2_]
 
 
-def is_part_of(record1, record2, strict_labeling):
+def is_part_of(record1, record2):
     logger = logging.getLogger(is_part_of.__name__)
     chrom1, pos1, ref1, alt1 = record1[0:4]
     chrom2, pos2, ref2, alt2 = record2[0:4]
@@ -802,10 +802,10 @@ def is_part_of(record1, record2, strict_labeling):
         return False
     vartype1 = get_type(ref1, alt1)
     vartype2 = get_type(ref2, alt2)
-    if (not strict_labeling) and (vartype1 == "SNP" and vartype2 == "DEL"):
+    if (vartype1 == "SNP" and vartype2 == "DEL"):
         if pos2 < pos1 < pos2 + len(ref2):
             return True
-    elif (not strict_labeling) and (vartype2 == "SNP" and vartype1 == "DEL"):
+    elif (vartype2 == "SNP" and vartype1 == "DEL"):
         if pos1 < pos2 < pos1 + len(ref1):
             return True
     elif vartype1 == vartype2:
@@ -1236,9 +1236,12 @@ def find_records(input_record):
             done = False
             for i in i_s:
                 truth_record = truth_records[i]
-                tr, eqs = push_lr(fasta_file, truth_record, 2)
+                if not strict_labeling:
+                    tr, eqs = push_lr(fasta_file, truth_record, 2)
+                else:
+                    tr, eqs = push_lr(fasta_file, truth_record, 0)
                 for eq in eqs:
-                    if is_part_of(eq, record, strict_labeling):
+                    if is_part_of(eq, record):
                         ref_t, alt_t = truth_record[2:4]
                         vartype_t = get_type(ref_t, alt_t)
                         record_center[j] = find_i_center(ref, alt)
@@ -1254,9 +1257,12 @@ def find_records(input_record):
                     perfect_idx)
                 for p in p_s:
                     ref_p, alt_p = records[p][2:4]
-                    tr, eqs = push_lr(fasta_file, records[p], 2)
+                    if not strict_labeling:
+                        tr, eqs = push_lr(fasta_file, truth_record, 2)
+                    else:
+                        tr, eqs = push_lr(fasta_file, truth_record, 0)
                     for eq in eqs:
-                        if is_part_of(eq, record, strict_labeling):
+                        if is_part_of(eq, record):
                             vartype = vtype[p]
                             record_center[j] = find_i_center(ref, alt)
                             record_len[j] = find_len(ref_p, alt_p)

From 74a27df8fc15aab6894d9e8063eb1f0bb062cfb5 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Mon, 8 Mar 2021 01:26:39 -0800
Subject: [PATCH 86/89] fixed strict_labeling

---
 neusomatic/python/generate_dataset.py | 35 +++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 609c597..edaead7 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -744,6 +744,35 @@ def push_lr(fasta_file, record, left_right_both):
     return record, eqs
 
 
+def push_left(fasta_file, record):
+    logger = logging.getLogger(push_lr.__name__)
+    record[0] = str(record[0])
+    if "," not in record[3]:
+        if record[2] != record[3]:
+            chrom, pos, ref, alt = record[0:4]
+            new_pos = pos
+            new_ref = ref
+            new_alt = alt
+            while(new_pos > 1):
+                l_base = fasta_file.fetch(
+                    (chrom), new_pos - 2, new_pos - 1).upper()
+                new_ref = l_base + new_ref
+                new_alt = l_base + new_alt
+                new_pos -= 1
+                while(len(new_alt) > 1 and len(new_ref) > 1):
+                    if new_alt[-1] == new_ref[-1]:
+                        new_alt = new_alt[:-1]
+                        new_ref = new_ref[:-1]
+                    else:
+                        break
+                if len(new_alt) > len(alt):
+                    new_ref = new_ref[1:]
+                    new_alt = new_alt[1:]
+                    new_pos += 1
+                    break
+            record = [chrom, new_pos, new_ref, new_alt] + record[4:]
+    return record
+
 def merge_records(fasta_file, records):
     logger = logging.getLogger(merge_records.__name__)
     if len(set(map(lambda x: x[0], records))) != 1:
@@ -1117,8 +1146,10 @@ def find_records(input_record):
                         record[3] = l_base + record[3]
                         record[4] = l_base + record[4]
                         pos -= 1
-                truth_records.append(
-                    [record[0], pos, record[3], record[4], str(i)])
+                tr = [record[0], pos, record[3], record[4], str(i)]
+                if strict_labeling:
+                    tr = push_left(fasta_file, tr)
+                truth_records.append(tr)
                 i += 1
 
         truth_bed = get_tmp_file()

From a2eed5ed83949d672ab150ce65c92fe5e43937d7 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 9 Mar 2021 21:08:43 -0800
Subject: [PATCH 87/89] small fix

---
 neusomatic/python/generate_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index edaead7..4146247 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1289,9 +1289,9 @@ def find_records(input_record):
                 for p in p_s:
                     ref_p, alt_p = records[p][2:4]
                     if not strict_labeling:
-                        tr, eqs = push_lr(fasta_file, truth_record, 2)
+                        tr, eqs = push_lr(fasta_file, records[p], 2)
                     else:
-                        tr, eqs = push_lr(fasta_file, truth_record, 0)
+                        tr, eqs = push_lr(fasta_file, records[p], 0)
                     for eq in eqs:
                         if is_part_of(eq, record):
                             vartype = vtype[p]

From 399b15cc53c5d25c0c8bf1fe24df99188572daf6 Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Tue, 9 Mar 2021 21:53:50 -0800
Subject: [PATCH 88/89] fixed strict_labeling

---
 neusomatic/python/generate_dataset.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 4146247..8646c6f 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -773,6 +773,7 @@ def push_left(fasta_file, record):
             record = [chrom, new_pos, new_ref, new_alt] + record[4:]
     return record
 
+
 def merge_records(fasta_file, records):
     logger = logging.getLogger(merge_records.__name__)
     if len(set(map(lambda x: x[0], records))) != 1:
@@ -1190,6 +1191,7 @@ def find_records(input_record):
         good_records = {"INS": [], "DEL": [], "SNP": []}
         vtype = {}
         record_len = {}
+        perfect_t_idx = set([])
         for i, js in map_truth_2_pred.items():
             truth_record = truth_records[i]
             for j in js:
@@ -1203,6 +1205,7 @@ def find_records(input_record):
                         record_len[j] = find_len(ref, alt)
                         good_records[vartype].append(j)
                         vtype[j] = vartype
+                        perfect_t_idx.add(i)
 
         good_records_idx = [i for w in list(good_records.values()) for i in w]
         remained_idx = sorted(set(range(len(records))) -
@@ -1250,6 +1253,7 @@ def find_records(input_record):
                                             record_len[j] = find_len(ref, alt)
                                             good_records[vartype].append(j)
                                             vtype[j] = vartype
+                                            perfect_t_idx |= set(t_i)
                                             done_js.append(j)
                                             done_js_.append(j)
                                         done_is_.extend(t_i)
@@ -1266,6 +1270,8 @@ def find_records(input_record):
             i_s = map_pred_2_truth[j]
             done = False
             for i in i_s:
+                if strict_labeling and (i not in perfect_t_idx):
+                    continue
                 truth_record = truth_records[i]
                 if not strict_labeling:
                     tr, eqs = push_lr(fasta_file, truth_record, 2)
@@ -1342,7 +1348,7 @@ def find_records(input_record):
                 vartype = get_type(record[2], record[3])
                 pos, ref, alt = record[1:4]
                 rc = find_i_center(ref, alt)
-                if vartype_t == vartype and pos_t == pos:
+                if vartype_t == vartype and pos_t == pos and ((not strict_labeling) or vartype_t != "SNP"):
                     good_records[vartype_t].append(j)
                     vtype[j] = vartype_t
                     record_len[j] = find_len(ref_t, alt_t)

From 13c45b7e4b5bf1636945596c3889172dc011641c Mon Sep 17 00:00:00 2001
From: Sahraeian <sahraeis@lb021login.ib.rsshpc1.sc1.science.roche.com>
Date: Wed, 10 Mar 2021 16:48:35 -0800
Subject: [PATCH 89/89] small fix for strict_labeling

---
 neusomatic/cpp/scan_alignments.cpp     |  2 +-
 neusomatic/python/filter_candidates.py |  2 +-
 neusomatic/python/generate_dataset.py  | 81 ++++++++++++++------------
 3 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/neusomatic/cpp/scan_alignments.cpp b/neusomatic/cpp/scan_alignments.cpp
index 3f8c552..d93adad 100644
--- a/neusomatic/cpp/scan_alignments.cpp
+++ b/neusomatic/cpp/scan_alignments.cpp
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
         auto var_code = ref_code; 
         int var_count = 0;
         int dp = ref_count;
-        if (report_all_alleles){
+        if (report_all_alleles and ref_base != '-'){
           for (int row = 0;  row < cols[i].base_freq_.size(); ++row) {
             auto alt_cnt = cols[i].base_freq_[row];
             if (( row != ref_code) and (alt_cnt > 0)){
diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py
index 1fa395e..aa7a063 100755
--- a/neusomatic/python/filter_candidates.py
+++ b/neusomatic/python/filter_candidates.py
@@ -109,7 +109,7 @@ def filter_candidates(candidate_record):
                 else:
                     ins = [ins[0][:-1]]
             good_records.extend(ins)
-            if dels and (ins or list(filter(lambda x: x[3] != "N" and x[2] != "N", rs))):
+            if dels and (ins or len(list(filter(lambda x: x[3] == "N" and x[2] != "N", rs))) == 0):
                 # emit del
                 if len(dels) == 1:
                     ro = dels[0][5]
diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py
index 8646c6f..0753a8f 100755
--- a/neusomatic/python/generate_dataset.py
+++ b/neusomatic/python/generate_dataset.py
@@ -1357,50 +1357,55 @@ def find_records(input_record):
         good_records_idx = [i for w in list(good_records.values()) for i in w]
         remained_idx = sorted(set(range(len(records))) -
                               (set(good_records_idx) | set(none_records_ids)))
-        for i, js in map_truth_2_pred.items():
-            truth_record = truth_records[i]
+        if not strict_labeling:
+            for i, js in map_truth_2_pred.items():
+                truth_record = truth_records[i]
 
-            if set(js) & set(good_records_idx):
-                continue
-            pos_t, ref_t, alt_t = truth_record[1:4]
-            vartype_t = get_type(ref_t, alt_t)
-            rct = find_i_center(ref_t, alt_t)
-            for j in js:
-                if j not in remained_idx:
+                if set(js) & set(good_records_idx):
                     continue
-                record = records[j]
-                vartype = get_type(record[2], record[3])
-                pos, ref, alt = record[1:4]
-                rc = find_i_center(ref, alt)
-                if pos_t + rct[0] + rct[1] == pos + rc[0] + rc[1]:
-                    if (vartype_t == "INS" and vartype == "SNP") or (vartype == "INS" and vartype_t == "SNP"):
+                pos_t, ref_t, alt_t = truth_record[1:4]
+                vartype_t = get_type(ref_t, alt_t)
+                rct = find_i_center(ref_t, alt_t)
+                for j in js:
+                    if j not in remained_idx:
+                        continue
+                    record = records[j]
+                    vartype = get_type(record[2], record[3])
+                    pos, ref, alt = record[1:4]
+                    rc = find_i_center(ref, alt)
+                    if pos_t + rct[0] + rct[1] == pos + rc[0] + rc[1]:
+                        if (vartype_t == "INS" and vartype == "SNP") or (vartype == "INS" and vartype_t == "SNP"):
+                            good_records[vartype_t].append(j)
+                            vtype[j] = vartype_t
+                            record_len[j] = find_len(ref_t, alt_t)
+                            record_center[j] = rc
+
+            good_records_idx = [i for w in list(
+                good_records.values()) for i in w]
+            remained_idx = sorted(set(range(len(records))) -
+                                  (set(good_records_idx) | set(none_records_ids)))
+
+        if not strict_labeling:
+            for i, js in map_truth_2_pred.items():
+                truth_record = truth_records[i]
+                if set(js) & set(good_records_idx):
+                    continue
+                pos_t, ref_t, alt_t = truth_record[1:4]
+                vartype_t = get_type(ref_t, alt_t)
+                for j in js:
+                    record = records[j]
+                    pos, ref, alt = record[1:4]
+                    vartype = get_type(record[2], record[3])
+                    if (vartype == vartype_t) and vartype_t != "SNP" and abs(pos - pos_t) < 2:
                         good_records[vartype_t].append(j)
                         vtype[j] = vartype_t
+                        record_center[j] = find_i_center(ref, alt)
                         record_len[j] = find_len(ref_t, alt_t)
-                        record_center[j] = rc
-
-        good_records_idx = [i for w in list(good_records.values()) for i in w]
-        remained_idx = sorted(set(range(len(records))) -
-                              (set(good_records_idx) | set(none_records_ids)))
-        for i, js in map_truth_2_pred.items():
-            truth_record = truth_records[i]
-            if set(js) & set(good_records_idx):
-                continue
-            pos_t, ref_t, alt_t = truth_record[1:4]
-            vartype_t = get_type(ref_t, alt_t)
-            for j in js:
-                record = records[j]
-                pos, ref, alt = record[1:4]
-                vartype = get_type(record[2], record[3])
-                if (vartype == vartype_t) and vartype_t != "SNP" and abs(pos - pos_t) < 2:
-                    good_records[vartype_t].append(j)
-                    vtype[j] = vartype_t
-                    record_center[j] = find_i_center(ref, alt)
-                    record_len[j] = find_len(ref_t, alt_t)
 
-        good_records_idx = [i for w in list(good_records.values()) for i in w]
-        remained_idx = sorted(set(range(len(records))) -
-                              (set(good_records_idx) | set(none_records_ids)))
+            good_records_idx = [i for w in list(
+                good_records.values()) for i in w]
+            remained_idx = sorted(set(range(len(records))) -
+                                  (set(good_records_idx) | set(none_records_ids)))
         for i, js in map_truth_2_pred.items():
             truth_record = truth_records[i]