From 9e14ec5ec2e2ad8fb9bc2668fdfa050c151f446e Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 6 Mar 2020 12:54:35 -0800 Subject: [PATCH 01/89] added new scripts for extending standalone features --- .../python/extend_stanalone_features.py | 248 +++++++ neusomatic/python/genomic_file_handlers.py | 623 ++++++++++++++++++ neusomatic/python/read_info_extractor.py | 290 ++++++++ neusomatic/python/sequencing_features.py | 269 ++++++++ 4 files changed, 1430 insertions(+) create mode 100755 neusomatic/python/extend_stanalone_features.py create mode 100644 neusomatic/python/genomic_file_handlers.py create mode 100644 neusomatic/python/read_info_extractor.py create mode 100644 neusomatic/python/sequencing_features.py diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_stanalone_features.py new file mode 100755 index 0000000..c75748e --- /dev/null +++ b/neusomatic/python/extend_stanalone_features.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +#------------------------------------------------------------------------- +# extend_standalone_features.py +# add extra features for standalone mode +#------------------------------------------------------------------------- +import argparse +import traceback +import logging +import multiprocessing +import os +import gzip + +import pysam +import numpy as np + +import sequencing_features +import genomic_file_handlers as genome + + +def extract_features(candidate_record): + work, reference, tumor_bam, normal_bam, chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic = candidate_record + thread_logger = logging.getLogger( + "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name)) + try: + thread_logger.info( + "---------------------Filter Candidates---------------------") + tbam = pysam.AlignmentFile(tumor_bam) + nbam = pysam.AlignmentFile(normal_bam) + ref_fa = pysam.FastaFile(reference) + + my_coordinate = [chrom, int(pos)] + nBamFeatures = sequencing_features.from_bam( + nbam, my_coordinate, ref, alt, min_mapq, min_bq) + tBamFeatures = sequencing_features.from_bam( + tbam, my_coordinate, ref, alt, min_mapq, min_bq) + + n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev'] + n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] + t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] + t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] + sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt) + + homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( + ref_fa, my_coordinate, ref, alt) + + indel_length = len(alt) - len(ref) + + CHROM = my_coordinate[0] + POS = my_coordinate[1] + REF = ref_base + ALT = first_alt + if_dbsnp = if_dbsnp + COMMON = if_common + if_COSMIC = if_cosmic + COSMIC_CNT = num_cases + Consistent_Mates = tBamFeatures['consistent_mates'] + Inconsistent_Mates = tBamFeatures['inconsistent_mates'] + N_DP = nBamFeatures['dp'] + nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'] + nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'] + nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'] + nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'] + nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'] + nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'] + nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'] + nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'] + nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'] + nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'] + nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'] + nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'] + nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'] + nBAM_Concordance_FET = rescale( + nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) + N_REF_FOR = nBamFeatures['ref_for'] + N_REF_REV = nBamFeatures['ref_rev'] + N_ALT_FOR = nBamFeatures['alt_for'] + N_ALT_REV = nBamFeatures['alt_rev'] + nBAM_StrandBias_FET = rescale( + nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) + nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'] + nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'] + nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'] + nBAM_Clipping_FET = rescale( + nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) + nBAM_MQ0 = nBamFeatures['MQ0'] + nBAM_Other_Reads = nBamFeatures['noise_read_count'] + nBAM_Poor_Reads = nBamFeatures['poor_read_count'] + nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'] + nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'] + nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'] + nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'] + nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'] + nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'] + SOR = sor + MaxHomopolymer_Length = homopolymer_length + SiteHomopolymer_Length = site_homopolymer_length + T_DP = tBamFeatures['dp'] + tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'] + tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'] + tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'] + tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'] + tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'] + tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'] + tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'] + tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'] + tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'] + tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'] + tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'] + tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'] + tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'] + tBAM_Concordance_FET = rescale( + tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) + T_REF_FOR = tBamFeatures['ref_for'] + T_REF_REV = tBamFeatures['ref_rev'] + T_ALT_FOR = tBamFeatures['alt_for'] + T_ALT_REV = tBamFeatures['alt_rev'] + tBAM_StrandBias_FET = rescale( + tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) + tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'] + tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'] + tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'] + tBAM_Clipping_FET = rescale( + tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) + tBAM_MQ0 = tBamFeatures['MQ0'] + tBAM_Other_Reads = tBamFeatures['noise_read_count'] + tBAM_Poor_Reads = tBamFeatures['poor_read_count'] + tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'] + tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'] + tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'] + tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'] + tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'] + tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'] + InDel_Length = indel_length + + # thread_logger.info(tBamFeatures) + # aaa + + return 0 + + except Exception as ex: + thread_logger.error(traceback.format_exc()) + thread_logger.error(ex) + return None + + +def extend_standalone_features(candidates_vcf, + reference, tumor_bam, normal_bam, + min_mapq, min_bq, + dbsnp, cosmic, + num_threads, + work): + + logger = logging.getLogger(extend_standalone_features.__name__) + + logger.info("----------------------Preprocessing------------------------") + if not os.path.exists(work): + os.mkdir(work) + + if not os.path.exists(tumor_bam): + logger.error("Aborting!") + raise Exception("No tumor BAM file {}".format(tumor_bam)) + if not os.path.exists(normal_bam): + logger.error("Aborting!") + raise Exception("No normal BAM file {}".format(normal_bam)) + if not os.path.exists(tumor_bam + ".bai"): + logger.error("Aborting!") + raise Exception( + "No tumor .bai index file {}".format(tumor_bam + ".bai")) + if not os.path.exists(normal_bam + ".bai"): + logger.error("Aborting!") + raise Exception( + "No normal .bai index file {}".format(normal_bam + ".bai")) + + if dbsnp: + with gzip.open(dbsnp,'rt') as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + continue + print(line) + aaa + pool = multiprocessing.Pool(num_threads) + map_args = [] + with open(candidates_vcf) as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] + map_args.append((work, reference, tumor_bam, normal_bam, + chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic)) + try: + ext_features = pool.map_async(extract_features, map_args).get() + pool.close() + except Exception as inst: + logger.error(inst) + pool.close() + traceback.print_exc() + raise Exception + + +if __name__ == '__main__': + FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' + logging.basicConfig(level=logging.INFO, format=FORMAT) + logger = logging.getLogger(__name__) + + parser = argparse.ArgumentParser( + description='extract extra features for standalone mode') + parser.add_argument('--candidates_vcf', type=str, help='candidates vcf', + required=True) + parser.add_argument('--reference', type=str, help='reference fasta filename', + required=True) + parser.add_argument('--tumor_bam', type=str, + help='tumor bam', required=True) + parser.add_argument('--normal_bam', type=str, + help='normal bam', required=True) + parser.add_argument('--min_mapq', type=int, + help='minimum mapping quality', default=1) + parser.add_argument('--min_bq', type=float, + help='minimum base quality', default=5) + parser.add_argument('--dbsnp', type=str, + help='dbSNP vcf (to annotate candidate variants)', default=None) + parser.add_argument('--cosmic', type=str, + help='COSMIC vcf (to annotate candidate variants)', default=None) + parser.add_argument('--num_threads', type=int, + help='number of threads', default=1) + parser.add_argument('--work', type=str, + help='work directory', required=True) + args = parser.parse_args() + logger.info(args) + + try: + output = extend_standalone_features(args.candidates_vcf, + args.reference, args.tumor_bam, args.normal_bam, + args.min_mapq, args.min_bq, + args.dbsnp, args.cosmic, + args.num_threads, + args.work) + if output is None: + raise Exception("extend_standalone_features failed!") + except Exception as e: + logger.error(traceback.format_exc()) + logger.error("Aborting!") + logger.error( + "extend_standalone_features.py failure on arguments: {}".format(args)) + raise e diff --git a/neusomatic/python/genomic_file_handlers.py b/neusomatic/python/genomic_file_handlers.py new file mode 100644 index 0000000..cd19a26 --- /dev/null +++ b/neusomatic/python/genomic_file_handlers.py @@ -0,0 +1,623 @@ +#!/usr/bin/env python3 + +from pysam import AlignmentFile +import sys, os, gzip, re, math + +# The regular expression pattern for "chrXX 1234567" in both VarScan2 Output and VCF files: +pattern_major_chr_position = re.compile(r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b') + +# More lenient pattern: +pattern_chr_position = re.compile(r'[^\t]+\t[0-9]+\b') +pattern_chrom = re.compile(r'(?:chr)?([1-9]|1[0-9]|2[0-2]|[XY]|MT?)\W') + + +# Valid Phred+33 quality strings: +valid_q = set() +[valid_q.add( chr(33+i) ) for i in range(42)]; + +nan = float('nan') +inf = float('inf') + +AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"} +AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile", "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"} + + +### ### ### ### ### MAJOR CLASSES ### ### ### ### ### +class Vcf_line: + '''Each instance of this object is a line from the vcf file (no header).''' + + def __init__(self, vcf_line): + + '''Argument is a line in pileup file.''' + self.vcf_line = vcf_line.rstrip('\n') + + try: + self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, *self.has_samples = vcf_line.rstrip('\n').split('\t') + self.position = int(self.position) + + try: + self.field, *self.samples = self.has_samples + except ValueError: + self.field = self.samples = '' + + except ValueError: + self.chromosome = self.identifier = self.refbase = self.altbase = self.qual = self.filters = self.info = self.field = self.samples = '' + self.position = None + + + def get_info_items(self): + return self.info.split(';') + + + def get_info_value(self, variable): + + key_item = re.search(r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line) + + # The key has a value attached to it, e.g., VAR=1,2,3 + if key_item: + return key_item.groups()[0] + + # Perhaps it's simply a flag without "=" + else: + key_item = self.info.split(';') + return True if variable in key_item else False + + + def get_sample_variable(self): + return self.field.split(':') + + + def get_sample_item(self, idx=0, out_type='d'): + '''d to output a dictionary. l to output a tuple of lists''' + + if out_type.lower() == 'd': + return dict( zip(self.get_sample_variable(), self.samples[idx].split(':') ) ) + elif out_type.lower() == 'l': + return ( self.get_sample_variable(), self.samples[idx].split(':') ) + + + def get_sample_value(self, variable, idx=0): + + var2value = dict( zip( self.field.split(':'), self.samples[idx].split(':') )) + + try: + return var2value[variable] + except KeyError: + return None + + + + +class pysam_header: + ''' + Extract BAM header using pysam. + Only sample name (SM) so far. + ''' + + def __init__(self, bam_file): + + bam = AlignmentFile(bam_file) + self.bam_header = bam.header + + + def SM(self): + '''Sample Name''' + + sample_name = set() + + for header_i in self.bam_header['RG']: + sample_name.add( header_i['SM'] ) + sample_name = tuple(sample_name) + + return sample_name + + +### ### ### ### ### MAJOR CLASSES OVER ### ### ### ### ### + + + + + + + + +### ### ### ### ### FUNCTIONS OF CONVENIENCE ### ### ### ### ### + +def skip_vcf_header(opened_file): + + line_i = opened_file.readline().rstrip() + while line_i.startswith('#'): + line_i = opened_file.readline().rstrip() + + return line_i + + +def faiordict2contigorder(file_name, file_format): + '''Takes either a .fai or .dict file, and return a contig order dictionary, i.e., chrom_seq['chr1'] == 0''' + + assert file_format in ('fai', 'dict') + + contig_sequence = [] + with open(file_name) as gfile: + + for line_i in gfile: + + if file_format == 'fai': + contig_match = re.match(r'([^\t]+)\t', line_i) + + elif file_format == 'dict': + if line_i.startswith('@SQ'): + contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i) + + if contig_match: + contig_i = contig_match.groups()[0].split(' ')[0] # some .fai files have space after the contig for descriptions. + contig_sequence.append( contig_i ) + + chrom_seq = {} + for n,contig_i in enumerate(contig_sequence): + chrom_seq[contig_i] = n + + return chrom_seq + + + +def open_textfile(file_name): + + # See if the input file is a .gz file: + if file_name.lower().endswith('.gz'): + return gzip.open(file_name, 'rt') + + else: + return open(file_name) + + + +def open_bam_file(file_name): + + try: + return AlignmentFile(file_name, 'rb') + except ValueError: + return open(file_name) + + + + +def ascii2phred33(x): + '''Put in an ASCII string, return a Phred+33 score.''' + return ord(x)-33 + + +def phred33toascii(x): + '''Put in a Phred33 score, return the character.''' + return chr(x+33) + + +def p2phred(p, max_phred=inf): + '''Convert p-value to Phred-scale quality score.''' + + if p == 0: + Q = max_phred + + elif p == 1: + Q = 0 + + elif p<0 or p>1: + Q = nan + + elif p > 0: + Q = -10 * math.log10(p) + if Q > max_phred: + Q = max_phred + + elif math.isnan(p): + Q = nan + + return Q + + + +def phred2p(phred): + '''Convert Phred-scale quality score to p-value.''' + return 10**(-phred/10) + + +def findall_index(mylist, tolookfor): + '''Find all instances in a list that matches exactly thestring.''' + all_indices = [i for i,item in enumerate(mylist) if item == tolookfor] + return all_indices + + +def findall_index_regex(mylist, pattern): + '''Find all instances in a list that matches a regex pattern.''' + all_indices = [i for i,item in enumerate(mylist) if re.search(pattern, item)] + return all_indices + + +def count_repeating_bases(sequence): + + '''For a string, count the number of characters that appears in a row. + E.g., for string "ABBCCCDDDDAAAAAAA", the function returns 1, 2, 3, 4, 7, because there is 1 A, 2 B's, 3 C's, 4 D's, and then 7 A's. + ''' + counters = [] + previous_base = None + + for current_base in sequence: + + if current_base == previous_base: + counters[-1] += 1 + else: + counters.append(1) + + previous_base = current_base + + counters + + return counters + + + +def numeric_id(chr_i, pos_i, contig_seq): + + chr_i = contig_seq[chr_i] + numeric_chr_i = float(chr_i) * 1000000000000 + numeric_pos_i = float(pos_i) + + numeric_i = numeric_chr_i + numeric_pos_i + + return numeric_i + + + + + +# Define which chromosome coordinate is ahead for the following function: +chrom_sequence = [str(i) for i in range(1,23)] +chrom_sequence.append('X') +chrom_sequence.append('Y') +chrom_sequence.append('M') + +chrom_seq = {} +for n,contig_i in enumerate(chrom_sequence): + chrom_seq[contig_i] = n + +def whoisbehind(coord_0, coord_1, chrom_sequence): + ''' + coord_0 and coord_1 are two strings or two lists, specifying the chromosome, a (typically) tab, and then the location. + Return the index where the coordinate is behind. Return 10 if they are the same position. + ''' + + end_of_0 = end_of_1 = False + + if coord_0 == '' or coord_0==['',''] or coord_0==('','') or not coord_0: + end_of_0 = True + + if coord_1 == '' or coord_1==['',''] or coord_1==('','') or not coord_1: + end_of_1 = True + + if end_of_0 and end_of_1: + return 10 + + elif end_of_1: + return 0 + + elif end_of_0: + return 1 + + else: + + if isinstance(coord_0, str): + chrom0, position0 = coord_0.split() + elif isinstance(coord_0, list) or isinstance(coord_0, tuple): + chrom0, position0 = coord_0[0], coord_0[1] + + if isinstance(coord_1, str): + chrom1, position1 = coord_1.split() + elif isinstance(coord_1, list) or isinstance(coord_1, tuple): + chrom1, position1 = coord_1[0], coord_1[1] + + if isinstance(chrom_sequence, dict): + chrom0_position = chrom_sequence[chrom0] + chrom1_position = chrom_sequence[chrom1] + elif isinstance(chrom_sequence, list) or isinstance(chrom_sequence, tuple): + chrom0_position = chrom_sequence.index(chrom0) + chrom1_position = chrom_sequence.index(chrom1) + + if chrom0_position < chrom1_position: + return 0 # 1st coordinate is ahead + + elif chrom0_position > chrom1_position: + return 1 # 1st coordinate is ahead + + # Must be in the same chromosome + else: + + position0 = int(position0) + position1 = int(position1) + + if position0 < position1: + return 0 + + elif position0 > position1: + return 1 + + # Same chromosome, same position, then same coordinate: + elif position0 == position1: + return 10 + + + + +def vcf_header_modifier(infile_handle, addons=[], getlost=' '): + + '''addons = A list of INFO, FORMAT, ID, or Filter lines you want to add. + getlost = a regex expression for the ID of INFO/FORMAT/FILTER that you want to get rid of.''' + + line_i = infile_handle.readline().rstrip() + + # First, write into the INFO and FORMAT what I want to add: + vcfheader_info_format_filter = [] + vcfheader_misc = [] + + for additions in addons: + vcfheader_info_format_filter.append(additions) + + while line_i.startswith('##'): + + if re.match(r'##fileformat=', line_i): + vcffileformat = line_i + + elif re.match(r'##(INFO|FORMAT|FILTER)', line_i): + + if not re.match(r'##(INFO|FORMAT|FILTER)= len(ref_base): + + inserted_sequence = variant_call[ len(ref_base):: ] + + ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower()) + + # Deletion: + elif len(variant_call) < len(ref_base): + + deleted_sequence = ref_base[ len(variant_call):: ] + + ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower()) + + else: + ref_for = ref_rev = alt_for = alt_rev = 0 + + return ref_for, ref_rev, alt_for, alt_rev + + + + +def rescale(x, original='fraction', rescale_to=None, max_phred=1001): + + if ( rescale_to == None ) or ( original.lower() == rescale_to.lower() ): + y = x if isinstance(x, int) else '%.2f' % x + + elif original.lower() == 'fraction' and rescale_to == 'phred': + y = genome.p2phred(x, max_phred=max_phred) + y = '%.2f' % y + + elif original.lower() == 'phred' and rescale_to == 'fraction': + y = genome.phred2p(x) + y = '%.2f' % y + + return y + + + + + +##### Stuff from VarDict: +def find_MSI(vcf_object): + + msi = vcf_object.get_info_value('MSI') + if msi: + msi = float(msi) + else: + msi = nan + return msi + + +def find_MSILEN(vcf_object): + + msilen = vcf_object.get_info_value('MSILEN') + if msilen: + msilen = float(msilen) + else: + msilen = nan + return msilen + + +def find_SHIFT3(vcf_object): + + shift3 = vcf_object.get_info_value('SHIFT3') + if shift3: + shift3 = float(shift3) + else: + shift3 = nan + return shift3 + + + +# MuTect2's Stuff: +def mutect2_nlod(vcf_object): + nlod = vcf_object.get_info_value('NLOD') + if nlod: + return float(nlod) + else: + return nan + + +def mutect2_tlod(vcf_object): + tlod = vcf_object.get_info_value('TLOD') + if tlod: + return float(tlod) + else: + return nan + + +def mutect2_STR(vcf_object): + if vcf_object.get_info_value('STR'): + return 1 + else: + return 0 + + +def mutect2_ECNT(vcf_object): + ecnt = vcf_object.get_info_value('ECNT') + if ecnt: + try: + ecnt = int( ecnt ) + except ValueError: + ecnt = nan + else: + ecnt = nan + + return ecnt diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py new file mode 100644 index 0000000..1135e1e --- /dev/null +++ b/neusomatic/python/sequencing_features.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 + +import sys, os, re, pysam +import scipy.stats as stats +import genomic_file_handlers as genome +from read_info_extractor import * + +nan = float('nan') + + +def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + + ''' + bam is the opened file handle of bam file + my_coordiate is a list or tuple of 0-based (contig, position) + ''' + + indel_length = len(first_alt) - len(ref_base) + reads = bam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] ) + + ref_read_mq = [] + alt_read_mq = [] + ref_read_bq = [] + alt_read_bq = [] + ref_edit_distance = [] + alt_edit_distance = [] + + ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0 + ref_for = ref_rev = alt_for = alt_rev = dp = 0 + ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0 + MQ0 = 0 + + ref_pos_from_end = [] + alt_pos_from_end = [] + ref_flanking_indel = [] + alt_flanking_indel = [] + + noise_read_count = poor_read_count = 0 + + qname_collector = {} + + for read_i in reads: + if not read_i.is_unmapped and dedup_test(read_i): + + dp += 1 + + code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 ) + + if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq: + poor_read_count += 1 + + if read_i.mapping_quality == 0: + MQ0 += 1 + + # Reference calls: + if code_i == 1 and base_call_i == ref_base[0]: + + try: + qname_collector[read_i.qname].append(0) + except KeyError: + qname_collector[read_i.qname] = [0] + + ref_read_mq.append( read_i.mapping_quality ) + ref_read_bq.append( read_i.query_qualities[ith_base] ) + + try: + ref_edit_distance.append( read_i.get_tag('NM') ) + except KeyError: + pass + + # Concordance + if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + ref_concordant_reads += 1 + elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + ref_discordant_reads += 1 + + # Orientation + if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + ref_for += 1 + elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + ref_rev += 1 + + # Soft-clipped reads? + if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: + ref_SC_reads += 1 + else: + ref_notSC_reads += 1 + + # Distance from the end of the read: + if ith_base != None: + ref_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) ) + + # Flanking indels: + ref_flanking_indel.append( flanking_indel_i ) + + + # Alternate calls: + # SNV, or Deletion, or Insertion where I do not check for matching indel length + elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \ + (indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or \ + (indel_length > 0 and code_i == 3): + + try: + qname_collector[read_i.qname].append(1) + except KeyError: + qname_collector[read_i.qname] = [1] + + alt_read_mq.append( read_i.mapping_quality ) + alt_read_bq.append( read_i.query_qualities[ith_base] ) + + try: + alt_edit_distance.append( read_i.get_tag('NM') ) + except KeyError: + pass + + # Concordance + if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + alt_concordant_reads += 1 + elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + alt_discordant_reads += 1 + + # Orientation + if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + alt_for += 1 + elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + alt_rev += 1 + + # Soft-clipped reads? + if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: + alt_SC_reads += 1 + else: + alt_notSC_reads += 1 + + # Distance from the end of the read: + if ith_base != None: + alt_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) ) + + # Flanking indels: + alt_flanking_indel.append( flanking_indel_i ) + + + # Inconsistent read or 2nd alternate calls: + else: + + try: + qname_collector[read_i.qname].append(2) + except KeyError: + qname_collector[read_i.qname] = [2] + + noise_read_count += 1 + + # Done extracting info from tumor BAM. Now tally them: + ref_mq = mean(ref_read_mq) + alt_mq = mean(alt_read_mq) + z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0] + + ref_bq = mean(ref_read_bq) + alt_bq = mean(alt_read_bq) + z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0] + + ref_NM = mean(ref_edit_distance) + alt_NM = mean(alt_edit_distance) + z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] + NM_Diff = alt_NM - ref_NM - abs(indel_length) + + concordance_fet = stats.fisher_exact(( (ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads) ))[1] + strandbias_fet = stats.fisher_exact(( (ref_for, alt_for), (ref_rev, alt_rev) ))[1] + clipping_fet = stats.fisher_exact(( (ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads) ))[1] + + z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] + + ref_indel_1bp = ref_flanking_indel.count(1) + ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp + ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp + alt_indel_1bp = alt_flanking_indel.count(1) + alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp + alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp + + consistent_mates = inconsistent_mates = 0 + for pairs_i in qname_collector: + + # Both are alternative calls: + if qname_collector[pairs_i] == [1,1]: + consistent_mates += 1 + + # One is alternate call but the other one is not: + elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]: + inconsistent_mates += 1 + + return vars() + + + + + +def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt): + + ''' + ref_fa is the opened reference fasta file handle + my_coordiate is a list or tuple of 0-based (contig, position) + ''' + + # Homopolymer eval (Make sure to modify for INDEL): + # The min and max is to prevent the +/- 20 bases from exceeding the ends of the reference sequence + lseq = ref_fa.fetch(my_coordinate[0], max(0, my_coordinate[1]-20), my_coordinate[1]) + rseq = ref_fa.fetch(my_coordinate[0], my_coordinate[1]+1, min(ref_fa.get_reference_length(my_coordinate[0])+1, my_coordinate[1]+21) ) + + # This is to get around buy in old version of pysam that reads the reference sequence in bytes instead of strings + lseq = lseq.decode() if isinstance(lseq, bytes) else lseq + rseq = rseq.decode() if isinstance(rseq, bytes) else rseq + + seq41_ref = lseq + ref_base + rseq + seq41_alt = lseq + first_alt + rseq + + ref_counts = genome.count_repeating_bases(seq41_ref) + alt_counts = genome.count_repeating_bases(seq41_alt) + + homopolymer_length = max( max(ref_counts), max(alt_counts) ) + + # Homopolymer spanning the variant site: + ref_c = 0 + alt_c = 0 + for i in rseq: + if i == ref_base: + ref_c += 1 + else: + break + + for i in lseq[::-1]: + if i == ref_base: + ref_c += 1 + else: + break + + for i in rseq: + if i == first_alt: + alt_c += 1 + else: + break + + for i in lseq[::-1]: + if i == first_alt: + alt_c += 1 + else: + break + + site_homopolymer_length = max( alt_c+1, ref_c+1 ) + + return homopolymer_length, site_homopolymer_length + + + + + +def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): + + # Odds Ratio just like VarDict's output + sor_numerator = n_alt * t_ref + sor_denominator = n_ref * t_alt + if sor_numerator == 0 and sor_denominator == 0: + sor = nan + elif sor_denominator == 0: + sor = max_value + else: + sor = sor_numerator / sor_denominator + if sor >= max_value: + sor = max_value + + return sor From 8720927de67998a57bed44dd709726e1898593a6 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 8 Mar 2020 20:14:09 -0700 Subject: [PATCH 02/89] fix for dbsnp --- .../python/extend_stanalone_features.py | 341 ++++++++++++------ 1 file changed, 224 insertions(+), 117 deletions(-) diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_stanalone_features.py index c75748e..5f6fef7 100755 --- a/neusomatic/python/extend_stanalone_features.py +++ b/neusomatic/python/extend_stanalone_features.py @@ -15,127 +15,160 @@ import sequencing_features import genomic_file_handlers as genome +from read_info_extractor import rescale def extract_features(candidate_record): - work, reference, tumor_bam, normal_bam, chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic = candidate_record + work, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record thread_logger = logging.getLogger( "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name)) try: - thread_logger.info( - "---------------------Filter Candidates---------------------") tbam = pysam.AlignmentFile(tumor_bam) nbam = pysam.AlignmentFile(normal_bam) ref_fa = pysam.FastaFile(reference) + if dbsnp: + dbsnp_tb = pysam.TabixFile(dbsnp) - my_coordinate = [chrom, int(pos)] - nBamFeatures = sequencing_features.from_bam( - nbam, my_coordinate, ref, alt, min_mapq, min_bq) - tBamFeatures = sequencing_features.from_bam( - tbam, my_coordinate, ref, alt, min_mapq, min_bq) - - n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev'] - n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] - t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] - t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] - sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt) - - homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( - ref_fa, my_coordinate, ref, alt) - - indel_length = len(alt) - len(ref) - - CHROM = my_coordinate[0] - POS = my_coordinate[1] - REF = ref_base - ALT = first_alt - if_dbsnp = if_dbsnp - COMMON = if_common - if_COSMIC = if_cosmic - COSMIC_CNT = num_cases - Consistent_Mates = tBamFeatures['consistent_mates'] - Inconsistent_Mates = tBamFeatures['inconsistent_mates'] - N_DP = nBamFeatures['dp'] - nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'] - nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'] - nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'] - nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'] - nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'] - nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'] - nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'] - nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'] - nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'] - nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'] - nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'] - nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'] - nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'] - nBAM_Concordance_FET = rescale( - nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) - N_REF_FOR = nBamFeatures['ref_for'] - N_REF_REV = nBamFeatures['ref_rev'] - N_ALT_FOR = nBamFeatures['alt_for'] - N_ALT_REV = nBamFeatures['alt_rev'] - nBAM_StrandBias_FET = rescale( - nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) - nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'] - nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'] - nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'] - nBAM_Clipping_FET = rescale( - nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) - nBAM_MQ0 = nBamFeatures['MQ0'] - nBAM_Other_Reads = nBamFeatures['noise_read_count'] - nBAM_Poor_Reads = nBamFeatures['poor_read_count'] - nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'] - nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'] - nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'] - nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'] - nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'] - nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'] - SOR = sor - MaxHomopolymer_Length = homopolymer_length - SiteHomopolymer_Length = site_homopolymer_length - T_DP = tBamFeatures['dp'] - tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'] - tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'] - tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'] - tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'] - tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'] - tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'] - tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'] - tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'] - tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'] - tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'] - tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'] - tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'] - tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'] - tBAM_Concordance_FET = rescale( - tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) - T_REF_FOR = tBamFeatures['ref_for'] - T_REF_REV = tBamFeatures['ref_rev'] - T_ALT_FOR = tBamFeatures['alt_for'] - T_ALT_REV = tBamFeatures['alt_rev'] - tBAM_StrandBias_FET = rescale( - tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) - tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'] - tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'] - tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'] - tBAM_Clipping_FET = rescale( - tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) - tBAM_MQ0 = tBamFeatures['MQ0'] - tBAM_Other_Reads = tBamFeatures['noise_read_count'] - tBAM_Poor_Reads = tBamFeatures['poor_read_count'] - tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'] - tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'] - tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'] - tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'] - tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'] - tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'] - InDel_Length = indel_length - - # thread_logger.info(tBamFeatures) - # aaa - - return 0 + ext_features = [] + for chrom, pos, ref, alt, if_cosmic, num_cosmic_cases in batch: + var_id = "-".join([chrom, pos, ref, alt]) + pos = int(pos) + my_coordinate = [chrom, pos] + nBamFeatures = sequencing_features.from_bam( + nbam, my_coordinate, ref, alt, min_mapq, min_bq) + tBamFeatures = sequencing_features.from_bam( + tbam, my_coordinate, ref, alt, min_mapq, min_bq) + + n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev'] + n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] + t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] + t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] + sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt) + + homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( + ref_fa, my_coordinate, ref, alt) + + indel_length = len(alt) - len(ref) + + if_dbsnp = 0 + if_common = 0 + if dbsnp: + region = "{}:{}-{}".format(chrom, pos, pos + 1) + dbsnp_vars = {} + for x in dbsnp_tb.fetch(region=region): + chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[ + 0:8] + for alt_ in alts_.split(","): + dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) + dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 + if var_id in dbsnp_vars: + if_dbsnp = 1 + if_common = dbsnp_vars[var_id] + + p_scale = None + CHROM = my_coordinate[0] + POS = my_coordinate[1] + REF = ref + ALT = alt + if_dbsnp = if_dbsnp + COMMON = if_common + if_COSMIC = if_cosmic + COSMIC_CNT = num_cosmic_cases + Consistent_Mates = tBamFeatures['consistent_mates'] + Inconsistent_Mates = tBamFeatures['inconsistent_mates'] + N_DP = nBamFeatures['dp'] + nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'] + nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'] + nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'] + nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'] + nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'] + nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'] + nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'] + nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'] + nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'] + nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'] + nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'] + nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'] + nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'] + nBAM_Concordance_FET = rescale( + nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) + N_REF_FOR = nBamFeatures['ref_for'] + N_REF_REV = nBamFeatures['ref_rev'] + N_ALT_FOR = nBamFeatures['alt_for'] + N_ALT_REV = nBamFeatures['alt_rev'] + nBAM_StrandBias_FET = rescale( + nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) + nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'] + nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'] + nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'] + nBAM_Clipping_FET = rescale( + nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) + nBAM_MQ0 = nBamFeatures['MQ0'] + nBAM_Other_Reads = nBamFeatures['noise_read_count'] + nBAM_Poor_Reads = nBamFeatures['poor_read_count'] + nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'] + nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'] + nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'] + nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'] + nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'] + nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'] + SOR = sor + MaxHomopolymer_Length = homopolymer_length + SiteHomopolymer_Length = site_homopolymer_length + T_DP = tBamFeatures['dp'] + tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'] + tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'] + tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'] + tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'] + tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'] + tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'] + tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'] + tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'] + tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'] + tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'] + tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'] + tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'] + tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'] + tBAM_Concordance_FET = rescale( + tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) + T_REF_FOR = tBamFeatures['ref_for'] + T_REF_REV = tBamFeatures['ref_rev'] + T_ALT_FOR = tBamFeatures['alt_for'] + T_ALT_REV = tBamFeatures['alt_rev'] + tBAM_StrandBias_FET = rescale( + tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) + tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'] + tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'] + tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'] + tBAM_Clipping_FET = rescale( + tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) + tBAM_MQ0 = tBamFeatures['MQ0'] + tBAM_Other_Reads = tBamFeatures['noise_read_count'] + tBAM_Poor_Reads = tBamFeatures['poor_read_count'] + tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'] + tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'] + tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'] + tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'] + tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'] + tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'] + InDel_Length = indel_length + + ext_features.append([CHROM, POS, REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, + Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, + nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, + nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, + nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, + nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, + nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, + nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, + MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, + tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, + tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, + tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, + tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, + tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, + tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) + return ext_features except Exception as ex: thread_logger.error(traceback.format_exc()) @@ -152,7 +185,8 @@ def extend_standalone_features(candidates_vcf, logger = logging.getLogger(extend_standalone_features.__name__) - logger.info("----------------------Preprocessing------------------------") + logger.info( + "----------------------Extend Standalone Features------------------------") if not os.path.exists(work): os.mkdir(work) @@ -172,34 +206,107 @@ def extend_standalone_features(candidates_vcf, "No normal .bai index file {}".format(normal_bam + ".bai")) if dbsnp: - with gzip.open(dbsnp,'rt') as i_f: + if not os.path.exists(dbsnp): + logger.error("Aborting!") + raise Exception( + "No dbSNP file {}".format(dbsnp)) + + if dbsnp[-6:] != "vcf.gz": + logger.error("Aborting!") + raise Exception( + "The dbSNP file should be a tabix indexed file with .vcf.gz format") + if not os.path.exists(dbsnp + ".tbi"): + logger.error("Aborting!") + raise Exception( + "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) + + if cosmic: + cosmic_vars = {} + with open(cosmic) as i_f: for line in i_f: if not line.strip(): continue if line[0] == "#": continue - print(line) - aaa + x = line.strip().split("\t") + chrom, pos, _, ref, alts, _, _, info = x[0:8] + num_cases = info.split("CNT=")[1].split( + ";")[0] if "CNT=" in info else float('nan') + for alt in alts.split(","): + var_id = "-".join([chrom, pos, ref, alt]) + cosmic_vars[var_id] = num_cases + + n_variants = 0 + with open(candidates_vcf) as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + continue + n_variants += 1 + logger.info("Number of variants: {}".format(n_variants)) + split_len = n_variants // num_threads pool = multiprocessing.Pool(num_threads) map_args = [] with open(candidates_vcf) as i_f: + i = 0 + batch = [] for line in i_f: if not line.strip(): continue if line[0] == "#": continue + chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] - map_args.append((work, reference, tumor_bam, normal_bam, - chrom, pos, ref, alt, min_mapq, min_bq, dbsnp, cosmic)) + var_id = "-".join([chrom, pos, ref, alt]) + num_cosmic_cases = float('nan') + if_cosmic = 0 + if cosmic and var_id in cosmic_vars: + if_cosmic = 1 + num_cosmic_cases = cosmic_vars[var_id] + batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) + i += 1 + if len(batch) >= split_len or i == n_variants: + map_args.append((work, reference, tumor_bam, normal_bam, + min_mapq, min_bq, dbsnp, batch)) + batch = [] + + header = ["CHROM", "POS", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", + "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ", + "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff", + "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", + "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", + "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", + "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", + "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR", + "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ", + "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff", + "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", + "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", + "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", + "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", + "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"] + try: ext_features = pool.map_async(extract_features, map_args).get() pool.close() + output_tsv = os.path.join(work, "features.tsv") + with open(output_tsv, "w") as o_f: + o_f.write( + "\t".join(header) + "\n") + for features in ext_features: + for w in features: + o_f.write( + "\t".join(map(lambda x: str(x).replace("nan", "0"), w)) + "\n") except Exception as inst: logger.error(inst) pool.close() traceback.print_exc() raise Exception + logger.info("Done Extending Standalone Features.") + return ext_features + if __name__ == '__main__': FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' From 9cb9960fdfbc3e041b58ebb24a93e7dd5d8e739a Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 11 Mar 2020 17:00:13 -0700 Subject: [PATCH 03/89] fix_extract_ensemble --- ...analone_features.py => extend_features.py} | 107 +++++++++++------- neusomatic/python/generate_dataset.py | 43 ++++--- neusomatic/python/preprocess.py | 42 ++++++- 3 files changed, 130 insertions(+), 62 deletions(-) rename neusomatic/python/{extend_stanalone_features.py => extend_features.py} (77%) diff --git a/neusomatic/python/extend_stanalone_features.py b/neusomatic/python/extend_features.py similarity index 77% rename from neusomatic/python/extend_stanalone_features.py rename to neusomatic/python/extend_features.py index 5f6fef7..8b275de 100755 --- a/neusomatic/python/extend_stanalone_features.py +++ b/neusomatic/python/extend_features.py @@ -1,6 +1,6 @@ #!/usr/bin/env python #------------------------------------------------------------------------- -# extend_standalone_features.py +# extend_features.py # add extra features for standalone mode #------------------------------------------------------------------------- import argparse @@ -19,9 +19,9 @@ def extract_features(candidate_record): - work, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record + reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record thread_logger = logging.getLogger( - "{} ({})".format(extend_standalone_features.__name__, multiprocessing.current_process().name)) + "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name)) try: tbam = pysam.AlignmentFile(tumor_bam) nbam = pysam.AlignmentFile(normal_bam) @@ -43,7 +43,8 @@ def extract_features(candidate_record): n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] - sor = sequencing_features.somaticOddRatio(n_ref, n_alt, t_ref, t_alt) + sor = sequencing_features.somaticOddRatio( + n_ref, n_alt, t_ref, t_alt) homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref, alt) @@ -60,7 +61,8 @@ def extract_features(candidate_record): 0:8] for alt_ in alts_.split(","): dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) - dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 + dbsnp_vars[ + dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 if var_id in dbsnp_vars: if_dbsnp = 1 if_common = dbsnp_vars[var_id] @@ -153,21 +155,21 @@ def extract_features(candidate_record): tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'] InDel_Length = indel_length - ext_features.append([CHROM, POS, REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, - Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, - nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, - nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, - nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, - nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, - nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, - nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, - MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, - tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, - tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, - tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, - tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, - tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, - tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) + ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, + Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, + nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, + nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, + nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, + nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, + nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, + nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, + MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, + tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, + tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, + tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, + tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, + tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, + tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) return ext_features except Exception as ex: @@ -176,19 +178,18 @@ def extract_features(candidate_record): return None -def extend_standalone_features(candidates_vcf, - reference, tumor_bam, normal_bam, - min_mapq, min_bq, - dbsnp, cosmic, - num_threads, - work): +def extend_features(candidates_vcf, + exclude_variants, + output_tsv, + reference, tumor_bam, normal_bam, + min_mapq, min_bq, + dbsnp, cosmic, + num_threads): - logger = logging.getLogger(extend_standalone_features.__name__) + logger = logging.getLogger(extend_features.__name__) logger.info( "----------------------Extend Standalone Features------------------------") - if not os.path.exists(work): - os.mkdir(work) if not os.path.exists(tumor_bam): logger.error("Aborting!") @@ -236,6 +237,21 @@ def extend_standalone_features(candidates_vcf, var_id = "-".join([chrom, pos, ref, alt]) cosmic_vars[var_id] = num_cases + if exclude_variants: + exclude_vars = [] + with open(exclude_variants) as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + continue + if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM": + continue + x = line.strip().split("\t") + chrom, pos, _, ref, alt = x[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + exclude_vars.append(var_id) + n_variants = 0 with open(candidates_vcf) as i_f: for line in i_f: @@ -259,6 +275,9 @@ def extend_standalone_features(candidates_vcf, chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] var_id = "-".join([chrom, pos, ref, alt]) + if exclude_variants: + if var_id in exclude_vars: + continue num_cosmic_cases = float('nan') if_cosmic = 0 if cosmic and var_id in cosmic_vars: @@ -267,11 +286,12 @@ def extend_standalone_features(candidates_vcf, batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) i += 1 if len(batch) >= split_len or i == n_variants: - map_args.append((work, reference, tumor_bam, normal_bam, + map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch)) batch = [] - header = ["CHROM", "POS", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", + logger.info("Number of batches: {}".format(len(map_args))) + header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", @@ -290,7 +310,6 @@ def extend_standalone_features(candidates_vcf, try: ext_features = pool.map_async(extract_features, map_args).get() pool.close() - output_tsv = os.path.join(work, "features.tsv") with open(output_tsv, "w") as o_f: o_f.write( "\t".join(header) + "\n") @@ -317,6 +336,10 @@ def extend_standalone_features(candidates_vcf, description='extract extra features for standalone mode') parser.add_argument('--candidates_vcf', type=str, help='candidates vcf', required=True) + parser.add_argument('--exclude_variants', type=str, help='variants to exclude', + default=None) + parser.add_argument('--output_tsv', type=str, help='output features tsv', + required=True) parser.add_argument('--reference', type=str, help='reference fasta filename', required=True) parser.add_argument('--tumor_bam', type=str, @@ -333,23 +356,23 @@ def extend_standalone_features(candidates_vcf, help='COSMIC vcf (to annotate candidate variants)', default=None) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) - parser.add_argument('--work', type=str, - help='work directory', required=True) args = parser.parse_args() logger.info(args) try: - output = extend_standalone_features(args.candidates_vcf, - args.reference, args.tumor_bam, args.normal_bam, - args.min_mapq, args.min_bq, - args.dbsnp, args.cosmic, - args.num_threads, - args.work) + output = extend_features(args.candidates_vcf, + args.exclude_variants, + args.output_tsv, + args.reference, args.tumor_bam, args.normal_bam, + args.min_mapq, args.min_bq, + args.dbsnp, args.cosmic, + args.num_threads, + ) if output is None: - raise Exception("extend_standalone_features failed!") + raise Exception("extend_features failed!") except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") logger.error( - "extend_standalone_features.py failure on arguments: {}".format(args)) + "extend_features.py failure on arguments: {}".format(args)) raise e diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 01bd2d7..908b781 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1369,7 +1369,7 @@ def find_records(input_record): return None -def extract_ensemble(work, ensemble_tsv): +def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] ensemble_pos = [] @@ -1399,15 +1399,23 @@ def extract_ensemble(work, ensemble_tsv): "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"] + callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", + "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", + "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", + "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] + + n_vars = 0 with open(ensemble_tsv) as s_f: for line in s_f: if not line.strip(): continue if line[0:5] == "CHROM": header_pos = line.strip().split()[0:5] - header = line.strip().split()[5:105] + header_ = line.strip().split()[5:] + if is_extend: + header_ += callers_features header_en = list(filter( - lambda x: x[1] in expected_features, enumerate(line.strip().split()[5:]))) + lambda x: x[1] in expected_features, enumerate(header_))) header = list(map(lambda x: x[1], header_en)) if set(expected_features) - set(header): logger.error("The following features are missing from ensemble file: {}".format( @@ -1420,9 +1428,15 @@ def extract_ensemble(work, ensemble_tsv): fields = line.strip().split() fields[2] = str(int(fields[1]) + len(fields[3])) ensemble_pos.append(fields[0:5]) + features = fields[5:] + if is_extend: + features += ["0"] * len(callers_features) ensemble_data.append(list(map(lambda x: float( - x.replace("False", "0").replace("True", "1")), fields[5:]))) - ensemble_data = np.array(ensemble_data)[:, order_header] + x.replace("False", "0").replace("True", "1")), features))) + n_vars += 1 + if n_vars > 0: + ensemble_data = np.array(ensemble_data)[:, order_header] + header = np.array(header)[order_header].tolist() cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ "Consistent_Mates", "Inconsistent_Mates", "N_DP", @@ -1502,14 +1516,14 @@ def extract_ensemble(work, ensemble_tsv): ] selected_features = sorted([i for f in min_max_features for i in f[0]]) selected_features_tags = list(map(lambda x: header[x], selected_features)) - for i_s, mn, mx in min_max_features: - s = ensemble_data[:, np.array(i_s)] - s = np.maximum(np.minimum(s, mx), mn) - s = (s - mn) / (mx - mn) - ensemble_data[:, np.array(i_s)] = s - ensemble_data = ensemble_data[:, selected_features] - ensemble_data = ensemble_data.tolist() - ensemble_bed = os.path.join(work, "ensemble.bed") + if n_vars > 0: + for i_s, mn, mx in min_max_features: + s = ensemble_data[:, np.array(i_s)] + s = np.maximum(np.minimum(s, mx), mn) + s = (s - mn) / (mx - mn) + ensemble_data[:, np.array(i_s)] = s + ensemble_data = ensemble_data[:, selected_features] + ensemble_data = ensemble_data.tolist() with open(ensemble_bed, "w")as f_: f_.write( "#" + "\t".join(map(str, header_pos + selected_features_tags)) + "\n") @@ -1546,7 +1560,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be split_batch_size = 10000 if ensemble_tsv and not ensemble_bed: - ensemble_bed = extract_ensemble(work, ensemble_tsv) + ensemble_bed = os.path.join(work, "ensemble.bed") + extract_ensemble(ensemble_tsv, ensemble_bed, False) cmd = "bedtools intersect -a {} -b {} -u".format( tumor_pred_vcf_file, region_bed_file) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index b227def..a82ddf7 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -19,7 +19,8 @@ from filter_candidates import filter_candidates from generate_dataset import generate_dataset, extract_ensemble from scan_alignments import scan_alignments -from utils import concatenate_vcfs, run_bedtools_cmd +from extend_features import extend_features +from utils import concatenate_files, concatenate_vcfs, run_bedtools_cmd def split_dbsnp(record): @@ -196,10 +197,10 @@ def extract_candidate_split_regions( for line in f_: if not line.strip(): continue - if line[0]!="#": + if line[0] != "#": is_empty = False break - logger.info([filtered_vcf,is_empty]) + logger.info([filtered_vcf, is_empty]) if not is_empty: cmd = '''grep -v "#" {}'''.format(filtered_vcf) candidates_bed = run_bedtools_cmd(cmd, run_logger=logger) @@ -219,7 +220,6 @@ def extract_candidate_split_regions( prefix="tmpbed_", suffix=".bed", delete=False) candidates_bed = candidates_bed.name - if ensemble_beds: cmd = "cat {} {}".format( candidates_bed, @@ -255,6 +255,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, matrix_width, matrix_base_pad, min_ev_frac_per_col, ensemble_tsv, long_read, restart, first_do_without_qual, filter_duplicate, + add_extra_features, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -289,7 +290,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): - ensemble_bed = extract_ensemble(work, ensemble_tsv) + extract_ensemble(ensemble_tsv, ensemble_bed, False) merge_d_for_short_read = 100 candidates_split_regions = [] @@ -380,10 +381,35 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, if os.path.exists(work_dataset_split): shutil.rmtree(work_dataset_split) os.mkdir(work_dataset_split) + ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None + if add_extra_features: + extra_features_tsv = os.path.join( + work_dataset_split, "ex_features.tsv") + extra_features = extend_features(filtered_vcf, + ensemble_beds[ + i] if ensemble_tsv else None, + extra_features_tsv, + reference, tumor_bam, normal_bam, + min_mapq, snp_min_bq, + dbsnp, None, + num_threads) + extra_features_bed = os.path.join( + work_dataset_split, "ex_features.bed") + extract_ensemble(extra_features_tsv, extra_features_bed, True) + if ensemble_tsv: + merged_features_bed = os.path.join( + work_dataset_split, "merged_features.bed") + concatenate_files([extra_features_bed, ensemble_beds[ + i]], merged_features_bed, check_file_existence=True) + ensemble_bed_i = merged_features_bed + else: + ensemble_bed_i = extra_features_bed + + generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_beds[i] if ensemble_tsv else None, tsv_batch_size) + ensemble_bed_i, tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir @@ -465,6 +491,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--filter_duplicate', help='filter duplicate reads when preparing pileup information', action="store_true") + parser.add_argument('--add_extra_features', + help='add extra input features', + action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -482,6 +511,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col, args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual, args.filter_duplicate, + args.add_extra_features, args.num_threads, args.scan_alignments_binary) except Exception as e: From 58ccf62f48f5969b03d7e0988eb7143d3d065f4e Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 12 Mar 2020 14:23:35 -0700 Subject: [PATCH 04/89] fix dirnames --- neusomatic/python/preprocess.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index a82ddf7..3bfe814 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -383,8 +383,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, os.mkdir(work_dataset_split) ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None if add_extra_features: + work_tumor_i = os.dirname(filtered_vcf) extra_features_tsv = os.path.join( - work_dataset_split, "ex_features.tsv") + work_tumor_i, "extra_features.tsv") extra_features = extend_features(filtered_vcf, ensemble_beds[ i] if ensemble_tsv else None, @@ -394,7 +395,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, dbsnp, None, num_threads) extra_features_bed = os.path.join( - work_dataset_split, "ex_features.bed") + work_dataset_split, "extra_features.bed") extract_ensemble(extra_features_tsv, extra_features_bed, True) if ensemble_tsv: merged_features_bed = os.path.join( From cab5ff7772d8e8d1816e8a22ed405f7f37b4c37b Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 13 Mar 2020 19:26:46 -0700 Subject: [PATCH 05/89] small fix --- neusomatic/python/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index ef79035..e5b4bfc 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -345,7 +345,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, os.mkdir(work_dataset_split) ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None if add_extra_features: - work_tumor_i = os.dirname(filtered_vcf) + work_tumor_i = os.path.dirname(filtered_vcf) extra_features_tsv = os.path.join( work_tumor_i, "extra_features.tsv") extra_features = extend_features(filtered_vcf, From efa419098cfcc57915ae42cf5e9a7cf2ae5c0777 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 19 Mar 2020 16:39:20 -0700 Subject: [PATCH 06/89] fix features --- neusomatic/python/preprocess.py | 69 +++++++++++++++-------- neusomatic/python/read_info_extractor.py | 71 ------------------------ 2 files changed, 47 insertions(+), 93 deletions(-) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index e5b4bfc..a23a515 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -249,7 +249,6 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, raise Exception( "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) - ensemble_bed = None if ensemble_tsv: ensemble_bed = os.path.join(work, "ensemble.bed") @@ -322,15 +321,15 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, os.mkdir(work_normal) logger.info("Scan normal bam (and extracting quality scores).") normal_counts, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam, - None, scan_window_size, 0.2, min_mapq, - None, min_dp, max_dp, - filter_duplicate, - good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, - ins_min_af, del_min_af, del_merge_min_af, - ins_merge_min_af, merge_r, - scan_alignments_binary, restart, num_threads, - calc_qual=True, - regions=candidates_split_regions) + None, scan_window_size, 0.2, min_mapq, + None, min_dp, max_dp, + filter_duplicate, + good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, + ins_min_af, del_min_af, del_merge_min_af, + ins_merge_min_af, merge_r, + scan_alignments_binary, restart, num_threads, + calc_qual=True, + regions=candidates_split_regions) work_dataset = os.path.join(work, "dataset") if restart or not os.path.exists(work_dataset): @@ -348,27 +347,53 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_tumor_i = os.path.dirname(filtered_vcf) extra_features_tsv = os.path.join( work_tumor_i, "extra_features.tsv") - extra_features = extend_features(filtered_vcf, - ensemble_beds[ - i] if ensemble_tsv else None, - extra_features_tsv, - reference, tumor_bam, normal_bam, - min_mapq, snp_min_bq, - dbsnp, None, - num_threads) + if not os.path.exists(extra_features_tsv) or restart: + extend_features(filtered_vcf, + ensemble_beds[ + i] if ensemble_tsv else None, + extra_features_tsv, + reference, tumor_bam, normal_bam, + min_mapq, snp_min_bq, + dbsnp, None, + num_threads) extra_features_bed = os.path.join( work_dataset_split, "extra_features.bed") - extract_ensemble(extra_features_tsv, extra_features_bed, True) + if not os.path.exists(extra_features_bed) or restart: + extract_ensemble(extra_features_tsv, extra_features_bed, True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") - concatenate_files([extra_features_bed, ensemble_beds[ - i]], merged_features_bed, check_file_existence=True) + if not os.path.exists(merged_features_bed) or restart: + exclude_ens_variants = [] + with open(merged_features_bed, "w") as o_f: + with open(ensemble_beds[i]) as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + o_f.write(line) + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + exclude_ens_variants.append(var_id) + o_f.write(line) + with open(extra_features_bed) as i_f: + for line in i_f: + if not line.strip(): + continue + if line[0] == "#": + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + if var_id in exclude_ens_variants: + continue + o_f.write(line) + # concatenate_files([extra_features_bed, ensemble_beds[ + # i]], merged_features_bed, check_file_existence=True) ensemble_bed_i = merged_features_bed else: ensemble_bed_i = extra_features_bed - generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index 4dcec80..b5bf75d 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -217,74 +217,3 @@ def rescale(x, original='fraction', rescale_to=None, max_phred=1001): return y - - - - -##### Stuff from VarDict: -def find_MSI(vcf_object): - - msi = vcf_object.get_info_value('MSI') - if msi: - msi = float(msi) - else: - msi = nan - return msi - - -def find_MSILEN(vcf_object): - - msilen = vcf_object.get_info_value('MSILEN') - if msilen: - msilen = float(msilen) - else: - msilen = nan - return msilen - - -def find_SHIFT3(vcf_object): - - shift3 = vcf_object.get_info_value('SHIFT3') - if shift3: - shift3 = float(shift3) - else: - shift3 = nan - return shift3 - - - -# MuTect2's Stuff: -def mutect2_nlod(vcf_object): - nlod = vcf_object.get_info_value('NLOD') - if nlod: - return float(nlod) - else: - return nan - - -def mutect2_tlod(vcf_object): - tlod = vcf_object.get_info_value('TLOD') - if tlod: - return float(tlod) - else: - return nan - - -def mutect2_STR(vcf_object): - if vcf_object.get_info_value('STR'): - return 1 - else: - return 0 - - -def mutect2_ECNT(vcf_object): - ecnt = vcf_object.get_info_value('ECNT') - if ecnt: - try: - ecnt = int( ecnt ) - except ValueError: - ecnt = nan - else: - ecnt = nan - - return ecnt From e4be780ed6e00f45b2445afe8ed3073c90b953db Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 10 Apr 2020 00:15:55 -0700 Subject: [PATCH 07/89] fix ensemble --- neusomatic/python/call.py | 15 ++++++++++----- neusomatic/python/train.py | 16 +++++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 073d674..6f3c5fb 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -395,7 +395,6 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, - ensemble, use_cuda): logger = logging.getLogger(call_neusomatic.__name__) @@ -412,7 +411,17 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, vartype_classes = ['DEL', 'INS', 'NONE', 'SNP'] data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + + ensemble = False + with open(candidates_tsv[0]) as i_f: + for line in i_f: + x = line.strip().split() + if len(x) == 97: + ensemble = True + break + num_channels = 119 if ensemble else 26 + logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: logger.info("GPU calling!") @@ -583,9 +592,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, help='output directory', required=True) parser.add_argument('--checkpoint', type=str, help='network model checkpoint path', required=True) - parser.add_argument('--ensemble', - help='Enable calling for ensemble mode', - action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--batch_size', type=int, @@ -607,7 +613,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, args.checkpoint, args.num_threads, args.batch_size, args.max_load_candidates, args.pass_threshold, args.lowqual_threshold, - args.ensemble, use_cuda) except Exception as e: logger.error(traceback.format_exc()) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 3cc5fac..35b1f1e 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -201,7 +201,7 @@ def __len__(self): def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpoint, num_threads, batch_size, max_epochs, learning_rate, lr_drop_epochs, lr_drop_ratio, momentum, boost_none, none_count_scale, - max_load_candidates, coverage_thr, save_freq, ensemble, + max_load_candidates, coverage_thr, save_freq, merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs, train_split_len, normalize_channels, @@ -219,7 +219,17 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo torch.set_num_threads(num_threads) data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + + ensemble = False + with open(candidates_tsv[0]) as i_f: + for line in i_f: + x=line.strip().split() + if len(x)==97: + ensemble=True + break + num_channels = 119 if ensemble else 26 + logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: logger.info("GPU training!") @@ -507,9 +517,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo help='pretrained network model checkpoint path', default=None) parser.add_argument('--validation_candidates_tsv', nargs="*", help=' validation candidate tsv files', default=[]) - parser.add_argument('--ensemble', - help='Enable training for ensemble mode', - action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--batch_size', type=int, @@ -568,7 +575,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.lr, args.lr_drop_epochs, args.lr_drop_ratio, args.momentum, args.boost_none, args.none_count_scale, args.max_load_candidates, args.coverage_thr, args.save_freq, - args.ensemble, args.merged_candidates_per_tsv, args.merged_max_num_tsvs, args.overwrite_merged_tsvs, args.train_split_len, args.normalize_channels, From 8fc85830d8a184b7d112fde190b51591c1ea6565 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 21 Apr 2020 17:12:04 -0700 Subject: [PATCH 08/89] backward compatiblity for call.py --- neusomatic/python/call.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 6f3c5fb..25b76f9 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -592,6 +592,9 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, help='output directory', required=True) parser.add_argument('--checkpoint', type=str, help='network model checkpoint path', required=True) + parser.add_argument('--ensemble', + help='Enable calling for ensemble mode', + action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--batch_size', type=int, From 8e8ec699ab38d26d42f160e222cbd74ab233f92b Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 28 Apr 2020 18:59:37 -0700 Subject: [PATCH 09/89] few fixes --- neusomatic/python/call.py | 12 ++++---- neusomatic/python/defaults.py | 5 ++-- neusomatic/python/extend_features.py | 28 ++++--------------- neusomatic/python/preprocess.py | 42 +++++++++++----------------- neusomatic/python/train.py | 13 ++++----- 5 files changed, 36 insertions(+), 64 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 0527245..6afb6f1 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -27,7 +27,7 @@ from dataloader import NeuSomaticDataset, matrix_transform from utils import get_chromosomes_order, prob2phred from merge_tsvs import merge_tsvs -from defaults import VARTYPE_CLASSES +from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES import torch._utils try: @@ -414,13 +414,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, ensemble = False with open(candidates_tsv[0]) as i_f: - for line in i_f: - x = line.strip().split() - if len(x) == 97: - ensemble = True - break + x=i_f.readline().strip().split() + if len(x) == NUM_ENS_FEATURES+4: + ensemble = True - num_channels = 119 if ensemble else 26 + num_channels = NUM_ENS_FEATURES+NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py index d249a61..97a90ee 100644 --- a/neusomatic/python/defaults.py +++ b/neusomatic/python/defaults.py @@ -1,4 +1,5 @@ -NUM_ENS_FEATURES=93 -VCF_HEADER="##fileformat=VCFv4.2" +NUM_ENS_FEATURES = 93 +NUM_ST_FEATURES = 26 +VCF_HEADER = "##fileformat=VCFv4.2" TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3} VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP'] \ No newline at end of file diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 8b275de..f5b2d3b 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -16,6 +16,7 @@ import sequencing_features import genomic_file_handlers as genome from read_info_extractor import rescale +from utils import skip_empty def extract_features(candidate_record): @@ -224,11 +225,7 @@ def extend_features(candidates_vcf, if cosmic: cosmic_vars = {} with open(cosmic) as i_f: - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - continue + for line in skip_empty(i_f): x = line.strip().split("\t") chrom, pos, _, ref, alts, _, _, info = x[0:8] num_cases = info.split("CNT=")[1].split( @@ -240,11 +237,7 @@ def extend_features(candidates_vcf, if exclude_variants: exclude_vars = [] with open(exclude_variants) as i_f: - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - continue + for line in skip_empty(i_f): if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM": continue x = line.strip().split("\t") @@ -254,25 +247,16 @@ def extend_features(candidates_vcf, n_variants = 0 with open(candidates_vcf) as i_f: - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - continue + for line in skip_empty(i_f): n_variants += 1 logger.info("Number of variants: {}".format(n_variants)) - split_len = n_variants // num_threads + split_len = (n_variants + num_threads - 1) // num_threads pool = multiprocessing.Pool(num_threads) map_args = [] with open(candidates_vcf) as i_f: i = 0 batch = [] - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - continue - + for line in skip_empty(i_f): chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] var_id = "-".join([chrom, pos, ref, alt]) if exclude_variants: diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 0c57e98..fb4a6e2 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -348,33 +348,23 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_dataset_split, "merged_features.bed") if not os.path.exists(merged_features_bed) or restart: exclude_ens_variants = [] - with open(merged_features_bed, "w") as o_f: - with open(ensemble_beds[i]) as i_f: - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - o_f.write(line) - continue - chrom, pos, _, ref, alt = line.strip().split("\t")[ - 0:5] - var_id = "-".join([chrom, pos, ref, alt]) - exclude_ens_variants.append(var_id) + with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: + for line in skip_empty(i_f_1, skip_header=False): + if line.startswith("#"): o_f.write(line) - with open(extra_features_bed) as i_f: - for line in i_f: - if not line.strip(): - continue - if line[0] == "#": - continue - chrom, pos, _, ref, alt = line.strip().split("\t")[ - 0:5] - var_id = "-".join([chrom, pos, ref, alt]) - if var_id in exclude_ens_variants: - continue - o_f.write(line) - # concatenate_files([extra_features_bed, ensemble_beds[ - # i]], merged_features_bed, check_file_existence=True) + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[ + 0:5] + var_id = "-".join([chrom, pos, ref, alt]) + exclude_ens_variants.append(var_id) + o_f.write(line) + for line in skip_empty(i_f_2): + chrom, pos, _, ref, alt = line.strip().split("\t")[ + 0:5] + var_id = "-".join([chrom, pos, ref, alt]) + if var_id in exclude_ens_variants: + continue + o_f.write(line) ensemble_bed_i = merged_features_bed else: ensemble_bed_i = extra_features_bed diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 6bf6346..acc44c1 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -24,7 +24,7 @@ from network import NeuSomaticNet from dataloader import NeuSomaticDataset, matrix_transform from merge_tsvs import merge_tsvs -from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES +from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES import torch._utils try: @@ -220,13 +220,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo ensemble = False with open(candidates_tsv[0]) as i_f: - for line in i_f: - x=line.strip().split() - if len(x)==97: - ensemble=True - break + x=i_f.readline().strip().split() + if len(x) == NUM_ENS_FEATURES+4: + ensemble = True + + num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES - num_channels = 119 if ensemble else 26 logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: From 2a845091d50a1335dc2d194f3c9115541eae8bc8 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 28 Apr 2020 19:12:29 -0700 Subject: [PATCH 10/89] fix format --- neusomatic/python/call.py | 6 +- neusomatic/python/defaults.py | 2 +- neusomatic/python/extend_features.py | 2 +- .../python/extract_postprocess_targets.py | 1 + neusomatic/python/filter_candidates.py | 5 +- neusomatic/python/generate_dataset.py | 8 +- neusomatic/python/genomic_file_handlers.py | 191 ++++++++---------- neusomatic/python/long_read_indelrealign.py | 42 ++-- neusomatic/python/merge_post_vcfs.py | 1 + neusomatic/python/postprocess.py | 9 +- neusomatic/python/preprocess.py | 3 +- neusomatic/python/read_info_extractor.py | 110 +++++----- neusomatic/python/resolve_scores.py | 2 +- neusomatic/python/scan_alignments.py | 1 + neusomatic/python/sequencing_features.py | 190 ++++++++--------- neusomatic/python/train.py | 8 +- neusomatic/python/utils.py | 6 +- 17 files changed, 290 insertions(+), 297 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 6afb6f1..b6eb055 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -414,11 +414,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, ensemble = False with open(candidates_tsv[0]) as i_f: - x=i_f.readline().strip().split() - if len(x) == NUM_ENS_FEATURES+4: + x = i_f.readline().strip().split() + if len(x) == NUM_ENS_FEATURES + 4: ensemble = True - num_channels = NUM_ENS_FEATURES+NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES + num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py index 97a90ee..4cf0d21 100644 --- a/neusomatic/python/defaults.py +++ b/neusomatic/python/defaults.py @@ -2,4 +2,4 @@ NUM_ST_FEATURES = 26 VCF_HEADER = "##fileformat=VCFv4.2" TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3} -VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP'] \ No newline at end of file +VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP'] diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index f5b2d3b..71afffd 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -238,7 +238,7 @@ def extend_features(candidates_vcf, exclude_vars = [] with open(exclude_variants) as i_f: for line in skip_empty(i_f): - if exclude_variants.split(".")[-1]=="tsv" and line[0:5]=="CHROM": + if exclude_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM": continue x = line.strip().split("\t") chrom, pos, _, ref, alt = x[0:5] diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py index aa96857..c3dee50 100755 --- a/neusomatic/python/extract_postprocess_targets.py +++ b/neusomatic/python/extract_postprocess_targets.py @@ -11,6 +11,7 @@ from utils import skip_empty from defaults import VCF_HEADER + def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): logger = logging.getLogger(extract_postprocess_targets.__name__) diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py index 39a1fdf..58fc628 100755 --- a/neusomatic/python/filter_candidates.py +++ b/neusomatic/python/filter_candidates.py @@ -276,14 +276,15 @@ def filter_candidates(candidate_record): for record in final_records: if dbsnp: chrom, pos, ref, alt = record[0:4] - var_id = "-".join(map(str,[chrom, pos, ref, alt])) + var_id = "-".join(map(str, [chrom, pos, ref, alt])) region = "{}:{}-{}".format(chrom, pos, pos + 1) dbsnp_vars = [] for x in dbsnp_tb.fetch(region=region): chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[ 0:5] for alt_ in alts_.split(","): - dbsnp_var_id = "-".join(map(str,[chrom_, pos_, ref_, alt_])) + dbsnp_var_id = "-".join(map(str, + [chrom_, pos_, ref_, alt_])) dbsnp_vars.append(dbsnp_var_id) if var_id in dbsnp_vars: continue diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index fe4570d..5daca75 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -25,6 +25,7 @@ NUC_to_NUM_tabix = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0} + def get_type(ref, alt): logger = logging.getLogger(get_type.__name__) len_diff = len(ref) - len(alt.split(",")[0]) @@ -862,11 +863,12 @@ def find_records(input_record): concatenate_vcfs( [split_pred_vcf_file, split_missed_ensemble_bed_file], split_pred_with_missed_file) - tmp_=get_tmp_file() - with open(split_pred_with_missed_file) as i_f, open(tmp_,"w") as o_f: + tmp_ = get_tmp_file() + with open(split_pred_with_missed_file) as i_f, open(tmp_, "w") as o_f: for line in skip_empty(i_f): x = line.strip().split("\t") - o_f.write("\t".join(list(map(str,[x[0],x[1],".",x[3],x[4],".",".",".",".","."])))+"\n") + o_f.write("\t".join( + list(map(str, [x[0], x[1], ".", x[3], x[4], ".", ".", ".", ".", "."]))) + "\n") bedtools_sort(tmp_, output_fn=split_pred_with_missed_file, run_logger=thread_logger) not_in_ensemble_bed = bedtools_window( diff --git a/neusomatic/python/genomic_file_handlers.py b/neusomatic/python/genomic_file_handlers.py index cd19a26..6e45a3a 100644 --- a/neusomatic/python/genomic_file_handlers.py +++ b/neusomatic/python/genomic_file_handlers.py @@ -1,10 +1,16 @@ #!/usr/bin/env python3 from pysam import AlignmentFile -import sys, os, gzip, re, math +import sys +import os +import gzip +import re +import math -# The regular expression pattern for "chrXX 1234567" in both VarScan2 Output and VCF files: -pattern_major_chr_position = re.compile(r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b') +# The regular expression pattern for "chrXX 1234567" in both VarScan2 +# Output and VCF files: +pattern_major_chr_position = re.compile( + r'^(?:chr)?(?:[1-9]|1[0-9]|2[0-2]|[XY]|MT?)\t[0-9]+\b') # More lenient pattern: pattern_chr_position = re.compile(r'[^\t]+\t[0-9]+\b') @@ -13,13 +19,15 @@ # Valid Phred+33 quality strings: valid_q = set() -[valid_q.add( chr(33+i) ) for i in range(42)]; +[valid_q.add(chr(33 + i)) for i in range(42)] nan = float('nan') inf = float('inf') -AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"} -AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile", "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"} +AA_3to1 = {"Ala": "A", "Arg": "R", "Asn": "N", "Asp": "D", "Cys": "C", "Glu": "E", "Gln": "Q", "Gly": "G", "His": "H", "Ile": "I", + "Leu": "L", "Lys": "K", "Met": "M", "Phe": "F", "Pro": "P", "Ser": "S", "Thr": "T", "Trp": "W", "Tyr": "Y", "Val": "V"} +AA_1to3 = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp", "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly", "H": "His", "I": "Ile", + "L": "Leu", "K": "Lys", "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser", "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"} ### ### ### ### ### MAJOR CLASSES ### ### ### ### ### @@ -27,12 +35,12 @@ class Vcf_line: '''Each instance of this object is a line from the vcf file (no header).''' def __init__(self, vcf_line): - '''Argument is a line in pileup file.''' self.vcf_line = vcf_line.rstrip('\n') try: - self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, *self.has_samples = vcf_line.rstrip('\n').split('\t') + self.chromosome, self.position, self.identifier, self.refbase, self.altbase, self.qual, self.filters, self.info, * \ + self.has_samples = vcf_line.rstrip('\n').split('\t') self.position = int(self.position) try: @@ -44,14 +52,13 @@ def __init__(self, vcf_line): self.chromosome = self.identifier = self.refbase = self.altbase = self.qual = self.filters = self.info = self.field = self.samples = '' self.position = None - def get_info_items(self): return self.info.split(';') - def get_info_value(self, variable): - key_item = re.search(r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line) + key_item = re.search( + r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), self.vcf_line) # The key has a value attached to it, e.g., VAR=1,2,3 if key_item: @@ -62,23 +69,21 @@ def get_info_value(self, variable): key_item = self.info.split(';') return True if variable in key_item else False - def get_sample_variable(self): return self.field.split(':') - def get_sample_item(self, idx=0, out_type='d'): '''d to output a dictionary. l to output a tuple of lists''' if out_type.lower() == 'd': - return dict( zip(self.get_sample_variable(), self.samples[idx].split(':') ) ) + return dict(zip(self.get_sample_variable(), self.samples[idx].split(':'))) elif out_type.lower() == 'l': - return ( self.get_sample_variable(), self.samples[idx].split(':') ) - + return (self.get_sample_variable(), self.samples[idx].split(':')) def get_sample_value(self, variable, idx=0): - var2value = dict( zip( self.field.split(':'), self.samples[idx].split(':') )) + var2value = dict(zip(self.field.split( + ':'), self.samples[idx].split(':'))) try: return var2value[variable] @@ -86,8 +91,6 @@ def get_sample_value(self, variable, idx=0): return None - - class pysam_header: ''' Extract BAM header using pysam. @@ -99,14 +102,13 @@ def __init__(self, bam_file): bam = AlignmentFile(bam_file) self.bam_header = bam.header - def SM(self): '''Sample Name''' sample_name = set() for header_i in self.bam_header['RG']: - sample_name.add( header_i['SM'] ) + sample_name.add(header_i['SM']) sample_name = tuple(sample_name) return sample_name @@ -115,20 +117,14 @@ def SM(self): ### ### ### ### ### MAJOR CLASSES OVER ### ### ### ### ### - - - - - - ### ### ### ### ### FUNCTIONS OF CONVENIENCE ### ### ### ### ### def skip_vcf_header(opened_file): - + line_i = opened_file.readline().rstrip() while line_i.startswith('#'): line_i = opened_file.readline().rstrip() - + return line_i @@ -139,9 +135,9 @@ def faiordict2contigorder(file_name, file_format): contig_sequence = [] with open(file_name) as gfile: - + for line_i in gfile: - + if file_format == 'fai': contig_match = re.match(r'([^\t]+)\t', line_i) @@ -150,17 +146,17 @@ def faiordict2contigorder(file_name, file_format): contig_match = re.match(r'@SQ\tSN:([^\t]+)\tLN:', line_i) if contig_match: - contig_i = contig_match.groups()[0].split(' ')[0] # some .fai files have space after the contig for descriptions. - contig_sequence.append( contig_i ) + # some .fai files have space after the contig for descriptions. + contig_i = contig_match.groups()[0].split(' ')[0] + contig_sequence.append(contig_i) chrom_seq = {} - for n,contig_i in enumerate(contig_sequence): + for n, contig_i in enumerate(contig_sequence): chrom_seq[contig_i] = n return chrom_seq - def open_textfile(file_name): # See if the input file is a .gz file: @@ -171,7 +167,6 @@ def open_textfile(file_name): return open(file_name) - def open_bam_file(file_name): try: @@ -180,16 +175,14 @@ def open_bam_file(file_name): return open(file_name) - - def ascii2phred33(x): '''Put in an ASCII string, return a Phred+33 score.''' - return ord(x)-33 + return ord(x) - 33 def phred33toascii(x): '''Put in a Phred33 score, return the character.''' - return chr(x+33) + return chr(x + 33) def p2phred(p, max_phred=inf): @@ -201,7 +194,7 @@ def p2phred(p, max_phred=inf): elif p == 1: Q = 0 - elif p<0 or p>1: + elif p < 0 or p > 1: Q = nan elif p > 0: @@ -215,26 +208,25 @@ def p2phred(p, max_phred=inf): return Q - def phred2p(phred): '''Convert Phred-scale quality score to p-value.''' - return 10**(-phred/10) + return 10**(-phred / 10) def findall_index(mylist, tolookfor): '''Find all instances in a list that matches exactly thestring.''' - all_indices = [i for i,item in enumerate(mylist) if item == tolookfor] + all_indices = [i for i, item in enumerate(mylist) if item == tolookfor] return all_indices def findall_index_regex(mylist, pattern): '''Find all instances in a list that matches a regex pattern.''' - all_indices = [i for i,item in enumerate(mylist) if re.search(pattern, item)] + all_indices = [i for i, item in enumerate( + mylist) if re.search(pattern, item)] return all_indices def count_repeating_bases(sequence): - '''For a string, count the number of characters that appears in a row. E.g., for string "ABBCCCDDDDAAAAAAA", the function returns 1, 2, 3, 4, 7, because there is 1 A, 2 B's, 3 C's, 4 D's, and then 7 A's. ''' @@ -255,7 +247,6 @@ def count_repeating_bases(sequence): return counters - def numeric_id(chr_i, pos_i, contig_seq): chr_i = contig_seq[chr_i] @@ -267,19 +258,17 @@ def numeric_id(chr_i, pos_i, contig_seq): return numeric_i - - - # Define which chromosome coordinate is ahead for the following function: -chrom_sequence = [str(i) for i in range(1,23)] +chrom_sequence = [str(i) for i in range(1, 23)] chrom_sequence.append('X') chrom_sequence.append('Y') chrom_sequence.append('M') chrom_seq = {} -for n,contig_i in enumerate(chrom_sequence): +for n, contig_i in enumerate(chrom_sequence): chrom_seq[contig_i] = n + def whoisbehind(coord_0, coord_1, chrom_sequence): ''' coord_0 and coord_1 are two strings or two lists, specifying the chromosome, a (typically) tab, and then the location. @@ -288,10 +277,10 @@ def whoisbehind(coord_0, coord_1, chrom_sequence): end_of_0 = end_of_1 = False - if coord_0 == '' or coord_0==['',''] or coord_0==('','') or not coord_0: + if coord_0 == '' or coord_0 == ['', ''] or coord_0 == ('', '') or not coord_0: end_of_0 = True - if coord_1 == '' or coord_1==['',''] or coord_1==('','') or not coord_1: + if coord_1 == '' or coord_1 == ['', ''] or coord_1 == ('', '') or not coord_1: end_of_1 = True if end_of_0 and end_of_1: @@ -345,10 +334,7 @@ def whoisbehind(coord_0, coord_1, chrom_sequence): return 10 - - def vcf_header_modifier(infile_handle, addons=[], getlost=' '): - '''addons = A list of INFO, FORMAT, ID, or Filter lines you want to add. getlost = a regex expression for the ID of INFO/FORMAT/FILTER that you want to get rid of.''' @@ -369,15 +355,14 @@ def vcf_header_modifier(infile_handle, addons=[], getlost=' '): elif re.match(r'##(INFO|FORMAT|FILTER)', line_i): if not re.match(r'##(INFO|FORMAT|FILTER)= 0, current_ref))) + x], filter(lambda x: x > 0, current_ref))) aa = "".join(map(lambda x: NUM_to_NUC[ - x], filter(lambda x: x > 0, current_alt))) - variants.append([current_bias, rr, aa, np.array(current_af)]) - done=False + x], filter(lambda x: x > 0, current_alt))) + variants.append( + [current_bias, rr, aa, np.array(current_af)]) + done = False current_ref = [] current_alt = [] current_af = [] @@ -958,13 +959,14 @@ def find_var(out_fasta_file, snp_min_af, del_min_af, ins_min_af, scale_maf, simp current_ref.append(r) current_alt.append(a) current_af.append(af) - is_ins = r==0 and a!=0 - is_del = r!=0 and a==0 + is_ins = r == 0 and a != 0 + is_del = r != 0 and a == 0 if r != 0: bias += 1 return variants + def TrimREFALT(ref, alt, pos): logger = logging.getLogger(TrimREFALT.__name__) alte = len(alt) @@ -1061,12 +1063,14 @@ def run_realignment(input_record): for var in vars_: pos_, ref_seq, alt_seq, afs = var if ref_seq != alt_seq: - ref, alt, pos = ref_seq, alt_seq, int(region.start) + 1 + pos_ + ref, alt, pos = ref_seq, alt_seq, int( + region.start) + 1 + pos_ if pos > 1: - num_add_before = min(40, pos-1) - before = ref_fasta.fetch(region.chrom, pos - num_add_before, pos-1).upper() + num_add_before = min(40, pos - 1) + before = ref_fasta.fetch( + region.chrom, pos - num_add_before, pos - 1).upper() print(before) - pos -= num_add_before-1 + pos -= num_add_before - 1 ref = before + ref alt = before + alt ref, alt, pos = TrimREFALT( diff --git a/neusomatic/python/merge_post_vcfs.py b/neusomatic/python/merge_post_vcfs.py index 014e56f..111df49 100755 --- a/neusomatic/python/merge_post_vcfs.py +++ b/neusomatic/python/merge_post_vcfs.py @@ -13,6 +13,7 @@ from utils import get_chromosomes_order, skip_empty from defaults import VCF_HEADER + def merge_post_vcfs(ref, resolved_vcf, no_resolve_vcf, out_vcf, pass_threshold, lowqual_threshold): diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 6df0a17..1c456c8 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -215,8 +215,9 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense os.mkdir(work_lr_indel_realign) ra_resolved_vcf = os.path.join( work, "candidates_preds.ra_resolved.vcf") - not_resolved_bed = os.path.join(work, "candidates_preds.not_ra_resolved.bed") - long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, + not_resolved_bed = os.path.join( + work, "candidates_preds.not_ra_resolved.bed") + long_read_indelrealign(work_lr_indel_realign, tumor_bam, None, ra_resolved_vcf, not_resolved_bed, target_bed, reference, num_threads, lr_pad, lr_chunk_size, lr_chunk_scale, lr_snp_min_af, @@ -227,12 +228,12 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense msa_binary) resolve_scores(tumor_bam, ra_resolved_vcf, target_vcf, resolved_vcf) - not_resolved_vcf = os.path.join(work, "candidates_preds.not_ra_resolved.vcf") + not_resolved_vcf = os.path.join( + work, "candidates_preds.not_ra_resolved.vcf") cmd = "bedtools intersect -a {} -b {} -u".format( target_vcf, not_resolved_bed) run_bedtools_cmd(cmd, output_fn=not_resolved_vcf, run_logger=logger) - all_no_resolve = concatenate_files( [no_resolve, ensembled_preds, not_resolved_vcf], os.path.join(work, "no_resolve.vcf")) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index fb4a6e2..8cd40bb 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -152,7 +152,8 @@ def extract_candidate_split_regions( logger.info([filtered_vcf, is_empty]) if not is_empty: candidates_bed = get_tmp_file() - vcf_2_bed(filtered_vcf,candidates_bed, len_ref=True, keep_ref_alt=False) + vcf_2_bed(filtered_vcf, candidates_bed, + len_ref=True, keep_ref_alt=False) candidates_bed = bedtools_sort(candidates_bed, run_logger=logger) candidates_bed = bedtools_slop( diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index b5bf75d..e4b4ead 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -2,22 +2,24 @@ import re -cigar_aln_match = 0 -cigar_insertion = 1 -cigar_deletion = 2 -cigar_skip = 3 -cigar_soft_clip = 4 -cigar_hard_clip = 5 -cigar_padding = 6 -cigar_seq_match = 7 +cigar_aln_match = 0 +cigar_insertion = 1 +cigar_deletion = 2 +cigar_skip = 3 +cigar_soft_clip = 4 +cigar_hard_clip = 5 +cigar_padding = 6 +cigar_seq_match = 7 cigar_seq_mismatch = 8 nan = float('nan') inf = float('inf') -## Define functions: +# Define functions: ### PYSAM ### + + def position_of_aligned_read(read_i, target_position): ''' Return the base call of the target position, or if it's a start of insertion/deletion. @@ -42,7 +44,6 @@ def position_of_aligned_read(read_i, target_position): seq_i = align_i[0] break - # If the target position is aligned: try: if seq_i is not None: @@ -54,29 +55,34 @@ def position_of_aligned_read(read_i, target_position): if i != len(read_i.get_aligned_pairs()) - 1: indel_length = 0 - # If the next alignment is the next sequenced base, then the target is either a reference read of a SNP/SNV: - if read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == target_position + 1: + # If the next alignment is the next sequenced base, then the + # target is either a reference read of a SNP/SNV: + if read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1: - code = 1 # Reference read for mismatch + code = 1 # Reference read for mismatch - # If the next reference position has no read position to it, it is DELETED in this read: - elif read_i.get_aligned_pairs()[i+1][0] == None and read_i.get_aligned_pairs()[i+1][1] == target_position + 1: + # If the next reference position has no read position to it, it + # is DELETED in this read: + elif read_i.get_aligned_pairs()[i + 1][0] == None and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1: - code = 2 # Deletion + code = 2 # Deletion - for align_j in read_i.get_aligned_pairs()[ i+1:: ]: + for align_j in read_i.get_aligned_pairs()[i + 1::]: if align_j[0] == None: indel_length -= 1 else: break # Opposite of deletion, if the read position cannot be aligned to the reference, it can be an INSERTION. - # Insertions sometimes show up wit soft-clipping at the end, if the inserted sequence is "too long" to align on a single read. In this case, the inserted length derived here is but a lower limit of the real inserted length. - elif read_i.get_aligned_pairs()[i+1][0] == seq_i+1 and read_i.get_aligned_pairs()[i+1][1] == None: + # Insertions sometimes show up wit soft-clipping at the end, if + # the inserted sequence is "too long" to align on a single + # read. In this case, the inserted length derived here is but a + # lower limit of the real inserted length. + elif read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == None: - code = 3 # Insertion or soft-clipping + code = 3 # Insertion or soft-clipping - for align_j in read_i.get_aligned_pairs()[ i+1:: ]: + for align_j in read_i.get_aligned_pairs()[i + 1::]: if align_j[1] == None: indel_length += 1 else: @@ -85,13 +91,13 @@ def position_of_aligned_read(read_i, target_position): # If "i" is the final alignment, cannt exam for indel: else: code = 1 # Assuming no indel - indel_length = nan # Would be zero if certain no indel, but uncertain here + indel_length = nan # Would be zero if certain no indel, but uncertain here - # If the target position is deleted from the sequencing read (i.e., the deletion in this read occurs before the target position): + # If the target position is deleted from the sequencing read (i.e., the + # deletion in this read occurs before the target position): else: code = 0 base_at_target, indel_length, flanking_indel = None, None, None - # See if there is insertion/deletion within 5 bp of "i": if isinstance(indel_length, int): @@ -99,7 +105,7 @@ def position_of_aligned_read(read_i, target_position): left_side_start = seq_i right_side_start = seq_i + abs(indel_length) + 1 switch = 1 - for j in (3,2,1): + for j in (3, 2, 1): for indel_seeker_i in left_side_start, right_side_start: switch = switch * -1 @@ -109,8 +115,9 @@ def position_of_aligned_read(read_i, target_position): if 0 <= seq_j < len(read_i.get_aligned_pairs()): # If the reference position has no base aligned to it, it's a deletion. - # On the other hand, if the base has no reference base aligned to it, it's an insertion. - if read_i.get_aligned_pairs()[ seq_j ][1] == None or read_i.get_aligned_pairs()[ seq_j ][0] == None: + # On the other hand, if the base has no reference base + # aligned to it, it's an insertion. + if read_i.get_aligned_pairs()[seq_j][1] == None or read_i.get_aligned_pairs()[seq_j][0] == None: flanking_indel = j break else: @@ -123,8 +130,7 @@ def position_of_aligned_read(read_i, target_position): return None, None, None, None, None - -## Dedup test for BAM file +# Dedup test for BAM file def dedup_test(read_i, remove_dup_or_not=True): ''' Return False (i.e., remove the read) if the read is a duplicate and if the user specify that duplicates should be removed. @@ -136,35 +142,31 @@ def dedup_test(read_i, remove_dup_or_not=True): return True - ### END OF PYSAM ### # Useful to make BED region into an iterator of coordinates def genomic_coordinates(contig_i, start, end): - for pos_i in range(start, end+1): + for pos_i in range(start, end + 1): yield contig_i, pos_i +def mean(stuff): + return sum(stuff) / len(stuff) if stuff else nan -def mean(stuff): - return sum(stuff)/len(stuff) if stuff else nan - - - -##### Extract Indel DP4 info from pileup files: +# Extract Indel DP4 info from pileup files: def pileup_indel_DP4(pileup_object, indel_pattern): if pileup_object.reads: ref_for = pileup_object.reads.count('.') ref_rev = pileup_object.reads.count(',') - alt_for = pileup_object.reads.count( indel_pattern.upper() ) - alt_rev = pileup_object.reads.count( indel_pattern.lower() ) + alt_for = pileup_object.reads.count(indel_pattern.upper()) + alt_rev = pileup_object.reads.count(indel_pattern.lower()) - dp4 = ref_for, ref_rev, alt_for, alt_rev + dp4 = ref_for, ref_rev, alt_for, alt_rev else: - dp4 = nan,nan,nan,nan + dp4 = nan, nan, nan, nan return dp4 @@ -178,21 +180,24 @@ def pileup_DP4(pileup_object, ref_base, variant_call): # SNV if len(variant_call) == len(ref_base): - ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[2].count(variant_call.upper()), base_calls[3].count(variant_call.lower()) + ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[ + 2].count(variant_call.upper()), base_calls[3].count(variant_call.lower()) # Insertion: elif len(variant_call) > len(ref_base): - inserted_sequence = variant_call[ len(ref_base):: ] + inserted_sequence = variant_call[len(ref_base)::] - ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower()) + ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[ + 6].count(inserted_sequence.upper()), base_calls[7].count(inserted_sequence.lower()) # Deletion: elif len(variant_call) < len(ref_base): - deleted_sequence = ref_base[ len(variant_call):: ] + deleted_sequence = ref_base[len(variant_call)::] - ref_for,ref_rev,alt_for,alt_rev = base_calls[0], base_calls[1], base_calls[4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower()) + ref_for, ref_rev, alt_for, alt_rev = base_calls[0], base_calls[1], base_calls[ + 4].count(deleted_sequence.upper()), base_calls[5].count(deleted_sequence.lower()) else: ref_for = ref_rev = alt_for = alt_rev = 0 @@ -200,20 +205,17 @@ def pileup_DP4(pileup_object, ref_base, variant_call): return ref_for, ref_rev, alt_for, alt_rev - - def rescale(x, original='fraction', rescale_to=None, max_phred=1001): - - if ( rescale_to == None ) or ( original.lower() == rescale_to.lower() ): + + if (rescale_to == None) or (original.lower() == rescale_to.lower()): y = x if isinstance(x, int) else '%.2f' % x - + elif original.lower() == 'fraction' and rescale_to == 'phred': y = genome.p2phred(x, max_phred=max_phred) y = '%.2f' % y - + elif original.lower() == 'phred' and rescale_to == 'fraction': y = genome.phred2p(x) y = '%.2f' % y - - return y + return y diff --git a/neusomatic/python/resolve_scores.py b/neusomatic/python/resolve_scores.py index 65ff505..f54c57c 100755 --- a/neusomatic/python/resolve_scores.py +++ b/neusomatic/python/resolve_scores.py @@ -24,7 +24,7 @@ def resolve_scores(input_bam, ra_vcf, target_vcf, output_vcf): final_intervals = read_tsv_file(tmp_) for x in final_intervals: - x[5] = str(np.round(-10*np.log10(0.25),4)) + x[5] = str(np.round(-10 * np.log10(0.25), 4)) tmp_ = bedtools_window( ra_vcf, target_vcf, args=" -w 5", run_logger=logger) diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index fe7e343..37b47f2 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -22,6 +22,7 @@ from utils import concatenate_files, run_shell_command, bedtools_sort, bedtools_merge, get_tmp_file, skip_empty from split_bed import split_region + def run_scan_alignments(record): work, reference, scan_alignments_binary, split_region_file, \ input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 1135e1e..3d7644a 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -1,57 +1,60 @@ #!/usr/bin/env python3 -import sys, os, re, pysam +import sys +import os +import re +import pysam import scipy.stats as stats import genomic_file_handlers as genome -from read_info_extractor import * +from read_info_extractor import * nan = float('nan') def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): - ''' bam is the opened file handle of bam file my_coordiate is a list or tuple of 0-based (contig, position) ''' - + indel_length = len(first_alt) - len(ref_base) - reads = bam.fetch( my_coordinate[0], my_coordinate[1]-1, my_coordinate[1] ) - + reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1]) + ref_read_mq = [] alt_read_mq = [] ref_read_bq = [] alt_read_bq = [] ref_edit_distance = [] alt_edit_distance = [] - + ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0 ref_for = ref_rev = alt_for = alt_rev = dp = 0 ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0 MQ0 = 0 - + ref_pos_from_end = [] alt_pos_from_end = [] ref_flanking_indel = [] alt_flanking_indel = [] - - noise_read_count = poor_read_count = 0 - + + noise_read_count = poor_read_count = 0 + qname_collector = {} - + for read_i in reads: if not read_i.is_unmapped and dedup_test(read_i): - + dp += 1 - - code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read(read_i, my_coordinate[1]-1 ) - + + code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( + read_i, my_coordinate[1] - 1) + if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq: poor_read_count += 1 - + if read_i.mapping_quality == 0: MQ0 += 1 - + # Reference calls: if code_i == 1 and base_call_i == ref_base[0]: @@ -59,27 +62,27 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): qname_collector[read_i.qname].append(0) except KeyError: qname_collector[read_i.qname] = [0] - - ref_read_mq.append( read_i.mapping_quality ) - ref_read_bq.append( read_i.query_qualities[ith_base] ) - + + ref_read_mq.append(read_i.mapping_quality) + ref_read_bq.append(read_i.query_qualities[ith_base]) + try: - ref_edit_distance.append( read_i.get_tag('NM') ) + ref_edit_distance.append(read_i.get_tag('NM')) except KeyError: pass - + # Concordance - if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: ref_concordant_reads += 1 elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: ref_discordant_reads += 1 - + # Orientation if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: ref_for += 1 - elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: ref_rev += 1 - + # Soft-clipped reads? if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: ref_SC_reads += 1 @@ -88,43 +91,44 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): # Distance from the end of the read: if ith_base != None: - ref_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) ) - + ref_pos_from_end.append( + min(ith_base, read_i.query_length - ith_base)) + # Flanking indels: - ref_flanking_indel.append( flanking_indel_i ) + ref_flanking_indel.append(flanking_indel_i) - # Alternate calls: - # SNV, or Deletion, or Insertion where I do not check for matching indel length + # SNV, or Deletion, or Insertion where I do not check for matching + # indel length elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \ (indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or \ - (indel_length > 0 and code_i == 3): + (indel_length > 0 and code_i == 3): try: qname_collector[read_i.qname].append(1) except KeyError: qname_collector[read_i.qname] = [1] - alt_read_mq.append( read_i.mapping_quality ) - alt_read_bq.append( read_i.query_qualities[ith_base] ) - + alt_read_mq.append(read_i.mapping_quality) + alt_read_bq.append(read_i.query_qualities[ith_base]) + try: - alt_edit_distance.append( read_i.get_tag('NM') ) + alt_edit_distance.append(read_i.get_tag('NM')) except KeyError: pass - + # Concordance - if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: alt_concordant_reads += 1 elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: alt_discordant_reads += 1 - + # Orientation if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: alt_for += 1 - elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: alt_rev += 1 - + # Soft-clipped reads? if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: alt_SC_reads += 1 @@ -133,56 +137,59 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): # Distance from the end of the read: if ith_base != None: - alt_pos_from_end.append( min(ith_base, read_i.query_length-ith_base) ) - + alt_pos_from_end.append( + min(ith_base, read_i.query_length - ith_base)) + # Flanking indels: - alt_flanking_indel.append( flanking_indel_i ) - - + alt_flanking_indel.append(flanking_indel_i) + # Inconsistent read or 2nd alternate calls: else: - + try: qname_collector[read_i.qname].append(2) except KeyError: qname_collector[read_i.qname] = [2] - + noise_read_count += 1 - + # Done extracting info from tumor BAM. Now tally them: - ref_mq = mean(ref_read_mq) - alt_mq = mean(alt_read_mq) + ref_mq = mean(ref_read_mq) + alt_mq = mean(alt_read_mq) z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0] - - ref_bq = mean(ref_read_bq) - alt_bq = mean(alt_read_bq) + + ref_bq = mean(ref_read_bq) + alt_bq = mean(alt_read_bq) z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0] - - ref_NM = mean(ref_edit_distance) - alt_NM = mean(alt_edit_distance) + + ref_NM = mean(ref_edit_distance) + alt_NM = mean(alt_edit_distance) z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] - NM_Diff = alt_NM - ref_NM - abs(indel_length) - - concordance_fet = stats.fisher_exact(( (ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads) ))[1] - strandbias_fet = stats.fisher_exact(( (ref_for, alt_for), (ref_rev, alt_rev) ))[1] - clipping_fet = stats.fisher_exact(( (ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads) ))[1] - + NM_Diff = alt_NM - ref_NM - abs(indel_length) + + concordance_fet = stats.fisher_exact( + ((ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads)))[1] + strandbias_fet = stats.fisher_exact( + ((ref_for, alt_for), (ref_rev, alt_rev)))[1] + clipping_fet = stats.fisher_exact( + ((ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads)))[1] + z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] - + ref_indel_1bp = ref_flanking_indel.count(1) ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp alt_indel_1bp = alt_flanking_indel.count(1) alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp - + consistent_mates = inconsistent_mates = 0 for pairs_i in qname_collector: - + # Both are alternative calls: - if qname_collector[pairs_i] == [1,1]: + if qname_collector[pairs_i] == [1, 1]: consistent_mates += 1 - + # One is alternate call but the other one is not: elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]: inconsistent_mates += 1 @@ -190,33 +197,33 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): return vars() - - - def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt): - ''' ref_fa is the opened reference fasta file handle my_coordiate is a list or tuple of 0-based (contig, position) ''' # Homopolymer eval (Make sure to modify for INDEL): - # The min and max is to prevent the +/- 20 bases from exceeding the ends of the reference sequence - lseq = ref_fa.fetch(my_coordinate[0], max(0, my_coordinate[1]-20), my_coordinate[1]) - rseq = ref_fa.fetch(my_coordinate[0], my_coordinate[1]+1, min(ref_fa.get_reference_length(my_coordinate[0])+1, my_coordinate[1]+21) ) - - # This is to get around buy in old version of pysam that reads the reference sequence in bytes instead of strings + # The min and max is to prevent the +/- 20 bases from exceeding the ends + # of the reference sequence + lseq = ref_fa.fetch(my_coordinate[0], max( + 0, my_coordinate[1] - 20), my_coordinate[1]) + rseq = ref_fa.fetch(my_coordinate[0], my_coordinate[ + 1] + 1, min(ref_fa.get_reference_length(my_coordinate[0]) + 1, my_coordinate[1] + 21)) + + # This is to get around buy in old version of pysam that reads the + # reference sequence in bytes instead of strings lseq = lseq.decode() if isinstance(lseq, bytes) else lseq rseq = rseq.decode() if isinstance(rseq, bytes) else rseq - - seq41_ref = lseq + ref_base + rseq + + seq41_ref = lseq + ref_base + rseq seq41_alt = lseq + first_alt + rseq - + ref_counts = genome.count_repeating_bases(seq41_ref) alt_counts = genome.count_repeating_bases(seq41_alt) - - homopolymer_length = max( max(ref_counts), max(alt_counts) ) - + + homopolymer_length = max(max(ref_counts), max(alt_counts)) + # Homopolymer spanning the variant site: ref_c = 0 alt_c = 0 @@ -225,37 +232,34 @@ def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt): ref_c += 1 else: break - + for i in lseq[::-1]: if i == ref_base: ref_c += 1 else: break - + for i in rseq: if i == first_alt: alt_c += 1 else: break - + for i in lseq[::-1]: if i == first_alt: alt_c += 1 else: break - site_homopolymer_length = max( alt_c+1, ref_c+1 ) + site_homopolymer_length = max(alt_c + 1, ref_c + 1) return homopolymer_length, site_homopolymer_length - - - def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): # Odds Ratio just like VarDict's output - sor_numerator = n_alt * t_ref + sor_numerator = n_alt * t_ref sor_denominator = n_ref * t_alt if sor_numerator == 0 and sor_denominator == 0: sor = nan diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index acc44c1..ddd31b5 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -89,7 +89,7 @@ def test(net, epoch, validation_loader, use_cuda): (matrices, labels, _, var_len_s, _), (paths) = data paths_ = copy.deepcopy(paths) - del paths + del paths paths = paths_ matrices = Variable(matrices) @@ -220,12 +220,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo ensemble = False with open(candidates_tsv[0]) as i_f: - x=i_f.readline().strip().split() - if len(x) == NUM_ENS_FEATURES+4: + x = i_f.readline().strip().split() + if len(x) == NUM_ENS_FEATURES + 4: ensemble = True num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES - + logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: diff --git a/neusomatic/python/utils.py b/neusomatic/python/utils.py index d4e9d45..67ba79c 100755 --- a/neusomatic/python/utils.py +++ b/neusomatic/python/utils.py @@ -125,6 +125,7 @@ def write_tsv_file(tsv_file, records, sep='\t', add_fields=[]): for x in records: f_o.write(sep.join(map(str, x + add_fields)) + "\n") + def skip_empty(fh, skip_header=True): for line in fh: if skip_header and line.startswith("#"): @@ -133,6 +134,7 @@ def skip_empty(fh, skip_header=True): continue yield line + def read_tsv_file(tsv_file, sep='\t', fields=None): records = [] with open(tsv_file) as i_f: @@ -143,11 +145,12 @@ def read_tsv_file(tsv_file, sep='\t', fields=None): records.append(x) return records + def vcf_2_bed(vcf_file, bed_file, add_fields=[], len_ref=False, keep_ref_alt=True): with open(bed_file, "w") as f_o, open(vcf_file, "r") as f_i: for line in skip_empty(f_i): x = line.strip().split("\t") - len_=1 if not len_ref else len(x[3]) + len_ = 1 if not len_ref else len(x[3]) if keep_ref_alt: f_o.write( "\t".join(map(str, [x[0], int(x[1]), int(x[1]) + len_, x[3], x[4]] + add_fields)) + "\n") @@ -156,7 +159,6 @@ def vcf_2_bed(vcf_file, bed_file, add_fields=[], len_ref=False, keep_ref_alt=Tru "\t".join(map(str, [x[0], int(x[1]), int(x[1]) + len_] + add_fields)) + "\n") - def bedtools_sort(bed_file, args="", output_fn=None, run_logger=None): cmd = "bedtools sort -i {} {}".format(bed_file, args) if output_fn is None: From 5581ba46dd77a60835ed92b59bd923287f245144 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 28 Apr 2020 23:47:45 -0700 Subject: [PATCH 11/89] small fix --- neusomatic/python/postprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 1c456c8..d0d934e 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -21,7 +21,7 @@ from extract_postprocess_targets import extract_postprocess_targets from merge_post_vcfs import merge_post_vcfs from resolve_variants import resolve_variants -from utils import concatenate_files, get_chromosomes_order, bedtools_window, skip_empty +from utils import concatenate_files, get_chromosomes_order, bedtools_window, run_bedtools_cmd, skip_empty from long_read_indelrealign import long_read_indelrealign from resolve_scores import resolve_scores from _version import __version__ From 115d81401a2dca202e475090b6b8afa4462a6cde Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 28 Apr 2020 23:52:10 -0700 Subject: [PATCH 12/89] fix for training loss --- neusomatic/python/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index ddd31b5..7b27fc9 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -451,9 +451,11 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo loss.backward() optimizer.step() - loss_s.append(loss.data) + loss_data = copy.deepcopy(loss.cpu().data) + del loss + loss_s.append(loss_data) - running_loss += loss.data + running_loss += loss_data if i_ % print_freq == print_freq - 1: logger.info('epoch: {}, iter: {:>7}, lr: {}, loss: {:.5f}'.format( n_epoch + prev_epochs, len(loss_s), From a3c4f3ea6e15fd60b67fb4fb0db4c08add714970 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 29 Apr 2020 02:07:43 -0700 Subject: [PATCH 13/89] fix for backward compatibility --- neusomatic/python/train.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 7b27fc9..1a0e125 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -518,6 +518,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo help=' validation candidate tsv files', default=[]) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) + parser.add_argument('--ensemble', + help='Enable training for ensemble mode', + action="store_true") parser.add_argument('--batch_size', type=int, help='batch size', default=1000) parser.add_argument('--max_epochs', type=int, From f9ee72564658414b703d06a7505afdea8c64897b Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 30 Apr 2020 08:47:17 -0700 Subject: [PATCH 14/89] fix train loss --- neusomatic/python/train.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 1a0e125..a841a26 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -451,11 +451,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo loss.backward() optimizer.step() - loss_data = copy.deepcopy(loss.cpu().data) - del loss - loss_s.append(loss_data) + loss_s.append(loss.data) - running_loss += loss_data + running_loss += loss.data if i_ % print_freq == print_freq - 1: logger.info('epoch: {}, iter: {:>7}, lr: {}, loss: {:.5f}'.format( n_epoch + prev_epochs, len(loss_s), From 26c4ca4156e48e7caec45f71fa51a41c12e5208f Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 2 May 2020 22:53:53 -0700 Subject: [PATCH 15/89] fix extend_features --- neusomatic/python/extend_features.py | 3 +++ neusomatic/python/read_info_extractor.py | 20 ++++++++++---------- neusomatic/python/sequencing_features.py | 4 ++-- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 71afffd..91c8359 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -292,6 +292,9 @@ def extend_features(candidates_vcf, "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"] try: + # ext_features=[] + # for w in map_args: + # ext_features.append(extract_features(w)) ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index e4b4ead..b6db804 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -36,8 +36,8 @@ def position_of_aligned_read(read_i, target_position): ''' flanking_deletion, flanking_insertion = nan, nan - - for i, align_i in enumerate(read_i.get_aligned_pairs()): + aligned_pairs=read_i.get_aligned_pairs() + for i, align_i in enumerate(aligned_pairs): # If find a match: if align_i[1] == target_position: @@ -52,22 +52,22 @@ def position_of_aligned_read(read_i, target_position): # Whether if it's a Deletion/Insertion depends on what happens after this position: # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel # if "i" is NOT the final alignment: - if i != len(read_i.get_aligned_pairs()) - 1: + if i != len(aligned_pairs) - 1: indel_length = 0 # If the next alignment is the next sequenced base, then the # target is either a reference read of a SNP/SNV: - if read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1: + if aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == target_position + 1: code = 1 # Reference read for mismatch # If the next reference position has no read position to it, it # is DELETED in this read: - elif read_i.get_aligned_pairs()[i + 1][0] == None and read_i.get_aligned_pairs()[i + 1][1] == target_position + 1: + elif aligned_pairs[i + 1][0] == None and aligned_pairs[i + 1][1] == target_position + 1: code = 2 # Deletion - for align_j in read_i.get_aligned_pairs()[i + 1::]: + for align_j in aligned_pairs[i + 1::]: if align_j[0] == None: indel_length -= 1 else: @@ -78,11 +78,11 @@ def position_of_aligned_read(read_i, target_position): # the inserted sequence is "too long" to align on a single # read. In this case, the inserted length derived here is but a # lower limit of the real inserted length. - elif read_i.get_aligned_pairs()[i + 1][0] == seq_i + 1 and read_i.get_aligned_pairs()[i + 1][1] == None: + elif aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == None: code = 3 # Insertion or soft-clipping - for align_j in read_i.get_aligned_pairs()[i + 1::]: + for align_j in aligned_pairs[i + 1::]: if align_j[1] == None: indel_length += 1 else: @@ -112,12 +112,12 @@ def position_of_aligned_read(read_i, target_position): displacement = j * switch seq_j = indel_seeker_i + displacement - if 0 <= seq_j < len(read_i.get_aligned_pairs()): + if 0 <= seq_j < len(aligned_pairs): # If the reference position has no base aligned to it, it's a deletion. # On the other hand, if the base has no reference base # aligned to it, it's an insertion. - if read_i.get_aligned_pairs()[seq_j][1] == None or read_i.get_aligned_pairs()[seq_j][0] == None: + if aligned_pairs[seq_j][1] == None or aligned_pairs[seq_j][0] == None: flanking_indel = j break else: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 3d7644a..b9adfcc 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -178,10 +178,10 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): ref_indel_1bp = ref_flanking_indel.count(1) ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp - ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp + ref_indel_1bp + ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp alt_indel_1bp = alt_flanking_indel.count(1) alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp - alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp + alt_indel_1bp + alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp consistent_mates = inconsistent_mates = 0 for pairs_i in qname_collector: From 757dfab3339d09ba29afbf112aa24aa38b7f6459 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 2 May 2020 23:07:54 -0700 Subject: [PATCH 16/89] improve efficiency in extend_features --- neusomatic/python/extend_features.py | 160 ++++++------- neusomatic/python/sequencing_features.py | 284 +++++++++-------------- 2 files changed, 192 insertions(+), 252 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 91c8359..5c813dc 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -35,17 +35,11 @@ def extract_features(candidate_record): var_id = "-".join([chrom, pos, ref, alt]) pos = int(pos) my_coordinate = [chrom, pos] - nBamFeatures = sequencing_features.from_bam( - nbam, my_coordinate, ref, alt, min_mapq, min_bq) - tBamFeatures = sequencing_features.from_bam( - tbam, my_coordinate, ref, alt, min_mapq, min_bq) + nBamFeatures = sequencing_features.AlignmentFeatures(nbam, my_coordinate, ref, alt, min_mapq, min_bq) + tBamFeatures = sequencing_features.AlignmentFeatures(tbam, my_coordinate, ref, alt, min_mapq, min_bq) - n_ref = nBamFeatures['ref_for'] + nBamFeatures['ref_rev'] - n_alt = nBamFeatures['alt_for'] + nBamFeatures['alt_rev'] - t_ref = tBamFeatures['ref_for'] + tBamFeatures['ref_rev'] - t_alt = tBamFeatures['alt_for'] + tBamFeatures['alt_rev'] - sor = sequencing_features.somaticOddRatio( - n_ref, n_alt, t_ref, t_alt) + sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, + tBamFeatures.nalt) homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref, alt) @@ -58,12 +52,10 @@ def extract_features(candidate_record): region = "{}:{}-{}".format(chrom, pos, pos + 1) dbsnp_vars = {} for x in dbsnp_tb.fetch(region=region): - chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[ - 0:8] + chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[0:8] for alt_ in alts_.split(","): dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) - dbsnp_vars[ - dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 + dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 if var_id in dbsnp_vars: if_dbsnp = 1 if_common = dbsnp_vars[var_id] @@ -77,83 +69,83 @@ def extract_features(candidate_record): COMMON = if_common if_COSMIC = if_cosmic COSMIC_CNT = num_cosmic_cases - Consistent_Mates = tBamFeatures['consistent_mates'] - Inconsistent_Mates = tBamFeatures['inconsistent_mates'] - N_DP = nBamFeatures['dp'] - nBAM_REF_MQ = '%g' % nBamFeatures['ref_mq'] - nBAM_ALT_MQ = '%g' % nBamFeatures['alt_mq'] - nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures['z_ranksums_mq'] - nBAM_REF_BQ = '%g' % nBamFeatures['ref_bq'] - nBAM_ALT_BQ = '%g' % nBamFeatures['alt_bq'] - nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures['z_ranksums_bq'] - nBAM_REF_NM = '%g' % nBamFeatures['ref_NM'] - nBAM_ALT_NM = '%g' % nBamFeatures['alt_NM'] - nBAM_NM_Diff = '%g' % nBamFeatures['NM_Diff'] - nBAM_REF_Concordant = nBamFeatures['ref_concordant_reads'] - nBAM_REF_Discordant = nBamFeatures['ref_discordant_reads'] - nBAM_ALT_Concordant = nBamFeatures['alt_concordant_reads'] - nBAM_ALT_Discordant = nBamFeatures['alt_discordant_reads'] + Consistent_Mates = tBamFeatures.consistent_mates + Inconsistent_Mates = tBamFeatures.inconsistent_mates + N_DP = nBamFeatures.dp + nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq + nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq + nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq + nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq + nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq + nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq + nBAM_REF_NM = '%g' % nBamFeatures.ref_NM + nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM + nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff + nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads + nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads + nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads + nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads nBAM_Concordance_FET = rescale( - nBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) - N_REF_FOR = nBamFeatures['ref_for'] - N_REF_REV = nBamFeatures['ref_rev'] - N_ALT_FOR = nBamFeatures['alt_for'] - N_ALT_REV = nBamFeatures['alt_rev'] + nBamFeatures.concordance_fet, 'fraction', p_scale, 1001) + N_REF_FOR = nBamFeatures.ref_for + N_REF_REV = nBamFeatures.ref_rev + N_ALT_FOR = nBamFeatures.alt_for + N_ALT_REV = nBamFeatures.alt_rev nBAM_StrandBias_FET = rescale( - nBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) - nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures['z_ranksums_endpos'] - nBAM_REF_Clipped_Reads = nBamFeatures['ref_SC_reads'] - nBAM_ALT_Clipped_Reads = nBamFeatures['alt_SC_reads'] + nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) + nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos + nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads + nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads nBAM_Clipping_FET = rescale( - nBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) - nBAM_MQ0 = nBamFeatures['MQ0'] - nBAM_Other_Reads = nBamFeatures['noise_read_count'] - nBAM_Poor_Reads = nBamFeatures['poor_read_count'] - nBAM_REF_InDel_3bp = nBamFeatures['ref_indel_3bp'] - nBAM_REF_InDel_2bp = nBamFeatures['ref_indel_2bp'] - nBAM_REF_InDel_1bp = nBamFeatures['ref_indel_1bp'] - nBAM_ALT_InDel_3bp = nBamFeatures['alt_indel_3bp'] - nBAM_ALT_InDel_2bp = nBamFeatures['alt_indel_2bp'] - nBAM_ALT_InDel_1bp = nBamFeatures['alt_indel_1bp'] + nBamFeatures.clipping_fet, 'fraction', p_scale, 1001) + nBAM_MQ0 = nBamFeatures.MQ0 + nBAM_Other_Reads = nBamFeatures.noise_read_count + nBAM_Poor_Reads = nBamFeatures.poor_read_count + nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp + nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp + nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp + nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp + nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp + nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp SOR = sor MaxHomopolymer_Length = homopolymer_length SiteHomopolymer_Length = site_homopolymer_length - T_DP = tBamFeatures['dp'] - tBAM_REF_MQ = '%g' % tBamFeatures['ref_mq'] - tBAM_ALT_MQ = '%g' % tBamFeatures['alt_mq'] - tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures['z_ranksums_mq'] - tBAM_REF_BQ = '%g' % tBamFeatures['ref_bq'] - tBAM_ALT_BQ = '%g' % tBamFeatures['alt_bq'] - tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures['z_ranksums_bq'] - tBAM_REF_NM = '%g' % tBamFeatures['ref_NM'] - tBAM_ALT_NM = '%g' % tBamFeatures['alt_NM'] - tBAM_NM_Diff = '%g' % tBamFeatures['NM_Diff'] - tBAM_REF_Concordant = tBamFeatures['ref_concordant_reads'] - tBAM_REF_Discordant = tBamFeatures['ref_discordant_reads'] - tBAM_ALT_Concordant = tBamFeatures['alt_concordant_reads'] - tBAM_ALT_Discordant = tBamFeatures['alt_discordant_reads'] + T_DP = tBamFeatures.dp + tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq + tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq + tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq + tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq + tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq + tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq + tBAM_REF_NM = '%g' % tBamFeatures.ref_NM + tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM + tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff + tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads + tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads + tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads + tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads tBAM_Concordance_FET = rescale( - tBamFeatures['concordance_fet'], 'fraction', p_scale, 1001) - T_REF_FOR = tBamFeatures['ref_for'] - T_REF_REV = tBamFeatures['ref_rev'] - T_ALT_FOR = tBamFeatures['alt_for'] - T_ALT_REV = tBamFeatures['alt_rev'] + tBamFeatures.concordance_fet, 'fraction', p_scale, 1001) + T_REF_FOR = tBamFeatures.ref_for + T_REF_REV = tBamFeatures.ref_rev + T_ALT_FOR = tBamFeatures.alt_for + T_ALT_REV = tBamFeatures.alt_rev tBAM_StrandBias_FET = rescale( - tBamFeatures['strandbias_fet'], 'fraction', p_scale, 1001) - tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures['z_ranksums_endpos'] - tBAM_REF_Clipped_Reads = tBamFeatures['ref_SC_reads'] - tBAM_ALT_Clipped_Reads = tBamFeatures['alt_SC_reads'] + tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) + tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos + tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads + tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads tBAM_Clipping_FET = rescale( - tBamFeatures['clipping_fet'], 'fraction', p_scale, 1001) - tBAM_MQ0 = tBamFeatures['MQ0'] - tBAM_Other_Reads = tBamFeatures['noise_read_count'] - tBAM_Poor_Reads = tBamFeatures['poor_read_count'] - tBAM_REF_InDel_3bp = tBamFeatures['ref_indel_3bp'] - tBAM_REF_InDel_2bp = tBamFeatures['ref_indel_2bp'] - tBAM_REF_InDel_1bp = tBamFeatures['ref_indel_1bp'] - tBAM_ALT_InDel_3bp = tBamFeatures['alt_indel_3bp'] - tBAM_ALT_InDel_2bp = tBamFeatures['alt_indel_2bp'] - tBAM_ALT_InDel_1bp = tBamFeatures['alt_indel_1bp'] + tBamFeatures.clipping_fet, 'fraction', p_scale, 1001) + tBAM_MQ0 = tBamFeatures.MQ0 + tBAM_Other_Reads = tBamFeatures.noise_read_count + tBAM_Poor_Reads = tBamFeatures.poor_read_count + tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp + tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp + tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp + tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp + tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp + tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp InDel_Length = indel_length ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, @@ -295,11 +287,11 @@ def extend_features(candidates_vcf, # ext_features=[] # for w in map_args: # ext_features.append(extract_features(w)) + ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: - o_f.write( - "\t".join(header) + "\n") + o_f.write("\t".join(header) + "\n") for features in ext_features: for w in features: o_f.write( diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index b9adfcc..61723c0 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -7,42 +7,40 @@ import scipy.stats as stats import genomic_file_handlers as genome from read_info_extractor import * +from collections import defaultdict nan = float('nan') -def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): - ''' - bam is the opened file handle of bam file - my_coordiate is a list or tuple of 0-based (contig, position) - ''' - - indel_length = len(first_alt) - len(ref_base) - reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1]) +class AlignmentFeatures: + def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + ''' + bam is the opened file handle of bam file + my_coordiate is a list or tuple of 0-based (contig, position) + ''' - ref_read_mq = [] - alt_read_mq = [] - ref_read_bq = [] - alt_read_bq = [] - ref_edit_distance = [] - alt_edit_distance = [] + indel_length = len(first_alt) - len(ref_base) + reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1]) - ref_concordant_reads = alt_concordant_reads = ref_discordant_reads = alt_discordant_reads = 0 - ref_for = ref_rev = alt_for = alt_rev = dp = 0 - ref_SC_reads = alt_SC_reads = ref_notSC_reads = alt_notSC_reads = 0 - MQ0 = 0 + # index 0 for ref, 1 for alt + read_mq = [[], []] + read_bq = [[], []] + edit_distance = [[], []] + flanking_indel = [[], []] + pos_from_end = [[], []] + concordance_counts = [[0, 0], [0, 0]] + orientation_counts = [[0, 0], [0, 0]] + soft_clip_counts = [[0, 0], [0, 0]] + dp = 0 + MQ0 = 0 - ref_pos_from_end = [] - alt_pos_from_end = [] - ref_flanking_indel = [] - alt_flanking_indel = [] + noise_read_count = poor_read_count = 0 - noise_read_count = poor_read_count = 0 + qname_collector = defaultdict(list) - qname_collector = {} - - for read_i in reads: - if not read_i.is_unmapped and dedup_test(read_i): + for read_i in reads: + if read_i.is_unmapped or not dedup_test(read_i): + continue dp += 1 @@ -55,146 +53,98 @@ def from_bam(bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): if read_i.mapping_quality == 0: MQ0 += 1 - # Reference calls: - if code_i == 1 and base_call_i == ref_base[0]: - - try: - qname_collector[read_i.qname].append(0) - except KeyError: - qname_collector[read_i.qname] = [0] - - ref_read_mq.append(read_i.mapping_quality) - ref_read_bq.append(read_i.query_qualities[ith_base]) - - try: - ref_edit_distance.append(read_i.get_tag('NM')) - except KeyError: - pass - - # Concordance - if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - ref_concordant_reads += 1 - elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - ref_discordant_reads += 1 - - # Orientation - if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - ref_for += 1 - elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - ref_rev += 1 - - # Soft-clipped reads? - if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: - ref_SC_reads += 1 - else: - ref_notSC_reads += 1 - - # Distance from the end of the read: - if ith_base != None: - ref_pos_from_end.append( - min(ith_base, read_i.query_length - ith_base)) - - # Flanking indels: - ref_flanking_indel.append(flanking_indel_i) - - # Alternate calls: - # SNV, or Deletion, or Insertion where I do not check for matching - # indel length - elif (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or \ - (indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or \ - (indel_length > 0 and code_i == 3): - - try: - qname_collector[read_i.qname].append(1) - except KeyError: - qname_collector[read_i.qname] = [1] - - alt_read_mq.append(read_i.mapping_quality) - alt_read_bq.append(read_i.query_qualities[ith_base]) - - try: - alt_edit_distance.append(read_i.get_tag('NM')) - except KeyError: - pass - - # Concordance - if read_i.is_proper_pair and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - alt_concordant_reads += 1 - elif (not read_i.is_proper_pair) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - alt_discordant_reads += 1 - - # Orientation - if (not read_i.is_reverse) and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - alt_for += 1 - elif read_i.is_reverse and read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - alt_rev += 1 - - # Soft-clipped reads? - if read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip: - alt_SC_reads += 1 - else: - alt_notSC_reads += 1 - - # Distance from the end of the read: - if ith_base != None: - alt_pos_from_end.append( - min(ith_base, read_i.query_length - ith_base)) - - # Flanking indels: - alt_flanking_indel.append(flanking_indel_i) - - # Inconsistent read or 2nd alternate calls: - else: - - try: - qname_collector[read_i.qname].append(2) - except KeyError: - qname_collector[read_i.qname] = [2] + is_ref_call = code_i == 1 and base_call_i == ref_base[0] + is_alt_call = (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or ( + indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or ( + indel_length > 0 and code_i == 3) + # inconsistent read or second alternate calls + if not (is_ref_call or is_alt_call): + qname_collector[read_i.qname].append(2) noise_read_count += 1 - - # Done extracting info from tumor BAM. Now tally them: - ref_mq = mean(ref_read_mq) - alt_mq = mean(alt_read_mq) - z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0] - - ref_bq = mean(ref_read_bq) - alt_bq = mean(alt_read_bq) - z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0] - - ref_NM = mean(ref_edit_distance) - alt_NM = mean(alt_edit_distance) - z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] - NM_Diff = alt_NM - ref_NM - abs(indel_length) - - concordance_fet = stats.fisher_exact( - ((ref_concordant_reads, alt_concordant_reads), (ref_discordant_reads, alt_discordant_reads)))[1] - strandbias_fet = stats.fisher_exact( - ((ref_for, alt_for), (ref_rev, alt_rev)))[1] - clipping_fet = stats.fisher_exact( - ((ref_notSC_reads, alt_notSC_reads), (ref_SC_reads, alt_SC_reads)))[1] - - z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] - - ref_indel_1bp = ref_flanking_indel.count(1) - ref_indel_2bp = ref_flanking_indel.count(2) + ref_indel_1bp - ref_indel_3bp = ref_flanking_indel.count(3) + ref_indel_2bp - alt_indel_1bp = alt_flanking_indel.count(1) - alt_indel_2bp = alt_flanking_indel.count(2) + alt_indel_1bp - alt_indel_3bp = alt_flanking_indel.count(3) + alt_indel_2bp - - consistent_mates = inconsistent_mates = 0 - for pairs_i in qname_collector: - - # Both are alternative calls: - if qname_collector[pairs_i] == [1, 1]: - consistent_mates += 1 - - # One is alternate call but the other one is not: - elif len(qname_collector[pairs_i]) == 2 and 1 in qname_collector[pairs_i]: - inconsistent_mates += 1 - - return vars() + continue + + index = 1 if is_alt_call else 0 + + qname_collector[read_i.qname].append(index) + + read_mq[index].append(read_i.mapping_quality) + read_bq[index].append(read_i.query_qualities[ith_base]) + + try: + edit_distance[index].append(read_i.get_tag('NM')) + except KeyError: + pass + + if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + concordance_counts[0 if read_i.is_proper_pair else 1][index] += 1 + orientation_counts[1 if read_i.is_reverse else 0][index] += 1 + + is_soft_clipped = read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip + soft_clip_counts[1 if is_soft_clipped else 0][index] += 1 + + # Distance from the end of the read: + if ith_base is not None: + pos_from_end[index].append(min(ith_base, read_i.query_length - ith_base)) + + flanking_indel[index].append(flanking_indel_i) + + # unpack to get the ref and alt values + ref_pos_from_end, alt_pos_from_end = pos_from_end + self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[0] + self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[1] + self.ref_for, self.alt_for = orientation_counts[0] + self.ref_rev, self.alt_rev = orientation_counts[1] + self.ref_notSC_reads, self.alt_notSC_reads = soft_clip_counts[0] + self.ref_SC_reads, self.alt_SC_reads = soft_clip_counts[1] + + # Done extracting info from BAM. Now tally them: + ref_read_mq, alt_read_mq = read_mq + self.ref_mq = mean(ref_read_mq) + self.alt_mq = mean(alt_read_mq) + self.z_ranksums_mq = stats.ranksums(alt_read_mq, ref_read_mq)[0] + + ref_read_bq, alt_read_bq = read_bq + self.ref_bq = mean(ref_read_bq) + self.alt_bq = mean(alt_read_bq) + self.z_ranksums_bq = stats.ranksums(alt_read_bq, ref_read_bq)[0] + + ref_edit_distance, alt_edit_distance = edit_distance + self.ref_NM = mean(ref_edit_distance) + self.alt_NM = mean(alt_edit_distance) + self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] + self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length) + + self.concordance_fet = stats.fisher_exact(concordance_counts)[1] + self.strandbias_fet = stats.fisher_exact(orientation_counts)[1] + self.clipping_fet = stats.fisher_exact(soft_clip_counts)[1] + + self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] + + ref_flanking_indel, alt_flanking_indel = flanking_indel + self.ref_indel_1bp = ref_flanking_indel.count(1) + self.ref_indel_2bp = ref_flanking_indel.count(2) + self.ref_indel_1bp + self.ref_indel_3bp = ref_flanking_indel.count(3) + self.ref_indel_2bp + self.alt_indel_1bp = alt_flanking_indel.count(1) + self.alt_indel_2bp = alt_flanking_indel.count(2) + self.alt_indel_1bp + self.alt_indel_3bp = alt_flanking_indel.count(3) + self.alt_indel_2bp + + self.consistent_mates = self.inconsistent_mates = 0 + for one_count in map(lambda x: x.count(1), filter(lambda y: len(y) == 2, qname_collector.values())): + # Both are alternative calls: + if one_count == 2: + self.consistent_mates += 1 + + # One is alternate call but the other one is not: + elif one_count == 1: + self.inconsistent_mates += 1 + + self.nref = self.ref_for + self.ref_rev + self.nalt = self.alt_for + self.alt_rev + self.dp = dp + self.MQ0 = MQ0 + self.noise_read_count = noise_read_count + self.poor_read_count = poor_read_count def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt): @@ -266,8 +216,6 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): elif sor_denominator == 0: sor = max_value else: - sor = sor_numerator / sor_denominator - if sor >= max_value: - sor = max_value + sor = min(sor_numerator / sor_denominator, max_value) return sor From 776ddcee0d9593a5f4ce78644a32f33626aa57a0 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 3 May 2020 19:28:29 -0700 Subject: [PATCH 17/89] fix for extend_features --- neusomatic/python/extend_features.py | 6 +++++- neusomatic/python/generate_dataset.py | 2 +- neusomatic/python/preprocess.py | 13 +++++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 5c813dc..ee40526 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -249,6 +249,7 @@ def extend_features(candidates_vcf, i = 0 batch = [] for line in skip_empty(i_f): + i += 1 chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] var_id = "-".join([chrom, pos, ref, alt]) if exclude_variants: @@ -260,11 +261,14 @@ def extend_features(candidates_vcf, if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) - i += 1 if len(batch) >= split_len or i == n_variants: map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch)) batch = [] + if batch: + map_args.append((reference, tumor_bam, normal_bam, + min_mapq, min_bq, dbsnp, tumor_only, batch)) + logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 68cfde6..9ef58d0 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1388,7 +1388,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): n_vars += 1 if n_vars > 0: ensemble_data = np.array(ensemble_data)[:, order_header] - header = np.array(header)[order_header].tolist() + header = np.array(header_)[order_header].tolist() cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ "Consistent_Mates", "Inconsistent_Mates", "N_DP", diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 8cd40bb..32ec235 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -94,7 +94,7 @@ def get_ensemble_region(record): region, reference + ".fai", args=" -b {}".format(matrix_base_pad + 3), run_logger=thread_logger) bedtools_intersect( - ensemble_bed, ensemble_bed_region_file_tmp, args=" -u", + ensemble_bed, ensemble_bed_region_file_tmp, args=" -u -header", output_fn=ensemble_bed_region_file, run_logger=thread_logger) return ensemble_bed_region_file @@ -349,17 +349,26 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_dataset_split, "merged_features.bed") if not os.path.exists(merged_features_bed) or restart: exclude_ens_variants = [] + header_line = "" with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: for line in skip_empty(i_f_1, skip_header=False): if line.startswith("#"): o_f.write(line) + if not header_line: + header_line = line + else: + assert(header_line == line) continue chrom, pos, _, ref, alt = line.strip().split("\t")[ 0:5] var_id = "-".join([chrom, pos, ref, alt]) exclude_ens_variants.append(var_id) o_f.write(line) - for line in skip_empty(i_f_2): + for line in skip_empty(i_f_2, skip_header=False): + if line.startswith("#"): + if header_line: + assert(header_line == line) + continue chrom, pos, _, ref, alt = line.strip().split("\t")[ 0:5] var_id = "-".join([chrom, pos, ref, alt]) From 15ff4a8061d466b89a60a3945a526da5a178cfe9 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 5 May 2020 12:10:39 -0700 Subject: [PATCH 18/89] added seq_complexity --- neusomatic/python/call.py | 59 ++++++++----- neusomatic/python/extend_features.py | 107 +++++++++++++++-------- neusomatic/python/generate_dataset.py | 82 ++++++++++------- neusomatic/python/preprocess.py | 15 +++- neusomatic/python/sequencing_features.py | 58 ++++++++++++ neusomatic/python/train.py | 77 ++++++++++------ 6 files changed, 275 insertions(+), 123 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index b6eb055..3a8229a 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -412,13 +412,38 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + logger.info("Load pretrained model from checkpoint {}".format(checkpoint)) + pretrained_dict = torch.load( + checkpoint, map_location=lambda storage, loc: storage) + pretrained_state_dict = pretrained_dict["state_dict"] + model_tag = pretrained_dict["tag"] + logger.info("tag: {}".format(model_tag)) + coverage_thr = pretrained_dict["coverage_thr"] + if "normalize_channels" in pretrained_dict: + normalize_channels = pretrained_dict["normalize_channels"] + else: + normalize_channels = False + if "seq_complexity" in pretrained_dict: + seq_complexity = pretrained_dict["seq_complexity"] + else: + seq_complexity = False + + logger.info("coverage_thr: {}".format(coverage_thr)) + logger.info("normalize_channels: {}".format(normalize_channels)) + logger.info("seq_complexity: {}".format(seq_complexity)) + + + num_expected_ensemble = NUM_ENS_FEATURES + if seq_complexity: + num_expected_ensemble += 2 ensemble = False with open(candidates_tsv[0]) as i_f: x = i_f.readline().strip().split() - if len(x) == NUM_ENS_FEATURES + 4: + if len(x) == num_expected_ensemble + 4: ensemble = True + num_channels = num_expected_ensemble + \ + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES - num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: @@ -431,26 +456,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("We use {} GPUs!".format(torch.cuda.device_count())) net = nn.DataParallel(net) - if not os.path.exists(out_dir): - os.mkdir(out_dir) - logger.info("Load pretrained model from checkpoint {}".format(checkpoint)) - pretrained_dict = torch.load( - checkpoint, map_location=lambda storage, loc: storage) - pretrained_state_dict = pretrained_dict["state_dict"] - model_tag = pretrained_dict["tag"] - logger.info("tag: {}".format(model_tag)) - - matrices_dir = "{}/matrices_{}".format(out_dir, model_tag) - if os.path.exists(matrices_dir): - logger.warning("Remove matrices directory: {}".format(matrices_dir)) - shutil.rmtree(matrices_dir) - os.mkdir(matrices_dir) - coverage_thr = pretrained_dict["coverage_thr"] - if "normalize_channels" in pretrained_dict: - normalize_channels = pretrained_dict["normalize_channels"] - else: - normalize_channels = False - model_dict = net.state_dict() # 1. filter out unnecessary keys @@ -472,6 +477,16 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, # 3. load the new state dict net.load_state_dict(pretrained_state_dict) + + if not os.path.exists(out_dir): + os.mkdir(out_dir) + matrices_dir = "{}/matrices_{}".format(out_dir, model_tag) + if os.path.exists(matrices_dir): + logger.warning("Remove matrices directory: {}".format(matrices_dir)) + shutil.rmtree(matrices_dir) + os.mkdir(matrices_dir) + + new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs") if os.path.exists(new_split_tsvs_dir): logger.warning( diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index ee40526..f440bda 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -20,7 +20,7 @@ def extract_features(candidate_record): - reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, batch = candidate_record + reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch = candidate_record thread_logger = logging.getLogger( "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name)) try: @@ -35,8 +35,10 @@ def extract_features(candidate_record): var_id = "-".join([chrom, pos, ref, alt]) pos = int(pos) my_coordinate = [chrom, pos] - nBamFeatures = sequencing_features.AlignmentFeatures(nbam, my_coordinate, ref, alt, min_mapq, min_bq) - tBamFeatures = sequencing_features.AlignmentFeatures(tbam, my_coordinate, ref, alt, min_mapq, min_bq) + nBamFeatures = sequencing_features.AlignmentFeatures( + nbam, my_coordinate, ref, alt, min_mapq, min_bq) + tBamFeatures = sequencing_features.AlignmentFeatures( + tbam, my_coordinate, ref, alt, min_mapq, min_bq) sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, tBamFeatures.nalt) @@ -46,16 +48,31 @@ def extract_features(candidate_record): indel_length = len(alt) - len(ref) + if seq_complexity: + seq_span_80bp = ref_fa.fetch(my_coordinate[0], max( + 0, my_coordinate[1] - 41), my_coordinate[1] + 40) + seq_left_80bp = ref_fa.fetch(my_coordinate[0], max( + 0, my_coordinate[1] - 81), my_coordinate[1]) + seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[ + 1], my_coordinate[1] + 81) + LC_spanning = sequencing_features.LC(seq_span_80bp) + LC_adjacent = min(sequencing_features.LC( + seq_left_80bp), sequencing_features.LC(seq_right_80bp)) + LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) + LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) + if_dbsnp = 0 if_common = 0 if dbsnp: region = "{}:{}-{}".format(chrom, pos, pos + 1) dbsnp_vars = {} for x in dbsnp_tb.fetch(region=region): - chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[0:8] + chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[ + 0:8] for alt_ in alts_.split(","): dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) - dbsnp_vars[dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 + dbsnp_vars[ + dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 if var_id in dbsnp_vars: if_dbsnp = 1 if_common = dbsnp_vars[var_id] @@ -71,6 +88,10 @@ def extract_features(candidate_record): COSMIC_CNT = num_cosmic_cases Consistent_Mates = tBamFeatures.consistent_mates Inconsistent_Mates = tBamFeatures.inconsistent_mates + if seq_complexity: + Seq_Complexity_Span = LC_spanning_phred + Seq_Complexity_Adj = LC_adjacent_phred + N_DP = nBamFeatures.dp nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq @@ -148,21 +169,26 @@ def extract_features(candidate_record): tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp InDel_Length = indel_length - ext_features.append([CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, - Consistent_Mates, Inconsistent_Mates, N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, - nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, - nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, - nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, - nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, - nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, - nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, - MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, - tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, - tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, - tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, - tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, - tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, - tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) + features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, + Consistent_Mates, Inconsistent_Mates] + if seq_complexity: + features.extend([Seq_Complexity_Span, Seq_Complexity_Adj]) + features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, + nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, + nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, + nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, + nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, + nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, + nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, + MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, + tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, + tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, + tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, + tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, + tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, + tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) + + ext_features.append(features) return ext_features except Exception as ex: @@ -177,6 +203,7 @@ def extend_features(candidates_vcf, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, cosmic, + seq_complexity, num_threads): logger = logging.getLogger(extend_features.__name__) @@ -263,29 +290,31 @@ def extend_features(candidates_vcf, batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) if len(batch) >= split_len or i == n_variants: map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, batch)) + min_mapq, min_bq, dbsnp, seq_complexity, batch)) batch = [] if batch: map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, tumor_only, batch)) - + min_mapq, min_bq, dbsnp, seq_complexity, batch)) logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", - "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ", - "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff", - "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", - "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", - "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", - "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", - "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR", - "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ", - "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff", - "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", - "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", - "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", - "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", - "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"] + "Consistent_Mates", "Inconsistent_Mates"] + if seq_complexity: + header.extend(["Seq_Complexity_Span", "Seq_Complexity_Adj"]) + header.extend(["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ", + "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff", + "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", + "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", + "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", + "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", + "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR", + "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ", + "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff", + "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", + "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", + "tBAM_Z_Ranksums_EndPos", "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", + "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", + "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]) try: # ext_features=[] @@ -337,6 +366,9 @@ def extend_features(candidates_vcf, help='dbSNP vcf (to annotate candidate variants)', default=None) parser.add_argument('--cosmic', type=str, help='COSMIC vcf (to annotate candidate variants)', default=None) + parser.add_argument('--seq_complexity', + help='Compute linguistic sequence complexity features', + action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) args = parser.parse_args() @@ -349,6 +381,7 @@ def extend_features(candidates_vcf, args.reference, args.tumor_bam, args.normal_bam, args.min_mapq, args.min_bq, args.dbsnp, args.cosmic, + args.seq_complexity, args.num_threads, ) if output is None: diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 9ef58d0..19eb077 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -826,7 +826,7 @@ def find_len(ref, alt): def find_records(input_record): - work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, work_index = input_record + work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, seq_complexity, work_index = input_record thread_logger = logging.getLogger( "{} ({})".format(find_records.__name__, multiprocessing.current_process().name)) try: @@ -848,6 +848,9 @@ def find_records(input_record): split_in_ensemble_bed = os.path.join( work, "in_ensemble_{}.bed".format(work_index)) + num_ens_features = NUM_ENS_FEATURES + if seq_complexity: + num_ens_features += 2 bedtools_intersect( truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger) bedtools_intersect( @@ -897,7 +900,7 @@ def find_records(input_record): r_ = [[chrom, pos, ref, alt]] for rr in r_: records.append(rr + [str(i)]) - anns[i] = [0] * NUM_ENS_FEATURES + anns[i] = [0] * num_ens_features i += 1 curren_pos_records = [] @@ -933,7 +936,7 @@ def find_records(input_record): else: r_ = [[chrom, pos, ref, alt]] - ann = [0] * NUM_ENS_FEATURES + ann = [0] * num_ens_features if pos == ens_pos: if ref == ens_ref and alt == ens_alt: ann = record_[15:] @@ -1323,7 +1326,7 @@ def find_records(input_record): return None -def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): +def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] ensemble_pos = [] @@ -1335,24 +1338,28 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", - "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", - "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", - "nBAM_ALT_NM", "nBAM_NM_Diff", "nBAM_REF_Concordant", "nBAM_REF_Discordant", - "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", - "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", "nBAM_Z_Ranksums_EndPos", - "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", "nBAM_MQ0", - "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", - "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", - "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "SOR", "MSI", "MSILEN", "SHIFT3", - "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", - "tBAM_Z_Ranksums_MQ", "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", - "tBAM_ALT_NM", "tBAM_NM_Diff", "tBAM_REF_Concordant", "tBAM_REF_Discordant", - "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", "tBAM_Concordance_FET", "T_REF_FOR", - "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", "tBAM_Z_Ranksums_EndPos", - "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", "tBAM_MQ0", - "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", - "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", - "InDel_Length"] + "Consistent_Mates", "Inconsistent_Mates"] + if seq_complexity: + expected_features += ["Seq_Complexity_Span", "Seq_Complexity_Adj"] + + expected_features += ["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", + "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", + "nBAM_ALT_NM", "nBAM_NM_Diff", "nBAM_REF_Concordant", "nBAM_REF_Discordant", + "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", "nBAM_Concordance_FET", "N_REF_FOR", "N_REF_REV", + "N_ALT_FOR", "N_ALT_REV", "nBAM_StrandBias_FET", "nBAM_Z_Ranksums_EndPos", + "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", "nBAM_MQ0", + "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", + "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", + "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "SOR", "MSI", "MSILEN", "SHIFT3", + "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", + "tBAM_Z_Ranksums_MQ", "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", + "tBAM_ALT_NM", "tBAM_NM_Diff", "tBAM_REF_Concordant", "tBAM_REF_Discordant", + "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", "tBAM_Concordance_FET", "T_REF_FOR", + "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", "tBAM_Z_Ranksums_EndPos", + "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", "tBAM_Clipping_FET", "tBAM_MQ0", + "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", + "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", + "InDel_Length"] callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", @@ -1370,8 +1377,9 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): lambda x: x[1] in expected_features, enumerate(header_))) header = list(map(lambda x: x[1], header_en)) if set(expected_features) - set(header): - logger.error("The following features are missing from ensemble file: {}".format( - list(set(expected_features) - set(header)))) + logger.error("The following features are missing from ensemble file {}: {}".format( + ensemble_tsv, + list(set(expected_features) - set(header)))) raise Exception order_header = [] for f in expected_features: @@ -1443,6 +1451,8 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header)))) InDel_Length = list(map(lambda x: x[0], filter( lambda x: x[1] in ["InDel_Length"], enumerate(header)))) + Seq_Complexity_ = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header)))) min_max_features = [[cov_features, 0, 2 * COV], [mq_features, 0, 70], @@ -1466,14 +1476,18 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): [SiteHomopolymer_Length, 0, 50], [InDel_Length, -30, 30], ] + if seq_complexity: + min_max_features.append([Seq_Complexity_, 0, 40]) + selected_features = sorted([i for f in min_max_features for i in f[0]]) selected_features_tags = list(map(lambda x: header[x], selected_features)) if n_vars > 0: for i_s, mn, mx in min_max_features: - s = ensemble_data[:, np.array(i_s)] - s = np.maximum(np.minimum(s, mx), mn) - s = (s - mn) / (mx - mn) - ensemble_data[:, np.array(i_s)] = s + if i_s: + s = ensemble_data[:, np.array(i_s)] + s = np.maximum(np.minimum(s, mx), mn) + s = (s - mn) / (mx - mn) + ensemble_data[:, np.array(i_s)] = s ensemble_data = ensemble_data[:, selected_features] ensemble_data = ensemble_data.tolist() with open(ensemble_bed, "w")as f_: @@ -1486,7 +1500,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, is_extend): def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, tsv_batch_size): + ensemble_bed, seq_complexity, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) logger.info("---------------------Generate Dataset----------------------") @@ -1513,7 +1527,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be split_batch_size = 10000 if ensemble_tsv and not ensemble_bed: ensemble_bed = os.path.join(work, "ensemble.bed") - extract_ensemble(ensemble_tsv, ensemble_bed, False) + extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False) tmp_ = bedtools_intersect( tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger) @@ -1540,7 +1554,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be map_args = [] for i, split_region_file in enumerate(split_region_files): map_args.append((work, split_region_file, truth_vcf_file, - tumor_pred_vcf_file, ref_file, ensemble_bed, i)) + tumor_pred_vcf_file, ref_file, ensemble_bed, seq_complexity, i)) try: records_data = pool.map_async(find_records, map_args).get() pool.close() @@ -1725,6 +1739,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be help='Ensemble annotation tsv file (only for short read)', default=None) parser.add_argument('--ensemble_bed', type=str, help='Ensemble annotation bed file (only for short read)', default=None) + parser.add_argument('--seq_complexity', + help='Compute linguistic sequence complexity features', + action="store_true") args = parser.parse_args() logger.info(args) @@ -1743,12 +1760,13 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be num_threads = args.num_threads ensemble_tsv = args.ensemble_tsv ensemble_bed = args.ensemble_bed + seq_complexity = args.seq_complexity tsv_batch_size = args.tsv_batch_size try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, tsv_batch_size) + ensemble_bed, seq_complexity, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 32ec235..d831b55 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -78,10 +78,11 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, - matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, tsv_batch_size): + matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed, + seq_complexity, tsv_batch_size) @@ -193,6 +194,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_tsv, long_read, restart, first_do_without_qual, filter_duplicate, add_extra_features, + seq_complexity, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -237,7 +239,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): - extract_ensemble(ensemble_tsv, ensemble_bed, False) + extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False) merge_d_for_short_read = 100 candidates_split_regions = [] @@ -338,12 +340,13 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, reference, tumor_bam, normal_bam, min_mapq, snp_min_bq, dbsnp, None, + seq_complexity, num_threads) extra_features_bed = os.path.join( work_dataset_split, "extra_features.bed") if not os.path.exists(extra_features_bed) or restart: extract_ensemble(extra_features_tsv, - extra_features_bed, True) + extra_features_bed, seq_complexity, True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") @@ -382,7 +385,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_bed_i, tsv_batch_size) + ensemble_bed_i, seq_complexity, tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir @@ -467,6 +470,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--add_extra_features', help='add extra input features', action="store_true") + parser.add_argument('--seq_complexity', + help='Compute linguistic sequence complexity features', + action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -485,6 +491,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual, args.filter_duplicate, args.add_extra_features, + args.seq_complexity, args.num_threads, args.scan_alignments_binary) except Exception as e: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 61723c0..599809c 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -219,3 +219,61 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): sor = min(sor_numerator / sor_denominator, max_value) return sor + +def max_vocabularies(seq_length): + # According to: + # https://doi.org/10.1093/bioinformatics/18.5.679 + # Assume 4 different nucleotides + counts = 0 + k = 1 + while k <= seq_length: + + if 4**k < (seq_length - k + 1): + counts = counts + 4**k + else: + counts = counts + (seq_length-k+1 + 1) * (seq_length-k+1 - 1 + 1)/2 + break + + k += 1 + + return counts + + + +def LC(sequence): + # Calculate linguistic sequence complexity according to + # https://doi.org/10.1093/bioinformatics/18.5.679 + # Assume 4 different nucleotides + sequence = sequence.upper() + + if not 'N' in sequence: + + number_of_subseqs = 0 + seq_length = len(sequence) + max_number_of_subseqs = max_vocabularies(seq_length) + + for i in range(1, seq_length+1): + + #max_vocab_1 = 4**i + #max_vocab_2 = seq_length - i + 1 + set_of_seq_n = set() + + for n, nth_base in enumerate(sequence): + + if n+i <= len(sequence): + sub_seq = sequence[n:n+i] + set_of_seq_n.add( sub_seq ) + + # All possible unique subseqs obtained. Break away and go no further. + #if ( max_vocab_1 >= max_vocab_2 ) and ( len(set_of_seq_n) == max_vocab_2 ): + # break + + num_uniq_subseqs = len(set_of_seq_n) + number_of_subseqs = number_of_subseqs + num_uniq_subseqs + + lc = number_of_subseqs/max_number_of_subseqs + + else: + lc = float('nan') + + return lc diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index a841a26..c58d271 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -203,6 +203,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs, train_split_len, normalize_channels, + seq_complexity, use_cuda): logger = logging.getLogger(train_neusomatic.__name__) @@ -218,13 +219,50 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo data_transform = matrix_transform((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) + if checkpoint: + logger.info( + "Load pretrained model from checkpoint {}".format(checkpoint)) + pretrained_dict = torch.load( + checkpoint, map_location=lambda storage, loc: storage) + pretrained_state_dict = pretrained_dict["state_dict"] + tag = pretrained_dict["tag"] + sofar_epochs = pretrained_dict["epoch"] + logger.info( + "sofar_epochs from pretrained checkpoint: {}".format(sofar_epochs)) + coverage_thr = pretrained_dict["coverage_thr"] + logger.info( + "Override coverage_thr from pretrained checkpoint: {}".format(coverage_thr)) + if "normalize_channels" in pretrained_dict: + normalize_channels = pretrained_dict["normalize_channels"] + else: + normalize_channels = False + logger.info( + "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels)) + if "seq_complexity" in pretrained_dict: + seq_complexity = pretrained_dict["seq_complexity"] + else: + seq_complexity = False + logger.info( + "Override seq_complexity from pretrained checkpoint: {}".format(seq_complexity)) + prev_epochs = sofar_epochs + 1 + else: + prev_epochs = 0 + time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") + tag = "neusomatic_{}".format(time_now) + logger.info("tag: {}".format(tag)) + + num_expected_ensemble = NUM_ENS_FEATURES + if seq_complexity: + num_expected_ensemble += 2 + ensemble = False with open(candidates_tsv[0]) as i_f: x = i_f.readline().strip().split() - if len(x) == NUM_ENS_FEATURES + 4: + if len(x) == num_expected_ensemble + 4: ensemble = True - num_channels = NUM_ENS_FEATURES + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES + num_channels = num_expected_ensemble + \ + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) @@ -242,25 +280,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo os.mkdir("{}/models/".format(out_dir)) if checkpoint: - logger.info( - "Load pretrained model from checkpoint {}".format(checkpoint)) - pretrained_dict = torch.load( - checkpoint, map_location=lambda storage, loc: storage) - pretrained_state_dict = pretrained_dict["state_dict"] - tag = pretrained_dict["tag"] - sofar_epochs = pretrained_dict["epoch"] - logger.info( - "sofar_epochs from pretrained checkpoint: {}".format(sofar_epochs)) - coverage_thr = pretrained_dict["coverage_thr"] - logger.info( - "Override coverage_thr from pretrained checkpoint: {}".format(coverage_thr)) - if "normalize_channels" in pretrained_dict: - normalize_channels = pretrained_dict["normalize_channels"] - else: - normalize_channels = False - logger.info( - "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels)) - prev_epochs = sofar_epochs + 1 model_dict = net.state_dict() # 1. filter out unnecessary keys # pretrained_state_dict = { @@ -278,11 +297,6 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo model_dict.update(pretrained_state_dict) # 3. load the new state dict net.load_state_dict(pretrained_state_dict) - else: - prev_epochs = 0 - time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") - tag = "neusomatic_{}".format(time_now) - logger.info("tag: {}".format(tag)) shuffle(candidates_tsv) @@ -403,8 +417,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "tag": tag, "epoch": curr_epoch, "coverage_thr": coverage_thr, - "normalize_channels": normalize_channels}, - '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) + "normalize_channels": normalize_channels, + "seq_complexity": seq_complexity + }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: train_sets[0].open_candidate_tsvs() @@ -469,6 +484,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, + "seq_complexity": seq_complexity, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if validation_candidates_tsv: test(net, curr_epoch, validation_loader, use_cuda) @@ -487,6 +503,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, + "seq_complexity": seq_complexity, }, '{}/models/checkpoint_{}_epoch{}.pth'.format( out_dir, tag, curr_epoch)) if validation_candidates_tsv: @@ -561,6 +578,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo help='normalize BQ, MQ, and other bam-info channels by frequency of observed alleles. \ Will be overridden if pretrained model is provided', action="store_true") + parser.add_argument('--seq_complexity', + help='Compute linguistic sequence complexity features', + action="store_true") args = parser.parse_args() logger.info(args) @@ -578,6 +598,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.merged_candidates_per_tsv, args.merged_max_num_tsvs, args.overwrite_merged_tsvs, args.train_split_len, args.normalize_channels, + args.seq_complexity, use_cuda) except Exception as e: logger.error(traceback.format_exc()) From 2c4f45db6f75356cc9d19617faa03a5f31523007 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 5 May 2020 22:00:56 -0700 Subject: [PATCH 19/89] fix ensemble --- neusomatic/python/call.py | 8 +- neusomatic/python/extend_features.py | 51 ++++++++-- neusomatic/python/generate_dataset.py | 14 ++- neusomatic/python/preprocess.py | 137 ++++++++++++++++++++------ neusomatic/python/train.py | 8 +- 5 files changed, 166 insertions(+), 52 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 3a8229a..2e60f47 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -433,15 +433,15 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("seq_complexity: {}".format(seq_complexity)) - num_expected_ensemble = NUM_ENS_FEATURES + expected_ens_fields = NUM_ENS_FEATURES if seq_complexity: - num_expected_ensemble += 2 + expected_ens_fields += 2 ensemble = False with open(candidates_tsv[0]) as i_f: x = i_f.readline().strip().split() - if len(x) == num_expected_ensemble + 4: + if len(x) == expected_ens_fields + 4: ensemble = True - num_channels = num_expected_ensemble + \ + num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index f440bda..43fcf8b 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -199,6 +199,7 @@ def extract_features(candidate_record): def extend_features(candidates_vcf, exclude_variants, + add_variants, output_tsv, reference, tumor_bam, normal_bam, min_mapq, min_bq, @@ -253,8 +254,8 @@ def extend_features(candidates_vcf, var_id = "-".join([chrom, pos, ref, alt]) cosmic_vars[var_id] = num_cases + exclude_vars = set([]) if exclude_variants: - exclude_vars = [] with open(exclude_variants) as i_f: for line in skip_empty(i_f): if exclude_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM": @@ -262,7 +263,18 @@ def extend_features(candidates_vcf, x = line.strip().split("\t") chrom, pos, _, ref, alt = x[0:5] var_id = "-".join([chrom, pos, ref, alt]) - exclude_vars.append(var_id) + exclude_vars.add(var_id) + + add_vars = set([]) + if add_variants: + with open(add_variants) as i_f: + for line in skip_empty(i_f): + if add_variants.split(".")[-1] == "tsv" and line[0:5] == "CHROM": + continue + x = line.strip().split("\t") + chrom, pos, _, ref, alt = x[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + add_vars.add(var_id) n_variants = 0 with open(candidates_vcf) as i_f: @@ -272,30 +284,46 @@ def extend_features(candidates_vcf, split_len = (n_variants + num_threads - 1) // num_threads pool = multiprocessing.Pool(num_threads) map_args = [] + batch = [] with open(candidates_vcf) as i_f: - i = 0 - batch = [] for line in skip_empty(i_f): - i += 1 chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] var_id = "-".join([chrom, pos, ref, alt]) if exclude_variants: if var_id in exclude_vars: continue + if add_variants: + if var_id in add_vars: + add_vars = add_vars - set([var_id]) num_cosmic_cases = float('nan') if_cosmic = 0 if cosmic and var_id in cosmic_vars: if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) - if len(batch) >= split_len or i == n_variants: + if len(batch) >= split_len: map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch)) batch = [] - if batch: - map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, seq_complexity, batch)) - + if add_variants and len(add_vars)>0: + for var_id in add_vars-set(exclude_vars): + v = var_id.split("-") + pos, ref, alt = v[-3:] + chrom = "-".join(v[:-3]) + num_cosmic_cases = float('nan') + if_cosmic = 0 + if cosmic and var_id in cosmic_vars: + if_cosmic = 1 + num_cosmic_cases = cosmic_vars[var_id] + batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) + if len(batch) >= split_len: + map_args.append((reference, tumor_bam, normal_bam, + min_mapq, min_bq, dbsnp, seq_complexity, batch)) + batch = [] + if batch: + map_args.append((reference, tumor_bam, normal_bam, + min_mapq, min_bq, dbsnp, seq_complexity, batch)) + logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", "Consistent_Mates", "Inconsistent_Mates"] @@ -350,6 +378,8 @@ def extend_features(candidates_vcf, required=True) parser.add_argument('--exclude_variants', type=str, help='variants to exclude', default=None) + parser.add_argument('--add_variants', type=str, help='variants to add if not exist in vcf. (Lower priority than --exclude_variants)', + default=None) parser.add_argument('--output_tsv', type=str, help='output features tsv', required=True) parser.add_argument('--reference', type=str, help='reference fasta filename', @@ -377,6 +407,7 @@ def extend_features(candidates_vcf, try: output = extend_features(args.candidates_vcf, args.exclude_variants, + args.add_variants, args.output_tsv, args.reference, args.tumor_bam, args.normal_bam, args.min_mapq, args.min_bq, diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 19eb077..870a999 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1326,7 +1326,7 @@ def find_records(input_record): return None -def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend): +def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] ensemble_pos = [] @@ -1376,6 +1376,9 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend): header_en = list(filter( lambda x: x[1] in expected_features, enumerate(header_))) header = list(map(lambda x: x[1], header_en)) + if not enforce_header: + expected_features = header + if set(expected_features) - set(header): logger.error("The following features are missing from ensemble file {}: {}".format( ensemble_tsv, @@ -1500,7 +1503,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, is_extend): def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, seq_complexity, tsv_batch_size): + ensemble_bed, seq_complexity, enforce_header, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) logger.info("---------------------Generate Dataset----------------------") @@ -1527,7 +1530,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be split_batch_size = 10000 if ensemble_tsv and not ensemble_bed: ensemble_bed = os.path.join(work, "ensemble.bed") - extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False) + extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, False) tmp_ = bedtools_intersect( tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger) @@ -1742,6 +1745,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be parser.add_argument('--seq_complexity', help='Compute linguistic sequence complexity features', action="store_true") + parser.add_argument('--enforce_header', + help='Enforce header match for ensemble_tsv', + action="store_true") args = parser.parse_args() logger.info(args) @@ -1766,7 +1772,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, seq_complexity, tsv_batch_size) + ensemble_bed, seq_complexity, enforce_header, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index d831b55..487a024 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -15,6 +15,7 @@ import logging import tempfile +import numpy as np from filter_candidates import filter_candidates from generate_dataset import generate_dataset, extract_ensemble @@ -78,11 +79,13 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, - matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, tsv_batch_size): + matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, + no_feature_recomp_for_ensemble, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed, seq_complexity, + no_feature_recomp_for_ensemble, tsv_batch_size) @@ -195,6 +198,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, filter_duplicate, add_extra_features, seq_complexity, + no_feature_recomp_for_ensemble, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -239,7 +243,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): - extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, False) + extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, no_feature_recomp_for_ensemble, False) merge_d_for_short_read = 100 candidates_split_regions = [] @@ -335,7 +339,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, if not os.path.exists(extra_features_tsv) or restart: extend_features(filtered_vcf, ensemble_beds[ - i] if ensemble_tsv else None, + i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None, + ensemble_beds[ + i] if (ensemble_tsv and not no_feature_recomp_for_ensemble) else None, extra_features_tsv, reference, tumor_bam, normal_bam, min_mapq, snp_min_bq, @@ -346,38 +352,105 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_dataset_split, "extra_features.bed") if not os.path.exists(extra_features_bed) or restart: extract_ensemble(extra_features_tsv, - extra_features_bed, seq_complexity, True) + extra_features_bed, seq_complexity, True, True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") if not os.path.exists(merged_features_bed) or restart: exclude_ens_variants = [] header_line = "" - with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: - for line in skip_empty(i_f_1, skip_header=False): - if line.startswith("#"): + if no_feature_recomp_for_ensemble: + with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: + for line in skip_empty(i_f_1, skip_header=False): + if line.startswith("#"): + if not header_line: + header_line = line + o_f.write(line) + else: + if header_line != line: + logger.error( + "{}!={}".format(header_line, line)) + raise Exception + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[ + 0:5] + var_id = "-".join([chrom, pos, ref, alt]) + exclude_ens_variants.append(var_id) + o_f.write(line) + for line in skip_empty(i_f_2, skip_header=False): + if line.startswith("#"): + if header_line != line: + logger.error( + "{}!={}".format(header_line, line)) + raise Exception + continue + chrom, pos, _, ref, alt = line.strip().split("\t")[ + 0:5] + var_id = "-".join([chrom, pos, ref, alt]) + if var_id in exclude_ens_variants: + continue o_f.write(line) - if not header_line: - header_line = line - else: - assert(header_line == line) - continue - chrom, pos, _, ref, alt = line.strip().split("\t")[ - 0:5] - var_id = "-".join([chrom, pos, ref, alt]) - exclude_ens_variants.append(var_id) - o_f.write(line) - for line in skip_empty(i_f_2, skip_header=False): - if line.startswith("#"): - if header_line: - assert(header_line == line) - continue - chrom, pos, _, ref, alt = line.strip().split("\t")[ - 0:5] - var_id = "-".join([chrom, pos, ref, alt]) - if var_id in exclude_ens_variants: - continue - o_f.write(line) + else: + callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", + "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", + "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", + "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] + with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: + ens_variants_info = {} + header_1_found = False + header_2_found = False + for line in skip_empty(i_f_1, skip_header=False): + if line.startswith("#"): + if not header_line: + header_line = line + else: + if header_line != line: + logger.error( + "{}!={}".format(header_line, line)) + raise Exception + header_ = line.strip().split()[5:] + header_caller = list(filter( + lambda x: x[1] in callers_features, enumerate(header_))) + header_caller_ = list( + map(lambda x: x[1], header_caller)) + header_i = list( + map(lambda x: x[0], header_caller)) + header_1_found = True + continue + assert header_1_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + ens_variants_info[var_id] = np.array(fields[5:])[ + header_i] + for line in skip_empty(i_f_2, skip_header=False): + if line.startswith("#"): + if header_line != line: + logger.error( + "{}!={}".format(header_line, line)) + if not header_2_found: + header_2 = line.strip().split()[5:] + logger.info(header_2) + order_header = [] + for f in header_caller_: + if f not in header_2: + logger.info("Missing header field {}".format(f)) + raise Exception + order_header.append(header_2.index(f)) + o_f.write(line) + header_2_found = True + + assert header_2_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, pos, ref, alt]) + if var_id in ens_variants_info: + fields_ = np.array(fields[5:]) + fields_[order_header] = ens_variants_info[ + var_id] + fields[5:] = fields_.tolist() + o_f.write( + "\t".join(list(map(str, fields))) + "\n") ensemble_bed_i = merged_features_bed else: ensemble_bed_i = extra_features_bed @@ -385,7 +458,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_bed_i, seq_complexity, tsv_batch_size) + ensemble_bed_i, seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir @@ -471,7 +544,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, help='add extra input features', action="store_true") parser.add_argument('--seq_complexity', - help='Compute linguistic sequence complexity features', + help='Compute linguistic sequence complexity features', + action="store_true") + parser.add_argument('--no_feature_recomp_for_ensemble', + help='Do not recompute features for ensemble_tsv', action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) @@ -492,6 +568,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.filter_duplicate, args.add_extra_features, args.seq_complexity, + args.no_feature_recomp_for_ensemble, args.num_threads, args.scan_alignments_binary) except Exception as e: diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index c58d271..8ddc301 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -251,17 +251,17 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo tag = "neusomatic_{}".format(time_now) logger.info("tag: {}".format(tag)) - num_expected_ensemble = NUM_ENS_FEATURES + expected_ens_fields = NUM_ENS_FEATURES if seq_complexity: - num_expected_ensemble += 2 + expected_ens_fields += 2 ensemble = False with open(candidates_tsv[0]) as i_f: x = i_f.readline().strip().split() - if len(x) == num_expected_ensemble + 4: + if len(x) == expected_ens_fields + 4: ensemble = True - num_channels = num_expected_ensemble + \ + num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) From e913e83f68fd6cc985123d92f1319c3a4cc284cb Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 5 May 2020 22:48:07 -0700 Subject: [PATCH 20/89] more efficient LC --- neusomatic/python/extend_features.py | 20 +++++------ neusomatic/python/sequencing_features.py | 43 +++++++++--------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 43fcf8b..23b57a9 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -55,9 +55,9 @@ def extract_features(candidate_record): 0, my_coordinate[1] - 81), my_coordinate[1]) seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[ 1], my_coordinate[1] + 81) - LC_spanning = sequencing_features.LC(seq_span_80bp) - LC_adjacent = min(sequencing_features.LC( - seq_left_80bp), sequencing_features.LC(seq_right_80bp)) + LC_spanning = sequencing_features.subLC(seq_span_80bp, 20) + LC_adjacent = min(sequencing_features.subLC( + seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20)) LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) @@ -305,8 +305,8 @@ def extend_features(candidates_vcf, map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch)) batch = [] - if add_variants and len(add_vars)>0: - for var_id in add_vars-set(exclude_vars): + if add_variants and len(add_vars) > 0: + for var_id in add_vars - set(exclude_vars): v = var_id.split("-") pos, ref, alt = v[-3:] chrom = "-".join(v[:-3]) @@ -323,7 +323,7 @@ def extend_features(candidates_vcf, if batch: map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch)) - + logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", "Consistent_Mates", "Inconsistent_Mates"] @@ -345,11 +345,11 @@ def extend_features(candidates_vcf, "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]) try: - # ext_features=[] - # for w in map_args: - # ext_features.append(extract_features(w)) + ext_features=[] + for w in map_args: + ext_features.append(extract_features(w)) - ext_features = pool.map_async(extract_features, map_args).get() + # ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: o_f.write("\t".join(header) + "\n") diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 599809c..c495681 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -220,18 +220,20 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): return sor -def max_vocabularies(seq_length): +def max_sub_vocabularies(seq_length, max_subseq_length): # According to: # https://doi.org/10.1093/bioinformatics/18.5.679 - # Assume 4 different nucleotides + # capping the length of sub_string as an input parameter + assert max_subseq_length <= seq_length + counts = 0 k = 1 - while k <= seq_length: + while k <= max_subseq_length: if 4**k < (seq_length - k + 1): counts = counts + 4**k else: - counts = counts + (seq_length-k+1 + 1) * (seq_length-k+1 - 1 + 1)/2 + counts = counts + (2*seq_length - k - max_subseq_length + 2) * (max_subseq_length - k + 1)/2 break k += 1 @@ -239,38 +241,23 @@ def max_vocabularies(seq_length): return counts - -def LC(sequence): +def subLC(sequence, max_substring_length=20): # Calculate linguistic sequence complexity according to # https://doi.org/10.1093/bioinformatics/18.5.679 - # Assume 4 different nucleotides + # Cut off substring at a fixed length sequence = sequence.upper() if not 'N' in sequence: number_of_subseqs = 0 seq_length = len(sequence) - max_number_of_subseqs = max_vocabularies(seq_length) - - for i in range(1, seq_length+1): - - #max_vocab_1 = 4**i - #max_vocab_2 = seq_length - i + 1 - set_of_seq_n = set() - - for n, nth_base in enumerate(sequence): - - if n+i <= len(sequence): - sub_seq = sequence[n:n+i] - set_of_seq_n.add( sub_seq ) - - # All possible unique subseqs obtained. Break away and go no further. - #if ( max_vocab_1 >= max_vocab_2 ) and ( len(set_of_seq_n) == max_vocab_2 ): - # break - - num_uniq_subseqs = len(set_of_seq_n) - number_of_subseqs = number_of_subseqs + num_uniq_subseqs - + max_number_of_subseqs = max_sub_vocabularies(seq_length, max_substring_length) + + set_of_seq_n = set() + for i in range(1, min(max_substring_length+1, seq_length+1) ): + set_of_seq_n.update((sequence[n: n+i] for n in range(len(sequence) - i + 1))) + + number_of_subseqs = len(set_of_seq_n) lc = number_of_subseqs/max_number_of_subseqs else: From 2be51cc07e115f6cee385b61ab018bcb38edb00d Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 5 May 2020 22:55:36 -0700 Subject: [PATCH 21/89] small fix --- neusomatic/python/extend_features.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 23b57a9..c84261d 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -345,11 +345,7 @@ def extend_features(candidates_vcf, "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]) try: - ext_features=[] - for w in map_args: - ext_features.append(extract_features(w)) - - # ext_features = pool.map_async(extract_features, map_args).get() + ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: o_f.write("\t".join(header) + "\n") From e9b83dad1d16ea42df3bce57de8bc120da7c822b Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 5 May 2020 23:04:01 -0700 Subject: [PATCH 22/89] filter duplicate by default --- neusomatic/python/postprocess.py | 10 ++++++---- neusomatic/python/preprocess.py | 10 ++++++---- test/NeuSomatic_ensemble.vcf | 16 ++++++++-------- test/NeuSomatic_standalone.vcf | 16 ++++++++-------- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index d0d934e..b8d0b3a 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -168,7 +168,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense lr_pad, lr_chunk_size, lr_chunk_scale, lr_snp_min_af, lr_ins_min_af, lr_del_min_af, lr_match_score, lr_mismatch_penalty, lr_gap_open_penalty, lr_gap_ext_penalty, lr_max_realign_dp, lr_do_split, - filter_duplicate, + keep_duplicate, pass_threshold, lowqual_threshold, msa_binary, num_threads): logger = logging.getLogger(postprocess.__name__) @@ -177,6 +177,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense if not os.path.exists(work): os.mkdir(work) + filter_duplicate = not keep_duplicate + original_tempdir = tempfile.tempdir bed_tempdir = os.path.join(work, "bed_tempdir_postprocess") if not os.path.exists(bed_tempdir): @@ -310,8 +312,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense parser.add_argument('--lowqual_threshold', type=float, help='SCORE for LowQual (PASS for lowqual_threshold <= score < pass_threshold)', default=0.4) - parser.add_argument('--filter_duplicate', - help='filter duplicate reads in analysis', + parser.add_argument('--keep_duplicate', + help='Dont filter duplicate reads in analysis', action="store_true") parser.add_argument('--msa_binary', type=str, help='MSA binary', default="../bin/msa") @@ -333,7 +335,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense args.lr_gap_open_penalty, args.lr_gap_ext_penalty, args.lr_max_realign_dp, args.lr_do_split, - args.filter_duplicate, + args.keep_duplicate, args.pass_threshold, args.lowqual_threshold, args.msa_binary, args.num_threads) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 487a024..dc39010 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -195,7 +195,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size, matrix_width, matrix_base_pad, min_ev_frac_per_col, ensemble_tsv, long_read, restart, first_do_without_qual, - filter_duplicate, + keep_duplicate, add_extra_features, seq_complexity, no_feature_recomp_for_ensemble, @@ -207,6 +207,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, if restart or not os.path.exists(work): os.mkdir(work) + filter_duplicate = not keep_duplicate + original_tempdir = tempfile.tempdir bed_tempdir = os.path.join(work, "bed_tempdir_preprocess") if not os.path.exists(bed_tempdir): @@ -537,8 +539,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--first_do_without_qual', help='Perform initial scan without calculating the quality stats', action="store_true") - parser.add_argument('--filter_duplicate', - help='filter duplicate reads when preparing pileup information', + parser.add_argument('--keep_duplicate', + help='Don not filter duplicate reads when preparing pileup information', action="store_true") parser.add_argument('--add_extra_features', help='add extra input features', @@ -565,7 +567,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.ins_merge_min_af, args.merge_r, args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col, args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual, - args.filter_duplicate, + args.keep_duplicate, args.add_extra_features, args.seq_complexity, args.no_feature_recomp_for_ensemble, diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf index 2302afe..e3a7d8b 100644 --- a/test/NeuSomatic_ensemble.vcf +++ b/test/NeuSomatic_ensemble.vcf @@ -14,11 +14,11 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE -22 21330787 . C T 26.9917 PASS SCORE=0.9980;DP=396;RO=306;AO=88;AF=0.2234 GT:DP:RO:AO:AF 0/1:396:306:88:0.2234 -22 21332122 . G A 28.5402 PASS SCORE=0.9986;DP=285;RO=223;AO=62;AF=0.2175 GT:DP:RO:AO:AF 0/1:285:223:62:0.2175 -22 21334924 . G C 17.5639 PASS SCORE=0.9825;DP=106;RO=83;AO=23;AF=0.217 GT:DP:RO:AO:AF 0/1:106:83:23:0.217 -22 21335259 . C A 19.7149 PASS SCORE=0.9893;DP=249;RO=200;AO=49;AF=0.1968 GT:DP:RO:AO:AF 0/1:249:200:49:0.1968 -22 21384516 . C T 27.6969 PASS SCORE=0.9983;DP=95;RO=68;AO=27;AF=0.2842 GT:DP:RO:AO:AF 0/1:95:68:27:0.2842 -22 21982892 . C T 21.5561 PASS SCORE=0.9930;DP=158;RO=113;AO=45;AF=0.2848 GT:DP:RO:AO:AF 0/1:158:113:45:0.2848 -22 21983260 . A G 31.5494 PASS SCORE=0.9993;DP=118;RO=74;AO=44;AF=0.3729 GT:DP:RO:AO:AF 0/1:118:74:44:0.3729 -22 21989959 . AAG A 33.0106 PASS SCORE=0.9995;DP=139;RO=107;AO=32;AF=0.2302 GT:DP:RO:AO:AF 0/1:139:107:32:0.2302 +22 21330787 . C T 26.9917 PASS SCORE=0.9980;DP=387;RO=298;AO=87;AF=0.226 GT:DP:RO:AO:AF 0/1:387:298:87:0.226 +22 21332122 . G A 28.5402 PASS SCORE=0.9986;DP=268;RO=209;AO=59;AF=0.2201 GT:DP:RO:AO:AF 0/1:268:209:59:0.2201 +22 21334924 . G C 17.6382 PASS SCORE=0.9828;DP=101;RO=78;AO=23;AF=0.2277 GT:DP:RO:AO:AF 0/1:101:78:23:0.2277 +22 21335259 . C A 19.7149 PASS SCORE=0.9893;DP=234;RO=190;AO=44;AF=0.188 GT:DP:RO:AO:AF 0/1:234:190:44:0.188 +22 21384516 . C T 27.9602 PASS SCORE=0.9984;DP=90;RO=64;AO=26;AF=0.2889 GT:DP:RO:AO:AF 0/1:90:64:26:0.2889 +22 21982892 . C T 21.4946 PASS SCORE=0.9929;DP=152;RO=109;AO=43;AF=0.2829 GT:DP:RO:AO:AF 0/1:152:109:43:0.2829 +22 21983260 . A G 31.5494 PASS SCORE=0.9993;DP=112;RO=70;AO=42;AF=0.375 GT:DP:RO:AO:AF 0/1:112:70:42:0.375 +22 21989959 . AAG A 33.0106 PASS SCORE=0.9995;DP=131;RO=99;AO=32;AF=0.2443 GT:DP:RO:AO:AF 0/1:131:99:32:0.2443 diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf index a7dd79f..bee861b 100644 --- a/test/NeuSomatic_standalone.vcf +++ b/test/NeuSomatic_standalone.vcf @@ -14,11 +14,11 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE -22 21330787 . C T 33.0111 PASS SCORE=0.9995;DP=396;RO=306;AO=88;AF=0.2234 GT:DP:RO:AO:AF 0/1:396:306:88:0.2234 -22 21332122 . G A 36.9903 PASS SCORE=0.9998;DP=285;RO=223;AO=62;AF=0.2175 GT:DP:RO:AO:AF 0/1:285:223:62:0.2175 -22 21334924 . G C 12.9061 PASS SCORE=0.9488;DP=106;RO=83;AO=23;AF=0.217 GT:DP:RO:AO:AF 0/1:106:83:23:0.217 -22 21335259 . C A 25.0876 PASS SCORE=0.9969;DP=249;RO=200;AO=49;AF=0.1968 GT:DP:RO:AO:AF 0/1:249:200:49:0.1968 -22 21384516 . C T 32.2191 PASS SCORE=0.9994;DP=95;RO=68;AO=27;AF=0.2842 GT:DP:RO:AO:AF 0/1:95:68:27:0.2842 -22 21982892 . C T 29.5872 PASS SCORE=0.9989;DP=158;RO=113;AO=45;AF=0.2848 GT:DP:RO:AO:AF 0/1:158:113:45:0.2848 -22 21983260 . A G 35.2289 PASS SCORE=0.9997;DP=118;RO=74;AO=44;AF=0.3729 GT:DP:RO:AO:AF 0/1:118:74:44:0.3729 -22 21989959 . AAG A 39.9993 PASS SCORE=0.9999;DP=139;RO=107;AO=32;AF=0.2302 GT:DP:RO:AO:AF 0/1:139:107:32:0.2302 +22 21330787 . C T 33.0111 PASS SCORE=0.9995;DP=387;RO=298;AO=87;AF=0.226 GT:DP:RO:AO:AF 0/1:387:298:87:0.226 +22 21332122 . G A 36.9903 PASS SCORE=0.9998;DP=268;RO=209;AO=59;AF=0.2201 GT:DP:RO:AO:AF 0/1:268:209:59:0.2201 +22 21334924 . G C 13.3787 PASS SCORE=0.9541;DP=101;RO=78;AO=23;AF=0.2277 GT:DP:RO:AO:AF 0/1:101:78:23:0.2277 +22 21335259 . C A 24.9497 PASS SCORE=0.9968;DP=234;RO=190;AO=44;AF=0.188 GT:DP:RO:AO:AF 0/1:234:190:44:0.188 +22 21384516 . C T 33.9800 PASS SCORE=0.9996;DP=90;RO=64;AO=26;AF=0.2889 GT:DP:RO:AO:AF 0/1:90:64:26:0.2889 +22 21982892 . C T 29.2094 PASS SCORE=0.9988;DP=152;RO=109;AO=43;AF=0.2829 GT:DP:RO:AO:AF 0/1:152:109:43:0.2829 +22 21983260 . A G 35.2289 PASS SCORE=0.9997;DP=112;RO=70;AO=42;AF=0.375 GT:DP:RO:AO:AF 0/1:112:70:42:0.375 +22 21989959 . AAG A 39.9993 PASS SCORE=0.9999;DP=131;RO=99;AO=32;AF=0.2443 GT:DP:RO:AO:AF 0/1:131:99:32:0.2443 From a54be6967fa4c82c382ed18ac426130f4400c021 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 7 May 2020 10:46:51 -0700 Subject: [PATCH 23/89] switched fisher test --- neusomatic/python/sequencing_features.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index c495681..679ab6d 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -8,17 +8,21 @@ import genomic_file_handlers as genome from read_info_extractor import * from collections import defaultdict +import fisher nan = float('nan') +def fisher_exact_test(mat): + return fisher.pvalue(mat[0][0],mat[0][1],mat[1][0],mat[1][1]).two_tail + class AlignmentFeatures: def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): ''' bam is the opened file handle of bam file my_coordiate is a list or tuple of 0-based (contig, position) - ''' - + ''' + indel_length = len(first_alt) - len(ref_base) reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1]) @@ -115,9 +119,9 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10) self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length) - self.concordance_fet = stats.fisher_exact(concordance_counts)[1] - self.strandbias_fet = stats.fisher_exact(orientation_counts)[1] - self.clipping_fet = stats.fisher_exact(soft_clip_counts)[1] + self.concordance_fet = fisher_exact_test(concordance_counts) + self.strandbias_fet = fisher_exact_test(orientation_counts) + self.clipping_fet = fisher_exact_test(soft_clip_counts) self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] From 4f6bf57cfabf5f0eb9af2ef8d9a4cb5fd69daff8 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 9 May 2020 12:06:15 -0700 Subject: [PATCH 24/89] fix seq_complexity --- neusomatic/python/call.py | 23 +++--- neusomatic/python/extend_features.py | 24 +++---- neusomatic/python/generate_dataset.py | 90 +++++++++++++----------- neusomatic/python/preprocess.py | 48 +++++++++---- neusomatic/python/sequencing_features.py | 2 +- neusomatic/python/train.py | 36 +++++----- test/run_test.sh | 2 + 7 files changed, 130 insertions(+), 95 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 2e60f47..e8e8fcb 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -423,24 +423,29 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, normalize_channels = pretrained_dict["normalize_channels"] else: normalize_channels = False - if "seq_complexity" in pretrained_dict: - seq_complexity = pretrained_dict["seq_complexity"] + if "no_seq_complexity" in pretrained_dict: + no_seq_complexity = pretrained_dict["no_seq_complexity"] else: - seq_complexity = False + no_seq_complexity = True logger.info("coverage_thr: {}".format(coverage_thr)) logger.info("normalize_channels: {}".format(normalize_channels)) - logger.info("seq_complexity: {}".format(seq_complexity)) + logger.info("no_seq_complexity: {}".format(no_seq_complexity)) expected_ens_fields = NUM_ENS_FEATURES - if seq_complexity: + if not no_seq_complexity: expected_ens_fields += 2 + + logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + ensemble = False - with open(candidates_tsv[0]) as i_f: - x = i_f.readline().strip().split() - if len(x) == expected_ens_fields + 4: - ensemble = True + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if len(x) == expected_ens_fields + 4: + ensemble = True + break num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index c84261d..79743a7 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -20,7 +20,7 @@ def extract_features(candidate_record): - reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, seq_complexity, batch = candidate_record + reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, no_seq_complexity, batch = candidate_record thread_logger = logging.getLogger( "{} ({})".format(extract_features.__name__, multiprocessing.current_process().name)) try: @@ -48,7 +48,7 @@ def extract_features(candidate_record): indel_length = len(alt) - len(ref) - if seq_complexity: + if not no_seq_complexity: seq_span_80bp = ref_fa.fetch(my_coordinate[0], max( 0, my_coordinate[1] - 41), my_coordinate[1] + 40) seq_left_80bp = ref_fa.fetch(my_coordinate[0], max( @@ -88,7 +88,7 @@ def extract_features(candidate_record): COSMIC_CNT = num_cosmic_cases Consistent_Mates = tBamFeatures.consistent_mates Inconsistent_Mates = tBamFeatures.inconsistent_mates - if seq_complexity: + if not no_seq_complexity: Seq_Complexity_Span = LC_spanning_phred Seq_Complexity_Adj = LC_adjacent_phred @@ -171,7 +171,7 @@ def extract_features(candidate_record): features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, Consistent_Mates, Inconsistent_Mates] - if seq_complexity: + if not no_seq_complexity: features.extend([Seq_Complexity_Span, Seq_Complexity_Adj]) features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, @@ -204,7 +204,7 @@ def extend_features(candidates_vcf, reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, cosmic, - seq_complexity, + no_seq_complexity, num_threads): logger = logging.getLogger(extend_features.__name__) @@ -303,7 +303,7 @@ def extend_features(candidates_vcf, batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) if len(batch) >= split_len: map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, seq_complexity, batch)) + min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) batch = [] if add_variants and len(add_vars) > 0: for var_id in add_vars - set(exclude_vars): @@ -318,16 +318,16 @@ def extend_features(candidates_vcf, batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) if len(batch) >= split_len: map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, seq_complexity, batch)) + min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) batch = [] if batch: map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, seq_complexity, batch)) + min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", "Consistent_Mates", "Inconsistent_Mates"] - if seq_complexity: + if not no_seq_complexity: header.extend(["Seq_Complexity_Span", "Seq_Complexity_Adj"]) header.extend(["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", "nBAM_Z_Ranksums_MQ", "nBAM_REF_BQ", "nBAM_ALT_BQ", "nBAM_Z_Ranksums_BQ", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_NM_Diff", @@ -392,8 +392,8 @@ def extend_features(candidates_vcf, help='dbSNP vcf (to annotate candidate variants)', default=None) parser.add_argument('--cosmic', type=str, help='COSMIC vcf (to annotate candidate variants)', default=None) - parser.add_argument('--seq_complexity', - help='Compute linguistic sequence complexity features', + parser.add_argument('--no_seq_complexity', + help='Dont compute linguistic sequence complexity features', action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) @@ -408,7 +408,7 @@ def extend_features(candidates_vcf, args.reference, args.tumor_bam, args.normal_bam, args.min_mapq, args.min_bq, args.dbsnp, args.cosmic, - args.seq_complexity, + args.no_seq_complexity, args.num_threads, ) if output is None: diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 870a999..5f9c6b8 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -826,7 +826,7 @@ def find_len(ref, alt): def find_records(input_record): - work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, seq_complexity, work_index = input_record + work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record thread_logger = logging.getLogger( "{} ({})".format(find_records.__name__, multiprocessing.current_process().name)) try: @@ -849,7 +849,7 @@ def find_records(input_record): work, "in_ensemble_{}.bed".format(work_index)) num_ens_features = NUM_ENS_FEATURES - if seq_complexity: + if not no_seq_complexity: num_ens_features += 2 bedtools_intersect( truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger) @@ -1326,7 +1326,7 @@ def find_records(input_record): return None -def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, is_extend): +def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] ensemble_pos = [] @@ -1339,7 +1339,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, "if_TNscope", "Strelka_Score", "Strelka_QSS", "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", "Consistent_Mates", "Inconsistent_Mates"] - if seq_complexity: + if not no_seq_complexity: expected_features += ["Seq_Complexity_Span", "Seq_Complexity_Adj"] expected_features += ["N_DP", "nBAM_REF_MQ", "nBAM_ALT_MQ", @@ -1366,37 +1366,42 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] n_vars = 0 - with open(ensemble_tsv) as s_f: - for line in skip_empty(s_f): - if line.startswith("CHROM"): - header_pos = line.strip().split()[0:5] - header_ = line.strip().split()[5:] + all_headers = set([]) + for ensemble_tsv in ensemble_tsvs: + with open(ensemble_tsv) as s_f: + for line in skip_empty(s_f): + if line.startswith("CHROM"): + all_headers.add(line) + header_pos = line.strip().split()[0:5] + header_ = line.strip().split()[5:] + if is_extend: + header_ += callers_features + header_en = list(filter( + lambda x: x[1] in expected_features, enumerate(header_))) + header = list(map(lambda x: x[1], header_en)) + if not enforce_header: + expected_features = header + + if set(expected_features) - set(header): + logger.error("The following features are missing from ensemble file {}: {}".format( + ensemble_tsv, + list(set(expected_features) - set(header)))) + raise Exception + order_header = [] + for f in expected_features: + order_header.append(header_en[header.index(f)][0]) + continue + fields = line.strip().split() + fields[2] = str(int(fields[1]) + len(fields[3])) + ensemble_pos.append(fields[0:5]) + features = fields[5:] if is_extend: - header_ += callers_features - header_en = list(filter( - lambda x: x[1] in expected_features, enumerate(header_))) - header = list(map(lambda x: x[1], header_en)) - if not enforce_header: - expected_features = header - - if set(expected_features) - set(header): - logger.error("The following features are missing from ensemble file {}: {}".format( - ensemble_tsv, - list(set(expected_features) - set(header)))) - raise Exception - order_header = [] - for f in expected_features: - order_header.append(header_en[header.index(f)][0]) - continue - fields = line.strip().split() - fields[2] = str(int(fields[1]) + len(fields[3])) - ensemble_pos.append(fields[0:5]) - features = fields[5:] - if is_extend: - features += ["0"] * len(callers_features) - ensemble_data.append(list(map(lambda x: float( - x.replace("False", "0").replace("True", "1")), features))) - n_vars += 1 + features += ["0"] * len(callers_features) + ensemble_data.append(list(map(lambda x: float( + x.replace("False", "0").replace("True", "1")), features))) + n_vars += 1 + if len(set(all_headers)) != 1: + raise(RuntimeError("inconsistent headers in {}".format(ensemble_tsvs))) if n_vars > 0: ensemble_data = np.array(ensemble_data)[:, order_header] header = np.array(header_)[order_header].tolist() @@ -1479,7 +1484,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, [SiteHomopolymer_Length, 0, 50], [InDel_Length, -30, 30], ] - if seq_complexity: + if not no_seq_complexity: min_max_features.append([Seq_Complexity_, 0, 40]) selected_features = sorted([i for f in min_max_features for i in f[0]]) @@ -1503,7 +1508,7 @@ def extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, seq_complexity, enforce_header, tsv_batch_size): + ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) logger.info("---------------------Generate Dataset----------------------") @@ -1530,7 +1535,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be split_batch_size = 10000 if ensemble_tsv and not ensemble_bed: ensemble_bed = os.path.join(work, "ensemble.bed") - extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, enforce_header, False) + extract_ensemble([ensemble_tsv], ensemble_bed, + no_seq_complexity, enforce_header, False) tmp_ = bedtools_intersect( tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger) @@ -1557,7 +1563,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be map_args = [] for i, split_region_file in enumerate(split_region_files): map_args.append((work, split_region_file, truth_vcf_file, - tumor_pred_vcf_file, ref_file, ensemble_bed, seq_complexity, i)) + tumor_pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, i)) try: records_data = pool.map_async(find_records, map_args).get() pool.close() @@ -1742,8 +1748,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be help='Ensemble annotation tsv file (only for short read)', default=None) parser.add_argument('--ensemble_bed', type=str, help='Ensemble annotation bed file (only for short read)', default=None) - parser.add_argument('--seq_complexity', - help='Compute linguistic sequence complexity features', + parser.add_argument('--no_seq_complexity', + help='Dont compute linguistic sequence complexity features', action="store_true") parser.add_argument('--enforce_header', help='Enforce header match for ensemble_tsv', @@ -1766,13 +1772,13 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be num_threads = args.num_threads ensemble_tsv = args.ensemble_tsv ensemble_bed = args.ensemble_bed - seq_complexity = args.seq_complexity + no_seq_complexity = args.no_seq_complexity tsv_batch_size = args.tsv_batch_size try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, seq_complexity, enforce_header, tsv_batch_size) + ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index dc39010..11410d1 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -79,12 +79,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, - matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, seq_complexity, + matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed, - seq_complexity, + no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) @@ -197,7 +197,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_tsv, long_read, restart, first_do_without_qual, keep_duplicate, add_extra_features, - seq_complexity, + no_seq_complexity, no_feature_recomp_for_ensemble, num_threads, scan_alignments_binary,): @@ -245,7 +245,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): - extract_ensemble(ensemble_tsv, ensemble_bed, seq_complexity, no_feature_recomp_for_ensemble, False) + extract_ensemble([ensemble_tsv], ensemble_bed, + no_seq_complexity, no_feature_recomp_for_ensemble, False) merge_d_for_short_read = 100 candidates_split_regions = [] @@ -338,23 +339,38 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_tumor_i = os.path.dirname(filtered_vcf) extra_features_tsv = os.path.join( work_tumor_i, "extra_features.tsv") + ex_tsvs = [extra_features_tsv] if not os.path.exists(extra_features_tsv) or restart: extend_features(filtered_vcf, ensemble_beds[ i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None, - ensemble_beds[ - i] if (ensemble_tsv and not no_feature_recomp_for_ensemble) else None, + None, extra_features_tsv, reference, tumor_bam, normal_bam, min_mapq, snp_min_bq, dbsnp, None, - seq_complexity, + no_seq_complexity, num_threads) + if ensemble_tsv and not no_feature_recomp_for_ensemble: + extra_features_others_tsv = os.path.join( + work_tumor_i, "extra_features_others.tsv") + ex_tsvs.append(extra_features_others_tsv) + if not os.path.exists(extra_features_others_tsv) or restart: + extend_features(ensemble_beds[i], + extra_features_tsv, + None, + extra_features_others_tsv, + reference, tumor_bam, normal_bam, + min_mapq, snp_min_bq, + dbsnp, None, + no_seq_complexity, + num_threads) + extra_features_bed = os.path.join( work_dataset_split, "extra_features.bed") if not os.path.exists(extra_features_bed) or restart: - extract_ensemble(extra_features_tsv, - extra_features_bed, seq_complexity, True, True) + extract_ensemble(ex_tsvs, + extra_features_bed, no_seq_complexity, True, True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") @@ -436,9 +452,11 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, order_header = [] for f in header_caller_: if f not in header_2: - logger.info("Missing header field {}".format(f)) + logger.info( + "Missing header field {}".format(f)) raise Exception - order_header.append(header_2.index(f)) + order_header.append( + header_2.index(f)) o_f.write(line) header_2_found = True @@ -460,7 +478,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_bed_i, seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) + ensemble_bed_i, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir @@ -545,8 +563,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--add_extra_features', help='add extra input features', action="store_true") - parser.add_argument('--seq_complexity', - help='Compute linguistic sequence complexity features', + parser.add_argument('--no_seq_complexity', + help='Dont compute linguistic sequence complexity features', action="store_true") parser.add_argument('--no_feature_recomp_for_ensemble', help='Do not recompute features for ensemble_tsv', @@ -569,7 +587,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual, args.keep_duplicate, args.add_extra_features, - args.seq_complexity, + args.no_seq_complexity, args.no_feature_recomp_for_ensemble, args.num_threads, args.scan_alignments_binary) diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 679ab6d..7e90d1a 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -43,7 +43,7 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10) qname_collector = defaultdict(list) for read_i in reads: - if read_i.is_unmapped or not dedup_test(read_i): + if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: continue dp += 1 diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 8ddc301..966d7f4 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -203,7 +203,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo merged_candidates_per_tsv, merged_max_num_tsvs, overwrite_merged_tsvs, train_split_len, normalize_channels, - seq_complexity, + no_seq_complexity, use_cuda): logger = logging.getLogger(train_neusomatic.__name__) @@ -238,12 +238,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo normalize_channels = False logger.info( "Override normalize_channels from pretrained checkpoint: {}".format(normalize_channels)) - if "seq_complexity" in pretrained_dict: - seq_complexity = pretrained_dict["seq_complexity"] + if "no_seq_complexity" in pretrained_dict: + no_seq_complexity = pretrained_dict["no_seq_complexity"] else: - seq_complexity = False + no_seq_complexity = True logger.info( - "Override seq_complexity from pretrained checkpoint: {}".format(seq_complexity)) + "Override no_seq_complexity from pretrained checkpoint: {}".format(no_seq_complexity)) prev_epochs = sofar_epochs + 1 else: prev_epochs = 0 @@ -252,14 +252,18 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo logger.info("tag: {}".format(tag)) expected_ens_fields = NUM_ENS_FEATURES - if seq_complexity: + if not no_seq_complexity: expected_ens_fields += 2 + logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + ensemble = False - with open(candidates_tsv[0]) as i_f: - x = i_f.readline().strip().split() - if len(x) == expected_ens_fields + 4: - ensemble = True + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if len(x) == expected_ens_fields + 4: + ensemble = True + break num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES @@ -418,7 +422,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, - "seq_complexity": seq_complexity + "no_seq_complexity": no_seq_complexity }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: @@ -484,7 +488,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, - "seq_complexity": seq_complexity, + "no_seq_complexity": no_seq_complexity, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if validation_candidates_tsv: test(net, curr_epoch, validation_loader, use_cuda) @@ -503,7 +507,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, - "seq_complexity": seq_complexity, + "no_seq_complexity": no_seq_complexity, }, '{}/models/checkpoint_{}_epoch{}.pth'.format( out_dir, tag, curr_epoch)) if validation_candidates_tsv: @@ -578,8 +582,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo help='normalize BQ, MQ, and other bam-info channels by frequency of observed alleles. \ Will be overridden if pretrained model is provided', action="store_true") - parser.add_argument('--seq_complexity', - help='Compute linguistic sequence complexity features', + parser.add_argument('--no_seq_complexity', + help='Dont compute linguistic sequence complexity features', action="store_true") args = parser.parse_args() @@ -598,7 +602,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.merged_candidates_per_tsv, args.merged_max_num_tsvs, args.overwrite_merged_tsvs, args.train_split_len, args.normalize_channels, - args.seq_complexity, + args.no_seq_complexity, use_cuda) except Exception as e: logger.error(traceback.format_exc()) diff --git a/test/run_test.sh b/test/run_test.sh index 850132e..22ebbe8 100755 --- a/test/run_test.sh +++ b/test/run_test.sh @@ -36,6 +36,7 @@ python ${neusomatic_dir}/neusomatic/python/preprocess.py \ --ins_min_af 0.05 \ --del_min_af 0.05 \ --num_threads 1 \ + --no_seq_complexity \ --scan_alignments_binary ${neusomatic_dir}/neusomatic/bin/scan_alignments CUDA_VISIBLE_DEVICES= python ${neusomatic_dir}/neusomatic/python/call.py \ @@ -73,6 +74,7 @@ python ${neusomatic_dir}/neusomatic/python/preprocess.py \ --del_min_af 0.05 \ --num_threads 1 \ --ensemble_tsv ${test_dir}/ensemble.tsv \ + --no_seq_complexity \ --scan_alignments_binary ${neusomatic_dir}/neusomatic/bin/scan_alignments CUDA_VISIBLE_DEVICES= python ${neusomatic_dir}/neusomatic/python/call.py \ From dbe9c4af198086faf2d9f29de197cee970945457 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 9 May 2020 12:29:53 -0700 Subject: [PATCH 25/89] fix num fields --- neusomatic/python/call.py | 16 +++++++++++++--- neusomatic/python/train.py | 15 ++++++++++++--- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index e8e8fcb..a5d6468 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -439,13 +439,23 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + expected_st_fields = 4 + + logger.info("expected_st_fields: {}".format(expected_st_fields)) + ensemble = False for tsv in candidates_tsv: with open(tsv) as i_f: x = i_f.readline().strip().split() - if len(x) == expected_ens_fields + 4: - ensemble = True - break + if x: + if len(x) == expected_ens_fields + 4: + ensemble = True + break + elif len(x) == 4: + break + else: + raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) + num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 966d7f4..9124cb9 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -257,13 +257,22 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + expected_st_fields = 4 + + logger.info("expected_st_fields: {}".format(expected_st_fields)) + ensemble = False for tsv in candidates_tsv: with open(tsv) as i_f: x = i_f.readline().strip().split() - if len(x) == expected_ens_fields + 4: - ensemble = True - break + if x: + if len(x) == expected_ens_fields + 4: + ensemble = True + break + elif len(x) == 4: + break + else: + raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES From cb64681a5f6f248912966ce26f609600473e84c4 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 9 May 2020 19:05:48 -0700 Subject: [PATCH 26/89] zero anns columns added --- neusomatic/python/call.py | 20 +++++++++++++++++- neusomatic/python/dataloader.py | 7 +++++++ neusomatic/python/train.py | 37 ++++++++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index a5d6468..18b3969 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -396,6 +396,7 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, + force_zero_ann_cols, use_cuda): logger = logging.getLogger(call_neusomatic.__name__) @@ -427,10 +428,20 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, no_seq_complexity = pretrained_dict["no_seq_complexity"] else: no_seq_complexity = True + if "zero_ann_cols" in pretrained_dict: + zero_ann_cols = pretrained_dict["zero_ann_cols"] + else: + zero_ann_cols = [] + + if force_zero_ann_cols: + logger.info( + "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols)) + zero_ann_cols = force_zero_ann_cols logger.info("coverage_thr: {}".format(coverage_thr)) logger.info("normalize_channels: {}".format(normalize_channels)) logger.info("no_seq_complexity: {}".format(no_seq_complexity)) + logger.info("zero_ann_cols: {}".format(zero_ann_cols)) expected_ens_fields = NUM_ENS_FEATURES @@ -554,7 +565,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, transform=data_transform, is_test=True, num_threads=num_threads, coverage_thr=coverage_thr, - normalize_channels=normalize_channels) + normalize_channels=normalize_channels, + zero_ann_cols=zero_ann_cols) call_loader = torch.utils.data.DataLoader(call_set, batch_size=batch_size, shuffle=True, pin_memory=True, @@ -634,6 +646,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, parser.add_argument('--lowqual_threshold', type=float, help='SCORE for LowQual (PASS for lowqual_threshold <= score < pass_threshold)', default=0.4) + parser.add_argument('--force_zero_ann_cols', nargs="*", type=int, + help='force columns to be set to zero in the annotations. Higher priority than \ + --zero_ann_cols and pretrained setting.\ + idx starts from 5th column in candidate.tsv file', + default=[]) args = parser.parse_args() use_cuda = torch.cuda.is_available() @@ -644,6 +661,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, args.checkpoint, args.num_threads, args.batch_size, args.max_load_candidates, args.pass_threshold, args.lowqual_threshold, + args.force_zero_ann_cols, use_cuda) except Exception as e: logger.error(traceback.format_exc()) diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index 05ea953..c2450dd 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -131,6 +131,7 @@ def __init__(self, roots, max_load_candidates, transform=None, num_threads=1, disable_ensemble=False, data_augmentation=False, nclasses_t=4, nclasses_l=4, coverage_thr=100, normalize_channels=False, + zero_ann_cols=[], max_opended_tsv=-1): soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) @@ -141,6 +142,7 @@ def __init__(self, roots, max_load_candidates, transform=None, max_opended_tsv = min(max_opended_tsv, soft) self.max_opended_tsv = max_opended_tsv self.normalize_channels = normalize_channels + self.zero_ann_cols = zero_ann_cols self.da_shift_p = 0.3 self.da_base_p = 0.05 self.da_rev_p = 0.1 @@ -264,6 +266,11 @@ def __getitem__(self, index): if self.disable_ensemble: anns = [] + if self.zero_ann_cols and len(anns)>0: + anns=np.array(anns) + anns[self.zero_ann_cols] = 0 + anns = anns.tolist() + tag = path.split("/")[-1] _, _, _, _, vartype, center, length, tumor_cov, normal_cov = tag.split( ".") diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 9124cb9..0211443 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -204,6 +204,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo train_split_len, normalize_channels, no_seq_complexity, + zero_ann_cols, + force_zero_ann_cols, use_cuda): logger = logging.getLogger(train_neusomatic.__name__) @@ -244,6 +246,13 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo no_seq_complexity = True logger.info( "Override no_seq_complexity from pretrained checkpoint: {}".format(no_seq_complexity)) + if "zero_ann_cols" in pretrained_dict: + zero_ann_cols = pretrained_dict["zero_ann_cols"] + else: + zero_ann_cols = [] + if not force_zero_ann_cols: + logger.info( + "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols)) prev_epochs = sofar_epochs + 1 else: prev_epochs = 0 @@ -251,6 +260,12 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo tag = "neusomatic_{}".format(time_now) logger.info("tag: {}".format(tag)) + if force_zero_ann_cols: + zero_ann_cols = force_zero_ann_cols + logger.info( + "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols)) + + expected_ens_fields = NUM_ENS_FEATURES if not no_seq_complexity: expected_ens_fields += 2 @@ -354,7 +369,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo max_load_candidates * len(tsvs) / float(len(candidates_tsv))), transform=data_transform, is_test=False, num_threads=num_threads, coverage_thr=coverage_thr, - normalize_channels=normalize_channels) + normalize_channels=normalize_channels, + zero_ann_cols=zero_ann_cols) train_sets.append(train_set) none_indices = train_set.get_none_indices() var_indices = train_set.get_var_indices() @@ -387,7 +403,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo max_load_candidates=max_load_candidates, transform=data_transform, is_test=True, num_threads=num_threads, coverage_thr=coverage_thr, - normalize_channels=normalize_channels) + normalize_channels=normalize_channels, + zero_ann_cols=zero_ann_cols) validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True, num_workers=num_threads, pin_memory=True) @@ -431,7 +448,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "epoch": curr_epoch, "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, - "no_seq_complexity": no_seq_complexity + "no_seq_complexity": no_seq_complexity, + "zero_ann_cols": zero_ann_cols, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: @@ -498,6 +516,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, + "zero_ann_cols": zero_ann_cols, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if validation_candidates_tsv: test(net, curr_epoch, validation_loader, use_cuda) @@ -517,6 +536,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "coverage_thr": coverage_thr, "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, + "zero_ann_cols": zero_ann_cols, }, '{}/models/checkpoint_{}_epoch{}.pth'.format( out_dir, tag, curr_epoch)) if validation_candidates_tsv: @@ -594,6 +614,15 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo parser.add_argument('--no_seq_complexity', help='Dont compute linguistic sequence complexity features', action="store_true") + parser.add_argument('--zero_ann_cols', nargs="*", type=int, + help='columns to be set to zero in the annotations \ + idx starts from 5th column in candidate.tsv file', + default=[]) + parser.add_argument('--force_zero_ann_cols', nargs="*", type=int, + help='force columns to be set to zero in the annotations. Higher priority than \ + --zero_ann_cols and pretrained setting \ + idx starts from 5th column in candidate.tsv file', + default=[]) args = parser.parse_args() logger.info(args) @@ -612,6 +641,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.overwrite_merged_tsvs, args.train_split_len, args.normalize_channels, args.no_seq_complexity, + args.zero_ann_cols, + args.force_zero_ann_cols, use_cuda) except Exception as e: logger.error(traceback.format_exc()) From 97d16f6ca0c6423631c26dd377cfe4670d7ebf31 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 10 May 2020 01:41:33 -0700 Subject: [PATCH 27/89] fix bug in read_info_extractor.py as in somaticseq --- neusomatic/python/read_info_extractor.py | 62 +++++++++++++----------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index b6db804..5c005f4 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -20,28 +20,28 @@ ### PYSAM ### -def position_of_aligned_read(read_i, target_position): +def position_of_aligned_read(read_i, target_position, win_size=3): ''' - Return the base call of the target position, or if it's a start of insertion/deletion. + Return the base call of the target position, and if it's a start of insertion/deletion. This target position follows pysam convension, i.e., 0-based. In VCF files, deletions/insertions occur AFTER the position. Return (Code, seq_i, base_at_target, indel_length, nearest insertion/deletion) - The first number in result is a code: - 1) Match to reference, which is either a reference read or a SNV/SNP - 2) Deletion after the target position - 3) Insertion after the target position - 0) The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics. + The first number in result is a codeMatch to reference on CIGAR, which is either a reference read or a SNV (substitution counts as M in CIGAR) to reference, which is either a reference read or a SNV/SNP + 2: Deletion after the target position + 3: Insertion after the target position + 0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics. ''' flanking_deletion, flanking_insertion = nan, nan - aligned_pairs=read_i.get_aligned_pairs() + aligned_pairs = read_i.get_aligned_pairs() for i, align_i in enumerate(aligned_pairs): # If find a match: if align_i[1] == target_position: seq_i = align_i[0] + idx_aligned_pair = i break # If the target position is aligned: @@ -99,27 +99,33 @@ def position_of_aligned_read(read_i, target_position): code = 0 base_at_target, indel_length, flanking_indel = None, None, None - # See if there is insertion/deletion within 5 bp of "i": + # See if there is insertion/deletion within 5 bp of "seq_i" on the query. + # seq_i is the i_th aligned base if isinstance(indel_length, int): - flanking_indel = inf - left_side_start = seq_i - right_side_start = seq_i + abs(indel_length) + 1 - switch = 1 - for j in (3, 2, 1): - for indel_seeker_i in left_side_start, right_side_start: - - switch = switch * -1 - displacement = j * switch - seq_j = indel_seeker_i + displacement - - if 0 <= seq_j < len(aligned_pairs): - - # If the reference position has no base aligned to it, it's a deletion. - # On the other hand, if the base has no reference base - # aligned to it, it's an insertion. - if aligned_pairs[seq_j][1] == None or aligned_pairs[seq_j][0] == None: - flanking_indel = j - break + right_indel_flanks = inf + left_indel_flanks = inf + left_side_start = idx_aligned_pair - 1 + right_side_start = idx_aligned_pair + abs(indel_length) + 1 + + #(i, None) = Insertion (or Soft-clips), i.e., means the i_th base in the query is not aligned to a reference + #(None, coordinate) = Deletion, i.e., there is no base in it that aligns to this coordinate. + # If those two scenarios occur right after an aligned base, that + # base position is counted as an indel. + for step_right_i in range(min(win_size, len(aligned_pairs) - right_side_start - 1)): + j = right_side_start + step_right_i + + if (aligned_pairs[j + 1][1] == None or aligned_pairs[j + 1][0] == None): + right_indel_flanks = step_right_i + 1 + break + + for step_left_i in range(min(win_size, left_side_start)): + j = left_side_start - step_left_i + + if (aligned_pairs[j][1] == None or aligned_pairs[j][0] == None): + left_indel_flanks = step_left_i + 1 + break + flanking_indel = min(left_indel_flanks, right_indel_flanks) + else: flanking_indel = None From 018e87ac350806b3a4441255676179ad1525ca0a Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Mon, 11 May 2020 21:13:55 -0700 Subject: [PATCH 28/89] cluster variants for feature extraction --- neusomatic/python/extend_features.py | 382 ++++++++++++----------- neusomatic/python/preprocess.py | 7 + neusomatic/python/sequencing_features.py | 106 +++++-- 3 files changed, 285 insertions(+), 210 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 79743a7..a952c00 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -16,7 +16,7 @@ import sequencing_features import genomic_file_handlers as genome from read_info_extractor import rescale -from utils import skip_empty +from utils import skip_empty, get_chromosomes_order def extract_features(candidate_record): @@ -31,164 +31,167 @@ def extract_features(candidate_record): dbsnp_tb = pysam.TabixFile(dbsnp) ext_features = [] - for chrom, pos, ref, alt, if_cosmic, num_cosmic_cases in batch: - var_id = "-".join([chrom, pos, ref, alt]) - pos = int(pos) - my_coordinate = [chrom, pos] - nBamFeatures = sequencing_features.AlignmentFeatures( - nbam, my_coordinate, ref, alt, min_mapq, min_bq) - tBamFeatures = sequencing_features.AlignmentFeatures( - tbam, my_coordinate, ref, alt, min_mapq, min_bq) - - sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, - tBamFeatures.nalt) - - homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( - ref_fa, my_coordinate, ref, alt) - - indel_length = len(alt) - len(ref) - - if not no_seq_complexity: - seq_span_80bp = ref_fa.fetch(my_coordinate[0], max( - 0, my_coordinate[1] - 41), my_coordinate[1] + 40) - seq_left_80bp = ref_fa.fetch(my_coordinate[0], max( - 0, my_coordinate[1] - 81), my_coordinate[1]) - seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[ - 1], my_coordinate[1] + 81) - LC_spanning = sequencing_features.subLC(seq_span_80bp, 20) - LC_adjacent = min(sequencing_features.subLC( - seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20)) - LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) - LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) - - if_dbsnp = 0 - if_common = 0 - if dbsnp: - region = "{}:{}-{}".format(chrom, pos, pos + 1) - dbsnp_vars = {} - for x in dbsnp_tb.fetch(region=region): - chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[ - 0:8] - for alt_ in alts_.split(","): - dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) - dbsnp_vars[ - dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 - if var_id in dbsnp_vars: - if_dbsnp = 1 - if_common = dbsnp_vars[var_id] - - p_scale = None - CHROM = my_coordinate[0] - POS = my_coordinate[1] - REF = ref - ALT = alt - if_dbsnp = if_dbsnp - COMMON = if_common - if_COSMIC = if_cosmic - COSMIC_CNT = num_cosmic_cases - Consistent_Mates = tBamFeatures.consistent_mates - Inconsistent_Mates = tBamFeatures.inconsistent_mates - if not no_seq_complexity: - Seq_Complexity_Span = LC_spanning_phred - Seq_Complexity_Adj = LC_adjacent_phred - - N_DP = nBamFeatures.dp - nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq - nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq - nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq - nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq - nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq - nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq - nBAM_REF_NM = '%g' % nBamFeatures.ref_NM - nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM - nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff - nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads - nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads - nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads - nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads - nBAM_Concordance_FET = rescale( - nBamFeatures.concordance_fet, 'fraction', p_scale, 1001) - N_REF_FOR = nBamFeatures.ref_for - N_REF_REV = nBamFeatures.ref_rev - N_ALT_FOR = nBamFeatures.alt_for - N_ALT_REV = nBamFeatures.alt_rev - nBAM_StrandBias_FET = rescale( - nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) - nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos - nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads - nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads - nBAM_Clipping_FET = rescale( - nBamFeatures.clipping_fet, 'fraction', p_scale, 1001) - nBAM_MQ0 = nBamFeatures.MQ0 - nBAM_Other_Reads = nBamFeatures.noise_read_count - nBAM_Poor_Reads = nBamFeatures.poor_read_count - nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp - nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp - nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp - nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp - nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp - nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp - SOR = sor - MaxHomopolymer_Length = homopolymer_length - SiteHomopolymer_Length = site_homopolymer_length - T_DP = tBamFeatures.dp - tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq - tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq - tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq - tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq - tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq - tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq - tBAM_REF_NM = '%g' % tBamFeatures.ref_NM - tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM - tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff - tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads - tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads - tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads - tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads - tBAM_Concordance_FET = rescale( - tBamFeatures.concordance_fet, 'fraction', p_scale, 1001) - T_REF_FOR = tBamFeatures.ref_for - T_REF_REV = tBamFeatures.ref_rev - T_ALT_FOR = tBamFeatures.alt_for - T_ALT_REV = tBamFeatures.alt_rev - tBAM_StrandBias_FET = rescale( - tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) - tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos - tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads - tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads - tBAM_Clipping_FET = rescale( - tBamFeatures.clipping_fet, 'fraction', p_scale, 1001) - tBAM_MQ0 = tBamFeatures.MQ0 - tBAM_Other_Reads = tBamFeatures.noise_read_count - tBAM_Poor_Reads = tBamFeatures.poor_read_count - tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp - tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp - tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp - tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp - tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp - tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp - InDel_Length = indel_length - - features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, - Consistent_Mates, Inconsistent_Mates] - if not no_seq_complexity: - features.extend([Seq_Complexity_Span, Seq_Complexity_Adj]) - features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, - nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, - nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, - nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, - nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, - nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, - nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, - MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, - tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, - tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, - tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, - tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, - tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, - tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) - - ext_features.append(features) + for nei_cluster in batch: + t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster) + n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster) + for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster): + var_id = "-".join([chrom, str(pos), ref, alt]) + pos = int(pos) + my_coordinate = [chrom, pos] + nBamFeatures = sequencing_features.AlignmentFeatures( + n_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq) + tBamFeatures = sequencing_features.AlignmentFeatures( + t_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq) + + sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, + tBamFeatures.nalt) + + homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( + ref_fa, my_coordinate, ref, alt) + + indel_length = len(alt) - len(ref) + + if not no_seq_complexity: + seq_span_80bp = ref_fa.fetch(my_coordinate[0], max( + 0, my_coordinate[1] - 41), my_coordinate[1] + 40) + seq_left_80bp = ref_fa.fetch(my_coordinate[0], max( + 0, my_coordinate[1] - 81), my_coordinate[1]) + seq_right_80bp = ref_fa.fetch(my_coordinate[0], my_coordinate[ + 1], my_coordinate[1] + 81) + LC_spanning = sequencing_features.subLC(seq_span_80bp, 20) + LC_adjacent = min(sequencing_features.subLC( + seq_left_80bp, 20), sequencing_features.subLC(seq_right_80bp, 20)) + LC_spanning_phred = genome.p2phred(1 - LC_spanning, 40) + LC_adjacent_phred = genome.p2phred(1 - LC_adjacent, 40) + + if_dbsnp = 0 + if_common = 0 + if dbsnp: + region = "{}:{}-{}".format(chrom, pos, pos + 1) + dbsnp_vars = {} + for x in dbsnp_tb.fetch(region=region): + chrom_, pos_, _, ref_, alts_, _, _, info_ = x.strip().split("\t")[ + 0:8] + for alt_ in alts_.split(","): + dbsnp_var_id = "-".join([chrom_, pos_, ref_, alt_]) + dbsnp_vars[ + dbsnp_var_id] = 1 if "COMMON=1" in info_ else 0 + if var_id in dbsnp_vars: + if_dbsnp = 1 + if_common = dbsnp_vars[var_id] + + p_scale = None + CHROM = my_coordinate[0] + POS = my_coordinate[1] + REF = ref + ALT = alt + if_dbsnp = if_dbsnp + COMMON = if_common + if_COSMIC = if_cosmic + COSMIC_CNT = num_cosmic_cases + Consistent_Mates = tBamFeatures.consistent_mates + Inconsistent_Mates = tBamFeatures.inconsistent_mates + if not no_seq_complexity: + Seq_Complexity_Span = LC_spanning_phred + Seq_Complexity_Adj = LC_adjacent_phred + + N_DP = nBamFeatures.dp + nBAM_REF_MQ = '%g' % nBamFeatures.ref_mq + nBAM_ALT_MQ = '%g' % nBamFeatures.alt_mq + nBAM_Z_Ranksums_MQ = '%g' % nBamFeatures.z_ranksums_mq + nBAM_REF_BQ = '%g' % nBamFeatures.ref_bq + nBAM_ALT_BQ = '%g' % nBamFeatures.alt_bq + nBAM_Z_Ranksums_BQ = '%g' % nBamFeatures.z_ranksums_bq + nBAM_REF_NM = '%g' % nBamFeatures.ref_NM + nBAM_ALT_NM = '%g' % nBamFeatures.alt_NM + nBAM_NM_Diff = '%g' % nBamFeatures.NM_Diff + nBAM_REF_Concordant = nBamFeatures.ref_concordant_reads + nBAM_REF_Discordant = nBamFeatures.ref_discordant_reads + nBAM_ALT_Concordant = nBamFeatures.alt_concordant_reads + nBAM_ALT_Discordant = nBamFeatures.alt_discordant_reads + nBAM_Concordance_FET = rescale( + nBamFeatures.concordance_fet, 'fraction', p_scale, 1001) + N_REF_FOR = nBamFeatures.ref_for + N_REF_REV = nBamFeatures.ref_rev + N_ALT_FOR = nBamFeatures.alt_for + N_ALT_REV = nBamFeatures.alt_rev + nBAM_StrandBias_FET = rescale( + nBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) + nBAM_Z_Ranksums_EndPos = '%g' % nBamFeatures.z_ranksums_endpos + nBAM_REF_Clipped_Reads = nBamFeatures.ref_SC_reads + nBAM_ALT_Clipped_Reads = nBamFeatures.alt_SC_reads + nBAM_Clipping_FET = rescale( + nBamFeatures.clipping_fet, 'fraction', p_scale, 1001) + nBAM_MQ0 = nBamFeatures.MQ0 + nBAM_Other_Reads = nBamFeatures.noise_read_count + nBAM_Poor_Reads = nBamFeatures.poor_read_count + nBAM_REF_InDel_3bp = nBamFeatures.ref_indel_3bp + nBAM_REF_InDel_2bp = nBamFeatures.ref_indel_2bp + nBAM_REF_InDel_1bp = nBamFeatures.ref_indel_1bp + nBAM_ALT_InDel_3bp = nBamFeatures.alt_indel_3bp + nBAM_ALT_InDel_2bp = nBamFeatures.alt_indel_2bp + nBAM_ALT_InDel_1bp = nBamFeatures.alt_indel_1bp + SOR = sor + MaxHomopolymer_Length = homopolymer_length + SiteHomopolymer_Length = site_homopolymer_length + T_DP = tBamFeatures.dp + tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq + tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq + tBAM_Z_Ranksums_MQ = '%g' % tBamFeatures.z_ranksums_mq + tBAM_REF_BQ = '%g' % tBamFeatures.ref_bq + tBAM_ALT_BQ = '%g' % tBamFeatures.alt_bq + tBAM_Z_Ranksums_BQ = '%g' % tBamFeatures.z_ranksums_bq + tBAM_REF_NM = '%g' % tBamFeatures.ref_NM + tBAM_ALT_NM = '%g' % tBamFeatures.alt_NM + tBAM_NM_Diff = '%g' % tBamFeatures.NM_Diff + tBAM_REF_Concordant = tBamFeatures.ref_concordant_reads + tBAM_REF_Discordant = tBamFeatures.ref_discordant_reads + tBAM_ALT_Concordant = tBamFeatures.alt_concordant_reads + tBAM_ALT_Discordant = tBamFeatures.alt_discordant_reads + tBAM_Concordance_FET = rescale( + tBamFeatures.concordance_fet, 'fraction', p_scale, 1001) + T_REF_FOR = tBamFeatures.ref_for + T_REF_REV = tBamFeatures.ref_rev + T_ALT_FOR = tBamFeatures.alt_for + T_ALT_REV = tBamFeatures.alt_rev + tBAM_StrandBias_FET = rescale( + tBamFeatures.strandbias_fet, 'fraction', p_scale, 1001) + tBAM_Z_Ranksums_EndPos = '%g' % tBamFeatures.z_ranksums_endpos + tBAM_REF_Clipped_Reads = tBamFeatures.ref_SC_reads + tBAM_ALT_Clipped_Reads = tBamFeatures.alt_SC_reads + tBAM_Clipping_FET = rescale( + tBamFeatures.clipping_fet, 'fraction', p_scale, 1001) + tBAM_MQ0 = tBamFeatures.MQ0 + tBAM_Other_Reads = tBamFeatures.noise_read_count + tBAM_Poor_Reads = tBamFeatures.poor_read_count + tBAM_REF_InDel_3bp = tBamFeatures.ref_indel_3bp + tBAM_REF_InDel_2bp = tBamFeatures.ref_indel_2bp + tBAM_REF_InDel_1bp = tBamFeatures.ref_indel_1bp + tBAM_ALT_InDel_3bp = tBamFeatures.alt_indel_3bp + tBAM_ALT_InDel_2bp = tBamFeatures.alt_indel_2bp + tBAM_ALT_InDel_1bp = tBamFeatures.alt_indel_1bp + InDel_Length = indel_length + + features = [CHROM, POS, ".", REF, ALT, if_dbsnp, COMMON, if_COSMIC, COSMIC_CNT, + Consistent_Mates, Inconsistent_Mates] + if not no_seq_complexity: + features.extend([Seq_Complexity_Span, Seq_Complexity_Adj]) + features.extend([N_DP, nBAM_REF_MQ, nBAM_ALT_MQ, nBAM_Z_Ranksums_MQ, + nBAM_REF_BQ, nBAM_ALT_BQ, nBAM_Z_Ranksums_BQ, nBAM_REF_NM, nBAM_ALT_NM, nBAM_NM_Diff, + nBAM_REF_Concordant, nBAM_REF_Discordant, nBAM_ALT_Concordant, nBAM_ALT_Discordant, + nBAM_Concordance_FET, N_REF_FOR, N_REF_REV, N_ALT_FOR, N_ALT_REV, nBAM_StrandBias_FET, + nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, + nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, + nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, + MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, + tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, + tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, + tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, + tBAM_Z_Ranksums_EndPos, tBAM_REF_Clipped_Reads, tBAM_ALT_Clipped_Reads, tBAM_Clipping_FET, + tBAM_MQ0, tBAM_Other_Reads, tBAM_Poor_Reads, tBAM_REF_InDel_3bp, tBAM_REF_InDel_2bp, + tBAM_REF_InDel_1bp, tBAM_ALT_InDel_3bp, tBAM_ALT_InDel_2bp, tBAM_ALT_InDel_1bp, InDel_Length]) + + ext_features.append(features) return ext_features except Exception as ex: @@ -205,6 +208,7 @@ def extend_features(candidates_vcf, min_mapq, min_bq, dbsnp, cosmic, no_seq_complexity, + window_extend, num_threads): logger = logging.getLogger(extend_features.__name__) @@ -242,6 +246,7 @@ def extend_features(candidates_vcf, raise Exception( "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) + chrom_order = get_chromosomes_order(reference) if cosmic: cosmic_vars = {} with open(cosmic) as i_f: @@ -276,15 +281,7 @@ def extend_features(candidates_vcf, var_id = "-".join([chrom, pos, ref, alt]) add_vars.add(var_id) - n_variants = 0 - with open(candidates_vcf) as i_f: - for line in skip_empty(i_f): - n_variants += 1 - logger.info("Number of variants: {}".format(n_variants)) - split_len = (n_variants + num_threads - 1) // num_threads - pool = multiprocessing.Pool(num_threads) - map_args = [] - batch = [] + all_variants=[] with open(candidates_vcf) as i_f: for line in skip_empty(i_f): chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] @@ -300,11 +297,8 @@ def extend_features(candidates_vcf, if cosmic and var_id in cosmic_vars: if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] - batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) - if len(batch) >= split_len: - map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) - batch = [] + all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) + if add_variants and len(add_vars) > 0: for var_id in add_vars - set(exclude_vars): v = var_id.split("-") @@ -315,14 +309,40 @@ def extend_features(candidates_vcf, if cosmic and var_id in cosmic_vars: if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] - batch.append([chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]) - if len(batch) >= split_len: + all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) + + all_variants = sorted(all_variants,key=lambda x:[chrom_order[x[0]],x[1]]) + n_variants = len(all_variants) + logger.info("Number of variants: {}".format(n_variants)) + split_len = (n_variants + num_threads - 1) // num_threads + pool = multiprocessing.Pool(num_threads) + map_args = [] + nei_cluster = [] + batch = [] + n_batch = 0 + curr_pos = None + for i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(all_variants): + if curr_pos is None: + curr_pos = [chrom, pos] + nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]] + continue + if chrom == curr_pos[0] and abs(curr_pos[1]-pos)= split_len or i == n_variants-1: + if i == n_variants-1: + batch.append(nei_cluster) + curr_pos = None + nei_cluster = [] + if batch: map_args.append((reference, tumor_bam, normal_bam, min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) - batch = [] - if batch: - map_args.append((reference, tumor_bam, normal_bam, - min_mapq, min_bq, dbsnp, no_seq_complexity, batch)) + batch = [] + assert(n_variants == sum([len(y) for x in map_args for y in x[-1]])) logger.info("Number of batches: {}".format(len(map_args))) header = ["CHROM", "POS", "ID", "REF", "ALT", "if_dbsnp", "COMMON", "if_COSMIC", "COSMIC_CNT", @@ -395,6 +415,9 @@ def extend_features(candidates_vcf, parser.add_argument('--no_seq_complexity', help='Dont compute linguistic sequence complexity features', action="store_true") + parser.add_argument('--window_extend', type=int, + help='window size for extending input features (should be in the order of readlength)', + default=1000) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) args = parser.parse_args() @@ -409,6 +432,7 @@ def extend_features(candidates_vcf, args.min_mapq, args.min_bq, args.dbsnp, args.cosmic, args.no_seq_complexity, + args.window_extend, args.num_threads, ) if output is None: diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 11410d1..20e60c1 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -199,6 +199,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, add_extra_features, no_seq_complexity, no_feature_recomp_for_ensemble, + window_extend, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -350,6 +351,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, min_mapq, snp_min_bq, dbsnp, None, no_seq_complexity, + window_extend, num_threads) if ensemble_tsv and not no_feature_recomp_for_ensemble: extra_features_others_tsv = os.path.join( @@ -364,6 +366,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, min_mapq, snp_min_bq, dbsnp, None, no_seq_complexity, + window_extend, num_threads) extra_features_bed = os.path.join( @@ -569,6 +572,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--no_feature_recomp_for_ensemble', help='Do not recompute features for ensemble_tsv', action="store_true") + parser.add_argument('--window_extend', type=int, + help='window size for extending input features (should be in the order of readlength)', + default=1000) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -589,6 +595,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.add_extra_features, args.no_seq_complexity, args.no_feature_recomp_for_ensemble, + args.window_extend, args.num_threads, args.scan_alignments_binary) except Exception as e: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 7e90d1a..ebd37db 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -9,22 +9,54 @@ from read_info_extractor import * from collections import defaultdict import fisher +import logging + +FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) nan = float('nan') + def fisher_exact_test(mat): - return fisher.pvalue(mat[0][0],mat[0][1],mat[1][0],mat[1][1]).two_tail + return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail + + +class ClusterReads: + def __init__(self, bam, variants): + self.chrom = variants[0][0] + self.min_pos = variants[0][1] + self.max_pos = variants[-1][1] + self.reads = [] + for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos): + if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: + continue + self.reads.append(read_i) + + done_i = -1 + n = len(variants) + self.var_reads = [[] for i in range(len(variants))] + for i, read in enumerate(self.reads): + for j in range(done_i + 1, n): + pos = variants[j][1] + if read.reference_start > pos: + done_i += 1 + continue + if pos < read.reference_end: + self.var_reads[j].append(i) + def get_var_reads(self, var_index): + return [self.reads[i] for i in self.var_reads[var_index]] class AlignmentFeatures: - def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + + def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): ''' bam is the opened file handle of bam file my_coordiate is a list or tuple of 0-based (contig, position) - ''' - + ''' + indel_length = len(first_alt) - len(ref_base) - reads = bam.fetch(my_coordinate[0], my_coordinate[1] - 1, my_coordinate[1]) # index 0 for ref, 1 for alt read_mq = [[], []] @@ -59,8 +91,8 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10) is_ref_call = code_i == 1 and base_call_i == ref_base[0] is_alt_call = (indel_length == 0 and code_i == 1 and base_call_i == first_alt) or ( - indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or ( - indel_length > 0 and code_i == 3) + indel_length < 0 and code_i == 2 and indel_length == indel_length_i) or ( + indel_length > 0 and code_i == 3) # inconsistent read or second alternate calls if not (is_ref_call or is_alt_call): @@ -81,22 +113,27 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10) pass if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: - concordance_counts[0 if read_i.is_proper_pair else 1][index] += 1 + concordance_counts[ + 0 if read_i.is_proper_pair else 1][index] += 1 orientation_counts[1 if read_i.is_reverse else 0][index] += 1 - is_soft_clipped = read_i.cigar[0][0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip + is_soft_clipped = read_i.cigar[0][ + 0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip soft_clip_counts[1 if is_soft_clipped else 0][index] += 1 # Distance from the end of the read: if ith_base is not None: - pos_from_end[index].append(min(ith_base, read_i.query_length - ith_base)) + pos_from_end[index].append( + min(ith_base, read_i.query_length - ith_base)) flanking_indel[index].append(flanking_indel_i) # unpack to get the ref and alt values ref_pos_from_end, alt_pos_from_end = pos_from_end - self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[0] - self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[1] + self.ref_concordant_reads, self.alt_concordant_reads = concordance_counts[ + 0] + self.ref_discordant_reads, self.alt_discordant_reads = concordance_counts[ + 1] self.ref_for, self.alt_for = orientation_counts[0] self.ref_rev, self.alt_rev = orientation_counts[1] self.ref_notSC_reads, self.alt_notSC_reads = soft_clip_counts[0] @@ -116,14 +153,16 @@ def __init__(self, bam, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10) ref_edit_distance, alt_edit_distance = edit_distance self.ref_NM = mean(ref_edit_distance) self.alt_NM = mean(alt_edit_distance) - self.z_ranksums_NM = stats.ranksums(alt_edit_distance, ref_edit_distance)[0] + self.z_ranksums_NM = stats.ranksums( + alt_edit_distance, ref_edit_distance)[0] self.NM_Diff = self.alt_NM - self.ref_NM - abs(indel_length) self.concordance_fet = fisher_exact_test(concordance_counts) self.strandbias_fet = fisher_exact_test(orientation_counts) self.clipping_fet = fisher_exact_test(soft_clip_counts) - self.z_ranksums_endpos = stats.ranksums(alt_pos_from_end, ref_pos_from_end)[0] + self.z_ranksums_endpos = stats.ranksums( + alt_pos_from_end, ref_pos_from_end)[0] ref_flanking_indel, alt_flanking_indel = flanking_indel self.ref_indel_1bp = ref_flanking_indel.count(1) @@ -224,24 +263,27 @@ def somaticOddRatio(n_ref, n_alt, t_ref, t_alt, max_value=100): return sor + def max_sub_vocabularies(seq_length, max_subseq_length): # According to: # https://doi.org/10.1093/bioinformatics/18.5.679 # capping the length of sub_string as an input parameter assert max_subseq_length <= seq_length - + counts = 0 k = 1 while k <= max_subseq_length: - + if 4**k < (seq_length - k + 1): counts = counts + 4**k else: - counts = counts + (2*seq_length - k - max_subseq_length + 2) * (max_subseq_length - k + 1)/2 + counts = counts + \ + (2 * seq_length - k - max_subseq_length + 2) * \ + (max_subseq_length - k + 1) / 2 break - + k += 1 - + return counts @@ -250,20 +292,22 @@ def subLC(sequence, max_substring_length=20): # https://doi.org/10.1093/bioinformatics/18.5.679 # Cut off substring at a fixed length sequence = sequence.upper() - + if not 'N' in sequence: - - number_of_subseqs = 0 - seq_length = len(sequence) - max_number_of_subseqs = max_sub_vocabularies(seq_length, max_substring_length) - + + number_of_subseqs = 0 + seq_length = len(sequence) + max_number_of_subseqs = max_sub_vocabularies( + seq_length, max_substring_length) + set_of_seq_n = set() - for i in range(1, min(max_substring_length+1, seq_length+1) ): - set_of_seq_n.update((sequence[n: n+i] for n in range(len(sequence) - i + 1))) - - number_of_subseqs = len(set_of_seq_n) - lc = number_of_subseqs/max_number_of_subseqs - + for i in range(1, min(max_substring_length + 1, seq_length + 1)): + set_of_seq_n.update((sequence[n: n + i] + for n in range(len(sequence) - i + 1))) + + number_of_subseqs = len(set_of_seq_n) + lc = number_of_subseqs / max_number_of_subseqs + else: lc = float('nan') From 0e394ff9c428be7d1b7eacf521e7935114bd6961 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Mon, 11 May 2020 22:53:56 -0700 Subject: [PATCH 29/89] small fix --- neusomatic/python/extend_features.py | 5 ++++- neusomatic/python/sequencing_features.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index a952c00..338adf7 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -365,7 +365,10 @@ def extend_features(candidates_vcf, "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]) try: - ext_features = pool.map_async(extract_features, map_args).get() + ext_features = [] + for w in map_args: + ext_features.append(extract_features(w)) + # ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: o_f.write("\t".join(header) + "\n") diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index ebd37db..f6c510b 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -39,10 +39,10 @@ def __init__(self, bam, variants): for i, read in enumerate(self.reads): for j in range(done_i + 1, n): pos = variants[j][1] - if read.reference_start > pos: + if read.reference_start >= pos: done_i += 1 continue - if pos < read.reference_end: + if pos <= read.reference_end: self.var_reads[j].append(i) def get_var_reads(self, var_index): return [self.reads[i] for i in self.var_reads[var_index]] From 3c061b37c49e66e8a2b7db693a227ed7b9915226 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 12 May 2020 01:18:19 -0700 Subject: [PATCH 30/89] record aligned_pairs --- neusomatic/python/extend_features.py | 8 +++--- neusomatic/python/read_info_extractor.py | 27 +++++++++++++++++--- neusomatic/python/sequencing_features.py | 32 ++++++++++++++++++------ 3 files changed, 50 insertions(+), 17 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 338adf7..66ca7ba 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -32,16 +32,14 @@ def extract_features(candidate_record): ext_features = [] for nei_cluster in batch: - t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster) n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster) + t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster) for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster): var_id = "-".join([chrom, str(pos), ref, alt]) pos = int(pos) my_coordinate = [chrom, pos] - nBamFeatures = sequencing_features.AlignmentFeatures( - n_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq) - tBamFeatures = sequencing_features.AlignmentFeatures( - t_cluster_reads.get_var_reads(var_i), my_coordinate, ref, alt, min_mapq, min_bq) + nBamFeatures = n_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq) + tBamFeatures = t_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq) sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, tBamFeatures.nalt) diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index 5c005f4..95fd269 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -1,6 +1,12 @@ #!/usr/bin/env python3 import re +import logging +import numpy as np + +FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' +logging.basicConfig(level=logging.INFO, format=FORMAT) +logger = logging.getLogger(__name__) cigar_aln_match = 0 cigar_insertion = 1 @@ -19,8 +25,7 @@ ### PYSAM ### - -def position_of_aligned_read(read_i, target_position, win_size=3): +def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3): ''' Return the base call of the target position, and if it's a start of insertion/deletion. This target position follows pysam convension, i.e., 0-based. @@ -33,9 +38,18 @@ def position_of_aligned_read(read_i, target_position, win_size=3): 3: Insertion after the target position 0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics. ''' - flanking_deletion, flanking_insertion = nan, nan - aligned_pairs = read_i.get_aligned_pairs() + # i_match = np.where(aligned_pairs[:,1]==target_position)[0] + # if len(i_match)>0: + # # If find a match: + # seq_i=aligned_pairs[i_match[0],0] + # idx_aligned_pair = i_match[0] + # i = i_match[0] + # else: + # seq_i = None + # idx_aligned_pair = None + # i = aligned_pairs.shape[0]-1 + for i, align_i in enumerate(aligned_pairs): # If find a match: @@ -44,6 +58,11 @@ def position_of_aligned_read(read_i, target_position, win_size=3): idx_aligned_pair = i break + # logger.info([aligned_pairs.shape,i_match,seq_i,idx_aligned_pair,seq_i_,idx_aligned_pair_]) + # assert(i==i_) + # assert(seq_i==seq_i_) + # assert(idx_aligned_pair==idx_aligned_pair_) + # aaa # If the target position is aligned: try: if seq_i is not None: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index f6c510b..24f944f 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -24,6 +24,7 @@ def fisher_exact_test(mat): class ClusterReads: def __init__(self, bam, variants): + self.variants = variants self.chrom = variants[0][0] self.min_pos = variants[0][1] self.max_pos = variants[-1][1] @@ -44,17 +45,33 @@ def __init__(self, bam, variants): continue if pos <= read.reference_end: self.var_reads[j].append(i) - def get_var_reads(self, var_index): - return [self.reads[i] for i in self.var_reads[var_index]] - + unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j]) + for i in unused_reads: + self.reads[i] = None + self.aligned_pairs = [] + for i, read in enumerate(self.reads): + if i not in unused_reads: + self.aligned_pairs.append(np.array(read.get_aligned_pairs())) + else: + self.aligned_pairs.append(None) -class AlignmentFeatures: - def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10): ''' bam is the opened file handle of bam file my_coordiate is a list or tuple of 0-based (contig, position) ''' + my_coordinate = self.variants[var_index][0:2] + reads = [self.reads[i] for i in self.var_reads[var_index]] + aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]] + bamfeatures = AlignmentFeatures(reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq, min_bq) + + return bamfeatures + + +class AlignmentFeatures: + + def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): indel_length = len(first_alt) - len(ref_base) @@ -74,14 +91,13 @@ def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=1 qname_collector = defaultdict(list) - for read_i in reads: + for read_i,aligned_pair in zip(reads,aligned_pairs): if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: continue - dp += 1 code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( - read_i, my_coordinate[1] - 1) + read_i, aligned_pair, my_coordinate[1] - 1) if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq: poor_read_count += 1 From 709b64b5fbf1a41eb6c3c5694262c7fbd6208e58 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 12 May 2020 11:06:11 -0700 Subject: [PATCH 31/89] small fix --- neusomatic/python/extend_features.py | 5 +---- neusomatic/python/read_info_extractor.py | 16 ---------------- neusomatic/python/sequencing_features.py | 2 +- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 66ca7ba..ba38f00 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -363,10 +363,7 @@ def extend_features(candidates_vcf, "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", "InDel_Length"]) try: - ext_features = [] - for w in map_args: - ext_features.append(extract_features(w)) - # ext_features = pool.map_async(extract_features, map_args).get() + ext_features = pool.map_async(extract_features, map_args).get() pool.close() with open(output_tsv, "w") as o_f: o_f.write("\t".join(header) + "\n") diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index 95fd269..40f38b7 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -39,17 +39,6 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3) 0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics. ''' flanking_deletion, flanking_insertion = nan, nan - # i_match = np.where(aligned_pairs[:,1]==target_position)[0] - # if len(i_match)>0: - # # If find a match: - # seq_i=aligned_pairs[i_match[0],0] - # idx_aligned_pair = i_match[0] - # i = i_match[0] - # else: - # seq_i = None - # idx_aligned_pair = None - # i = aligned_pairs.shape[0]-1 - for i, align_i in enumerate(aligned_pairs): # If find a match: @@ -58,11 +47,6 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3) idx_aligned_pair = i break - # logger.info([aligned_pairs.shape,i_match,seq_i,idx_aligned_pair,seq_i_,idx_aligned_pair_]) - # assert(i==i_) - # assert(seq_i==seq_i_) - # assert(idx_aligned_pair==idx_aligned_pair_) - # aaa # If the target position is aligned: try: if seq_i is not None: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 24f944f..b2d09da 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -51,7 +51,7 @@ def __init__(self, bam, variants): self.aligned_pairs = [] for i, read in enumerate(self.reads): if i not in unused_reads: - self.aligned_pairs.append(np.array(read.get_aligned_pairs())) + self.aligned_pairs.append(read.get_aligned_pairs()) else: self.aligned_pairs.append(None) From 12167e1acf7db67f351094aba4f173d46fd47f36 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 12 May 2020 19:15:52 -0700 Subject: [PATCH 32/89] more efficient read/ref pos match search --- neusomatic/python/extend_features.py | 14 +++--- neusomatic/python/read_info_extractor.py | 22 ++++----- neusomatic/python/sequencing_features.py | 57 ++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 25 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index ba38f00..9e98551 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -323,14 +323,14 @@ def extend_features(candidates_vcf, if curr_pos is None: curr_pos = [chrom, pos] nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]] - continue - if chrom == curr_pos[0] and abs(curr_pos[1]-pos)= split_len or i == n_variants-1: if i == n_variants-1: batch.append(nei_cluster) diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index 40f38b7..cf71ed0 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -25,7 +25,7 @@ ### PYSAM ### -def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3): +def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3): ''' Return the base call of the target position, and if it's a start of insertion/deletion. This target position follows pysam convension, i.e., 0-based. @@ -39,14 +39,8 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3) 0: The target position does not match to reference, and may be discarded for "reference/alternate" read count purposes, but can be kept for "inconsistent read" metrics. ''' flanking_deletion, flanking_insertion = nan, nan - for i, align_i in enumerate(aligned_pairs): - - # If find a match: - if align_i[1] == target_position: - seq_i = align_i[0] - idx_aligned_pair = i - break + idx_aligned_pair, seq_i = read_pos_for_ref_pos #get_read_pos_for_ref_pos(read_i, target_position) # If the target position is aligned: try: if seq_i is not None: @@ -55,22 +49,22 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3) # Whether if it's a Deletion/Insertion depends on what happens after this position: # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel # if "i" is NOT the final alignment: - if i != len(aligned_pairs) - 1: + if idx_aligned_pair != len(aligned_pairs) - 1: indel_length = 0 # If the next alignment is the next sequenced base, then the # target is either a reference read of a SNP/SNV: - if aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == target_position + 1: + if aligned_pairs[idx_aligned_pair + 1][0] == seq_i + 1 and aligned_pairs[idx_aligned_pair + 1][1] == target_position + 1: code = 1 # Reference read for mismatch # If the next reference position has no read position to it, it # is DELETED in this read: - elif aligned_pairs[i + 1][0] == None and aligned_pairs[i + 1][1] == target_position + 1: + elif aligned_pairs[idx_aligned_pair + 1][0] == None and aligned_pairs[idx_aligned_pair + 1][1] == target_position + 1: code = 2 # Deletion - for align_j in aligned_pairs[i + 1::]: + for align_j in aligned_pairs[idx_aligned_pair + 1::]: if align_j[0] == None: indel_length -= 1 else: @@ -81,11 +75,11 @@ def position_of_aligned_read(read_i, aligned_pairs, target_position, win_size=3) # the inserted sequence is "too long" to align on a single # read. In this case, the inserted length derived here is but a # lower limit of the real inserted length. - elif aligned_pairs[i + 1][0] == seq_i + 1 and aligned_pairs[i + 1][1] == None: + elif aligned_pairs[idx_aligned_pair + 1][0] == seq_i + 1 and aligned_pairs[idx_aligned_pair + 1][1] == None: code = 3 # Insertion or soft-clipping - for align_j in aligned_pairs[i + 1::]: + for align_j in aligned_pairs[idx_aligned_pair + 1::]: if align_j[1] == None: indel_length += 1 else: diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index b2d09da..9fa0c56 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -21,6 +21,45 @@ def fisher_exact_test(mat): return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail +def get_read_pos_for_ref_pos(read, ref_pos_s): + cigartuples = read.cigartuples + pos_r = read.reference_start + current_i = 0 + output = {} + while current_i < len(ref_pos_s): + if pos_r > ref_pos_s[current_i] or not cigartuples: + output[ref_pos_s[current_i]]=[None, None] + current_i +=1 + else: + break + if current_i >= len(ref_pos_s): + return output + cigar_aligned = [cigar_aln_match, cigar_seq_match, cigar_seq_mismatch] + cigar_s = 1 if cigartuples[0][0] == cigar_soft_clip else 0 + cigar_e = (len(cigartuples) - 1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples) + count = pos_q = cigartuples[0][1] if cigar_s == 1 else 0 + cigar_index = cigar_s + for op, length in cigartuples[cigar_s: cigar_e]: + is_aligned = op == 0 or op >= 7 + delta_r = length if (is_aligned or op == cigar_deletion) else 0 + delta_q = length if (is_aligned or op == cigar_insertion) else 0 + while current_i < len(ref_pos_s): + diff = ref_pos_s[current_i] - pos_r + if diff < delta_r: + output[ref_pos_s[current_i]]=[count + diff, (pos_q + diff) if delta_q else None] + current_i +=1 + else: + break + if current_i >= len(ref_pos_s): + return output + count += max(delta_r, delta_q) + pos_r += delta_r + pos_q += delta_q + cigar_index += 1 + while current_i < len(ref_pos_s): + output[ref_pos_s[current_i]]=[None, None] + current_i +=1 + return output class ClusterReads: def __init__(self, bam, variants): @@ -37,6 +76,7 @@ def __init__(self, bam, variants): done_i = -1 n = len(variants) self.var_reads = [[] for i in range(len(variants))] + self.read_vars = [[] for i in range(len(self.reads))] for i, read in enumerate(self.reads): for j in range(done_i + 1, n): pos = variants[j][1] @@ -45,6 +85,7 @@ def __init__(self, bam, variants): continue if pos <= read.reference_end: self.var_reads[j].append(i) + self.read_vars[i].append(j) unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j]) for i in unused_reads: self.reads[i] = None @@ -54,6 +95,13 @@ def __init__(self, bam, variants): self.aligned_pairs.append(read.get_aligned_pairs()) else: self.aligned_pairs.append(None) + self.read_pos_for_ref_pos = [] + for i, read in enumerate(self.reads): + if i not in unused_reads: + self.read_pos_for_ref_pos.append(get_read_pos_for_ref_pos(read, + [self.variants[j][1]-1 for j in self.read_vars[i]])) + else: + self.read_pos_for_ref_pos.append(None) def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10): @@ -64,14 +112,15 @@ def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_b my_coordinate = self.variants[var_index][0:2] reads = [self.reads[i] for i in self.var_reads[var_index]] aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]] - bamfeatures = AlignmentFeatures(reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq, min_bq) + read_pos_for_ref_pos_s = [self.read_pos_for_ref_pos[i][my_coordinate[1]-1] for i in self.var_reads[var_index]] + bamfeatures = AlignmentFeatures(reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq, min_bq) return bamfeatures class AlignmentFeatures: - def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): indel_length = len(first_alt) - len(ref_base) @@ -91,13 +140,13 @@ def __init__(self, reads, aligned_pairs, my_coordinate, ref_base, first_alt, min qname_collector = defaultdict(list) - for read_i,aligned_pair in zip(reads,aligned_pairs): + for read_i, aligned_pair, read_pos_for_ref_pos in zip(reads,aligned_pairs,read_pos_for_ref_pos_s): if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: continue dp += 1 code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( - read_i, aligned_pair, my_coordinate[1] - 1) + read_i, aligned_pair, read_pos_for_ref_pos, my_coordinate[1] - 1) if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq: poor_read_count += 1 From ff00be31ad2b5c6d2e2e63690625aaed16bdd9eb Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 12 May 2020 23:23:24 -0700 Subject: [PATCH 33/89] input num_splits --- neusomatic/python/preprocess.py | 14 +++++++++----- neusomatic/python/scan_alignments.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 20e60c1..100f52e 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -31,12 +31,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, - scan_alignments_binary, restart, num_threads, calc_qual, regions=[]): + scan_alignments_binary, restart, num_splits, num_threads, calc_qual, regions=[]): logger = logging.getLogger(process_split_region.__name__) logger.info("Scan bam.") scan_outputs = scan_alignments(work, scan_alignments_binary, alignment_bam, - region, reference, num_threads, scan_window_size, scan_maf, + region, reference, num_splits, num_threads, scan_window_size, scan_maf, min_mapq, max_dp, filter_duplicate, restart=restart, split_region_files=regions, calc_qual=calc_qual) if filtered_candidates_vcf: @@ -200,6 +200,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, no_seq_complexity, no_feature_recomp_for_ensemble, window_extend, + num_splits, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -268,7 +269,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, snp_min_af, -10000, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, - scan_alignments_binary, restart, num_threads, + scan_alignments_binary, restart, num_splits, num_threads, calc_qual=False) tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q = tumor_outputs_without_q @@ -293,7 +294,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, - scan_alignments_binary, restart, num_threads, + scan_alignments_binary, restart, num_splits, num_threads, calc_qual=True, regions=candidates_split_regions) tumor_counts, split_regions, filtered_candidates_vcfs = tumor_outputs @@ -320,7 +321,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, - scan_alignments_binary, restart, num_threads, + scan_alignments_binary, restart, num_splits, num_threads, calc_qual=True, regions=candidates_split_regions) @@ -575,6 +576,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--window_extend', type=int, help='window size for extending input features (should be in the order of readlength)', default=1000) + parser.add_argument('--num_splits', type=int, + help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -596,6 +599,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.no_seq_complexity, args.no_feature_recomp_for_ensemble, args.window_extend, + args.num_splits, args.num_threads, args.scan_alignments_binary) except Exception as e: diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index 37b47f2..5b703a0 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -70,7 +70,7 @@ def run_scan_alignments(record): def scan_alignments(work, scan_alignments_binary, input_bam, - regions_bed_file, reference, + regions_bed_file, reference, num_splits, num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True, split_region_files=[], calc_qual=True): @@ -115,8 +115,11 @@ def scan_alignments(work, scan_alignments_binary, input_bam, regions_bed_file = os.path.join(work, "all_regions.bed") shutil.move(regions_bed, regions_bed_file) - num_split = max(int(np.ceil((total_len // 10000000) // - num_threads) * num_threads), num_threads) + if num_splits is not None: + num_split = num_splits + else: + num_split = max(int(np.ceil((total_len // 10000000) // + num_threads) * num_threads), num_threads) split_region_files = split_region(work, regions_bed_file, num_split, min_region=window_size, max_region=1e20) else: @@ -189,6 +192,8 @@ def scan_alignments(work, scan_alignments_binary, input_bam, parser.add_argument('--filter_duplicate', help='filter duplicate reads when preparing pileup information', action="store_true") + parser.add_argument('--num_splits', type=int, + help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) args = parser.parse_args() @@ -196,7 +201,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam, try: outputs = scan_alignments(args.work, args.scan_alignments_binary, args.input_bam, - args.regions_bed_file, args.reference, + args.regions_bed_file, args.reference, args.num_splits, args.num_threads, args.window_size, args.maf, args.min_mapq, args.max_dp, args.filter_duplicate) except Exception as e: From 366964e239e8249d536d5fd4dbbed2702b2bf94d Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 13 May 2020 10:30:10 -0700 Subject: [PATCH 34/89] max_cluster size added --- neusomatic/python/extend_features.py | 42 ++++++++++++++++++---------- neusomatic/python/preprocess.py | 7 +++++ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index 9e98551..b1606b5 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -32,14 +32,18 @@ def extract_features(candidate_record): ext_features = [] for nei_cluster in batch: - n_cluster_reads = sequencing_features.ClusterReads(nbam, nei_cluster) - t_cluster_reads = sequencing_features.ClusterReads(tbam, nei_cluster) + n_cluster_reads = sequencing_features.ClusterReads( + nbam, nei_cluster) + t_cluster_reads = sequencing_features.ClusterReads( + tbam, nei_cluster) for var_i, [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases] in enumerate(nei_cluster): var_id = "-".join([chrom, str(pos), ref, alt]) pos = int(pos) my_coordinate = [chrom, pos] - nBamFeatures = n_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq) - tBamFeatures = t_cluster_reads.get_alignment_features(var_i, ref, alt, min_mapq, min_bq) + nBamFeatures = n_cluster_reads.get_alignment_features( + var_i, ref, alt, min_mapq, min_bq) + tBamFeatures = t_cluster_reads.get_alignment_features( + var_i, ref, alt, min_mapq, min_bq) sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, tBamFeatures.nalt) @@ -207,6 +211,7 @@ def extend_features(candidates_vcf, dbsnp, cosmic, no_seq_complexity, window_extend, + max_cluster_size, num_threads): logger = logging.getLogger(extend_features.__name__) @@ -279,7 +284,7 @@ def extend_features(candidates_vcf, var_id = "-".join([chrom, pos, ref, alt]) add_vars.add(var_id) - all_variants=[] + all_variants = [] with open(candidates_vcf) as i_f: for line in skip_empty(i_f): chrom, pos, _, ref, alt = line.strip().split("\t")[0:5] @@ -295,7 +300,8 @@ def extend_features(candidates_vcf, if cosmic and var_id in cosmic_vars: if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] - all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) + all_variants.append( + [chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) if add_variants and len(add_vars) > 0: for var_id in add_vars - set(exclude_vars): @@ -307,9 +313,11 @@ def extend_features(candidates_vcf, if cosmic and var_id in cosmic_vars: if_cosmic = 1 num_cosmic_cases = cosmic_vars[var_id] - all_variants.append([chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) + all_variants.append( + [chrom, int(pos), ref, alt, if_cosmic, num_cosmic_cases]) - all_variants = sorted(all_variants,key=lambda x:[chrom_order[x[0]],x[1]]) + all_variants = sorted(all_variants, key=lambda x: [ + chrom_order[x[0]], x[1]]) n_variants = len(all_variants) logger.info("Number of variants: {}".format(n_variants)) split_len = (n_variants + num_threads - 1) // num_threads @@ -324,15 +332,17 @@ def extend_features(candidates_vcf, curr_pos = [chrom, pos] nei_cluster = [[chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]] else: - if chrom == curr_pos[0] and abs(curr_pos[1]-pos)= split_len or i == n_variants-1: - if i == n_variants-1: + nei_cluster = [ + [chrom, pos, ref, alt, if_cosmic, num_cosmic_cases]] + if n_batch >= split_len or i == n_variants - 1: + if i == n_variants - 1: batch.append(nei_cluster) curr_pos = None nei_cluster = [] @@ -415,7 +425,10 @@ def extend_features(candidates_vcf, action="store_true") parser.add_argument('--window_extend', type=int, help='window size for extending input features (should be in the order of readlength)', - default=1000) + default=1000) + parser.add_argument('--max_cluster_size', type=int, + help='max cluster size for extending input features (should be in the order of readlength)', + default=300) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) args = parser.parse_args() @@ -431,6 +444,7 @@ def extend_features(candidates_vcf, args.dbsnp, args.cosmic, args.no_seq_complexity, args.window_extend, + args.max_cluster_size, args.num_threads, ) if output is None: diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 100f52e..65d777e 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -200,6 +200,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, no_seq_complexity, no_feature_recomp_for_ensemble, window_extend, + max_cluster_size, num_splits, num_threads, scan_alignments_binary,): @@ -353,6 +354,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, dbsnp, None, no_seq_complexity, window_extend, + max_cluster_size, num_threads) if ensemble_tsv and not no_feature_recomp_for_ensemble: extra_features_others_tsv = os.path.join( @@ -368,6 +370,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, dbsnp, None, no_seq_complexity, window_extend, + max_cluster_size, num_threads) extra_features_bed = os.path.join( @@ -576,6 +579,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--window_extend', type=int, help='window size for extending input features (should be in the order of readlength)', default=1000) + parser.add_argument('--max_cluster_size', type=int, + help='max cluster size for extending input features (should be in the order of readlength)', + default=300) parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, @@ -599,6 +605,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.no_seq_complexity, args.no_feature_recomp_for_ensemble, args.window_extend, + args.max_cluster_size, args.num_splits, args.num_threads, args.scan_alignments_binary) From 839cc63e4bb2b6861847ad1913479ca2afa8128e Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 13 May 2020 13:29:22 -0700 Subject: [PATCH 35/89] better memory management for feature extraction --- neusomatic/python/read_info_extractor.py | 7 +- neusomatic/python/sequencing_features.py | 118 ++++++++++++----------- 2 files changed, 68 insertions(+), 57 deletions(-) diff --git a/neusomatic/python/read_info_extractor.py b/neusomatic/python/read_info_extractor.py index cf71ed0..b9ae3e7 100644 --- a/neusomatic/python/read_info_extractor.py +++ b/neusomatic/python/read_info_extractor.py @@ -25,7 +25,8 @@ ### PYSAM ### -def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3): + +def position_of_aligned_read(aligned_pairs, read_pos_for_ref_pos, target_position, win_size=3): ''' Return the base call of the target position, and if it's a start of insertion/deletion. This target position follows pysam convension, i.e., 0-based. @@ -40,11 +41,11 @@ def position_of_aligned_read(read_i, aligned_pairs, read_pos_for_ref_pos, target ''' flanking_deletion, flanking_insertion = nan, nan - idx_aligned_pair, seq_i = read_pos_for_ref_pos #get_read_pos_for_ref_pos(read_i, target_position) + # get_read_pos_for_ref_pos(read_i, target_position) + idx_aligned_pair, seq_i, base_at_target, qual_at_target = read_pos_for_ref_pos # If the target position is aligned: try: if seq_i is not None: - base_at_target = read_i.seq[seq_i] # Whether if it's a Deletion/Insertion depends on what happens after this position: # If the match (i.e., i, seq_i) is the final alignment, then you cannot know if it's an indel diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 9fa0c56..b2fe8d9 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -21,6 +21,7 @@ def fisher_exact_test(mat): return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail + def get_read_pos_for_ref_pos(read, ref_pos_s): cigartuples = read.cigartuples pos_r = read.reference_start @@ -28,15 +29,16 @@ def get_read_pos_for_ref_pos(read, ref_pos_s): output = {} while current_i < len(ref_pos_s): if pos_r > ref_pos_s[current_i] or not cigartuples: - output[ref_pos_s[current_i]]=[None, None] - current_i +=1 + output[ref_pos_s[current_i]] = [None, None, None, None] + current_i += 1 else: break if current_i >= len(ref_pos_s): return output cigar_aligned = [cigar_aln_match, cigar_seq_match, cigar_seq_mismatch] cigar_s = 1 if cigartuples[0][0] == cigar_soft_clip else 0 - cigar_e = (len(cigartuples) - 1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples) + cigar_e = (len(cigartuples) - + 1) if cigartuples[-1][0] == cigar_soft_clip else len(cigartuples) count = pos_q = cigartuples[0][1] if cigar_s == 1 else 0 cigar_index = cigar_s for op, length in cigartuples[cigar_s: cigar_e]: @@ -46,8 +48,13 @@ def get_read_pos_for_ref_pos(read, ref_pos_s): while current_i < len(ref_pos_s): diff = ref_pos_s[current_i] - pos_r if diff < delta_r: - output[ref_pos_s[current_i]]=[count + diff, (pos_q + diff) if delta_q else None] - current_i +=1 + output[ref_pos_s[current_i]] = [count + diff, (pos_q + diff) if delta_q else None, + read.seq[ + (pos_q + diff)] if delta_q else None, + read.query_qualities[ + (pos_q + diff)] if delta_q else None, + ] + current_i += 1 else: break if current_i >= len(ref_pos_s): @@ -57,70 +64,75 @@ def get_read_pos_for_ref_pos(read, ref_pos_s): pos_q += delta_q cigar_index += 1 while current_i < len(ref_pos_s): - output[ref_pos_s[current_i]]=[None, None] - current_i +=1 + output[ref_pos_s[current_i]] = [None, None, None, None] + current_i += 1 return output + +class AugmentedAlignedRead: + + def __init__(self, read, vars_pos): + self.qname = read.qname + self.vars_pos = vars_pos + self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos) + self.aligned_pairs = read.get_aligned_pairs() + self.mapping_quality = read.mapping_quality + self.mean_query_qualities = mean(read.query_qualities) + self.is_proper_pair = read.is_proper_pair + self.is_reverse = read.is_reverse + self.NM = read.get_tag('NM') + self.query_length = read.query_length + self.is_soft_clipped = read.cigar[0][ + 0] == cigar_soft_clip or read.cigar[-1][0] == cigar_soft_clip + + class ClusterReads: + def __init__(self, bam, variants): self.variants = variants self.chrom = variants[0][0] self.min_pos = variants[0][1] self.max_pos = variants[-1][1] self.reads = [] + n = len(variants) + self.var_reads = [[] for i in range(len(variants))] + self.read_pos_for_ref_pos = [] + self.aligned_pairs = [] + done_j = -1 + i = 0 for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos): if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: continue - self.reads.append(read_i) - - done_i = -1 - n = len(variants) - self.var_reads = [[] for i in range(len(variants))] - self.read_vars = [[] for i in range(len(self.reads))] - for i, read in enumerate(self.reads): - for j in range(done_i + 1, n): + read_vars = [] + for j in range(done_j + 1, n): pos = variants[j][1] - if read.reference_start >= pos: - done_i += 1 + if read_i.reference_start >= pos: + done_j += 1 continue - if pos <= read.reference_end: + if pos <= read_i.reference_end: self.var_reads[j].append(i) - self.read_vars[i].append(j) - unused_reads = set(range(len(self.reads)))-set([i for j in self.var_reads for i in j]) - for i in unused_reads: - self.reads[i] = None - self.aligned_pairs = [] - for i, read in enumerate(self.reads): - if i not in unused_reads: - self.aligned_pairs.append(read.get_aligned_pairs()) - else: - self.aligned_pairs.append(None) - self.read_pos_for_ref_pos = [] - for i, read in enumerate(self.reads): - if i not in unused_reads: - self.read_pos_for_ref_pos.append(get_read_pos_for_ref_pos(read, - [self.variants[j][1]-1 for j in self.read_vars[i]])) - else: - self.read_pos_for_ref_pos.append(None) - + read_vars.append(j) + if len(read_vars) > 0: + vars_pos = [self.variants[j][1] - 1 for j in read_vars] + self.reads.append(AugmentedAlignedRead(read_i, vars_pos)) + i += 1 def get_alignment_features(self, var_index, ref_base, first_alt, min_mq=1, min_bq=10): ''' bam is the opened file handle of bam file - my_coordiate is a list or tuple of 0-based (contig, position) + my_coordinate is a list or tuple of 0-based (contig, position) ''' my_coordinate = self.variants[var_index][0:2] reads = [self.reads[i] for i in self.var_reads[var_index]] - aligned_pairs = [self.aligned_pairs[i] for i in self.var_reads[var_index]] - read_pos_for_ref_pos_s = [self.read_pos_for_ref_pos[i][my_coordinate[1]-1] for i in self.var_reads[var_index]] - bamfeatures = AlignmentFeatures(reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq, min_bq) + bamfeatures = AlignmentFeatures( + reads, my_coordinate, ref_base, first_alt, min_mq, min_bq) return bamfeatures class AlignmentFeatures: - def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): + def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=10): indel_length = len(first_alt) - len(ref_base) @@ -140,15 +152,15 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, qname_collector = defaultdict(list) - for read_i, aligned_pair, read_pos_for_ref_pos in zip(reads,aligned_pairs,read_pos_for_ref_pos_s): - if read_i.is_unmapped or not dedup_test(read_i) or read_i.seq is None: - continue + for read_i in reads: dp += 1 - + read_pos_for_ref_pos = read_i.read_pos_for_ref_pos[ + my_coordinate[1] - 1] code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( - read_i, aligned_pair, read_pos_for_ref_pos, my_coordinate[1] - 1) + read_i.aligned_pairs, read_pos_for_ref_pos, my_coordinate[1] - 1) + read_i_qual_ith_base = read_pos_for_ref_pos[3] - if read_i.mapping_quality < min_mq and mean(read_i.query_qualities) < min_bq: + if read_i.mapping_quality < min_mq and read_i.mean_query_qualities < min_bq: poor_read_count += 1 if read_i.mapping_quality == 0: @@ -170,21 +182,19 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, qname_collector[read_i.qname].append(index) read_mq[index].append(read_i.mapping_quality) - read_bq[index].append(read_i.query_qualities[ith_base]) + read_bq[index].append(read_i_qual_ith_base) try: - edit_distance[index].append(read_i.get_tag('NM')) + edit_distance[index].append(read_i.NM) except KeyError: pass - if read_i.mapping_quality >= min_mq and read_i.query_qualities[ith_base] >= min_bq: + if read_i.mapping_quality >= min_mq and read_i_qual_ith_base >= min_bq: concordance_counts[ 0 if read_i.is_proper_pair else 1][index] += 1 orientation_counts[1 if read_i.is_reverse else 0][index] += 1 - is_soft_clipped = read_i.cigar[0][ - 0] == cigar_soft_clip or read_i.cigar[-1][0] == cigar_soft_clip - soft_clip_counts[1 if is_soft_clipped else 0][index] += 1 + soft_clip_counts[1 if read_i.is_soft_clipped else 0][index] += 1 # Distance from the end of the read: if ith_base is not None: @@ -258,7 +268,7 @@ def __init__(self, reads, aligned_pairs, read_pos_for_ref_pos_s, my_coordinate, def from_genome_reference(ref_fa, my_coordinate, ref_base, first_alt): ''' ref_fa is the opened reference fasta file handle - my_coordiate is a list or tuple of 0-based (contig, position) + my_coordinate is a list or tuple of 0-based (contig, position) ''' # Homopolymer eval (Make sure to modify for INDEL): From 050188311b99fdb420cea27057396b1d33e43cc7 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 13 May 2020 15:54:58 -0700 Subject: [PATCH 36/89] not to store aligned_pairs --- neusomatic/python/sequencing_features.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index b2fe8d9..8aa7bf8 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -75,7 +75,12 @@ def __init__(self, read, vars_pos): self.qname = read.qname self.vars_pos = vars_pos self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos) - self.aligned_pairs = read.get_aligned_pairs() + self.pos_of_aligned_read = {} + for pos in vars_pos: + code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( + read.get_aligned_pairs(), self.read_pos_for_ref_pos[pos], pos) + self.pos_of_aligned_read[pos] = [ + code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i] self.mapping_quality = read.mapping_quality self.mean_query_qualities = mean(read.query_qualities) self.is_proper_pair = read.is_proper_pair @@ -96,8 +101,6 @@ def __init__(self, bam, variants): self.reads = [] n = len(variants) self.var_reads = [[] for i in range(len(variants))] - self.read_pos_for_ref_pos = [] - self.aligned_pairs = [] done_j = -1 i = 0 for read_i in bam.fetch(self.chrom, self.min_pos - 1, self.max_pos): @@ -156,8 +159,8 @@ def __init__(self, reads, my_coordinate, ref_base, first_alt, min_mq=1, min_bq=1 dp += 1 read_pos_for_ref_pos = read_i.read_pos_for_ref_pos[ my_coordinate[1] - 1] - code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( - read_i.aligned_pairs, read_pos_for_ref_pos, my_coordinate[1] - 1) + code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = read_i.pos_of_aligned_read[ + my_coordinate[1] - 1] read_i_qual_ith_base = read_pos_for_ref_pos[3] if read_i.mapping_quality < min_mq and read_i.mean_query_qualities < min_bq: From 90a68da63b131a8676f5d5360da6d44c32c3c13a Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 13 May 2020 16:17:14 -0700 Subject: [PATCH 37/89] small fix --- neusomatic/python/sequencing_features.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 8aa7bf8..265d0f8 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -76,9 +76,10 @@ def __init__(self, read, vars_pos): self.vars_pos = vars_pos self.read_pos_for_ref_pos = get_read_pos_for_ref_pos(read, vars_pos) self.pos_of_aligned_read = {} + aligned_pairs = read.get_aligned_pairs() for pos in vars_pos: code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i = position_of_aligned_read( - read.get_aligned_pairs(), self.read_pos_for_ref_pos[pos], pos) + aligned_pairs, self.read_pos_for_ref_pos[pos], pos) self.pos_of_aligned_read[pos] = [ code_i, ith_base, base_call_i, indel_length_i, flanking_indel_i] self.mapping_quality = read.mapping_quality From f83b6b520961d163ebc51ff96acaf3c7e609a069 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 14 May 2020 17:57:28 -0700 Subject: [PATCH 38/89] enable custom header --- neusomatic/python/generate_dataset.py | 7 +++++-- neusomatic/python/preprocess.py | 28 ++++++++++++++++++--------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 5f9c6b8..f9c5bf3 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1535,8 +1535,11 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be split_batch_size = 10000 if ensemble_tsv and not ensemble_bed: ensemble_bed = os.path.join(work, "ensemble.bed") - extract_ensemble([ensemble_tsv], ensemble_bed, - no_seq_complexity, enforce_header, False) + extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed, + no_seq_complexity=no_seq_complexity, enforce_header=enforce_header, + custom_header=ensemble_custom_header, + is_extend=False) + tmp_ = bedtools_intersect( tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 65d777e..5b742dd 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -194,7 +194,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, truth_vcf, tsv_batch_size, matrix_width, matrix_base_pad, min_ev_frac_per_col, - ensemble_tsv, long_read, restart, first_do_without_qual, + ensemble_tsv, ensemble_custom_header, + long_read, restart, first_do_without_qual, keep_duplicate, add_extra_features, no_seq_complexity, @@ -248,9 +249,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, ensemble_bed = os.path.join(work, "ensemble.bed") logger.info("Extract ensemble info.") if restart or not os.path.exists(ensemble_bed): - extract_ensemble([ensemble_tsv], ensemble_bed, - no_seq_complexity, no_feature_recomp_for_ensemble, False) - + extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed, + no_seq_complexity=no_seq_complexity, enforce_header=no_feature_recomp_for_ensemble, + custom_header=ensemble_custom_header, + is_extend=False) merge_d_for_short_read = 100 candidates_split_regions = [] ensemble_beds = [] @@ -376,8 +378,12 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, extra_features_bed = os.path.join( work_dataset_split, "extra_features.bed") if not os.path.exists(extra_features_bed) or restart: - extract_ensemble(ex_tsvs, - extra_features_bed, no_seq_complexity, True, True) + extract_ensemble(ensemble_tsvs=ex_tsvs, + ensemble_bed=extra_features_bed, + no_seq_complexity=no_seq_complexity, + enforce_header=True, + custom_header=False, + is_extend=True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") @@ -555,6 +561,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, help='minimum frac cov per column to keep columm', default=0.06) parser.add_argument('--ensemble_tsv', type=str, help='Ensemble annotation tsv file (only for short read)', default=None) + parser.add_argument('--ensemble_custom_header', + help='Allow ensemble tsv to have custom header fields', + action="store_true") parser.add_argument('--long_read', help='Enable long_read (high error-rate sequence) indel realignment', action="store_true") @@ -578,10 +587,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, action="store_true") parser.add_argument('--window_extend', type=int, help='window size for extending input features (should be in the order of readlength)', - default=1000) + default=1000) parser.add_argument('--max_cluster_size', type=int, help='max cluster size for extending input features (should be in the order of readlength)', - default=300) + default=300) parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, @@ -599,7 +608,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.ins_min_af, args.del_min_af, args.del_merge_min_af, args.ins_merge_min_af, args.merge_r, args.truth_vcf, args.tsv_batch_size, args.matrix_width, args.matrix_base_pad, args.min_ev_frac_per_col, - args.ensemble_tsv, args.long_read, args.restart, args.first_do_without_qual, + args.ensemble_tsv, args.ensemble_custom_header, + args.long_read, args.restart, args.first_do_without_qual, args.keep_duplicate, args.add_extra_features, args.no_seq_complexity, From 49c809f82a8242e38d26d4e4f89c8ca9e9916ff5 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 14 May 2020 17:58:24 -0700 Subject: [PATCH 39/89] fixed a bug --- neusomatic/python/generate_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 5f9c6b8..235f6db 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -984,7 +984,7 @@ def find_records(input_record): else: r_ = [[chrom, pos, ref, alt]] - ann = [0] * NUM_ENS_FEATURES + ann = [0] * num_ens_features if pos == ens_pos: if ref == ens_ref and alt == ens_alt: ann = record_[15:] From 2f9905bbe8504468a9f40c09433e376c1d944aa1 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 14 May 2020 22:14:38 -0700 Subject: [PATCH 40/89] fix bug in region splitting --- neusomatic/python/generate_dataset.py | 45 +++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 235f6db..5e7eacb 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -825,6 +825,41 @@ def find_len(ref, alt): return max(len(ref_), len(alt_)) +def keep_in_region(input_file, region_bed, + output_fn): + logger = logging.getLogger(find_len.__name__) + i = 0 + tmp_ = get_tmp_file() + with open(input_file) as i_f, open(tmp_, "w") as o_f: + for line in skip_empty(i_f): + fields = line.strip().split() + chrom, start, end = fields[0:3] + o_f.write( + "\t".join([chrom, start, str(int(start) + 1), str(i)]) + "\n") + i += 1 + + good_i = set([]) + tmp_ = bedtools_intersect( + tmp_, region_bed, args=" -wa -wb", run_logger=logger) + with open(tmp_) as i_f: + for line in skip_empty(i_f): + fields = line.strip().split() + chrom, start, end, i_, chrom_, start_, end_ = fields[0:7] + assert(chrom == chrom_) + if start_ <= start <= end_: + good_i.add(int(i_)) + i = 0 + with open(input_file) as i_f, open(output_fn, "w") as o_f: + for line in skip_empty(i_f, skip_header=False): + if line.startswith("#"): + o_f.write(line) + continue + fields = line.strip().split() + if i in good_i: + o_f.write(line) + i += 1 + + def find_records(input_record): work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record thread_logger = logging.getLogger( @@ -853,11 +888,17 @@ def find_records(input_record): num_ens_features += 2 bedtools_intersect( truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger) + tmp_ = get_tmp_file() bedtools_intersect( - pred_vcf_file, split_bed, args=" -u", output_fn=split_pred_vcf_file, run_logger=thread_logger) + pred_vcf_file, split_bed, args=" -u", output_fn=tmp_, run_logger=thread_logger) + keep_in_region(input_file=tmp_, region_bed=split_region_file, + output_fn=split_pred_vcf_file) if ensemble_bed: + tmp_ = get_tmp_file() bedtools_intersect( - ensemble_bed, split_bed, args=" -u", output_fn=split_ensemble_bed_file, run_logger=thread_logger) + ensemble_bed, split_bed, args=" -u", output_fn=tmp_, run_logger=thread_logger) + keep_in_region(input_file=tmp_, region_bed=split_region_file, + output_fn=split_ensemble_bed_file) tmp_ = bedtools_window( split_ensemble_bed_file, split_pred_vcf_file, args=" -w 5 -v", run_logger=thread_logger) From c4f24ace7a294c84a94d7719860ac7f66be6deb5 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 14 May 2020 22:37:42 -0700 Subject: [PATCH 41/89] small fix --- neusomatic/python/generate_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 5e7eacb..ee89e6b 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -839,8 +839,8 @@ def keep_in_region(input_file, region_bed, i += 1 good_i = set([]) - tmp_ = bedtools_intersect( - tmp_, region_bed, args=" -wa -wb", run_logger=logger) + tmp_ = bedtools_window( + tmp_, region_bed, args=" -w 1", run_logger=logger) with open(tmp_) as i_f: for line in skip_empty(i_f): fields = line.strip().split() From 1ce3935b8359fe2f0beb69d10427f872298bbbcc Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 14 May 2020 23:50:07 -0700 Subject: [PATCH 42/89] small_fix --- neusomatic/python/generate_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index ee89e6b..ca2afc5 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -827,7 +827,7 @@ def find_len(ref, alt): def keep_in_region(input_file, region_bed, output_fn): - logger = logging.getLogger(find_len.__name__) + logger = logging.getLogger(keep_in_region.__name__) i = 0 tmp_ = get_tmp_file() with open(input_file) as i_f, open(tmp_, "w") as o_f: @@ -846,7 +846,7 @@ def keep_in_region(input_file, region_bed, fields = line.strip().split() chrom, start, end, i_, chrom_, start_, end_ = fields[0:7] assert(chrom == chrom_) - if start_ <= start <= end_: + if int(start_) <= int(start) <= int(end_): good_i.add(int(i_)) i = 0 with open(input_file) as i_f, open(output_fn, "w") as o_f: From 8b29757ea55c431b0153efd2fb83ceeb312a0dbc Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 15 May 2020 11:58:27 -0700 Subject: [PATCH 43/89] small fix --- neusomatic/python/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 0211443..e488058 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -253,7 +253,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo if not force_zero_ann_cols: logger.info( "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols)) - prev_epochs = sofar_epochs + 1 + prev_epochs = sofar_epochs else: prev_epochs = 0 time_now = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") @@ -450,7 +450,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, - }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) + }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: train_sets[0].open_candidate_tsvs() From 7b0cb751957ab3f4dd9f5572dcc9689ae9646725 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 15 May 2020 15:07:37 -0700 Subject: [PATCH 44/89] small fix --- neusomatic/python/generate_dataset.py | 16 ++++++++++++---- neusomatic/python/preprocess.py | 9 +++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 95ccb1f..3bebdc8 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1367,7 +1367,9 @@ def find_records(input_record): return None -def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, is_extend): +def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, + ensemble_custom_header, + is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] ensemble_pos = [] @@ -1549,7 +1551,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size): + ensemble_bed, + ensemble_custom_header, + no_seq_complexity, enforce_header, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) logger.info("---------------------Generate Dataset----------------------") @@ -1581,7 +1585,6 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be custom_header=ensemble_custom_header, is_extend=False) - tmp_ = bedtools_intersect( tumor_pred_vcf_file, region_bed_file, args=" -u", run_logger=logger) len_candids = 0 @@ -1792,6 +1795,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be help='Ensemble annotation tsv file (only for short read)', default=None) parser.add_argument('--ensemble_bed', type=str, help='Ensemble annotation bed file (only for short read)', default=None) + parser.add_argument('--ensemble_custom_header', + help='Allow ensemble tsv to have custom header fields', + action="store_true") parser.add_argument('--no_seq_complexity', help='Dont compute linguistic sequence complexity features', action="store_true") @@ -1822,7 +1828,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, - ensemble_bed, no_seq_complexity, enforce_header, tsv_batch_size) + ensemble_bed, + ensemble_custom_header, + no_seq_complexity, enforce_header, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 5b742dd..049e041 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -79,11 +79,14 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, - matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, no_seq_complexity, + matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, + ensemble_custom_header, + no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed, + ensemble_custom_header, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) @@ -491,7 +494,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_bed_i, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) + ensemble_bed_i, + ensemble_custom_header, + no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir From 7706c3b141bf4bdfabd966774284fd48dbbc1373 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 16 May 2020 00:47:15 -0700 Subject: [PATCH 45/89] enable custom heading --- neusomatic/python/call.py | 60 +++--- neusomatic/python/generate_dataset.py | 254 ++++++++++++++------------ neusomatic/python/postprocess.py | 44 +++-- neusomatic/python/preprocess.py | 3 +- neusomatic/python/train.py | 62 ++++--- 5 files changed, 244 insertions(+), 179 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 18b3969..2395947 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -432,6 +432,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, zero_ann_cols = pretrained_dict["zero_ann_cols"] else: zero_ann_cols = [] + if "ensemble_custom_header" in pretrained_dict: + ensemble_custom_header = pretrained_dict["ensemble_custom_header"] + else: + ensemble_custom_header = False if force_zero_ann_cols: logger.info( @@ -442,33 +446,43 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("normalize_channels: {}".format(normalize_channels)) logger.info("no_seq_complexity: {}".format(no_seq_complexity)) logger.info("zero_ann_cols: {}".format(zero_ann_cols)) - + logger.info("ensemble_custom_header: {}".format(ensemble_custom_header)) - expected_ens_fields = NUM_ENS_FEATURES - if not no_seq_complexity: - expected_ens_fields += 2 - logger.info("expected_ens_fields: {}".format(expected_ens_fields)) - - expected_st_fields = 4 - - logger.info("expected_st_fields: {}".format(expected_st_fields)) + if ensemble_custom_header: + expected_ens_fields = NUM_ENS_FEATURES + if not no_seq_complexity: + expected_ens_fields += 2 + + logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + + expected_st_fields = 4 + + logger.info("expected_st_fields: {}".format(expected_st_fields)) + + ensemble = False + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if x: + if len(x) == expected_ens_fields + 4: + ensemble = True + break + elif len(x) == 4: + break + else: + raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) - ensemble = False - for tsv in candidates_tsv: - with open(tsv) as i_f: - x = i_f.readline().strip().split() - if x: - if len(x) == expected_ens_fields + 4: - ensemble = True - break - elif len(x) == 4: + num_channels = expected_ens_fields + \ + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES + else: + num_channels = 0 + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if x: + num_channels = len(x) - 4 + NUM_ST_FEATURES break - else: - raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) - - num_channels = expected_ens_fields + \ - NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 3bebdc8..62649ac 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -861,7 +861,7 @@ def keep_in_region(input_file, region_bed, def find_records(input_record): - work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, work_index = input_record + work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, work_index = input_record thread_logger = logging.getLogger( "{} ({})".format(find_records.__name__, multiprocessing.current_process().name)) try: @@ -883,9 +883,6 @@ def find_records(input_record): split_in_ensemble_bed = os.path.join( work, "in_ensemble_{}.bed".format(work_index)) - num_ens_features = NUM_ENS_FEATURES - if not no_seq_complexity: - num_ens_features += 2 bedtools_intersect( truth_vcf_file, split_bed, args=" -u", output_fn=split_truth_vcf_file, run_logger=thread_logger) tmp_ = get_tmp_file() @@ -1368,7 +1365,7 @@ def find_records(input_record): def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, - ensemble_custom_header, + custom_header, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] @@ -1417,31 +1414,39 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea all_headers.add(line) header_pos = line.strip().split()[0:5] header_ = line.strip().split()[5:] - if is_extend: - header_ += callers_features - header_en = list(filter( - lambda x: x[1] in expected_features, enumerate(header_))) - header = list(map(lambda x: x[1], header_en)) - if not enforce_header: - expected_features = header - - if set(expected_features) - set(header): - logger.error("The following features are missing from ensemble file {}: {}".format( - ensemble_tsv, - list(set(expected_features) - set(header)))) - raise Exception - order_header = [] - for f in expected_features: - order_header.append(header_en[header.index(f)][0]) + if not custom_header: + if is_extend: + header_ += callers_features + header_en = list(filter( + lambda x: x[1] in expected_features, enumerate(header_))) + header = list(map(lambda x: x[1], header_en)) + if not enforce_header: + expected_features = header + + if set(expected_features) - set(header): + logger.error("The following features are missing from ensemble file {}: {}".format( + ensemble_tsv, + list(set(expected_features) - set(header)))) + raise Exception + order_header = [] + for f in expected_features: + order_header.append(header_en[header.index(f)][0]) + else: + order_header=range(len(header_)) continue fields = line.strip().split() fields[2] = str(int(fields[1]) + len(fields[3])) ensemble_pos.append(fields[0:5]) features = fields[5:] - if is_extend: + if is_extend and not custom_header: features += ["0"] * len(callers_features) - ensemble_data.append(list(map(lambda x: float( - x.replace("False", "0").replace("True", "1")), features))) + features = list(map(lambda x: float( + x.replace("False", "0").replace("True", "1")), features)) + if custom_header: + if min(features)<0 or max(features)>1: + logger.info("In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]" ) + raise Exception + ensemble_data.append(features) n_vars += 1 if len(set(all_headers)) != 1: raise(RuntimeError("inconsistent headers in {}".format(ensemble_tsvs))) @@ -1449,98 +1454,102 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea ensemble_data = np.array(ensemble_data)[:, order_header] header = np.array(header_)[order_header].tolist() - cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ - "Consistent_Mates", "Inconsistent_Mates", "N_DP", - "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", - "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", - "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", - "nBAM_ALT_InDel_1bp", - "T_DP", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", - "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", - "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", - "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", - "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", - ], enumerate(header)))) - mq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ - "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header)))) - bq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ - "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header)))) - nm_diff_features = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header)))) - ranksum_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ", - "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ", "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header)))) - zero_to_one_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", - "MuSE_Tier", "if_Strelka"] + ["nBAM_Concordance_FET", "nBAM_StrandBias_FET", "nBAM_Clipping_FET", - "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header)))) - stralka_scor = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_Score"], enumerate(header)))) - stralka_qss = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_QSS"], enumerate(header)))) - stralka_tqss = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["Strelka_TQSS"], enumerate(header)))) - varscan2_score = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["VarScan2_Score"], enumerate(header)))) - vardict_score = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["VarDict_Score"], enumerate(header)))) - m2_lod = list(map(lambda x: x[0], filter(lambda x: x[1] in [ - "M2_NLOD", "M2_TLOD"], enumerate(header)))) - sniper_score = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["Sniper_Score"], enumerate(header)))) - m2_ecent = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["M2_ECNT"], enumerate(header)))) - sor = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["SOR"], enumerate(header)))) - msi = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["MSI"], enumerate(header)))) - msilen = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["MSILEN"], enumerate(header)))) - shift3 = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["SHIFT3"], enumerate(header)))) - MaxHomopolymer_Length = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header)))) - SiteHomopolymer_Length = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header)))) - InDel_Length = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["InDel_Length"], enumerate(header)))) - Seq_Complexity_ = list(map(lambda x: x[0], filter( - lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header)))) - - min_max_features = [[cov_features, 0, 2 * COV], - [mq_features, 0, 70], - [bq_features, 0, 41], - [nm_diff_features, -2 * COV, 2 * COV], - [zero_to_one_features, 0, 1], - [ranksum_features, -30, 30], - [stralka_scor, 0, 40], - [stralka_qss, 0, 200], - [stralka_tqss, 0, 4], - [varscan2_score, 0, 60], - [vardict_score, 0, 120], - [m2_lod, 0, 100], - [sniper_score, 0, 120], - [m2_ecent, 0, 40], - [sor, 0, 100], - [msi, 0, 100], - [msilen, 0, 10], - [shift3, 0, 100], - [MaxHomopolymer_Length, 0, 50], - [SiteHomopolymer_Length, 0, 50], - [InDel_Length, -30, 30], - ] - if not no_seq_complexity: - min_max_features.append([Seq_Complexity_, 0, 40]) - - selected_features = sorted([i for f in min_max_features for i in f[0]]) - selected_features_tags = list(map(lambda x: header[x], selected_features)) - if n_vars > 0: - for i_s, mn, mx in min_max_features: - if i_s: - s = ensemble_data[:, np.array(i_s)] - s = np.maximum(np.minimum(s, mx), mn) - s = (s - mn) / (mx - mn) - ensemble_data[:, np.array(i_s)] = s - ensemble_data = ensemble_data[:, selected_features] - ensemble_data = ensemble_data.tolist() + if not custom_header: + cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "Consistent_Mates", "Inconsistent_Mates", "N_DP", + "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", + "N_REF_FOR", "N_REF_REV", "N_ALT_FOR", "N_ALT_REV", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", + "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", + "nBAM_ALT_InDel_1bp", + "T_DP", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", + "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", + "tBAM_REF_Clipped_Reads", "tBAM_ALT_Clipped_Reads", + "tBAM_MQ0", "tBAM_Other_Reads", "tBAM_Poor_Reads", "tBAM_REF_InDel_3bp", "tBAM_REF_InDel_2bp", + "tBAM_REF_InDel_1bp", "tBAM_ALT_InDel_3bp", "tBAM_ALT_InDel_2bp", "tBAM_ALT_InDel_1bp", + ], enumerate(header)))) + mq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "nBAM_REF_MQ", "nBAM_ALT_MQ", "tBAM_REF_MQ", "tBAM_ALT_MQ"], enumerate(header)))) + bq_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "nBAM_REF_BQ", "nBAM_ALT_BQ", "tBAM_REF_BQ", "tBAM_ALT_BQ"], enumerate(header)))) + nm_diff_features = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["nBAM_NM_Diff", "tBAM_NM_Diff"], enumerate(header)))) + ranksum_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["nBAM_Z_Ranksums_MQ", "nBAM_Z_Ranksums_BQ", + "nBAM_Z_Ranksums_EndPos", "tBAM_Z_Ranksums_BQ", "tBAM_Z_Ranksums_MQ", "tBAM_Z_Ranksums_EndPos", ], enumerate(header)))) + zero_to_one_features = list(map(lambda x: x[0], filter(lambda x: x[1] in ["if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", + "MuSE_Tier", "if_Strelka"] + ["nBAM_Concordance_FET", "nBAM_StrandBias_FET", "nBAM_Clipping_FET", + "tBAM_Concordance_FET", "tBAM_StrandBias_FET", "tBAM_Clipping_FET"] + ["if_dbsnp", "COMMON"] + ["M2_STR"], enumerate(header)))) + stralka_scor = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_Score"], enumerate(header)))) + stralka_qss = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_QSS"], enumerate(header)))) + stralka_tqss = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Strelka_TQSS"], enumerate(header)))) + varscan2_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["VarScan2_Score"], enumerate(header)))) + vardict_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["VarDict_Score"], enumerate(header)))) + m2_lod = list(map(lambda x: x[0], filter(lambda x: x[1] in [ + "M2_NLOD", "M2_TLOD"], enumerate(header)))) + sniper_score = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Sniper_Score"], enumerate(header)))) + m2_ecent = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["M2_ECNT"], enumerate(header)))) + sor = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SOR"], enumerate(header)))) + msi = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MSI"], enumerate(header)))) + msilen = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MSILEN"], enumerate(header)))) + shift3 = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SHIFT3"], enumerate(header)))) + MaxHomopolymer_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["MaxHomopolymer_Length"], enumerate(header)))) + SiteHomopolymer_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["SiteHomopolymer_Length"], enumerate(header)))) + InDel_Length = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["InDel_Length"], enumerate(header)))) + Seq_Complexity_ = list(map(lambda x: x[0], filter( + lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header)))) + + min_max_features = [[cov_features, 0, 2 * COV], + [mq_features, 0, 70], + [bq_features, 0, 41], + [nm_diff_features, -2 * COV, 2 * COV], + [zero_to_one_features, 0, 1], + [ranksum_features, -30, 30], + [stralka_scor, 0, 40], + [stralka_qss, 0, 200], + [stralka_tqss, 0, 4], + [varscan2_score, 0, 60], + [vardict_score, 0, 120], + [m2_lod, 0, 100], + [sniper_score, 0, 120], + [m2_ecent, 0, 40], + [sor, 0, 100], + [msi, 0, 100], + [msilen, 0, 10], + [shift3, 0, 100], + [MaxHomopolymer_Length, 0, 50], + [SiteHomopolymer_Length, 0, 50], + [InDel_Length, -30, 30], + ] + if not no_seq_complexity: + min_max_features.append([Seq_Complexity_, 0, 40]) + + selected_features = sorted([i for f in min_max_features for i in f[0]]) + selected_features_tags = list(map(lambda x: header[x], selected_features)) + if n_vars > 0: + for i_s, mn, mx in min_max_features: + if i_s: + s = ensemble_data[:, np.array(i_s)] + s = np.maximum(np.minimum(s, mx), mn) + s = (s - mn) / (mx - mn) + ensemble_data[:, np.array(i_s)] = s + ensemble_data = ensemble_data[:, selected_features] + ensemble_data = ensemble_data.tolist() + else: + ensemble_data = ensemble_data.tolist() + selected_features_tags = header_ with open(ensemble_bed, "w")as f_: f_.write( "#" + "\t".join(map(str, header_pos + selected_features_tags)) + "\n") @@ -1606,11 +1615,22 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be fasta_file = pysam.Fastafile(ref_file) chrom_lengths = dict(zip(fasta_file.references, fasta_file.lengths)) + if not ensemble_custom_header: + num_ens_features = NUM_ENS_FEATURES + if not no_seq_complexity: + num_ens_features += 2 + else: + num_ens_features = 0 + with open(ensemble_bed) as i_f: + x = i_f.readline().strip().split() + if x: + num_ens_features = len(x) - 5 + pool = multiprocessing.Pool(num_threads) map_args = [] for i, split_region_file in enumerate(split_region_files): map_args.append((work, split_region_file, truth_vcf_file, - tumor_pred_vcf_file, ref_file, ensemble_bed, no_seq_complexity, i)) + tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, i)) try: records_data = pool.map_async(find_records, map_args).get() pool.close() diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index b8d0b3a..b62f9e7 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -33,6 +33,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, logger = logging.getLogger(add_vcf_info.__name__) ensemble_candids_vcf = None + use_ensemble_candids = False if ensemble_tsv: ensemble_candids_vcf = os.path.join(work, "ensemble_candids.vcf") with open(ensemble_tsv) as e_f, open(ensemble_candids_vcf, "w") as c_f: @@ -40,39 +41,48 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, c_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for line in e_f: - if "T_REF_FOR" in line: + if "POS" in line: header = line.strip().split() chrom_id = header.index("CHROM") pos_id = header.index("POS") ref_id = header.index("REF") alt_id = header.index("ALT") - dp_id = header.index("T_DP") - ref_fw_id = header.index("T_REF_FOR") - ref_rv_id = header.index("T_REF_REV") - alt_fw_id = header.index("T_ALT_FOR") - alt_rv_id = header.index("T_ALT_REV") + if "T_DP" in line: + dp_id = header.index("T_DP") + ref_fw_id = header.index("T_REF_FOR") + ref_rv_id = header.index("T_REF_REV") + alt_fw_id = header.index("T_ALT_FOR") + alt_rv_id = header.index("T_ALT_REV") + use_ensemble_candids = True + else: + dp_id, ref_fw_id, ref_rv_id, alt_fw_id, alt_rv_id = None, None, None, None, None continue fields = line.strip().split() chrom = fields[chrom_id] pos = fields[pos_id] ref = fields[ref_id] alt = fields[alt_id] - dp = int(fields[dp_id]) - ro_fw = int(fields[ref_fw_id]) - ro_rv = int(fields[ref_rv_id]) - ao_fw = int(fields[alt_fw_id]) - ao_rv = int(fields[alt_rv_id]) - ro = ro_fw + ro_rv - ao = ao_fw + ao_rv - af = np.round(ao / float(ao + ro + 0.0001), 4) - c_f.write( - "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n") + if dp_id is not None: + dp = int(fields[dp_id]) + ro_fw = int(fields[ref_fw_id]) + ro_rv = int(fields[ref_rv_id]) + ao_fw = int(fields[alt_fw_id]) + ao_rv = int(fields[alt_rv_id]) + ro = ro_fw + ro_rv + ao = ao_fw + ao_rv + af = np.round(ao / float(ao + ro + 0.0001), 4) + c_f.write( + "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", "GT:DP:RO:AO:AF", ":".join(map(str, ["0/1", dp, ro, ao, af]))])) + "\n") + else: + c_f.write( + "\t".join(map(str, [chrom, pos, ".", ref, alt, ".", ".", ".", ".", "."])) + "\n") + in_candidates = bedtools_window( merged_vcf, candidates_vcf, args=" -w 5", run_logger=logger) notin_candidates = bedtools_window( merged_vcf, candidates_vcf, args=" -w 5 -v", run_logger=logger) - if ensemble_tsv: + if ensemble_tsv and use_ensemble_candids: in_ensemble = bedtools_window( merged_vcf, ensemble_candids_vcf, args=" -w 5", run_logger=logger) notin_any = bedtools_window( diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 049e041..f69734e 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -567,7 +567,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--ensemble_tsv', type=str, help='Ensemble annotation tsv file (only for short read)', default=None) parser.add_argument('--ensemble_custom_header', - help='Allow ensemble tsv to have custom header fields', + help='Allow ensemble tsv to have custom header fields. (Features should be\ + normalized between [0,1]', action="store_true") parser.add_argument('--long_read', help='Enable long_read (high error-rate sequence) indel realignment', diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index e488058..45fbae2 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -206,6 +206,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo no_seq_complexity, zero_ann_cols, force_zero_ann_cols, + ensemble_custom_header, use_cuda): logger = logging.getLogger(train_neusomatic.__name__) @@ -253,6 +254,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo if not force_zero_ann_cols: logger.info( "Override zero_ann_cols from pretrained checkpoint: {}".format(zero_ann_cols)) + if "ensemble_custom_header" in pretrained_dict: + ensemble_custom_header = pretrained_dict["ensemble_custom_header"] + else: + ensemble_custom_header = False prev_epochs = sofar_epochs else: prev_epochs = 0 @@ -265,33 +270,40 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo logger.info( "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols)) + if not ensemble_custom_header: + expected_ens_fields = NUM_ENS_FEATURES + if not no_seq_complexity: + expected_ens_fields += 2 - expected_ens_fields = NUM_ENS_FEATURES - if not no_seq_complexity: - expected_ens_fields += 2 + logger.info("expected_ens_fields: {}".format(expected_ens_fields)) - logger.info("expected_ens_fields: {}".format(expected_ens_fields)) + expected_st_fields = 4 - expected_st_fields = 4 + logger.info("expected_st_fields: {}".format(expected_st_fields)) - logger.info("expected_st_fields: {}".format(expected_st_fields)) + ensemble = False + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if x: + if len(x) == expected_ens_fields + 4: + ensemble = True + break + elif len(x) == 4: + break + else: + raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) - ensemble = False - for tsv in candidates_tsv: - with open(tsv) as i_f: - x = i_f.readline().strip().split() - if x: - if len(x) == expected_ens_fields + 4: - ensemble = True - break - elif len(x) == 4: + num_channels = expected_ens_fields + \ + NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES + else: + num_channels = 0 + for tsv in candidates_tsv: + with open(tsv) as i_f: + x = i_f.readline().strip().split() + if x: + num_channels = len(x) - 4 + NUM_ST_FEATURES break - else: - raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) - - num_channels = expected_ens_fields + \ - NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES - logger.info("Number of channels: {}".format(num_channels)) net = NeuSomaticNet(num_channels) if use_cuda: @@ -450,6 +462,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, + "ensemble_custom_header": ensemble_custom_header, }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: @@ -517,6 +530,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, + "ensemble_custom_header": ensemble_custom_header, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if validation_candidates_tsv: test(net, curr_epoch, validation_loader, use_cuda) @@ -537,6 +551,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "normalize_channels": normalize_channels, "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, + "ensemble_custom_header": ensemble_custom_header, }, '{}/models/checkpoint_{}_epoch{}.pth'.format( out_dir, tag, curr_epoch)) if validation_candidates_tsv: @@ -623,6 +638,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo --zero_ann_cols and pretrained setting \ idx starts from 5th column in candidate.tsv file', default=[]) + parser.add_argument('--ensemble_custom_header', + help='Allow ensemble tsv to have custom header fields. (Features should be\ + normalized between [0,1]', + action="store_true") args = parser.parse_args() logger.info(args) @@ -643,6 +662,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.no_seq_complexity, args.zero_ann_cols, args.force_zero_ann_cols, + args.ensemble_custom_header, use_cuda) except Exception as e: logger.error(traceback.format_exc()) From 27f98c8db8cd3479d312373d45bfa269723b73e5 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 16 May 2020 05:04:07 -0700 Subject: [PATCH 46/89] small fix --- neusomatic/python/call.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 2395947..d814a1b 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -449,7 +449,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("ensemble_custom_header: {}".format(ensemble_custom_header)) - if ensemble_custom_header: + if not ensemble_custom_header: expected_ens_fields = NUM_ENS_FEATURES if not no_seq_complexity: expected_ens_fields += 2 From 0bc4655e2d4a1698a3bafe3b3b4754edb0e4fbe1 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 16 May 2020 17:50:43 -0700 Subject: [PATCH 47/89] small fix --- neusomatic/python/generate_dataset.py | 27 ++-- neusomatic/python/preprocess.py | 200 +++++++++++++++++--------- 2 files changed, 152 insertions(+), 75 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 62649ac..53f8ca8 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1405,6 +1405,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] + if is_extend and custom_header: + expected_features = list( + filter(lambda x: x not in callers_features, expected_features)) n_vars = 0 all_headers = set([]) for ensemble_tsv in ensemble_tsvs: @@ -1414,8 +1417,10 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea all_headers.add(line) header_pos = line.strip().split()[0:5] header_ = line.strip().split()[5:] - if not custom_header: - if is_extend: + if custom_header and not is_extend: + order_header = range(len(header_)) + else: + if is_extend and not custom_header: header_ += callers_features header_en = list(filter( lambda x: x[1] in expected_features, enumerate(header_))) @@ -1431,8 +1436,6 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea order_header = [] for f in expected_features: order_header.append(header_en[header.index(f)][0]) - else: - order_header=range(len(header_)) continue fields = line.strip().split() fields[2] = str(int(fields[1]) + len(fields[3])) @@ -1442,9 +1445,10 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea features += ["0"] * len(callers_features) features = list(map(lambda x: float( x.replace("False", "0").replace("True", "1")), features)) - if custom_header: - if min(features)<0 or max(features)>1: - logger.info("In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]" ) + if custom_header and not is_extend: + if min(features) < 0 or max(features) > 1: + logger.info( + "In --ensemble_custom_header mode, feature values in ensemble.tsv should be normalized in [0,1]") raise Exception ensemble_data.append(features) n_vars += 1 @@ -1454,7 +1458,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea ensemble_data = np.array(ensemble_data)[:, order_header] header = np.array(header_)[order_header].tolist() - if not custom_header: + if not custom_header or is_extend: cov_features = list(map(lambda x: x[0], filter(lambda x: x[1] in [ "Consistent_Mates", "Inconsistent_Mates", "N_DP", "nBAM_REF_NM", "nBAM_ALT_NM", "nBAM_REF_Concordant", "nBAM_REF_Discordant", "nBAM_ALT_Concordant", "nBAM_ALT_Discordant", @@ -1537,7 +1541,8 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea min_max_features.append([Seq_Complexity_, 0, 40]) selected_features = sorted([i for f in min_max_features for i in f[0]]) - selected_features_tags = list(map(lambda x: header[x], selected_features)) + selected_features_tags = list( + map(lambda x: header[x], selected_features)) if n_vars > 0: for i_s, mn, mx in min_max_features: if i_s: @@ -1548,7 +1553,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea ensemble_data = ensemble_data[:, selected_features] ensemble_data = ensemble_data.tolist() else: - ensemble_data = ensemble_data.tolist() + ensemble_data = ensemble_data.tolist() selected_features_tags = header_ with open(ensemble_bed, "w")as f_: f_.write( @@ -1624,7 +1629,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be with open(ensemble_bed) as i_f: x = i_f.readline().strip().split() if x: - num_ens_features = len(x) - 5 + num_ens_features = len(x) - 5 pool = multiprocessing.Pool(num_threads) map_args = [] diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index f69734e..6b3900f 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -79,7 +79,7 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, - matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, + matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, ensemble_custom_header, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size): @@ -236,6 +236,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, logger.error("Aborting!") raise Exception( "No normal .bai index file {}".format(normal_bam + ".bai")) + if no_feature_recomp_for_ensemble and ensemble_custom_header: + logger.error("Aborting!") + raise Exception( + "--ensemble_custom_header and --no_feature_recomp_for_ensemble are incompatible") if dbsnp: if dbsnp[-6:] != "vcf.gz": @@ -382,18 +386,18 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_dataset_split, "extra_features.bed") if not os.path.exists(extra_features_bed) or restart: extract_ensemble(ensemble_tsvs=ex_tsvs, - ensemble_bed=extra_features_bed, - no_seq_complexity=no_seq_complexity, - enforce_header=True, - custom_header=False, + ensemble_bed=extra_features_bed, + no_seq_complexity=no_seq_complexity, + enforce_header=True, + custom_header=ensemble_custom_header, is_extend=True) if ensemble_tsv: merged_features_bed = os.path.join( work_dataset_split, "merged_features.bed") if not os.path.exists(merged_features_bed) or restart: exclude_ens_variants = [] - header_line = "" if no_feature_recomp_for_ensemble: + header_line = "" with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: for line in skip_empty(i_f_1, skip_header=False): if line.startswith("#"): @@ -425,68 +429,136 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, continue o_f.write(line) else: - callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", - "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", - "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", - "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] - with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: - ens_variants_info = {} - header_1_found = False - header_2_found = False - for line in skip_empty(i_f_1, skip_header=False): - if line.startswith("#"): - if not header_line: - header_line = line - else: + if not ensemble_custom_header: + header_line = "" + callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", + "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", + "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", + "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] + with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: + ens_variants_info = {} + header_1_found = False + header_2_found = False + for line in skip_empty(i_f_1, skip_header=False): + if line.startswith("#"): + if not header_line: + header_line = line + else: + if header_line != line: + logger.error( + "{}!={}".format(header_line, line)) + raise Exception + header_ = line.strip().split()[5:] + header_caller = list(filter( + lambda x: x[1] in callers_features, enumerate(header_))) + header_caller_ = list( + map(lambda x: x[1], header_caller)) + header_i = list( + map(lambda x: x[0], header_caller)) + header_1_found = True + continue + assert header_1_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, + pos, ref, alt]) + ens_variants_info[var_id] = np.array(fields[5:])[ + header_i] + for line in skip_empty(i_f_2, skip_header=False): + if line.startswith("#"): if header_line != line: logger.error( "{}!={}".format(header_line, line)) - raise Exception - header_ = line.strip().split()[5:] - header_caller = list(filter( - lambda x: x[1] in callers_features, enumerate(header_))) - header_caller_ = list( - map(lambda x: x[1], header_caller)) - header_i = list( - map(lambda x: x[0], header_caller)) - header_1_found = True - continue - assert header_1_found - fields = line.strip().split("\t") - chrom, pos, _, ref, alt = fields[0:5] - var_id = "-".join([chrom, pos, ref, alt]) - ens_variants_info[var_id] = np.array(fields[5:])[ - header_i] - for line in skip_empty(i_f_2, skip_header=False): - if line.startswith("#"): - if header_line != line: - logger.error( - "{}!={}".format(header_line, line)) - if not header_2_found: - header_2 = line.strip().split()[5:] - logger.info(header_2) - order_header = [] - for f in header_caller_: - if f not in header_2: - logger.info( - "Missing header field {}".format(f)) + if not header_2_found: + header_2 = line.strip().split()[ + 5:] + order_header = [] + for f in header_caller_: + if f not in header_2: + logger.info( + "Missing header field {}".format(f)) + raise Exception + order_header.append( + header_2.index(f)) + o_f.write(line) + header_2_found = True + + assert header_2_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, + pos, ref, alt]) + if var_id in ens_variants_info: + fields_ = np.array(fields[5:]) + fields_[order_header] = ens_variants_info[ + var_id] + fields[5:] = fields_.tolist() + o_f.write( + "\t".join(list(map(str, fields))) + "\n") + else: + header_line_1 = "" + header_line_2 = "" + with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: + ens_variants_info = {} + ex_variants_info = {} + header_1_found = False + header_2_found = False + for line in skip_empty(i_f_1, skip_header=False): + if line.startswith("#"): + if not header_line_1: + header_line_1 = line + else: + if header_line_1 != line: + logger.error( + "{}!={}".format(header_line_1, line)) raise Exception - order_header.append( - header_2.index(f)) - o_f.write(line) - header_2_found = True - - assert header_2_found - fields = line.strip().split("\t") - chrom, pos, _, ref, alt = fields[0:5] - var_id = "-".join([chrom, pos, ref, alt]) - if var_id in ens_variants_info: - fields_ = np.array(fields[5:]) - fields_[order_header] = ens_variants_info[ - var_id] - fields[5:] = fields_.tolist() + header_1 = line.strip().split()[5:] + header_1_found = True + continue + assert header_1_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, + pos, ref, alt]) + ens_variants_info[ + var_id] = np.array(fields[5:]) + for line in skip_empty(i_f_2, skip_header=False): + if line.startswith("#"): + if not header_line_2: + header_line_2 = line + else: + if header_line_2 != line: + logger.error( + "{}!={}".format(header_line_2, line)) + raise Exception + header_2 = line.strip().split()[5:] + header_2_found = True + continue + assert header_2_found + fields = line.strip().split("\t") + chrom, pos, _, ref, alt = fields[0:5] + var_id = "-".join([chrom, + pos, ref, alt]) + ex_variants_info[ + var_id] = np.array(fields[5:]) + header_mixed = [ + "#CHROM", "POS", "ID", "REF", "ALT"] + header_1 + header_2 o_f.write( - "\t".join(list(map(str, fields))) + "\n") + "\t".join(list(map(str, header_mixed))) + "\n") + for var_id in set(ens_variants_info.keys()) | set(ex_variants_info.keys()): + features = [0.0] * \ + (len(header_1) + len(header_2)) + if var_id in ens_variants_info: + features[0:len(header_1)] = ens_variants_info[ + var_id] + if var_id in ex_variants_info: + features[len(header_1):] = ex_variants_info[ + var_id] + chrom = "-".join(var_id.split("-") + [:-3]) + pos, ref, alt = var_id.split("-")[-3:] + o_f.write( + "\t".join(list(map(str, [chrom, pos, int(pos) + len(ref), ref, alt] + features))) + "\n") ensemble_bed_i = merged_features_bed else: ensemble_bed_i = extra_features_bed @@ -494,7 +566,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, generate_dataset_region(work_dataset_split, truth_vcf, mode, filtered_vcf, candidates_split_region, tumor_count, normal_count, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, - ensemble_bed_i, + ensemble_bed_i, ensemble_custom_header, no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) From 7deb7d6e963607eb920b78837d55e7af7f48954a Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 23 May 2020 16:20:40 -0700 Subject: [PATCH 48/89] small fix --- neusomatic/python/generate_dataset.py | 2 ++ neusomatic/python/resolve_variants.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 53f8ca8..042333e 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1849,6 +1849,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be ensemble_bed = args.ensemble_bed no_seq_complexity = args.no_seq_complexity tsv_batch_size = args.tsv_batch_size + ensemble_custom_header = args.ensemble_custom_header + enforce_header = args.enforce_header try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index 0a672c5..c2dbf09 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -61,7 +61,7 @@ def extract_ins(record): continue if C == CIGAR_INS: inss.append([record.reference_name, pos, pos + 1, - record.query[seq_pos:seq_pos + L]]) + record.seq[seq_pos:seq_pos + L]]) seq_pos += L else: if C != CIGAR_DEL: From 7897df8dc0e84c98158a2c1bc29d704aa83e23d5 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 28 May 2020 08:53:33 -0700 Subject: [PATCH 49/89] small fix --- neusomatic/python/split_bed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/split_bed.py b/neusomatic/python/split_bed.py index b9bb1b7..e1be65f 100755 --- a/neusomatic/python/split_bed.py +++ b/neusomatic/python/split_bed.py @@ -37,7 +37,7 @@ def split_region(work, region_bed_file, num_splits, max_region=1000000, min_regi shuffle(intervals) total_len = sum(map(lambda x: int(x[2]) - int(x[1]) + 1, intervals)) logger.info("Total length: {}".format(total_len)) - split_len = total_len // num_splits + split_len = max(total_len // num_splits, min_region) split_regions = [] current_regions = [] sofar_len = 0 From 8ac67a1885c801e9cd1ade685c1a537c96d86f5c Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 30 May 2020 21:57:57 -0700 Subject: [PATCH 50/89] fix ann --- neusomatic/python/generate_dataset.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 042333e..bf808e2 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -975,9 +975,11 @@ def find_records(input_record): r_ = [[chrom, pos, ref, alt]] ann = [0] * num_ens_features + var_match = False if pos == ens_pos: if ref == ens_ref and alt == ens_alt: ann = record_[15:] + var_match = True elif (len(ref) > len(alt) and len(ens_ref) > len(ens_alt) and (alt) == (ens_alt)): if ((len(ref) > len(ens_ref) and ref[0:len(ens_ref)] == ens_ref) or ( @@ -990,14 +992,18 @@ def find_records(input_record): ann = record_[15:] if ann: ann = list(map(float, ann)) - rrs.append([r_, ann]) + rrs.append([r_, ann, var_match]) + has_var_match = sum(map(lambda x: x[2], rrs)) + if has_var_match: + rrs = list( + filter(lambda x: x[2], rrs))[0:1] max_ann = max(map(lambda x: sum(x[1]), rrs)) if max_ann > 0: rrs = list( filter(lambda x: sum(x[1]) > 0, rrs)) elif max_ann == 0: rrs = rrs[0:1] - for r_, ann in rrs: + for r_, ann, _ in rrs: for rr in r_: records.append(rr + [str(i)]) anns[i] = ann @@ -1023,9 +1029,11 @@ def find_records(input_record): r_ = [[chrom, pos, ref, alt]] ann = [0] * num_ens_features + var_match = False if pos == ens_pos: if ref == ens_ref and alt == ens_alt: ann = record_[15:] + var_match = True elif (len(ref) > len(alt) and len(ens_ref) > len(ens_alt) and (alt) == (ens_alt)): if ((len(ref) > len(ens_ref) and ref[0:len(ens_ref)] == ens_ref) or ( @@ -1038,13 +1046,17 @@ def find_records(input_record): ann = record_[15:] if ann: ann = list(map(float, ann)) - rrs.append([r_, ann]) + rrs.append([r_, ann, var_match]) + has_var_match = sum(map(lambda x: x[2], rrs)) + if has_var_match: + rrs = list( + filter(lambda x: x[2], rrs))[0:1] max_ann = max(map(lambda x: sum(x[1]), rrs)) if max_ann > 0: rrs = list(filter(lambda x: sum(x[1]) > 0, rrs)) elif max_ann == 0: rrs = rrs[0:1] - for r_, ann in rrs: + for r_, ann, _ in rrs: for rr in r_: records.append(rr + [str(i)]) anns[i] = ann From 607abcdebf9943839a292188cfaf4b879ee6c4b4 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 5 Jun 2020 14:36:01 -0700 Subject: [PATCH 51/89] small fix --- neusomatic/python/long_read_indelrealign.py | 1 - neusomatic/python/postprocess.py | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/neusomatic/python/long_read_indelrealign.py b/neusomatic/python/long_read_indelrealign.py index 4a74eda..53c1635 100755 --- a/neusomatic/python/long_read_indelrealign.py +++ b/neusomatic/python/long_read_indelrealign.py @@ -1069,7 +1069,6 @@ def run_realignment(input_record): num_add_before = min(40, pos - 1) before = ref_fasta.fetch( region.chrom, pos - num_add_before, pos - 1).upper() - print(before) pos -= num_add_before - 1 ref = before + ref alt = before + alt diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index b62f9e7..e549a33 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -21,7 +21,7 @@ from extract_postprocess_targets import extract_postprocess_targets from merge_post_vcfs import merge_post_vcfs from resolve_variants import resolve_variants -from utils import concatenate_files, get_chromosomes_order, bedtools_window, run_bedtools_cmd, skip_empty +from utils import concatenate_files, get_chromosomes_order, bedtools_window, bedtools_intersect, skip_empty from long_read_indelrealign import long_read_indelrealign from resolve_scores import resolve_scores from _version import __version__ @@ -196,7 +196,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense tempfile.tempdir = bed_tempdir candidates_preds = os.path.join(work, "candidates_preds.vcf") - ensembled_preds = os.path.join(work, "ensembled_preds.vcf") + ensembled_preds = os.path.join(work, "ensemble_preds.vcf") bedtools_window( pred_vcf_file, candidates_vcf, args=" -w 5 -v", output_fn=ensembled_preds, run_logger=logger) @@ -242,9 +242,8 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense not_resolved_vcf = os.path.join( work, "candidates_preds.not_ra_resolved.vcf") - cmd = "bedtools intersect -a {} -b {} -u".format( - target_vcf, not_resolved_bed) - run_bedtools_cmd(cmd, output_fn=not_resolved_vcf, run_logger=logger) + bedtools_intersect(target_vcf, not_resolved_bed, args=" -u ", + output_fn=not_resolved_vcf, run_logger=logger) all_no_resolve = concatenate_files( [no_resolve, ensembled_preds, not_resolved_vcf], os.path.join(work, "no_resolve.vcf")) From 867ba5f37ac7429b770fc6014f14c119f1474c38 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 17 Jun 2020 21:59:53 -0700 Subject: [PATCH 52/89] added callers vcf to tsv --- neusomatic/python/extend_features.py | 13 +- neusomatic/python/generate_dataset.py | 20 +- neusomatic/python/preprocess.py | 18 +- neusomatic/python/read_callers_vcf.py | 477 +++++++++++++++++++++++ neusomatic/python/sequencing_features.py | 12 +- 5 files changed, 528 insertions(+), 12 deletions(-) create mode 100755 neusomatic/python/read_callers_vcf.py diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index b1606b5..bf4c118 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -48,6 +48,14 @@ def extract_features(candidate_record): sor = sequencing_features.somaticOddRatio(nBamFeatures.nref, nBamFeatures.nalt, tBamFeatures.nref, tBamFeatures.nalt) + try: + score_varscan2 = genome.p2phred(sequencing_features.fisher_exact_test( + ((tBamFeatures.nalt, nBamFeatures.nalt), + (tBamFeatures.nref, nBamFeatures.nref)), + alternative='greater')) + except ValueError: + score_varscan2 = nan + homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref, alt) @@ -136,6 +144,7 @@ def extract_features(candidate_record): SOR = sor MaxHomopolymer_Length = homopolymer_length SiteHomopolymer_Length = site_homopolymer_length + score_varscan2 = rescale(score_varscan2, 'phred', p_scale, 1001) T_DP = tBamFeatures.dp tBAM_REF_MQ = '%g' % tBamFeatures.ref_mq tBAM_ALT_MQ = '%g' % tBamFeatures.alt_mq @@ -185,7 +194,7 @@ def extract_features(candidate_record): nBAM_Z_Ranksums_EndPos, nBAM_REF_Clipped_Reads, nBAM_ALT_Clipped_Reads, nBAM_Clipping_FET, nBAM_MQ0, nBAM_Other_Reads, nBAM_Poor_Reads, nBAM_REF_InDel_3bp, nBAM_REF_InDel_2bp, nBAM_REF_InDel_1bp, nBAM_ALT_InDel_3bp, nBAM_ALT_InDel_2bp, nBAM_ALT_InDel_1bp, SOR, - MaxHomopolymer_Length, SiteHomopolymer_Length, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, + MaxHomopolymer_Length, SiteHomopolymer_Length, score_varscan2, T_DP, tBAM_REF_MQ, tBAM_ALT_MQ, tBAM_Z_Ranksums_MQ, tBAM_REF_BQ, tBAM_ALT_BQ, tBAM_Z_Ranksums_BQ, tBAM_REF_NM, tBAM_ALT_NM, tBAM_NM_Diff, tBAM_REF_Concordant, tBAM_REF_Discordant, tBAM_ALT_Concordant, tBAM_ALT_Discordant, tBAM_Concordance_FET, T_REF_FOR, T_REF_REV, T_ALT_FOR, T_ALT_REV, tBAM_StrandBias_FET, @@ -364,7 +373,7 @@ def extend_features(candidates_vcf, "nBAM_Z_Ranksums_EndPos", "nBAM_REF_Clipped_Reads", "nBAM_ALT_Clipped_Reads", "nBAM_Clipping_FET", "nBAM_MQ0", "nBAM_Other_Reads", "nBAM_Poor_Reads", "nBAM_REF_InDel_3bp", "nBAM_REF_InDel_2bp", "nBAM_REF_InDel_1bp", "nBAM_ALT_InDel_3bp", "nBAM_ALT_InDel_2bp", "nBAM_ALT_InDel_1bp", "SOR", - "MaxHomopolymer_Length", "SiteHomopolymer_Length", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ", + "MaxHomopolymer_Length", "SiteHomopolymer_Length", "VarScan2_Score", "T_DP", "tBAM_REF_MQ", "tBAM_ALT_MQ", "tBAM_Z_Ranksums_MQ", "tBAM_REF_BQ", "tBAM_ALT_BQ", "tBAM_Z_Ranksums_BQ", "tBAM_REF_NM", "tBAM_ALT_NM", "tBAM_NM_Diff", "tBAM_REF_Concordant", "tBAM_REF_Discordant", "tBAM_ALT_Concordant", "tBAM_ALT_Discordant", "tBAM_Concordance_FET", "T_REF_FOR", "T_REF_REV", "T_ALT_FOR", "T_ALT_REV", "tBAM_StrandBias_FET", diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index bf808e2..2991274 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1378,6 +1378,7 @@ def find_records(input_record): def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_header, custom_header, + zero_vscore, is_extend): logger = logging.getLogger(extract_ensemble.__name__) ensemble_data = [] @@ -1414,7 +1415,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea "InDel_Length"] callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", - "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", + "Strelka_TQSS", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] if is_extend and custom_header: @@ -1527,6 +1528,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea Seq_Complexity_ = list(map(lambda x: x[0], filter( lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header)))) + max_varscan2_score = 0 if zero_vscore else 60 min_max_features = [[cov_features, 0, 2 * COV], [mq_features, 0, 70], [bq_features, 0, 41], @@ -1536,7 +1538,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea [stralka_scor, 0, 40], [stralka_qss, 0, 200], [stralka_tqss, 0, 4], - [varscan2_score, 0, 60], + [varscan2_score, 0, max_varscan2_score], [vardict_score, 0, 120], [m2_lod, 0, 100], [sniper_score, 0, 120], @@ -1579,7 +1581,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, ensemble_bed, ensemble_custom_header, - no_seq_complexity, enforce_header, tsv_batch_size): + no_seq_complexity, enforce_header, + zero_vscore, + tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) logger.info("---------------------Generate Dataset----------------------") @@ -1609,6 +1613,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed, no_seq_complexity=no_seq_complexity, enforce_header=enforce_header, custom_header=ensemble_custom_header, + zero_vscore=zero_vscore, is_extend=False) tmp_ = bedtools_intersect( @@ -1841,6 +1846,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be parser.add_argument('--enforce_header', help='Enforce header match for ensemble_tsv', action="store_true") + parser.add_argument('--zero_vscore', + help='set VarScan2_Score to zero', + action="store_true") args = parser.parse_args() logger.info(args) @@ -1863,13 +1871,15 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be tsv_batch_size = args.tsv_batch_size ensemble_custom_header = args.ensemble_custom_header enforce_header = args.enforce_header - + zero_vscore = zero_vscore try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, ensemble_bed, ensemble_custom_header, - no_seq_complexity, enforce_header, tsv_batch_size) + no_seq_complexity, enforce_header, + zero_vscore, + tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 6b3900f..fbbe1ff 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -82,13 +82,16 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, ensemble_custom_header, no_seq_complexity, - no_feature_recomp_for_ensemble, tsv_batch_size): + no_feature_recomp_for_ensemble, + zero_vscore, + tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, None, ensemble_bed, ensemble_custom_header, no_seq_complexity, no_feature_recomp_for_ensemble, + zero_vscore, tsv_batch_size) @@ -251,6 +254,10 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, raise Exception( "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) + zero_vscore = False + if not ensemble_tsv and add_extra_features: + zero_vscore = True + ensemble_bed = None if ensemble_tsv: ensemble_bed = os.path.join(work, "ensemble.bed") @@ -259,7 +266,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, extract_ensemble(ensemble_tsvs=[ensemble_tsv], ensemble_bed=ensemble_bed, no_seq_complexity=no_seq_complexity, enforce_header=no_feature_recomp_for_ensemble, custom_header=ensemble_custom_header, + zero_vscore=zero_vscore, is_extend=False) + merge_d_for_short_read = 100 candidates_split_regions = [] ensemble_beds = [] @@ -390,6 +399,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, no_seq_complexity=no_seq_complexity, enforce_header=True, custom_header=ensemble_custom_header, + zero_vscore=zero_vscore, is_extend=True) if ensemble_tsv: merged_features_bed = os.path.join( @@ -433,7 +443,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, header_line = "" callers_features = ["if_MuTect", "if_VarScan2", "if_JointSNVMix2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", "if_LoFreq", "if_Scalpel", "if_Strelka", "if_TNscope", "Strelka_Score", "Strelka_QSS", - "Strelka_TQSS", "VarScan2_Score", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", + "Strelka_TQSS", "SNVMix2_Score", "Sniper_Score", "VarDict_Score", "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] with open(merged_features_bed, "w") as o_f, open(ensemble_beds[i]) as i_f_1, open(extra_features_bed) as i_f_2: ens_variants_info = {} @@ -568,7 +578,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, ensemble_bed_i, ensemble_custom_header, - no_seq_complexity, no_feature_recomp_for_ensemble, tsv_batch_size) + no_seq_complexity, no_feature_recomp_for_ensemble, + zero_vscore, + tsv_batch_size) shutil.rmtree(bed_tempdir) tempfile.tempdir = original_tempdir diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py new file mode 100755 index 0000000..997270e --- /dev/null +++ b/neusomatic/python/read_callers_vcf.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python +#------------------------------------------------------------------------- +# read_callers_vcf.py +# read callers vcf files and generate ensemble tsv +#------------------------------------------------------------------------- +import argparse +import traceback +import logging +import re +import gzip + +import genomic_file_handlers as genome +from read_info_extractor import rescale +from utils import skip_empty, get_chromosomes_order + +import numpy as np + +# Normal/Tumor index in the Merged VCF file, or any other VCF file that +# puts NORMAL first. +idxN, idxT = 0, 1 +nan = float('nan') + + +def get_info_value(info_field, variable, ith_alt=None): + logger = logging.getLogger(get_info_value.__name__) + key_item = re.search( + r'\b{}=([^;\s]+)([;\W]|$)'.format(variable), info_field) + + # The key has a value attached to it, e.g., VAR=1,2,3 + if key_item: + if ith_alt is None: + return key_item.groups()[0] + else: + return key_item.groups()[0].split(",")[ith_alt] + + # Perhaps it's simply a flag without "=" + else: + key_item = info_field.split(';') + return True if variable in key_item else False + + +def get_sample_value(fields, samples, variable, idx=0): + + var2value = dict(zip(fields.split(':'), samples[idx].split(':'))) + try: + return var2value[variable] + except KeyError: + return None + + +def get_mutect2_info(filters, info, ith_alt): + + mutect_classification = 1 if (get_info_value(info, + 'SOMATIC') or 'PASS' in filters) else 0 + + # MuTect2 has some useful information: + nlod = get_info_value(info, 'NLOD', ith_alt) + nlod = float(nlod) if nlod else nan + + tlod = get_info_value(info, 'TLOD', ith_alt) + tlod = float(tlod) if tlod else nan + + tandem = 1 if get_info_value(info, 'STR') else 0 + + ecnt = get_info_value(info, 'ECNT') + if ecnt: + try: + ecnt = int(ecnt) + except ValueError: + ecnt = nan + else: + ecnt = nan + return mutect_classification, nlod, tlod, tandem, ecnt + + +def get_varscan2_info(info): + varscan_classification = 1 if get_info_value(info, + 'SOMATIC') else 0 + return varscan_classification + + +def get_somaticsniper_info(fields, samples, idxT): + somaticsniper_classification = 1 if get_sample_value(fields, samples, + 'SS', idxT) == '2' else 0 + if somaticsniper_classification == 1: + score_somaticsniper = get_sample_value(fields, samples, + 'SSC', idxT) + score_somaticsniper = int( + score_somaticsniper) if score_somaticsniper else nan + else: + score_somaticsniper = nan + + return somaticsniper_classification, score_somaticsniper + + +def get_vardict_info(filters, info, fields, samples): + + if (filters == 'PASS') and ('Somatic' in info): + vardict_classification = 1 + elif 'Somatic' in info: + vardict_filters = filters.split(';') + + disqualifying_filters = \ + ('d7' in vardict_filters or 'd5' in vardict_filters) or \ + ('DIFF0.2' in vardict_filters) or \ + ('LongAT' in vardict_filters) or \ + ('MAF0.05' in vardict_filters) or \ + ('MSI6' in vardict_filters) or \ + ('NM4' in vardict_filters or 'NM4.25' in vardict_filters) or \ + ('pSTD' in vardict_filters) or \ + ('SN1.5' in vardict_filters) or \ + ( 'P0.05' in vardict_filters and float(get_info_value(info, 'SSF') ) >= 0.15 ) or \ + (('v3' in vardict_filters or 'v4' in vardict_filters) + and int(get_sample_value(fields, samples, 'VD', 1)) < 3) + + no_bad_filter = not disqualifying_filters + filter_fail_times = len(vardict_filters) + + if no_bad_filter and filter_fail_times <= 2: + vardict_classification = 0.5 + else: + vardict_classification = 0 + + else: + vardict_classification = 0 + + # Somatic Score: + score_vardict = get_info_value(info, 'SSF') + if score_vardict: + score_vardict = float(score_vardict) + score_vardict = genome.p2phred(score_vardict, max_phred=100) + score_vardict = rescale(score_vardict, 'phred', None, 1001) + else: + score_vardict = nan + + # MSI, MSILEN, and SHIFT3: + msi = get_info_value(info, 'MSI') + if msi: + msi = float(msi) + else: + msi = nan + msilen = get_info_value(info, 'MSILEN') + if msilen: + msilen = float(msilen) + else: + msilen = nan + shift3 = get_info_value(info, 'SHIFT3') + if shift3: + shift3 = float(shift3) + else: + shift3 = nan + + return vardict_classification, msi, msilen, shift3, score_vardict + + +def get_muse_info(filters): + if filters == 'PASS': + muse_classification = 1 + elif filters == 'Tier1': + muse_classification = 0.9 + elif filters == 'Tier2': + muse_classification = 0.8 + elif filters == 'Tier3': + muse_classification = 0.7 + elif filters == 'Tier4': + muse_classification = 0.6 + elif filters == 'Tier5': + muse_classification = 0.5 + else: + muse_classification = 0 + return muse_classification + + +def get_strelka2_info(filters, info): + strelka_classification = 1 if 'PASS' in filters else 0 + somatic_evs = get_info_value(info, 'SomaticEVS') + qss = get_info_value(info, 'QSS') + tqss = get_info_value(info, 'TQSS') + return strelka_classification, somatic_evs, qss, tqss + + +def open_textfile(file_name): + + # See if the input file is a .gz file: + if file_name.lower().endswith('.gz'): + return gzip.open(file_name, 'rt') + + else: + return open(file_name) + + +def read_callers_vcf(reference, + output_tsv, + mutect2_vcfs, + strelka2_vcfs, + varscan2_vcfs, + muse_vcfs, + vardict_vcfs, + somaticsniper_vcfs, + min_caller): + + logger = logging.getLogger(read_callers_vcf.__name__) + + logger.info( + "----------------------Read Callers VCF------------------------") + + mutect2_info = {} + if mutect2_vcfs: + for mutect2_vcf in mutect2_vcfs: + i_f = open_textfile(mutect2_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info = x[0:8] + for ith_alt, alt in enumerate(alts.split(",")): + if ref != alt: + mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info( + filters, info, ith_alt) + var_id = "-".join([chrom, pos, ref, alt]) + mutect2_info[var_id] = [ + mutect_classification, nlod, tlod, tandem, ecnt] + i_f.close() + strelka2_info = {} + if strelka2_vcfs: + for strelka2_vcf in strelka2_vcfs: + i_f = open_textfile(strelka2_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info = x[0:8] + strelka_classification, somatic_evs, qss, tqss = get_strelka2_info( + filters, info) + for alt in alts.split(","): + if ref != alt: + var_id = "-".join([chrom, pos, ref, alt]) + strelka2_info[var_id] = [ + strelka_classification, somatic_evs, qss, tqss] + i_f.close() + vardict_info = {} + if vardict_vcfs: + for vardict_vcf in vardict_vcfs: + i_f = open_textfile(vardict_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info, fields = x[0:9] + samples = x[9:] + + # In the REF/ALT field, non-GCTA characters should be + # changed to N to fit the VCF standard: + ref = re.sub(r'[^GCTA]', 'N', ref, flags=re.I) + alts = re.sub(r'[^GCTA]', 'N', alts, flags=re.I) + + vardict_classification, msi, msilen, shift3, score_vardict = get_vardict_info( + filters, info, fields, samples) + for alt in alts.split(","): + if ref != alt: + if 'TYPE=SNV' in info or 'TYPE=Deletion' in info or 'TYPE=Insertion' in info: + var_id = "-".join([chrom, pos, ref, alt]) + vardict_info[var_id] = [ + vardict_classification, msi, msilen, shift3, score_vardict] + elif 'TYPE=Complex' in info and (len(ref) == len(alt)): + for i, (ref_i, alt_i) in enumerate(zip(ref, alt)): + if ref_i != alt_i: + var_id = "-".join([chrom, + str(int(pos) + i), ref_i, alt_i]) + vardict_info[var_id] = [ + vardict_classification, msi, msilen, shift3, score_vardict] + i_f.close() + varscan2_info = {} + if varscan2_vcfs: + for varscan2_vcf in varscan2_vcfs: + i_f = open_textfile(varscan2_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info = x[0:8] + varscan_classification = get_varscan2_info(info) + + # Replace the wrong "G/A" with the correct "G,A" in ALT + # column: + alts = alts.replace('/', ',') + + # multiple sequences in the REF, as is the case in + # VarScan2's indel output: + ref = re.sub(r'[^\w].*$', '', ref) + + # Get rid of non-compliant characters in the ALT column: + alts = re.sub(r'[^\w,.]', '', alts) + + # Eliminate dupliate entries in ALT: + alts = re.sub(r'(\w+),\1', r'\1', alts) + + # VarScan2 output a line with REF allele as "M" + if re.search(r'[^GCTAU]', ref, re.I): + continue + + for alt in alts.split(","): + if ref != alt: + var_id = "-".join([chrom, pos, ref, alt]) + varscan2_info[var_id] = varscan_classification + i_f.close() + + muse_info = {} + if muse_vcfs: + for muse_vcf in muse_vcfs: + i_f = open_textfile(muse_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info = x[0:8] + muse_classification = get_muse_info(filters) + for alt in alts.split(","): + if ref != alt: + var_id = "-".join([chrom, pos, ref, alt]) + muse_info[var_id] = muse_classification + i_f.close() + + somaticsniper_info = {} + if somaticsniper_vcfs: + for somaticsniper_vcf in somaticsniper_vcfs: + i_f = open_textfile(somaticsniper_vcf) + for line in skip_empty(i_f): + x = line.strip().split() + chrom, pos, _, ref, alts, _, filters, info, fields = x[0:9] + samples = x[9:] + ref = re.sub(r'[^GCTA]', 'N', ref, flags=re.I) + somaticsniper_classification, score_somaticsniper = get_somaticsniper_info( + fields, samples, idxT) + for alt in alts.split(","): + if ref != alt: + var_id = "-".join([chrom, pos, ref, alt]) + somaticsniper_info[var_id] = [ + somaticsniper_classification, score_somaticsniper] + i_f.close() + + features = {} + for var_id in (set(mutect2_info.keys()) | set(strelka2_info.keys()) | set(vardict_info.keys()) | + set(varscan2_info.keys()) | set(somaticsniper_info.keys()) | set(muse_info.keys())): + num_callers = 0 + if var_id in mutect2_info: + mutect_classification, nlod, tlod, tandem, ecnt = mutect2_info[ + var_id] + num_callers += mutect_classification + else: + mutect_classification = 0 + nlod = tlod = tandem = ecnt = nan + + if var_id in strelka2_info: + strelka_classification, somatic_evs, qss, tqss = strelka2_info[ + var_id] + num_callers += strelka_classification + else: + strelka_classification = 0 + somatic_evs = qss = tqss = nan + + if var_id in vardict_info: + vardict_classification, msi, msilen, shift3, score_vardict = vardict_info[ + var_id] + num_callers += vardict_classification + else: + vardict_classification = 0 + msi = msilen = shift3 = score_vardict = nan + + if var_id in varscan2_info: + varscan_classification = varscan2_info[var_id] + num_callers += varscan_classification + else: + varscan_classification = 0 + + if var_id in muse_info: + muse_classification = muse_info[var_id] + num_callers += muse_classification + else: + muse_classification = 0 + + if var_id in somaticsniper_info: + somaticsniper_classification, score_somaticsniper = somaticsniper_info[ + var_id] + num_callers += somaticsniper_classification + else: + somaticsniper_classification = 0 + score_somaticsniper = nan + + if num_callers >= min_caller: + features[var_id] = [mutect_classification, nlod, tlod, tandem, ecnt, + strelka_classification, somatic_evs, qss, tqss, + vardict_classification, msi, msilen, shift3, score_vardict, + varscan_classification, + muse_classification, + somaticsniper_classification, score_somaticsniper] + + chrom_order = get_chromosomes_order(reference) + ordered_vars = sorted(features.keys(), key=lambda x: [ + chrom_order["-".join(x.split("-")[:-3])], int(x.split("-")[1])]) + n_variants = len(ordered_vars) + logger.info("Number of variants: {}".format(n_variants)) + header = ["CHROM", "POS", "ID", "REF", "ALT", "if_MuTect", "if_VarScan2", "if_SomaticSniper", "if_VarDict", "MuSE_Tier", + "if_Strelka", "Strelka_Score", "Strelka_QSS", + "Strelka_TQSS", "Sniper_Score", "VarDict_Score", + "M2_NLOD", "M2_TLOD", "M2_STR", "M2_ECNT", "MSI", "MSILEN", "SHIFT3"] + + with open(output_tsv, "w") as o_f: + o_f.write("\t".join(header) + "\n") + for var_id in ordered_vars: + mutect_classification, nlod, tlod, tandem, ecnt, \ + strelka_classification, somatic_evs, qss, tqss, \ + vardict_classification, msi, msilen, shift3, score_vardict, \ + varscan_classification, \ + muse_classification, \ + somaticsniper_classification, score_somaticsniper = features[ + var_id] + + f = [mutect_classification, varscan_classification, somaticsniper_classification, + vardict_classification, muse_classification, strelka_classification, + somatic_evs, qss, tqss, + score_somaticsniper, score_vardict, + nlod, tlod, tandem, ecnt, + msi, msilen, shift3] + chrom = "-".join(var_id.split("-")[:-3]) + pos, ref, alt = var_id.split("-")[-3:] + o_f.write( + "\t".join([chrom, pos, ".", ref, alt] + list(map(lambda x: str(x).replace("nan", "0"), f))) + "\n") + + logger.info("Done Reading Callers' Features.") + return output_tsv + + +if __name__ == '__main__': + FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' + logging.basicConfig(level=logging.INFO, format=FORMAT) + logger = logging.getLogger(__name__) + + parser = argparse.ArgumentParser( + description='extract extra features for standalone mode') + parser.add_argument('--reference', type=str, help='reference fasta filename', + required=True) + parser.add_argument('--output_tsv', type=str, help='output features tsv', + required=True) + parser.add_argument('--mutect2_vcfs', type=str, nargs="*", + help='MuTect2 VCFs', + default=None) + parser.add_argument('--strelka2_vcfs', type=str, nargs="*", + help='Strelka2 VCFs', + default=None) + parser.add_argument('--varscan2_vcfs', type=str, nargs="*", + help='VarScan2 VCFs', + default=None) + parser.add_argument('--muse_vcfs', type=str, nargs="*", + help='MuSE VCFs', + default=None) + parser.add_argument('--vardict_vcfs', type=str, nargs="*", + help='VarDict VCFs', + default=None) + parser.add_argument('--somaticsniper_vcfs', type=str, nargs="*", + help='SomaticSniper VCFs', + default=None) + parser.add_argument('--min_caller', type=float, + help='Number of minimum callers support a call', + default=0.5) + args = parser.parse_args() + logger.info(args) + + try: + output = read_callers_vcf(args.reference, + args.output_tsv, + args.mutect2_vcfs, + args.strelka2_vcfs, + args.varscan2_vcfs, + args.muse_vcfs, + args.vardict_vcfs, + args.somaticsniper_vcfs, + args.min_caller, + ) + if output is None: + raise Exception("read_callers_vcf failed!") + except Exception as e: + logger.error(traceback.format_exc()) + logger.error("Aborting!") + logger.error( + "read_callers_vcf.py failure on arguments: {}".format(args)) + raise e diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 265d0f8..6aafc1e 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -18,8 +18,16 @@ nan = float('nan') -def fisher_exact_test(mat): - return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail +def fisher_exact_test(mat, alternative="two-sided"): + if alternative == "two-sided": + return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).two_tail + elif alternative == "greater": + return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).right_tail + elif alternative == "less": + return fisher.pvalue(mat[0][0], mat[0][1], mat[1][0], mat[1][1]).left_tail + else: + logger.error("Wrong fisher_test alternative: {}".format(alternative)) + raise Exception def get_read_pos_for_ref_pos(read, ref_pos_s): From 635341ee55b63cf02dff34659184a66bad6d2569 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 18 Jun 2020 16:09:25 -0700 Subject: [PATCH 53/89] merge regions for scanning --- neusomatic/python/preprocess.py | 11 ++++++++++- neusomatic/python/scan_alignments.py | 24 ++++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index fbbe1ff..ae1586c 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -31,11 +31,12 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, + merge_d_for_scan, scan_alignments_binary, restart, num_splits, num_threads, calc_qual, regions=[]): logger = logging.getLogger(process_split_region.__name__) logger.info("Scan bam.") - scan_outputs = scan_alignments(work, scan_alignments_binary, alignment_bam, + scan_outputs = scan_alignments(work, merge_d_for_scan, scan_alignments_binary, alignment_bam, region, reference, num_splits, num_threads, scan_window_size, scan_maf, min_mapq, max_dp, filter_duplicate, restart=restart, split_region_files=regions, calc_qual=calc_qual) @@ -208,6 +209,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, no_feature_recomp_for_ensemble, window_extend, max_cluster_size, + merge_d_for_scan, num_splits, num_threads, scan_alignments_binary,): @@ -288,6 +290,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, snp_min_af, -10000, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, + merge_d_for_scan, scan_alignments_binary, restart, num_splits, num_threads, calc_qual=False) tumor_counts_without_q, split_regions, filtered_candidates_vcfs_without_q = tumor_outputs_without_q @@ -313,6 +316,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, + merge_d_for_scan, scan_alignments_binary, restart, num_splits, num_threads, calc_qual=True, regions=candidates_split_regions) @@ -340,6 +344,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r, + merge_d_for_scan, scan_alignments_binary, restart, num_splits, num_threads, calc_qual=True, regions=candidates_split_regions) @@ -681,6 +686,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--max_cluster_size', type=int, help='max cluster size for extending input features (should be in the order of readlength)', default=300) + parser.add_argument('--merge_d_for_scan', type=int, + help='-d used to merge regions before scan', + default=None) parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, @@ -706,6 +714,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.no_feature_recomp_for_ensemble, args.window_extend, args.max_cluster_size, + args.merge_d_for_scan, args.num_splits, args.num_threads, args.scan_alignments_binary) diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index 5b703a0..b8aaf11 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -24,7 +24,7 @@ def run_scan_alignments(record): - work, reference, scan_alignments_binary, split_region_file, \ + work, reference, merge_d_for_scan, scan_alignments_binary, split_region_file, \ input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual = record if filter_duplicate: @@ -39,10 +39,19 @@ def run_scan_alignments(record): raise IOError("File not found: {}".format(scan_alignments_binary)) if not os.path.exists(work): os.mkdir(work) - if os.path.getsize(split_region_file) > 0: + + if merge_d_for_scan is not None: + split_region_file_=os.path.join(work,"merged_region.bed") + tmp_ = bedtools_sort(split_region_file, run_logger=thread_logger) + bedtools_merge( + tmp_, output_fn=split_region_file_ , args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger) + else: + split_region_file_=split_region_file + + if os.path.getsize(split_region_file_) > 0: cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \ --window_size {} --min_af {} --min_mapq {} --max_depth {} {}".format( - scan_alignments_binary, reference, input_bam, split_region_file, + scan_alignments_binary, reference, input_bam, split_region_file_, work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, filter_duplicate_str) if calc_qual: cmd += " --calculate_qual_stat" @@ -69,7 +78,7 @@ def run_scan_alignments(record): return None -def scan_alignments(work, scan_alignments_binary, input_bam, +def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, regions_bed_file, reference, num_splits, num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True, split_region_files=[], calc_qual=True): @@ -137,7 +146,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam, if os.path.exists(work_): shutil.rmtree(work_) map_args.append((os.path.join(work, "work.{}".format(i)), - reference, scan_alignments_binary, split_region_file, + reference, merge_d_for_scan, scan_alignments_binary, split_region_file, input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual)) not_done.append(i) else: @@ -192,6 +201,9 @@ def scan_alignments(work, scan_alignments_binary, input_bam, parser.add_argument('--filter_duplicate', help='filter duplicate reads when preparing pileup information', action="store_true") + parser.add_argument('--merge_d_for_scan', type=int, + help='-d used to merge regions before scan', + default=None) parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, @@ -200,7 +212,7 @@ def scan_alignments(work, scan_alignments_binary, input_bam, logger.info(args) try: - outputs = scan_alignments(args.work, args.scan_alignments_binary, args.input_bam, + outputs = scan_alignments(args.work, args.merge_d_for_scan, args.scan_alignments_binary, args.input_bam, args.regions_bed_file, args.reference, args.num_splits, args.num_threads, args.window_size, args.maf, args.min_mapq, args.max_dp, args.filter_duplicate) From a52d0c21029d03828d9c3ae3c0eea6a72f719471 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 18 Jun 2020 17:02:12 -0700 Subject: [PATCH 54/89] bug fixes for call/post --- neusomatic/python/call.py | 98 ++-- .../python/extract_postprocess_targets.py | 36 +- neusomatic/python/postprocess.py | 6 +- neusomatic/python/resolve_variants.py | 437 ++++++++++++++---- 4 files changed, 435 insertions(+), 142 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index d814a1b..6b5a8b9 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -146,6 +146,7 @@ def pred_vcf_records_path(record): chrom, pos, ref, alt, _, center, _, _, _ = path.split( ".") + ref, alt = ref.upper(), alt.upper() center = int(center) pos = int(pos) @@ -212,61 +213,82 @@ def pred_vcf_records_path(record): for i in nzref_pos: col_2_pos[i] = cnt cnt += 1 + if vartype_candidate == "INS" and anchor[1] == 0 and 0 not in col_2_pos: + col_2_pos[0] = -1 + nzref_pos = np.array([0] + list(nzref_pos)) if anchor[1] not in col_2_pos: - # print "NNN",path,pred - return vcf_record + if I[0, anchor[1], 0] > 0 and vartype_candidate == "INS" and type_pred == "INS": + ins_no_zref_pos = True + else: + # thread_logger.info(["NNN", path, pred]) + return vcf_record + if not ins_no_zref_pos: + b = (anchor[0] - col_2_pos[anchor[1]]) + for i in nzref_pos: + col_2_pos[i] += b + pos_2_col = {v: k for k, v in col_2_pos.items()} - b = (anchor[0] - col_2_pos[anchor[1]]) - for i in nzref_pos: - col_2_pos[i] += b - pos_2_col = {v: k for k, v in col_2_pos.items()} + if type_pred == "SNP" and len(ref) - len(alt) > 1 and abs(center_pred - center) < center_dist_roundback: + thread_logger.info(["TBC", path, nzref_pos]) if abs(center_pred - center) < too_far_center: if type_pred == "SNP": - pos_ = col_2_pos[center_] - ref_ = "" - alt_ = "" - for i in range(len_pred): - nzp = nzref_pos[nzref_pos >= (center_ + i)] - if len(nzp) > 0: - center__ = nzp[np.argmin(abs(nzp - (center_ + i)))] - rb = np.argmax(I[1:, center__, 0]) - ref_ += ACGT[rb] - II = I.copy() - II[rb + 1, center__, 1] = 0 - alt_ += ACGT[np.argmax(II[1:, center__, 1])] - if sum(I[1:, center__, 1]) == 0: - break - if not ref_: - # print "SSS",path,pred - return vcf_record + if abs(center_pred - center) < center_dist_roundback and len_pred == 1 and len(ref) == 1 and len(alt) == 1: + pos_, ref_, alt_ = pos, ref.upper(), alt.upper() + else: + pos_ = col_2_pos[center_] + ref_ = "" + alt_ = "" + for i in range(len_pred): + nzp = nzref_pos[nzref_pos >= (center_ + i)] + if len(nzp) > 0: + center__ = nzp[np.argmin(abs(nzp - (center_ + i)))] + rb = np.argmax(I[1:, center__, 0]) + ref_ += ACGT[rb] + II = I.copy() + II[rb + 1, center__, 1] = 0 + if max(II[1:, center__, 1]) == 0 and center__ == center and ref == ref_ and len(alt) == 1: + alt_ = alt + else: + alt_ += ACGT[np.argmax(II[1:, center__, 1])] + if sum(I[1:, center__, 1]) == 0: + break + if not ref_: + # thread_logger.info(["SSS", path, pred]) + return vcf_record elif type_pred == "INS": if ins_no_zref_pos: pos_, ref_, alt_ = pos, ref.upper(), alt.upper() else: - pos_ = -1 + pos_ = -2 i_ = center_ - 1 - for i_ in range(center_ - 1, 0, -1): + for i_ in range(center_ - 1, -2, -1): if i_ in nzref_pos: pos_ = col_2_pos[i_] break - if pos_ == -1: + if pos_ == -2: # print "PPP-1",path,pred return vcf_record - if (sum(I[1:, i_, 1]) == 0): - # path,pred,i_,nzref_pos,col_2_pos,I[1:,i_,1],true_path[path] - return vcf_record - ref_ = ACGT[np.argmax(I[1:, i_, 0])] - alt_ = ref_ + len_pred_=len_pred if len_pred == 3: len_pred = max(len(alt) - len(ref), len_pred) - for i in range(i_ + 1, Iw): - if i in zref_pos: - alt_ += ACGT[np.argmax(I[1:, i, 1])] - else: - break - if (len(alt_) - len(ref_)) >= len_pred: - break + if (sum(I[1:, i_, 1]) == 0): + # thread_logger.info(["PPP-2", path, pred]) + return vcf_record + if len_pred == len(alt) - len(ref) and pos_ == pos: + pos_, ref_, alt_ = pos, ref.upper(), alt.upper() + else: + ref_ = ACGT[np.argmax(I[1:, i_, 0])] + alt_ = ref_ + for i in range(i_ + 1, Iw): + if i in zref_pos: + alt_ += ACGT[np.argmax(I[1:, i, 1])] + else: + break + if (len(alt_) - len(ref_)) >= len_pred: + break + if len_pred_ == 3 and (len(alt_) - len(ref_)) < len_pred and pos_ == pos: + pos_, ref_, alt_ = pos, ref.upper(), alt.upper() elif type_pred == "DEL": pos_ = col_2_pos[center_] - 1 if pos_ not in pos_2_col: diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py index c3dee50..be0089e 100755 --- a/neusomatic/python/extract_postprocess_targets.py +++ b/neusomatic/python/extract_postprocess_targets.py @@ -7,16 +7,20 @@ import argparse import traceback import logging +import pysam from utils import skip_empty from defaults import VCF_HEADER +from resolve_variants import push_left_var -def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): +def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): logger = logging.getLogger(extract_postprocess_targets.__name__) logger.info("--------------Extract Postprocessing Targets---------------") + ref_fasta = pysam.FastaFile(reference) + base_name = ".".join(input_vcf.split(".")[:-1]) out_vcf = "{}.no_resolve.vcf".format(base_name) redo_vcf = "{}.resolve_target.vcf".format(base_name) @@ -37,10 +41,20 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): if not record_set: record_set.append(record) continue - if len(list(filter(lambda x: (chrom == x[0] and (abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos)) <= max_dist)), record_set))) > 0: + chrom_, pos_, ref_, alt_ = push_left_var( + ref_fasta, chrom, pos, ref, alt) + if len(list(filter(lambda x: (chrom == x[0] and + (min(abs(x[1] + len(x[2]) - (pos + len(ref))), + abs(x[1] - pos), + abs(min(x[1] + len(x[2]), pos + len(ref)) - max(x[1], pos))) <= max_dist)), record_set))) > 0 or len( + list(filter(lambda x: (chrom_ == x[0] and + (min(abs(x[1] + len(x[2]) - (pos_ + len(ref_))), + abs(x[1] - pos_), + abs(min(x[1] + len(x[2]), pos_ + len(ref_)) - max(x[1], pos_))) <= max_dist)), record_set))) > 0: record_set.append(record) continue + if record_set: record_sets.append(record_set) record_set = [record] @@ -48,7 +62,18 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): for ii, record_set in enumerate(record_sets): if len(record_set) > 1: - if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)): + varid_pos = {} + for chrom, pos, ref, alt, _, _ in record_set: + if pos not in varid_pos: + varid_pos[pos] = set([]) + vid = "-".join([ref, alt]) + varid_pos[pos].add(vid) + multi_allelic = False + for vid in varid_pos: + if len(varid_pos[vid]) > 1: + multi_allelic = True + + if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)) or multi_allelic: for x in record_set: fields = x[-1].strip().split() fields[2] = str(ii) @@ -60,6 +85,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): else: for x in record_set: o_f.write(x[-1]) + elif record_set: if abs(len(record_set[0][2]) - len(record_set[0][3])) >= min_len: fields = record_set[0][-1].strip().split() @@ -80,6 +106,8 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): parser = argparse.ArgumentParser( description='infer genotype by ao and ro counts') + parser.add_argument('--reference', type=str, + help='reference fasta filename', required=True) parser.add_argument('--input_vcf', type=str, help='input vcf', required=True) parser.add_argument('--min_len', type=int, @@ -92,7 +120,7 @@ def extract_postprocess_targets(input_vcf, min_len, max_dist, pad): logger.info(args) try: extract_postprocess_targets( - args.input_vcf, args.min_len, args.max_dist, args.pad) + args.reference, args.input_vcf, args.min_len, args.max_dist, args.pad) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index e549a33..53a7dd8 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -110,13 +110,15 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, dp, ro, ao = list(map(int, info[1:4])) af = float(info[4]) is_same = x[1] == x[11] and x[3] == x[13] and x[4] == x[14] + is_same = 0 if is_same else 1 is_same_type = np.sign( len(x[3]) - len(x[13])) == np.sign(len(x[4]) - len(x[14])) + is_same_type = 0 if is_same_type else 1 dist = abs(int(x[1]) - int(x[11])) len_diff = abs( (len(x[3]) - len(x[13])) - (len(x[4]) - len(x[14]))) tags_info[tag].append( - [~is_same, ~is_same_type, dist, len_diff, s_e, dp, ro, ao, af]) + [is_same, is_same_type, dist, len_diff, s_e, dp, ro, ao, af]) fina_info_tag = {} for tag, hits in tags_info.items(): hits = sorted(hits, key=lambda x: x[0:5]) @@ -206,7 +208,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense logger.info("Extract targets") postprocess_pad = 1 if not long_read else 10 extract_postprocess_targets( - candidates_preds, min_len, postprocess_max_dist, postprocess_pad) + reference, candidates_preds, min_len, postprocess_max_dist, postprocess_pad) no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf") target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf") diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index c2dbf09..a73325c 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -33,16 +33,25 @@ NUC_to_NUM = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0, "N": 5} NUM_to_NUC = {1: "A", 2: "C", 3: "G", 4: "T", 0: "-", 5: "N"} +max_indel = 100 + def extract_del(record): logger = logging.getLogger(extract_del.__name__) dels = [] pos = record.pos - for C, L in record.cigartuples: + cigartuples = record.cigartuples + first_sc = 1 if cigartuples[0][0] in [ + CIGAR_SOFTCLIP, CIGAR_HARDCLIP] else 0 + last_sc = 1 if cigartuples[-1][0] in [CIGAR_SOFTCLIP, + CIGAR_HARDCLIP] else 0 + for i, (C, L) in enumerate(cigartuples): if C in [CIGAR_SOFTCLIP, CIGAR_HARDCLIP, CIGAR_INS]: continue if C == CIGAR_DEL: - dels.append([record.reference_name, pos, pos + L]) + if i > first_sc and i < len(cigartuples) - 1 - last_sc: + L_ = min(L, max_indel) + dels.append([record.reference_name, pos, pos + L_]) pos += L return dels @@ -53,15 +62,25 @@ def extract_ins(record): inss = [] pos = record.pos seq_pos = 0 - for C, L in record.cigartuples: + cigartuples = record.cigartuples + first_sc = 1 if cigartuples[0][0] in [ + CIGAR_SOFTCLIP, CIGAR_HARDCLIP] else 0 + last_sc = 1 if cigartuples[-1][0] in [CIGAR_SOFTCLIP, + CIGAR_HARDCLIP] else 0 + for i, (C, L) in enumerate(cigartuples): if C == CIGAR_SOFTCLIP: seq_pos += L continue elif C == CIGAR_HARDCLIP: continue if C == CIGAR_INS: - inss.append([record.reference_name, pos, pos + 1, - record.seq[seq_pos:seq_pos + L]]) + if not record.seq[seq_pos:seq_pos + L]: + logger.info([str(record).split("\t"), seq_pos, + L, len(record.seq), len(record.seq)]) + if i > first_sc and i < len(cigartuples) - 1 - last_sc: + L_ = min(L, max_indel) + inss.append([record.reference_name, pos, pos + 1, + record.seq[seq_pos:seq_pos + L_]]) seq_pos += L else: if C != CIGAR_DEL: @@ -69,93 +88,304 @@ def extract_ins(record): pos += L return inss +def push_left_var(ref_fasta, chrom, pos, ref, alt): + logger = logging.getLogger(push_left_var.__name__) + pos = int(pos) + while ref[-1] == alt[-1] and pos > 1: + prev_base = ref_fasta.fetch(chrom, pos - 2, pos - 1) + pos -= 1 + ref = prev_base + ref[:-1] + alt = prev_base + alt[:-1] + while ref[0] == alt[0] and len(ref) == len(alt) and len(ref) > 1: + pos += 1 + ref = ref[1:] + alt = alt[1:] + return [chrom, pos, ref, alt] + + +class Variant: + + def __init__(self, chrom, pos, ref, alt, gt, score, cnt, vtype): + self.chrom = chrom + self.pos = int(pos) + self.ref = ref + self.alt = alt + self.gt = gt + self.score = float(score) + self.cnt = float(cnt) if cnt is not None else None + self.vtype = vtype + self.processed = False + + def push_left(self, ref_fasta): + _, self.pos, self.ref, self.alt = push_left_var( + ref_fasta, self.chrom, self.pos, self.ref, self.alt) + + def var_str(self): + return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.vtype])) + + def var_pos_vt_str(self): + return "-".join(map(str, [self.chrom, self.pos, self.vtype])) + + def var_gt_str(self): + return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.gt, self.vtype])) + + def __str__(self): + return "-".join(map(str, [self.chrom, self.pos, self.ref, self.alt, self.gt, + self.score, self.cnt, self.vtype])) + + +def resolve_group(ref_fasta, variants, vars_count): + logger = logging.getLogger(resolve_group.__name__) + chrom = variants[0].chrom + vars_count_ = {} + for var_str in vars_count: + pos, ref, alt, vtype = var_str.split("-")[-4:] + pos = int(pos) + v = Variant(chrom, pos, ref, alt, "0/0", + 0, vars_count[var_str], vtype) + v.push_left(ref_fasta) + s = v.var_str() + if s not in vars_count_: + vars_count_[s] = 0 + vars_count_[s] += vars_count[var_str] + vars_count = vars_count_ + + group_vars = {} + processed = [] + for v in variants: + if v.pos not in group_vars: + group_vars[v.pos] = [] + var_str = v.var_str() + if var_str not in vars_count: + vars_count[var_str] = 0 + cnt = vars_count[var_str] + v.cnt = cnt + processed.append(var_str) + group_vars[v.pos].append(v) + + for var_str in vars_count: + if var_str not in processed: + pos, ref, alt, vtype = var_str.split("-")[-4:] + pos = int(pos) + if pos not in group_vars: + group_vars[pos] = [] + v = Variant(chrom, pos, ref, alt, "0/0", + 0, vars_count[var_str], vtype) + group_vars[pos].append(v) + for pos in group_vars: + var_ = {} + for v in group_vars[pos]: + var_id = v.var_gt_str() + if var_id not in var_: + var_[var_id] = [] + var_[var_id].append(v) + group_vars[pos] = [] + for var_id in var_: + group_vars[pos].append(sorted(var_[var_id], key=lambda x: x.score, reverse=True + )[0]) + + out_variants_ = [] + max_target = [ + v.cnt for pos in group_vars for v in group_vars[pos] if v.score > 0] + if len(max_target) == 0: + # logger.info( + # "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos]))) + return [] + + max_count = max(max_target) + for pos in group_vars.keys(): + if max(map(lambda x: x.cnt, group_vars[pos]) + ) < 0.2 * max_count: + continue + mx = max(map(lambda x: x.cnt, group_vars[pos])) + gts = [x.gt for x in group_vars[pos]] + gts = set(gts) - set(["0/0"]) + if len(gts) == 0: + continue + if len(gts) > 1: + gts_count = {"0/1": 0, "0/0": 0} + gts_score = {"0/1": 0, "0/0": 0} + for x in group_vars[pos]: + if x.gt != "0/0" and x.cnt >= 0.4 * mx: + gts_count[x.gt] += x.cnt + gts_score[x.gt] += x.score + priority = {"0/1": 2, "0/0": 1} + sorted_gts = sorted(gts_count.keys(), key=lambda x: [ + gts_count[x], gts_score[x], + priority[x]], reverse=True) + gt = sorted_gts[0] + else: + gt = list(gts)[0] + + all_vars = sorted(group_vars[pos], key=lambda x: [ + x.cnt, x.score, x.gt != "0/0"], reverse=True) + + vtypes = set([x.vtype for x in group_vars[pos] + if x.gt != "0/0" and x.cnt >= 0.4 * mx]) + if not vtypes: + vtypes = set([x.vtype for x in group_vars[pos] + if x.gt != "0/0"]) + all_vars = list( + filter(lambda x: x.vtype in vtypes, all_vars)) + if not all_vars: + logger.info( + "No vars: {}".format(list(str(x) for x in group_vars[pos]))) + logger.info( + "No vars: {}".format([[list(str(x) for x in group_vars[pos_])]for pos_ in group_vars])) + raise Exception + score = max([v.score for v in all_vars]) + v = all_vars[0] + out_variants_.append( + [v.chrom, v.pos, v.ref, v.alt, gt, score, v.cnt]) + + + vars_gt = {} + for chrom_, pos_, ref_, alt_, gt_, score_, cnt_ in out_variants_: + if gt_ not in vars_gt: + vars_gt[gt_] = [] + vars_gt[gt_].append( + Variant(chrom_, pos_, ref_, alt_, gt_, score_, cnt_, "")) + vars_gt = {gt_: sorted(vars_gt[gt_], key=lambda x: [ + x.cnt, x.score], reverse=True) for gt_ in vars_gt} + out_variants_ = [] + for gt_ in vars_gt: + v0 = vars_gt[gt_][0] + good_vs = [v0] + for v in vars_gt[gt_][1:]: + keep=True + for g_v in good_vs: + if min(v.pos + len(v.ref), g_v.pos + len(g_v.ref)) > max(v.pos, g_v.pos): + keep=False + break + if keep: + good_vs.append(v) + for v in good_vs: + out_variants_.append( + [v.chrom, v.pos, v.ref, v.alt, v.gt, v.score]) + return out_variants_ + + def find_resolved_variants(input_record): chrom, start, end, variants, input_bam, filter_duplicate, reference = input_record thread_logger = logging.getLogger( "{} ({})".format(find_resolved_variants.__name__, multiprocessing.current_process().name)) try: - ref = pysam.FastaFile(reference) - out_variants = [] + ref_fasta = pysam.FastaFile(reference) + variants_ = [] + for x in variants: + pos = int(x[1]) + ref = x[3] + alt = x[4] + gt = x[9].split(":")[0] + score = x[5] + vtype = x[-1] + v = Variant(chrom, pos, ref, alt, gt, score, None, vtype) + v.push_left(ref_fasta) + variants_.append(v) + variants = variants_ start, end = list(map(int, [start, end])) region = [chrom, start, end] - vartypes = list(map(lambda x: x[-1], variants)) - scores = list(map(lambda x: x[5], variants)) - if len(set(vartypes)) > 1: - out_variants.extend( - list(map(lambda x: [x[0], int(x[1]), x[3], x[4], x[9].split(":")[0], x[5]], variants))) - else: - vartype = vartypes[0] - score = max(scores) - if vartype == "DEL": - dels = [] - with pysam.AlignmentFile(input_bam) as samfile: - for record in samfile.fetch(chrom, start, end): - if not record.is_duplicate or not filter_duplicate: - if record.cigarstring and "D" in record.cigarstring: - dels.extend(extract_del(record)) - dels = list(filter(lambda x: ( - start <= x[1] <= end) or start <= x[2] <= end, dels)) - if dels: - del_strs = list( - map(lambda x: "---".join(map(str, x[0:3])), dels)) - uniq_dels = list(set(del_strs)) - uniq_dels_count = {} - for del_ in uniq_dels: - uniq_dels_count[del_] = del_strs.count(del_) - max_count = max(uniq_dels_count.values()) - for del_ in uniq_dels: - if uniq_dels_count[del_] <= max_count * 0.5: - del uniq_dels_count[del_] - new_bed = get_tmp_file() - with open(new_bed, "w") as f_o: - for k in uniq_dels_count.keys(): - x = k.split("---") - f_o.write( - "\t".join(map(str, x + [".", "."])) + "\n") - new_bed = bedtools_sort(new_bed, run_logger=thread_logger) - new_bed = bedtools_merge( - new_bed, args=" -c 1 -o count", run_logger=thread_logger) - vs = read_tsv_file(new_bed, fields=range(4)) - vs = list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( - x[1]) - 1, int(x[2])).upper(), ref.fetch(x[0], int(x[1]) - 1, int(x[1])).upper(), "0/1", score], vs)) - out_variants.extend(vs) - elif vartype == "INS": - intervals = [] - inss = [] - with pysam.AlignmentFile(input_bam) as samfile: - for record in samfile.fetch(chrom, start, end): - if not record.is_duplicate or not filter_duplicate: - if record.cigarstring and "I" in record.cigarstring: - inss.extend(extract_ins(record)) - inss = list(filter(lambda x: ( - start <= x[1] <= end) or start <= x[2] <= end, inss)) - if inss: - ins_strs = list( - map(lambda x: "---".join(map(str, x[0:4])), inss)) - uniq_inss = list(set(ins_strs)) - uniq_inss_count = {} - for ins_ in uniq_inss: - uniq_inss_count[ins_] = ins_strs.count(ins_) - max_ins, max_count = sorted( - uniq_inss_count.items(), key=lambda x: x[1])[-1] - max_pos = int(max_ins.split("---")[1]) - for ins_ in uniq_inss: - if uniq_inss_count[ins_] <= max_count * 0.5 or 0 < abs(int(ins_.split("---")[1]) - max_pos) < 4: - del uniq_inss_count[ins_] - - new_bed = get_tmp_file() - with open(new_bed, "w") as f_o: - for k in uniq_inss_count.keys(): - x = k.split("---") - f_o.write( - "\t".join(map(str, x + [".", "."])) + "\n") - new_bed = bedtools_sort(new_bed, run_logger=thread_logger) - vs = read_tsv_file(new_bed, fields=range(4)) - vs = list(map(lambda x: [x[0], int(x[1]), ref.fetch(x[0], int( - x[1]) - 1, int(x[1])).upper(), ref.fetch(x[0], int(x[1]) - 1, int(x[1])).upper() + x[3], "0/1", score], vs)) - out_variants.extend(vs) - return out_variants + vartypes = list(map(lambda x: x.vtype, variants)) + scores = list(map(lambda x: x.score, variants)) + dels = [] + inss = [] + snps = [] + vars_count = {} + with pysam.AlignmentFile(i_bam) as samfile: + cov = 0 + dels_ = [] + inss_ = [] + snps_ = [] + for record in samfile.fetch(chrom, start, end): + if record.is_unmapped: + continue + if record.seq is None: + continue + if not record.is_duplicate or not filter_duplicate: + cov += 1 + if record.cigarstring and "D" in record.cigarstring: + dels_.extend(extract_del(record)) + if record.cigarstring and "I" in record.cigarstring: + inss_.extend(extract_ins(record)) + aligned_pairs = np.array( + record.get_aligned_pairs(matches_only=True)) + near_pos = np.where((start <= aligned_pairs[:, 1]) & ( + aligned_pairs[:, 1] <= end))[0] + if len(near_pos) != 0: + for pos_i in near_pos: + seq_pos, ref_pos = aligned_pairs[pos_i, :] + if seq_pos is not None: + ref_snp = ref_fasta.fetch( + chrom, ref_pos, ref_pos + 1).upper() + alt_snp = record.seq[seq_pos] + if alt_snp != ref_snp: + snps_.append( + [chrom, ref_pos + 1, ref_snp, alt_snp]) + + dels.extend([x + [1.0 / (cov)] for x in dels_]) + inss.extend([x + [1.0 / (cov)] for x in inss_]) + snps.extend([x + [1.0 / (cov)] for x in snps_]) + + dels = list(filter(lambda x: ( + start <= x[1] <= end) or start <= x[2] <= end, dels)) + if dels: + del_strs = [] + cnt_ = {} + for x in dels: + chrom, st, en, cnt = x + del_str = "---".join(map(str, [chrom, st, en])) + if del_str not in cnt_: + cnt_[del_str] = 0 + cnt_[del_str] += cnt + del_strs.append(del_str) + + uniq_dels = list(set(del_strs)) + for del_ in uniq_dels: + st, en = map(int, del_.split("---")[1:3]) + del_str = "-".join(list(map(str, [chrom, int(st), ref_fasta.fetch(chrom, st - 1, en).upper(), + ref_fasta.fetch(chrom, st - 1, st).upper(), "DEL"]))) + vars_count[del_str] = np.round(cnt_[del_], 4) + inss = list(filter(lambda x: ( + start <= x[1] <= end) or start <= x[2] <= end, inss)) + if inss: + cnt_ = {} + ins_strs = [] + for x in inss: + chrom, st, en, bases, cnt = x + ins_str = "---".join(map(str, [chrom, st, en, bases])) + if ins_str not in cnt_: + cnt_[ins_str] = 0 + cnt_[ins_str] += cnt + ins_strs.append(ins_str) + uniq_inss = list(set(ins_strs)) + for ins_ in uniq_inss: + st, en, bases = ins_.split("---")[1:4] + st, en = map(int, [st, en]) + ins_str = "-".join(list(map(str, [chrom, int(st), ref_fasta.fetch(chrom, st - 1, st).upper(), + ref_fasta.fetch(chrom, st - 1, st).upper() + bases, "INS"]))) + vars_count[ins_str] = np.round(cnt_[ins_], 4) + + if snps: + cnt_ = {} + snp_strs = [] + for x in snps: + chrom, st, ref_, alt_, cnt = x + snp_str = "---".join(map(str, [chrom, st, ref_, alt_])) + if snp_str not in cnt_: + cnt_[snp_str] = 0 + cnt_[snp_str] += cnt + snp_strs.append(snp_str) + uniq_snps = list(set(snp_strs)) + for snp_ in uniq_snps: + st, ref_, alt_ = snp_.split("---")[1:4] + snp_str = "-".join(list(map(str, [chrom, st, ref_, + alt_, "SNP"]))) + vars_count[snp_str] = np.round(cnt_[snp_], 4) + + out_variants_ = resolve_group(ref_fasta, variants, vars_count) + return out_variants_ + except Exception as ex: thread_logger.error(traceback.format_exc()) thread_logger.error(ex) @@ -191,20 +421,26 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file, map_args.append([chrom, start, end, variants[id_], input_bam, filter_duplicate, reference]) - pool = multiprocessing.Pool(num_threads) - try: - out_variants_list = pool.map_async( - find_resolved_variants, map_args).get() - pool.close() - except Exception as inst: - logger.error(inst) - pool.close() - traceback.print_exc() - raise Exception - - for o in out_variants_list: - if o is None: - raise Exception("resolve_variants failed!") + if num_threads > 1: + try: + n_per_bacth = min(10 * num_threads, len(map_args)) + out_variants_list = [] + i = 0 + while i < len(map_args): + pool = multiprocessing.Pool(num_threads) + batch_i_s = i + batch_i_e = min(i + n_per_bacth, len(map_args)) + out_variants_list.extend(pool.map_async( + find_resolved_variants, map_args[batch_i_s:batch_i_e]).get()) + i = batch_i_e + pool.close() + except Exception as inst: + logger.error(inst) + pool.close() + traceback.print_exc() + raise Exception + else: + out_variants_list = [find_resolved_variants(w) for w in map_args] out_variants = [x for xs in out_variants_list for x in xs] chroms_order = get_chromosomes_order(bam=input_bam) @@ -214,8 +450,13 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file, with open(resolved_vcf, "w") as o_f: o_f.write("{}\n".format(VCF_HEADER)) o_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") + done_id = set([]) for chrom, pos, ref, alt, gt, phred_score in out_variants: if ref != alt: + id_ = "-".join(list(map(str, [chrom, pos, ref, alt]))) + if id_ in done_id: + continue + done_id.add(id_) phred_score = float(phred_score) prob = np.round(1 - (10**(-phred_score / 10)), 4) o_f.write("\t".join([chrom, str(pos), ".", ref, From 2094b3e9c1b95612ee745180abcce98c85aaef92 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 18 Jun 2020 20:47:25 -0700 Subject: [PATCH 55/89] fix_bugs --- docker/Dockerfile | 2 +- neusomatic/python/call.py | 26 +++++++++++++++----------- neusomatic/python/resolve_variants.py | 5 ++++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 847941b..fc28553 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:16.04 -ENV NEUSOMATIC_VERSION 0.2.1 +ENV NEUSOMATIC_VERSION 0.3.0 ENV ZLIB_VERSION 1.2.11 ENV NUMPY_VERSION 1.15.4 ENV SCIPY_VERSION 1.2.0 diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 6b5a8b9..52f3a69 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -146,7 +146,7 @@ def pred_vcf_records_path(record): chrom, pos, ref, alt, _, center, _, _, _ = path.split( ".") - ref, alt = ref.upper(), alt.upper() + ref, alt = ref.upper(), alt.upper() center = int(center) pos = int(pos) @@ -247,8 +247,14 @@ def pred_vcf_records_path(record): ref_ += ACGT[rb] II = I.copy() II[rb + 1, center__, 1] = 0 - if max(II[1:, center__, 1]) == 0 and center__ == center and ref == ref_ and len(alt) == 1: - alt_ = alt + if max(II[1:, center__, 1]) == 0: + if abs(center_pred - center) < center_dist_roundback * 3 and len_pred == 1: + pos_, ref_, alt_ = pos, ref.upper(), alt.upper() + break + else: + ref_ = "" + alt_ = "" + break else: alt_ += ACGT[np.argmax(II[1:, center__, 1])] if sum(I[1:, center__, 1]) == 0: @@ -269,7 +275,7 @@ def pred_vcf_records_path(record): if pos_ == -2: # print "PPP-1",path,pred return vcf_record - len_pred_=len_pred + len_pred_ = len_pred if len_pred == 3: len_pred = max(len(alt) - len(ref), len_pred) if (sum(I[1:, i_, 1]) == 0): @@ -469,13 +475,12 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("no_seq_complexity: {}".format(no_seq_complexity)) logger.info("zero_ann_cols: {}".format(zero_ann_cols)) logger.info("ensemble_custom_header: {}".format(ensemble_custom_header)) - - - if not ensemble_custom_header: + + if not ensemble_custom_header: expected_ens_fields = NUM_ENS_FEATURES if not no_seq_complexity: expected_ens_fields += 2 - + logger.info("expected_ens_fields: {}".format(expected_ens_fields)) expected_st_fields = 4 @@ -493,7 +498,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, elif len(x) == 4: break else: - raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) + raise Exception( + "Wrong number of fields in {}: {}".format(tsv, len(x))) num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES @@ -539,7 +545,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, # 3. load the new state dict net.load_state_dict(pretrained_state_dict) - if not os.path.exists(out_dir): os.mkdir(out_dir) matrices_dir = "{}/matrices_{}".format(out_dir, model_tag) @@ -548,7 +553,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, shutil.rmtree(matrices_dir) os.mkdir(matrices_dir) - new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs") if os.path.exists(new_split_tsvs_dir): logger.warning( diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index a73325c..e08cf5f 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -192,6 +192,9 @@ def resolve_group(ref_fasta, variants, vars_count): # "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos]))) return [] + + # logger.info(list([pos, [str(y) for y in x]] for pos,x in group_vars.items())) + max_count = max(max_target) for pos in group_vars.keys(): if max(map(lambda x: x.cnt, group_vars[pos]) @@ -292,7 +295,7 @@ def find_resolved_variants(input_record): inss = [] snps = [] vars_count = {} - with pysam.AlignmentFile(i_bam) as samfile: + with pysam.AlignmentFile(input_bam) as samfile: cov = 0 dels_ = [] inss_ = [] From 30ac6cf2f5e76d339f5f3c0912b2dabdde8571f3 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 19 Jun 2020 01:29:07 -0700 Subject: [PATCH 56/89] updated versions to 0.3.0 --- docker/Dockerfile | 33 +++++++++++++++++---------------- neusomatic/python/_version.py | 2 +- test/NeuSomatic_standalone.vcf | 2 +- test/docker_test.sh | 18 +++++++++--------- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index fc28553..223b8a5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,19 +2,20 @@ FROM ubuntu:16.04 ENV NEUSOMATIC_VERSION 0.3.0 ENV ZLIB_VERSION 1.2.11 -ENV NUMPY_VERSION 1.15.4 -ENV SCIPY_VERSION 1.2.0 -ENV IMAGEIO_VERSION 2.5.0 -ENV PYTORCH_VERSION 1.1.0 -ENV TORCHVISION_VERSION 0.3.0 -ENV CUDATOOLKIT_VERSION 9.0 -ENV CMAKE_VERSION 3.13.2 -ENV PYBEDTOOLS_VERSION 0.8.0 -ENV PYSAM_VERSION 0.15.2 +ENV NUMPY_VERSION 1.18.5 +ENV SCIPY_VERSION 1.4.1 +ENV IMAGEIO_VERSION 2.8.0 +ENV PILLOW_VERSION 2.8.0 +ENV PYTORCH_VERSION 1.4.0 +ENV TORCHVISION_VERSION 0.5.0 +ENV CUDATOOLKIT_VERSION 9.2 +ENV CMAKE_VERSION 3.17.0 +ENV PYSAM_VERSION 0.15.3 ENV SAMTOOLS_VERSION 1.9 ENV TABIX_VERSION 0.2.6 -ENV BEDTOOLS_VERSION 2.27.1 -ENV BIOPYTHON_VERSION 1.72 +ENV BEDTOOLS_VERSION 2.29.2 +ENV BIOPYTHON_VERSION 1.76 +ENV FISHER_VERSION 0.1.9 ENV GCC_VERSION 5 RUN apt-get update && apt-get install -y --fix-missing \ @@ -30,9 +31,9 @@ RUN conda update -y conda RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \ - imageio=${IMAGEIO_VERSION} && conda clean -a -RUN conda install -y cmake=${CMAKE_VERSION} -c conda-forge && conda clean -a -RUN conda install -y pysam=${PYSAM_VERSION} pybedtools=${PYBEDTOOLS_VERSION} \ + pillow=${PILLOW_VERSION} cmake=${CMAKE_VERSION} imageio=${IMAGEIO_VERSION} && conda clean -a +RUN conda install -y fisher=${FISHER_VERSION} -c conda-forge && conda clean -a +RUN conda install -y pysam=${PYSAM_VERSION} \ samtools=${SAMTOOLS_VERSION} tabix=${TABIX_VERSION} \ bedtools=${BEDTOOLS_VERSION} \ biopython=${BIOPYTHON_VERSION} -c bioconda && conda clean -a @@ -42,7 +43,7 @@ RUN conda install -y pytorch=${PYTORCH_VERSION} \ RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION} -ADD https://github.com/bioinform/neusomatic/archive/v${NEUSOMATIC_VERSION}.tar.gz /opt/v${NEUSOMATIC_VERSION}.tar.gz -RUN cd /opt/ && tar -xzvf v${NEUSOMATIC_VERSION}.tar.gz && mv neusomatic-${NEUSOMATIC_VERSION} neusomatic && rm /opt/v${NEUSOMATIC_VERSION}.tar.gz +ADD https://github.com/bioinform/neusomatic/archive/extended_standalone.tar.gz /opt/extended_standalone.tar.gz +RUN cd /opt/ && tar -xzvf extended_standalone.tar.gz && mv neusomatic-extended_standalone neusomatic && rm /opt/extended_standalone.tar.gz RUN cd /opt/neusomatic/ && ./build.sh ENV PATH=/opt/neusomatic/neusomatic/bin:/opt/neusomatic/neusomatic/python/:${PATH} diff --git a/neusomatic/python/_version.py b/neusomatic/python/_version.py index 3ced358..493f741 100755 --- a/neusomatic/python/_version.py +++ b/neusomatic/python/_version.py @@ -1 +1 @@ -__version__ = "0.2.1" +__version__ = "0.3.0" diff --git a/test/NeuSomatic_standalone.vcf b/test/NeuSomatic_standalone.vcf index bee861b..b054942 100644 --- a/test/NeuSomatic_standalone.vcf +++ b/test/NeuSomatic_standalone.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##NeuSomatic Version=0.2.1 +##NeuSomatic Version=0.3.0 ##INFO= ##INFO= ##INFO= diff --git a/test/docker_test.sh b/test/docker_test.sh index 6118711..fcbead1 100755 --- a/test/docker_test.sh +++ b/test/docker_test.sh @@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ] then if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa" fi rm -rf work_standalone @@ -27,7 +27,7 @@ rm -rf work_standalone #Stand-alone NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 --num_threads 1 \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --num_threads 1 \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ @@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 rm -rf /mnt/example/work_ensemble #Ensemble NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 --ensemble_tsv /mnt/ensemble.tsv \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --ensemble \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.2.1 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ From c2deecb66ff20de1f22c8cbdfc5de0ed0628783e Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 19 Jun 2020 12:27:44 -0700 Subject: [PATCH 57/89] small fix --- docker/Dockerfile | 10 +++++----- neusomatic/python/extend_features.py | 2 +- neusomatic/python/generate_dataset.py | 6 ++++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 223b8a5..51696af 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,15 +1,15 @@ -FROM ubuntu:16.04 +FROM ubuntu:18.04 ENV NEUSOMATIC_VERSION 0.3.0 ENV ZLIB_VERSION 1.2.11 -ENV NUMPY_VERSION 1.18.5 +ENV NUMPY_VERSION 1.18.1 ENV SCIPY_VERSION 1.4.1 ENV IMAGEIO_VERSION 2.8.0 -ENV PILLOW_VERSION 2.8.0 +ENV PILLOW_VERSION 7.1.2 ENV PYTORCH_VERSION 1.4.0 ENV TORCHVISION_VERSION 0.5.0 ENV CUDATOOLKIT_VERSION 9.2 -ENV CMAKE_VERSION 3.17.0 +ENV CMAKE_VERSION 3.14.0 ENV PYSAM_VERSION 0.15.3 ENV SAMTOOLS_VERSION 1.9 ENV TABIX_VERSION 0.2.6 @@ -22,7 +22,7 @@ RUN apt-get update && apt-get install -y --fix-missing \ build-essential zlib1g-dev curl less vim bzip2 RUN apt-get install -y --fix-missing git wget -RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b RUN rm Miniconda3-latest-Linux-x86_64.sh ENV PATH=/miniconda/bin:${PATH} diff --git a/neusomatic/python/extend_features.py b/neusomatic/python/extend_features.py index bf4c118..5d4b482 100755 --- a/neusomatic/python/extend_features.py +++ b/neusomatic/python/extend_features.py @@ -54,7 +54,7 @@ def extract_features(candidate_record): (tBamFeatures.nref, nBamFeatures.nref)), alternative='greater')) except ValueError: - score_varscan2 = nan + score_varscan2 = float('nan') homopolymer_length, site_homopolymer_length = sequencing_features.from_genome_reference( ref_fa, my_coordinate, ref, alt) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 2991274..6148fbb 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1528,7 +1528,6 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea Seq_Complexity_ = list(map(lambda x: x[0], filter( lambda x: x[1] in ["Seq_Complexity_Span", "Seq_Complexity_Adj"], enumerate(header)))) - max_varscan2_score = 0 if zero_vscore else 60 min_max_features = [[cov_features, 0, 2 * COV], [mq_features, 0, 70], [bq_features, 0, 41], @@ -1538,7 +1537,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea [stralka_scor, 0, 40], [stralka_qss, 0, 200], [stralka_tqss, 0, 4], - [varscan2_score, 0, max_varscan2_score], + [varscan2_score, 0, 60], [vardict_score, 0, 120], [m2_lod, 0, 100], [sniper_score, 0, 120], @@ -1554,6 +1553,9 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea if not no_seq_complexity: min_max_features.append([Seq_Complexity_, 0, 40]) + if zero_vscore: + ensemble_data[:,np.array(varscan2_score)] = 0 + selected_features = sorted([i for f in min_max_features for i in f[0]]) selected_features_tags = list( map(lambda x: header[x], selected_features)) From bad8004e9d797b1cbe796c121a1f97dbd33b8655 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 19 Jun 2020 12:31:00 -0700 Subject: [PATCH 58/89] small fix --- neusomatic/python/preprocess.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index ae1586c..648a5b6 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -689,6 +689,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--merge_d_for_scan', type=int, help='-d used to merge regions before scan', default=None) + parser.add_argument('--zero_vscore', + help='set VarScan2_Score to zero', + action="store_true") parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, From 97e4864ccd967cdd4746822c7581592d9da97d3f Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 19 Jun 2020 22:59:22 -0700 Subject: [PATCH 59/89] fix test --- docker/Dockerfile | 1 + test/NeuSomatic_ensemble.vcf | 2 +- test/docker_test.sh | 18 +++++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 51696af..f6bdbc9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -43,6 +43,7 @@ RUN conda install -y pytorch=${PYTORCH_VERSION} \ RUN apt-get install -y --fix-missing gcc-${GCC_VERSION} g++-${GCC_VERSION} + ADD https://github.com/bioinform/neusomatic/archive/extended_standalone.tar.gz /opt/extended_standalone.tar.gz RUN cd /opt/ && tar -xzvf extended_standalone.tar.gz && mv neusomatic-extended_standalone neusomatic && rm /opt/extended_standalone.tar.gz RUN cd /opt/neusomatic/ && ./build.sh diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf index e3a7d8b..d62986b 100644 --- a/test/NeuSomatic_ensemble.vcf +++ b/test/NeuSomatic_ensemble.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##NeuSomatic Version=0.2.1 +##NeuSomatic Version=0.3.0 ##INFO= ##INFO= ##INFO= diff --git a/test/docker_test.sh b/test/docker_test.sh index fcbead1..4a8f34f 100755 --- a/test/docker_test.sh +++ b/test/docker_test.sh @@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ] then if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa" fi rm -rf work_standalone @@ -27,7 +27,7 @@ rm -rf work_standalone #Stand-alone NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 --num_threads 1 \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --num_threads 1 \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ @@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 rm -rf /mnt/example/work_ensemble #Ensemble NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 --ensemble_tsv /mnt/ensemble.tsv \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neu --ensemble \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ From fb6ea214ece6b743fb2c070a670c1eafbc702c50 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 23 Jun 2020 17:08:43 -0700 Subject: [PATCH 60/89] fix in resolve variants --- neusomatic/python/generate_dataset.py | 4 +- neusomatic/python/postprocess.py | 3 + neusomatic/python/resolve_variants.py | 76 ++++++++++++++++++------ neusomatic/python/sequencing_features.py | 2 +- 4 files changed, 64 insertions(+), 21 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 6148fbb..53920c5 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1553,8 +1553,8 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea if not no_seq_complexity: min_max_features.append([Seq_Complexity_, 0, 40]) - if zero_vscore: - ensemble_data[:,np.array(varscan2_score)] = 0 + if zero_vscore and n_vars > 0: + ensemble_data[:, np.array(varscan2_score)] = 0 selected_features = sorted([i for f in min_max_features for i in f[0]]) selected_features_tags = list( diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 53a7dd8..32043a8 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -40,6 +40,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, c_f.write("{}\n".format(VCF_HEADER)) c_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") + ensemble_header_found = False for line in e_f: if "POS" in line: header = line.strip().split() @@ -47,6 +48,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, pos_id = header.index("POS") ref_id = header.index("REF") alt_id = header.index("ALT") + ensemble_header_found = True if "T_DP" in line: dp_id = header.index("T_DP") ref_fw_id = header.index("T_REF_FOR") @@ -57,6 +59,7 @@ def add_vcf_info(work, reference, merged_vcf, candidates_vcf, ensemble_tsv, else: dp_id, ref_fw_id, ref_rv_id, alt_fw_id, alt_rv_id = None, None, None, None, None continue + assert ensemble_header_found fields = line.strip().split() chrom = fields[chrom_id] pos = fields[pos_id] diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index e08cf5f..cd170f7 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -88,6 +88,17 @@ def extract_ins(record): pos += L return inss + +def find_vtype(ref, alt): + if len(alt) < len(ref): + vtype = "DEL" + elif len(alt) > len(ref): + vtype = "INS" + else: + vtype = "SNP" + return vtype + + def push_left_var(ref_fasta, chrom, pos, ref, alt): logger = logging.getLogger(push_left_var.__name__) pos = int(pos) @@ -114,6 +125,7 @@ def __init__(self, chrom, pos, ref, alt, gt, score, cnt, vtype): self.score = float(score) self.cnt = float(cnt) if cnt is not None else None self.vtype = vtype + self.len = abs(len(alt) - len(ref)) self.processed = False def push_left(self, ref_fasta): @@ -185,15 +197,15 @@ def resolve_group(ref_fasta, variants, vars_count): )[0]) out_variants_ = [] - max_target = [ - v.cnt for pos in group_vars for v in group_vars[pos] if v.score > 0] + max_target = [v.cnt for pos in group_vars for v in group_vars[ + pos] if v.score > 0 or v.len >= 3] if len(max_target) == 0: # logger.info( # "No non-zero COUNT with non-zero SCORE: {}".format(list(str(x) for x in group_vars[pos]))) return [] - - # logger.info(list([pos, [str(y) for y in x]] for pos,x in group_vars.items())) + # logger.info(list([pos, [str(y) for y in x]] + # for pos, x in group_vars.items())) max_count = max(max_target) for pos in group_vars.keys(): @@ -202,14 +214,15 @@ def resolve_group(ref_fasta, variants, vars_count): continue mx = max(map(lambda x: x.cnt, group_vars[pos])) gts = [x.gt for x in group_vars[pos]] - gts = set(gts) - set(["0/0"]) + gts = set([x.gt for x in group_vars[pos] + if x.gt != "0/0" or x.len >= 3]) if len(gts) == 0: continue if len(gts) > 1: gts_count = {"0/1": 0, "0/0": 0} gts_score = {"0/1": 0, "0/0": 0} for x in group_vars[pos]: - if x.gt != "0/0" and x.cnt >= 0.4 * mx: + if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx: gts_count[x.gt] += x.cnt gts_score[x.gt] += x.score priority = {"0/1": 2, "0/0": 1} @@ -222,12 +235,11 @@ def resolve_group(ref_fasta, variants, vars_count): all_vars = sorted(group_vars[pos], key=lambda x: [ x.cnt, x.score, x.gt != "0/0"], reverse=True) - vtypes = set([x.vtype for x in group_vars[pos] - if x.gt != "0/0" and x.cnt >= 0.4 * mx]) + if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx]) if not vtypes: vtypes = set([x.vtype for x in group_vars[pos] - if x.gt != "0/0"]) + if (x.gt != "0/0" or x.len >= 3)]) all_vars = list( filter(lambda x: x.vtype in vtypes, all_vars)) if not all_vars: @@ -237,10 +249,44 @@ def resolve_group(ref_fasta, variants, vars_count): "No vars: {}".format([[list(str(x) for x in group_vars[pos_])]for pos_ in group_vars])) raise Exception score = max([v.score for v in all_vars]) + if gt == "0/0": + nz_vars = [x for x in all_vars if x.gt != + "0/0" and x.vtype == all_vars[0].vtype] + if nz_vars: + nz_vars = sorted(nz_vars, key=lambda x: [ + x.score], reverse=True)[0] + gt = nz_vars.gt v = all_vars[0] out_variants_.append( [v.chrom, v.pos, v.ref, v.alt, gt, score, v.cnt]) + if len(out_variants_) == 1 and out_variants_[0][4] == "0/0" and abs(len(out_variants_[0][2]) - len(out_variants_[0][3])) >= 3: + chrom_, pos_, ref_, alt_, gt_, score_, cnt_ = out_variants_[0] + vtype = find_vtype(ref_, alt_) + resolve_candids = [] + for pos in group_vars.keys(): + for y in group_vars[pos]: + if y.vtype == vtype and y.gt != "0/0": + resolve_candids.append(y) + if resolve_candids: + resolve_candids = sorted(resolve_candids, key=lambda x: [ + x.score], reverse=True)[0] + out_variants_ = [[chrom_, pos_, ref_, alt_, + resolve_candids.gt, resolve_candids.score, cnt_]] + + if len(out_variants_) > 1 and "0/0" in [x[4] for x in out_variants_]: + nz_vars = [x for x in out_variants_ if x[4] != "0/0"] + if nz_vars: + nz_vtypes = [find_vtype(x[2], x[3]) for x in nz_vars] + out_variants__ = [] + for x in out_variants_: + if x[4] != "0/0": + out_variants__.append(x) + else: + vtype = find_vtype(x[2], x[3]) + if vtype not in nz_vtypes: + out_variants__.append(x) + out_variants_ = out_variants__ vars_gt = {} for chrom_, pos_, ref_, alt_, gt_, score_, cnt_ in out_variants_: @@ -255,10 +301,10 @@ def resolve_group(ref_fasta, variants, vars_count): v0 = vars_gt[gt_][0] good_vs = [v0] for v in vars_gt[gt_][1:]: - keep=True + keep = True for g_v in good_vs: if min(v.pos + len(v.ref), g_v.pos + len(g_v.ref)) > max(v.pos, g_v.pos): - keep=False + keep = False break if keep: good_vs.append(v) @@ -268,7 +314,6 @@ def resolve_group(ref_fasta, variants, vars_count): return out_variants_ - def find_resolved_variants(input_record): chrom, start, end, variants, input_bam, filter_duplicate, reference = input_record thread_logger = logging.getLogger( @@ -406,12 +451,7 @@ def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file, for line in skip_empty(tv_f): fields = line.strip().split() id_ = int(fields[2]) - if len(fields[4]) < len(fields[3]): - vartype = "DEL" - elif len(fields[4]) > len(fields[3]): - vartype = "INS" - else: - vartype = "SNP" + vartype = find_vtype(fields[3], fields[4]) if id_ not in variants: variants[id_] = [] variants[id_].append(fields + [vartype]) diff --git a/neusomatic/python/sequencing_features.py b/neusomatic/python/sequencing_features.py index 6aafc1e..364f76b 100644 --- a/neusomatic/python/sequencing_features.py +++ b/neusomatic/python/sequencing_features.py @@ -385,7 +385,7 @@ def subLC(sequence, max_substring_length=20): number_of_subseqs = 0 seq_length = len(sequence) max_number_of_subseqs = max_sub_vocabularies( - seq_length, max_substring_length) + seq_length, min(seq_length, max_substring_length)) set_of_seq_n = set() for i in range(1, min(max_substring_length + 1, seq_length + 1)): From 1bf40b46ab37eeb9ee28d87062f4f3329d959d33 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 25 Jun 2020 00:03:33 -0700 Subject: [PATCH 61/89] ensemble with internal features --- neusomatic/python/filter_candidates.py | 32 +-------------- neusomatic/python/preprocess.py | 54 ++++++++++++++------------ neusomatic/python/read_callers_vcf.py | 11 +++--- test/NeuSomatic_ensemble.vcf | 16 ++++---- 4 files changed, 45 insertions(+), 68 deletions(-) diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py index 58fc628..1fa395e 100755 --- a/neusomatic/python/filter_candidates.py +++ b/neusomatic/python/filter_candidates.py @@ -17,7 +17,7 @@ def filter_candidates(candidate_record): - candidates_vcf, filtered_candidates_vcf, reference, dbsnp, min_dp, max_dp, good_ao, \ + candidates_vcf, filtered_candidates_vcf, reference, min_dp, max_dp, good_ao, \ min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, \ del_merge_min_af, ins_merge_min_af, merge_r = candidate_record thread_logger = logging.getLogger( @@ -26,16 +26,6 @@ def filter_candidates(candidate_record): thread_logger.info( "---------------------Filter Candidates---------------------") - if dbsnp: - if not dbsnp.endswith("vcf.gz"): - thread_logger.error("Aborting!") - raise Exception( - "The dbSNP file should be a tabix indexed file with .vcf.gz format") - if not os.path.exists(dbsnp + ".tbi"): - thread_logger.error("Aborting!") - raise Exception( - "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) - records = {} with open(candidates_vcf) as v_f: for line in skip_empty(v_f): @@ -267,27 +257,11 @@ def filter_candidates(candidate_record): "GT:DP:RO:AO:AF", "0/1:{}:{}:{}:{}".format(dp, ro, ao, af)]) final_records.append([chrom, pos - 1, ref, alt, line]) final_records = sorted(final_records, key=lambda x: x[0:2]) - if dbsnp: - dbsnp_tb = pysam.TabixFile(dbsnp) with open(filtered_candidates_vcf, "w") as o_f: o_f.write("{}\n".format(VCF_HEADER)) o_f.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") for record in final_records: - if dbsnp: - chrom, pos, ref, alt = record[0:4] - var_id = "-".join(map(str, [chrom, pos, ref, alt])) - region = "{}:{}-{}".format(chrom, pos, pos + 1) - dbsnp_vars = [] - for x in dbsnp_tb.fetch(region=region): - chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[ - 0:5] - for alt_ in alts_.split(","): - dbsnp_var_id = "-".join(map(str, - [chrom_, pos_, ref_, alt_])) - dbsnp_vars.append(dbsnp_var_id) - if var_id in dbsnp_vars: - continue o_f.write(record[-1] + "\n") return filtered_candidates_vcf @@ -309,8 +283,6 @@ def filter_candidates(candidate_record): required=True) parser.add_argument('--reference', type=str, help='reference fasta filename', required=True) - parser.add_argument('--dbsnp_to_filter', type=str, - help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None) parser.add_argument('--good_ao', type=float, help='good alternate count (ignores maf)', default=10) parser.add_argument('--min_ao', type=float, @@ -341,7 +313,7 @@ def filter_candidates(candidate_record): try: output = filter_candidates((args.candidates_vcf, args.filtered_candidates_vcf, - args.reference, args.dbsnp_to_filter, args.min_dp, args.max_dp, + args.reference, args.min_dp, args.max_dp, args.good_ao, args.min_ao, args.snp_min_af, args.snp_min_bq, args.snp_min_ao, args.ins_min_af, args.del_min_af, diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 648a5b6..3b470e8 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -24,7 +24,7 @@ from utils import concatenate_vcfs, run_bedtools_cmd, bedtools_sort, bedtools_merge, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty, vcf_2_bed -def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp, +def process_split_region(tn, work, region, reference, mode, alignment_bam, scan_window_size, scan_maf, min_mapq, filtered_candidates_vcf, min_dp, max_dp, filter_duplicate, @@ -48,7 +48,7 @@ def process_split_region(tn, work, region, reference, mode, alignment_bam, dbsnp for i, (raw_vcf, count_bed, split_region_bed) in enumerate(scan_outputs): filtered_vcf = os.path.join(os.path.dirname( os.path.realpath(raw_vcf)), "filtered_candidates.vcf") - map_args.append((raw_vcf, filtered_vcf, reference, dbsnp, min_dp, max_dp, good_ao, + map_args.append((raw_vcf, filtered_vcf, reference, min_dp, max_dp, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, ins_min_af, del_min_af, del_merge_min_af, ins_merge_min_af, merge_r)) try: @@ -283,7 +283,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, work_tumor_without_q, "filtered_candidates.vcf") tumor_outputs_without_q = process_split_region("tumor", work_tumor_without_q, region_bed, reference, mode, - tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq, + tumor_bam, scan_window_size, scan_maf, min_mapq, filtered_candidates_vcf_without_q, min_dp, max_dp, filter_duplicate, good_ao, min_ao, @@ -309,7 +309,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, logger.info("Scan tumor bam (and extracting quality scores).") tumor_outputs = process_split_region("tumor", work_tumor, region_bed, reference, mode, - tumor_bam, dbsnp, scan_window_size, scan_maf, min_mapq, + tumor_bam, scan_window_size, scan_maf, min_mapq, filtered_candidates_vcf, min_dp, max_dp, filter_duplicate, good_ao, min_ao, @@ -338,7 +338,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, os.mkdir(work_normal) logger.info("Scan normal bam (and extracting quality scores).") normal_counts, _, _ = process_split_region("normal", work_normal, region_bed, reference, mode, normal_bam, - None, scan_window_size, 0.2, min_mapq, + scan_window_size, 0.2, min_mapq, None, min_dp, max_dp, filter_duplicate, good_ao, min_ao, snp_min_af, snp_min_bq, snp_min_ao, @@ -361,24 +361,28 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, shutil.rmtree(work_dataset_split) os.mkdir(work_dataset_split) ensemble_bed_i = ensemble_beds[i] if ensemble_tsv else None - if add_extra_features: + if add_extra_features or (ensemble_tsv and not no_feature_recomp_for_ensemble): work_tumor_i = os.path.dirname(filtered_vcf) - extra_features_tsv = os.path.join( - work_tumor_i, "extra_features.tsv") - ex_tsvs = [extra_features_tsv] - if not os.path.exists(extra_features_tsv) or restart: - extend_features(filtered_vcf, - ensemble_beds[ - i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None, - None, - extra_features_tsv, - reference, tumor_bam, normal_bam, - min_mapq, snp_min_bq, - dbsnp, None, - no_seq_complexity, - window_extend, - max_cluster_size, - num_threads) + if add_extra_features: + extra_features_tsv = os.path.join( + work_tumor_i, "extra_features.tsv") + ex_tsvs = [extra_features_tsv] + if not os.path.exists(extra_features_tsv) or restart: + extend_features(filtered_vcf, + ensemble_beds[ + i] if (ensemble_tsv and no_feature_recomp_for_ensemble) else None, + None, + extra_features_tsv, + reference, tumor_bam, normal_bam, + min_mapq, snp_min_bq, + dbsnp, None, + no_seq_complexity, + window_extend, + max_cluster_size, + num_threads) + else: + ex_tsvs = [] + extra_features_tsv = None if ensemble_tsv and not no_feature_recomp_for_ensemble: extra_features_others_tsv = os.path.join( work_tumor_i, "extra_features_others.tsv") @@ -612,8 +616,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, help='normal bam', required=True) parser.add_argument('--work', type=str, help='work directory', required=True) - parser.add_argument('--dbsnp_to_filter', type=str, - help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None) + parser.add_argument('--dbsnp', type=str, + help='dbsnp vcf.gz', default=None) parser.add_argument('--scan_window_size', type=int, help='window size to scan the variants', default=2000) parser.add_argument('--scan_maf', type=float, @@ -703,7 +707,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, try: preprocess(args.work, args.mode, args.reference, args.region_bed, args.tumor_bam, args.normal_bam, - args.dbsnp_to_filter, + args.dbsnp, args.scan_window_size, args.scan_maf, args.min_mapq, args.min_dp, args.max_dp, args.good_ao, args.min_ao, args.snp_min_af, args.snp_min_bq, args.snp_min_ao, args.ins_min_af, args.del_min_af, args.del_merge_min_af, diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py index 997270e..1114358 100755 --- a/neusomatic/python/read_callers_vcf.py +++ b/neusomatic/python/read_callers_vcf.py @@ -213,11 +213,12 @@ def read_callers_vcf(reference, chrom, pos, _, ref, alts, _, filters, info = x[0:8] for ith_alt, alt in enumerate(alts.split(",")): if ref != alt: - mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info( - filters, info, ith_alt) - var_id = "-".join([chrom, pos, ref, alt]) - mutect2_info[var_id] = [ - mutect_classification, nlod, tlod, tandem, ecnt] + if len(ref) == 1 or len(alt) == 1: + mutect_classification, nlod, tlod, tandem, ecnt = get_mutect2_info( + filters, info, ith_alt) + var_id = "-".join([chrom, pos, ref, alt]) + mutect2_info[var_id] = [ + mutect_classification, nlod, tlod, tandem, ecnt] i_f.close() strelka2_info = {} if strelka2_vcfs: diff --git a/test/NeuSomatic_ensemble.vcf b/test/NeuSomatic_ensemble.vcf index d62986b..82e77b0 100644 --- a/test/NeuSomatic_ensemble.vcf +++ b/test/NeuSomatic_ensemble.vcf @@ -14,11 +14,11 @@ ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE -22 21330787 . C T 26.9917 PASS SCORE=0.9980;DP=387;RO=298;AO=87;AF=0.226 GT:DP:RO:AO:AF 0/1:387:298:87:0.226 -22 21332122 . G A 28.5402 PASS SCORE=0.9986;DP=268;RO=209;AO=59;AF=0.2201 GT:DP:RO:AO:AF 0/1:268:209:59:0.2201 -22 21334924 . G C 17.6382 PASS SCORE=0.9828;DP=101;RO=78;AO=23;AF=0.2277 GT:DP:RO:AO:AF 0/1:101:78:23:0.2277 -22 21335259 . C A 19.7149 PASS SCORE=0.9893;DP=234;RO=190;AO=44;AF=0.188 GT:DP:RO:AO:AF 0/1:234:190:44:0.188 -22 21384516 . C T 27.9602 PASS SCORE=0.9984;DP=90;RO=64;AO=26;AF=0.2889 GT:DP:RO:AO:AF 0/1:90:64:26:0.2889 -22 21982892 . C T 21.4946 PASS SCORE=0.9929;DP=152;RO=109;AO=43;AF=0.2829 GT:DP:RO:AO:AF 0/1:152:109:43:0.2829 -22 21983260 . A G 31.5494 PASS SCORE=0.9993;DP=112;RO=70;AO=42;AF=0.375 GT:DP:RO:AO:AF 0/1:112:70:42:0.375 -22 21989959 . AAG A 33.0106 PASS SCORE=0.9995;DP=131;RO=99;AO=32;AF=0.2443 GT:DP:RO:AO:AF 0/1:131:99:32:0.2443 +22 21330787 . C T 33.9793 PASS SCORE=0.9996;DP=387;RO=298;AO=87;AF=0.226 GT:DP:RO:AO:AF 0/1:387:298:87:0.226 +22 21332122 . G A 35.2289 PASS SCORE=0.9997;DP=268;RO=209;AO=59;AF=0.2201 GT:DP:RO:AO:AF 0/1:268:209:59:0.2201 +22 21334924 . G C 24.4407 PASS SCORE=0.9964;DP=101;RO=78;AO=23;AF=0.2277 GT:DP:RO:AO:AF 0/1:101:78:23:0.2277 +22 21335259 . C A 29.2081 PASS SCORE=0.9988;DP=234;RO=190;AO=44;AF=0.188 GT:DP:RO:AO:AF 0/1:234:190:44:0.188 +22 21384516 . C T 36.9903 PASS SCORE=0.9998;DP=90;RO=64;AO=26;AF=0.2889 GT:DP:RO:AO:AF 0/1:90:64:26:0.2889 +22 21982892 . C T 33.0101 PASS SCORE=0.9995;DP=152;RO=109;AO=43;AF=0.2829 GT:DP:RO:AO:AF 0/1:152:109:43:0.2829 +22 21983260 . A G 36.9903 PASS SCORE=0.9998;DP=112;RO=70;AO=42;AF=0.375 GT:DP:RO:AO:AF 0/1:112:70:42:0.375 +22 21989959 . AAG A 39.9993 PASS SCORE=0.9999;DP=131;RO=99;AO=32;AF=0.2443 GT:DP:RO:AO:AF 0/1:131:99:32:0.2443 From 2471f8e5e1983d7005671bff7a2d2b22e330c7ee Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 25 Jun 2020 11:16:25 -0700 Subject: [PATCH 62/89] fix resolve --- neusomatic/python/resolve_variants.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index cd170f7..e4359ba 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -358,6 +358,8 @@ def find_resolved_variants(input_record): inss_.extend(extract_ins(record)) aligned_pairs = np.array( record.get_aligned_pairs(matches_only=True)) + if len(aligned_pairs)==0: + continue near_pos = np.where((start <= aligned_pairs[:, 1]) & ( aligned_pairs[:, 1] <= end))[0] if len(near_pos) != 0: @@ -435,9 +437,10 @@ def find_resolved_variants(input_record): return out_variants_ except Exception as ex: + thread_logger.error("Error in {}".format(input_record)) thread_logger.error(traceback.format_exc()) thread_logger.error(ex) - return None + raise Exception def resolve_variants(input_bam, resolved_vcf, reference, target_vcf_file, From 544e2f23dfc9cb6b74487b5f3c776971eb0c5c1f Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 2 Jul 2020 13:20:22 -0700 Subject: [PATCH 63/89] small fix --- neusomatic/python/read_callers_vcf.py | 1 + neusomatic/python/scan_alignments.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/read_callers_vcf.py b/neusomatic/python/read_callers_vcf.py index 1114358..42c1428 100755 --- a/neusomatic/python/read_callers_vcf.py +++ b/neusomatic/python/read_callers_vcf.py @@ -415,6 +415,7 @@ def read_callers_vcf(reference, msi, msilen, shift3] chrom = "-".join(var_id.split("-")[:-3]) pos, ref, alt = var_id.split("-")[-3:] + ref, alt = ref.upper(), alt.upper() o_f.write( "\t".join([chrom, pos, ".", ref, alt] + list(map(lambda x: str(x).replace("nan", "0"), f))) + "\n") diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index b8aaf11..4bff295 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -41,12 +41,12 @@ def run_scan_alignments(record): os.mkdir(work) if merge_d_for_scan is not None: - split_region_file_=os.path.join(work,"merged_region.bed") + split_region_file_ = os.path.join(work, "merged_region.bed") tmp_ = bedtools_sort(split_region_file, run_logger=thread_logger) bedtools_merge( - tmp_, output_fn=split_region_file_ , args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger) + tmp_, output_fn=split_region_file_, args=" -d {}".format(merge_d_for_scan), run_logger=thread_logger) else: - split_region_file_=split_region_file + split_region_file_ = split_region_file if os.path.getsize(split_region_file_) > 0: cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \ @@ -90,7 +90,12 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, split_len_ratio = 0.98 if not split_region_files: if regions_bed_file: - regions_bed = bedtools_sort(regions_bed_file, run_logger=logger) + regions_bed = get_tmp_file() + with open(regions_bed_file) as i_f, open(regions_bed, "w") as o_f: + for line in skip_empty(i_f): + chrom, st, en = line.strip().split()[0:3] + o_f.wirte("\t".join([chrom, st, en]) + "\n") + regions_bed = bedtools_sort(regions_bed, run_logger=logger) regions_bed = bedtools_merge( regions_bed, args=" -d 0", run_logger=logger) else: From 96d2091953c31980ee3c101136bcbb7db110290f Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 2 Jul 2020 13:21:22 -0700 Subject: [PATCH 64/89] small fix --- neusomatic/python/scan_alignments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/scan_alignments.py b/neusomatic/python/scan_alignments.py index 4bff295..ae522ec 100755 --- a/neusomatic/python/scan_alignments.py +++ b/neusomatic/python/scan_alignments.py @@ -94,7 +94,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, with open(regions_bed_file) as i_f, open(regions_bed, "w") as o_f: for line in skip_empty(i_f): chrom, st, en = line.strip().split()[0:3] - o_f.wirte("\t".join([chrom, st, en]) + "\n") + o_f.write("\t".join([chrom, st, en]) + "\n") regions_bed = bedtools_sort(regions_bed, run_logger=logger) regions_bed = bedtools_merge( regions_bed, args=" -d 0", run_logger=logger) From fd2a2f7c1237df75860dc7288f17fb2c3dd7cb9d Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 19 Jul 2020 01:46:49 -0700 Subject: [PATCH 65/89] repeat extension --- .../python/extract_postprocess_targets.py | 137 +++++++++++++++--- neusomatic/python/postprocess.py | 7 +- neusomatic/python/preprocess.py | 8 +- 3 files changed, 131 insertions(+), 21 deletions(-) diff --git a/neusomatic/python/extract_postprocess_targets.py b/neusomatic/python/extract_postprocess_targets.py index be0089e..bdbe40d 100755 --- a/neusomatic/python/extract_postprocess_targets.py +++ b/neusomatic/python/extract_postprocess_targets.py @@ -9,12 +9,78 @@ import logging import pysam -from utils import skip_empty +from utils import skip_empty, get_tmp_file, bedtools_sort, bedtools_merge from defaults import VCF_HEADER from resolve_variants import push_left_var -def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): +def check_rep(ref_seq, left_right, w): + logger = logging.getLogger(check_rep.__name__) + if len(ref_seq) < 2 * w: + return False + if left_right == "left": + return ref_seq[0:w] == ref_seq[w:2 * w] + elif left_right == "right": + return ref_seq[-w:] == ref_seq[-2 * w:-w] + else: + logger.error("Wrong left/right value: {}".format(left_right)) + raise Exception + + +def extend_region_repeat(chrom, start, end, ref_fasta, + chrom_length, pad): + logger = logging.getLogger(extend_region_repeat.__name__) + new_start = start + new_end = end + w = 3 + while True: + changed = False + new_start = max(new_start - pad - w, 1) + ref_seq = ref_fasta.fetch( + chrom, new_start, new_end + 1).upper() + while True: + cnt_s = 0 + for rep_len in [1, 2, 3, 4]: + if cnt_s > 0: + continue + while check_rep(ref_seq, "left", rep_len) and new_start > rep_len: + new_start -= rep_len + ref_seq = ref_fasta.fetch( + chrom, new_start, new_end + 1).upper() + cnt_s += rep_len + changed = True + if cnt_s > 0: + continue + if cnt_s == 0: + break + if not changed: + break + while True: + changed = False + new_end = min(new_end + pad + w, chrom_length - 2) + ref_seq = ref_fasta.fetch( + chrom, new_start, new_end + 1).upper() + while True: + cnt_e = 0 + for rep_len in [1, 2, 3, 4]: + if cnt_e > 0: + continue + while check_rep(ref_seq, "right", rep_len) and new_end < chrom_length - rep_len - 1: + new_end += rep_len + ref_seq = ref_fasta.fetch( + chrom, new_start, new_end + 1).upper() + cnt_e += rep_len + changed = True + if cnt_e > 0: + continue + if cnt_e == 0: + break + if not changed: + break + return new_start, new_end + + +def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, extend_repeats, pad): logger = logging.getLogger(extract_postprocess_targets.__name__) logger.info("--------------Extract Postprocessing Targets---------------") @@ -28,9 +94,9 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): record_sets = [] record_set = [] - with open(input_vcf) as i_f, open(out_vcf, "w") as o_f, open(redo_vcf, "w") as r_f, open(redo_bed, "w") as r_b: - r_f.write("{}\n".format(VCF_HEADER)) - r_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") + redo_vars = {} + redo_regions = {} + with open(input_vcf) as i_f, open(out_vcf, "w") as o_f: for line in skip_empty(i_f): if len(line) < 2: continue @@ -54,7 +120,6 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): record_set.append(record) continue - if record_set: record_sets.append(record_set) record_set = [record] @@ -76,12 +141,13 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): if list(filter(lambda x: len(x[2]) != len(x[3]), record_set)) or multi_allelic: for x in record_set: fields = x[-1].strip().split() - fields[2] = str(ii) - r_f.write("\t".join(fields) + "\n") - r_b.write("\t".join(map(str, [record_set[0][0], max(0, min(map(lambda x:x[1], record_set)) - pad), - max(map(lambda x:x[ - 1] + len(x[2]), record_set)) + pad, ii, - ])) + "\n") + # fields[2] = str(ii) + if ii not in redo_vars: + redo_vars[ii] = [] + redo_vars[ii].append(fields) + redo_regions[ii] = [record_set[0][0], max(0, + min(map(lambda x:x[1], record_set)) - pad), + max(map(lambda x:x[1] + len(x[2]), record_set)) + pad] else: for x in record_set: o_f.write(x[-1]) @@ -89,15 +155,49 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): elif record_set: if abs(len(record_set[0][2]) - len(record_set[0][3])) >= min_len: fields = record_set[0][-1].strip().split() - fields[2] = str(ii) - r_f.write("\t".join(fields) + "\n") + # fields[2] = str(ii) + if ii not in redo_vars: + redo_vars[ii] = [] + redo_vars[ii].append(fields) chrom_, pos_, ref_, alt_ = record_set[0][0:4] - r_b.write("\t".join( - map(str, [chrom_, max(0, pos_ - pad), pos_ + len(ref_) + pad, ii])) + "\n") + redo_regions[ii] = [chrom_, max( + 0, pos_ - pad), pos_ + len(ref_) + pad] else: o_f.write(record_set[0][-1]) + if extend_repeats: + chrom_lengths = dict( + zip(ref_fasta.references, ref_fasta.lengths)) + tmp_ = get_tmp_file() + with open(tmp_, "w") as o_f: + for ii in redo_regions: + chrom, st, en = redo_regions[ii] + st, en = extend_region_repeat( + chrom, st, en, ref_fasta, chrom_lengths[chrom], 0) + o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n") + tmp_=bedtools_sort(tmp_,run_logger=logger) + tmp_=bedtools_merge(tmp_,args="-c 4 -o collapse", run_logger=logger) + else: + tmp_ = get_tmp_file() + with open(tmp_, "w") as o_f: + for ii in redo_regions: + chrom, st, en = redo_regions[ii] + o_f.write("\t".join(list(map(str, [chrom, st, en, ii]))) + "\n") + j = 0 + with open(tmp_) as i_f, open(redo_vcf, "w") as r_f, open(redo_bed, "w") as r_b: + r_f.write("{}\n".format(VCF_HEADER)) + r_f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n") + for line in skip_empty(i_f): + chrom, st, en, i_s = line.strip().split() + for i in list(map(int, i_s.split(","))): + for fields in redo_vars[i]: + fields[2] = str(j) + r_f.write("\t".join(fields) + "\n") + r_b.write("\t".join(list(map(str, [chrom, st, en, j]))) + "\n") + j += 1 + + if __name__ == '__main__': FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' @@ -114,13 +214,16 @@ def extract_postprocess_targets(reference, input_vcf, min_len, max_dist, pad): help='minimum INDEL len to resolve', default=4) parser.add_argument('--max_dist', type=int, help='max distance to neighboring variant', default=5) + parser.add_argument('--extend_repeats', + help='extend resolve regions to repeat boundaries', + action='store_true') parser.add_argument( '--pad', type=int, help='padding to bed region for extracting reads', default=10) args = parser.parse_args() logger.info(args) try: extract_postprocess_targets( - args.reference, args.input_vcf, args.min_len, args.max_dist, args.pad) + args.reference, args.input_vcf, args.min_len, args.max_dist, args.extend_repeats, args.pad) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") diff --git a/neusomatic/python/postprocess.py b/neusomatic/python/postprocess.py index 32043a8..f5ed9a7 100755 --- a/neusomatic/python/postprocess.py +++ b/neusomatic/python/postprocess.py @@ -185,6 +185,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense lr_gap_open_penalty, lr_gap_ext_penalty, lr_max_realign_dp, lr_do_split, keep_duplicate, pass_threshold, lowqual_threshold, + extend_repeats, msa_binary, num_threads): logger = logging.getLogger(postprocess.__name__) @@ -211,7 +212,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense logger.info("Extract targets") postprocess_pad = 1 if not long_read else 10 extract_postprocess_targets( - reference, candidates_preds, min_len, postprocess_max_dist, postprocess_pad) + reference, candidates_preds, min_len, postprocess_max_dist, extend_repeats, postprocess_pad) no_resolve = os.path.join(work, "candidates_preds.no_resolve.vcf") target_vcf = os.path.join(work, "candidates_preds.resolve_target.vcf") @@ -329,6 +330,9 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense parser.add_argument('--keep_duplicate', help='Dont filter duplicate reads in analysis', action="store_true") + parser.add_argument('--extend_repeats', + help='extend resolve regions to repeat boundaries', + action='store_true') parser.add_argument('--msa_binary', type=str, help='MSA binary', default="../bin/msa") parser.add_argument('--num_threads', type=int, @@ -351,6 +355,7 @@ def postprocess(work, reference, pred_vcf_file, output_vcf, candidates_vcf, ense args.lr_do_split, args.keep_duplicate, args.pass_threshold, args.lowqual_threshold, + args.extend_repeats, args.msa_binary, args.num_threads) except Exception as e: diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index 3b470e8..e6b5559 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -210,6 +210,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, window_extend, max_cluster_size, merge_d_for_scan, + use_vscore, num_splits, num_threads, scan_alignments_binary,): @@ -257,7 +258,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, "The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp)) zero_vscore = False - if not ensemble_tsv and add_extra_features: + if (not ensemble_tsv and add_extra_features) and not use_vscore: zero_vscore = True ensemble_bed = None @@ -693,8 +694,8 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--merge_d_for_scan', type=int, help='-d used to merge regions before scan', default=None) - parser.add_argument('--zero_vscore', - help='set VarScan2_Score to zero', + parser.add_argument('--use_vscore', + help='don\'t set VarScan2_Score to zero', action="store_true") parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) @@ -722,6 +723,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.window_extend, args.max_cluster_size, args.merge_d_for_scan, + args.use_vscore, args.num_splits, args.num_threads, args.scan_alignments_binary) From cca94c82837cab5867c5af1fd2330bd4e6fa1d58 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 24 Jul 2020 01:31:40 -0700 Subject: [PATCH 66/89] small fix --- neusomatic/python/resolve_variants.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/neusomatic/python/resolve_variants.py b/neusomatic/python/resolve_variants.py index e4359ba..4e7c009 100755 --- a/neusomatic/python/resolve_variants.py +++ b/neusomatic/python/resolve_variants.py @@ -221,10 +221,14 @@ def resolve_group(ref_fasta, variants, vars_count): if len(gts) > 1: gts_count = {"0/1": 0, "0/0": 0} gts_score = {"0/1": 0, "0/0": 0} + nz = 0 for x in group_vars[pos]: if (x.gt != "0/0" or x.len >= 3) and x.cnt >= 0.4 * mx: gts_count[x.gt] += x.cnt gts_score[x.gt] += x.score + nz += 1 + if nz == 0: + continue priority = {"0/1": 2, "0/0": 1} sorted_gts = sorted(gts_count.keys(), key=lambda x: [ gts_count[x], gts_score[x], @@ -311,6 +315,7 @@ def resolve_group(ref_fasta, variants, vars_count): for v in good_vs: out_variants_.append( [v.chrom, v.pos, v.ref, v.alt, v.gt, v.score]) + out_variants_ = [x for x in out_variants_ if x[4] != "0/0"] return out_variants_ @@ -358,7 +363,7 @@ def find_resolved_variants(input_record): inss_.extend(extract_ins(record)) aligned_pairs = np.array( record.get_aligned_pairs(matches_only=True)) - if len(aligned_pairs)==0: + if len(aligned_pairs) == 0: continue near_pos = np.where((start <= aligned_pairs[:, 1]) & ( aligned_pairs[:, 1] <= end))[0] From b4d2bdf3ed39bd6be0c6d22646128f51ae208458 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 28 Jul 2020 23:12:14 -0700 Subject: [PATCH 67/89] improve cpu multi-thread call.py --- neusomatic/python/call.py | 262 ++++++++++++++++++++++++-------- neusomatic/python/dataloader.py | 25 +-- 2 files changed, 217 insertions(+), 70 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 52f3a69..79ea73f 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -25,7 +25,7 @@ from network import NeuSomaticNet from dataloader import NeuSomaticDataset, matrix_transform -from utils import get_chromosomes_order, prob2phred +from utils import get_chromosomes_order, prob2phred, skip_empty from merge_tsvs import merge_tsvs from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES @@ -349,15 +349,21 @@ def pred_vcf_records(ref_file, final_preds, true_path, chroms, num_threads): map_args.append([path, true_path[path], final_preds[path], chroms, ref_file]) - pool = multiprocessing.Pool(num_threads) - try: - all_vcf_records = pool.map_async(pred_vcf_records_path, map_args).get() - pool.close() - except Exception as inst: - logger.error(inst) - pool.close() - traceback.print_exc() - raise Exception + if num_threads == 1: + all_vcf_records = [] + for w in map_args: + all_vcf_records.append(pred_vcf_records_path(w)) + else: + pool = multiprocessing.Pool(num_threads) + try: + all_vcf_records = pool.map_async( + pred_vcf_records_path, map_args).get() + pool.close() + except Exception as inst: + logger.error(inst) + pool.close() + traceback.print_exc() + raise Exception for o in all_vcf_records: if o is None: @@ -422,6 +428,78 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr lines.append(line) +def write_merged_vcf(output_vcfs, output_vcf, chroms_order): + logger = logging.getLogger(write_merged_vcf.__name__) + vcf_records = [] + for vcf in output_vcfs: + with open(vcf) as i_f: + for line in skip_empty(i_f): + x = line.strip().split() + vcf_records.append([x[0], int(x[1]), line]) + vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]]) + lines = [] + with open(output_vcf, "w") as ov: + for chrom_, pos_, line in vcf_records: + if line not in lines: + ov.write(line) + lines.append(line) + + +def single_thread_call(record): + thread_logger = logging.getLogger( + "{} ({})".format(single_thread_call.__name__, multiprocessing.current_process().name)) + try: + torch.set_num_threads(1) + net, candidate_files, max_load_candidates, data_transform, \ + coverage_thr, normalize_channels, zero_ann_cols, batch_size, \ + out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \ + pass_threshold, lowqual_threshold, i = record + + call_set = NeuSomaticDataset(roots=candidate_files, + max_load_candidates=max_load_candidates, + transform=data_transform, is_test=True, + num_threads=1, + coverage_thr=coverage_thr, + normalize_channels=normalize_channels, + zero_ann_cols=zero_ann_cols) + call_loader = torch.utils.data.DataLoader(call_set, + batch_size=batch_size, + shuffle=True, # pin_memory=True, + num_workers=0) + logger.info("N_dataset: {}".format(len(call_set))) + if len(call_set) == 0: + logger.warning( + "Skip {} with 0 candidates".format(candidate_file)) + return [], [] + + final_preds_, none_preds_, true_path_ = call_variants( + net, call_loader, out_dir, model_tag, use_cuda) + all_vcf_records = pred_vcf_records( + ref_file, final_preds_, true_path_, chroms, 1) + all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms) + + all_vcf_records = dict(all_vcf_records) + all_vcf_records_none = dict(all_vcf_records_none) + + var_vcf_records = get_vcf_records(all_vcf_records) + vcf_records_none = get_vcf_records(all_vcf_records_none) + + output_vcf = "{}/pred_{}.vcf".format(tmp_preds_dir, i) + write_vcf(var_vcf_records, output_vcf, chroms_order, + pass_threshold, lowqual_threshold) + + logger.info("Prepare Non-Somatics VCF") + output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i) + write_vcf(vcf_records_none, output_vcf_none, + chroms_order, pass_threshold, lowqual_threshold) + + return output_vcf, output_vcf_none + except Exception as ex: + thread_logger.error(traceback.format_exc()) + thread_logger.error(ex) + return None + + def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, force_zero_ann_cols, @@ -562,6 +640,13 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, Ls = [] candidates_tsv_ = [] split_i = 0 + total_L = 0 + for candidate_file in candidates_tsv: + total_L += len(pickle.load(open(candidate_file + ".idx", "rb"))) + logger.info("Total number of candidates: {}".format(total_L)) + if not use_cuda: + max_load_candidates = min( + max_load_candidates, 3 * total_L // num_threads) for candidate_file in candidates_tsv: idx = pickle.load(open(candidate_file + ".idx", "rb")) if len(idx) > max_load_candidates / 2: @@ -595,59 +680,114 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, candidate_files = [] all_vcf_records = [] all_vcf_records_none = [] - for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])): - current_L += L - candidate_files.append(candidate_file) - if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1: - logger.info("Run for candidate files: {}".format(candidate_files)) - call_set = NeuSomaticDataset(roots=candidate_files, - max_load_candidates=max_load_candidates, - transform=data_transform, is_test=True, - num_threads=num_threads, - coverage_thr=coverage_thr, - normalize_channels=normalize_channels, - zero_ann_cols=zero_ann_cols) - call_loader = torch.utils.data.DataLoader(call_set, - batch_size=batch_size, - shuffle=True, pin_memory=True, - num_workers=num_threads) - - current_L = 0 - candidate_files = [] - - logger.info("N_dataset: {}".format(len(call_set))) - if len(call_set) == 0: - logger.warning( - "Skip {} with 0 candidates".format(candidate_file)) - continue - - final_preds_, none_preds_, true_path_ = call_variants( - net, call_loader, out_dir, model_tag, use_cuda) - all_vcf_records.extend(pred_vcf_records( - ref_file, final_preds_, true_path_, chroms, num_threads)) - all_vcf_records_none.extend( - pred_vcf_records_none(none_preds_, chroms)) - - all_vcf_records = dict(all_vcf_records) - all_vcf_records_none = dict(all_vcf_records_none) - - if os.path.exists(new_split_tsvs_dir): + if use_cuda: + for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])): + current_L += L + candidate_files.append(candidate_file) + if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1: + logger.info( + "Run for candidate files: {}".format(candidate_files)) + call_set = NeuSomaticDataset(roots=candidate_files, + max_load_candidates=max_load_candidates, + transform=data_transform, is_test=True, + num_threads=num_threads, + coverage_thr=coverage_thr, + normalize_channels=normalize_channels, + zero_ann_cols=zero_ann_cols) + call_loader = torch.utils.data.DataLoader(call_set, + batch_size=batch_size, + shuffle=True, pin_memory=True, + num_workers=num_threads) + + current_L = 0 + candidate_files = [] + + logger.info("N_dataset: {}".format(len(call_set))) + if len(call_set) == 0: + logger.warning( + "Skip {} with 0 candidates".format(candidate_file)) + continue + + final_preds_, none_preds_, true_path_ = call_variants( + net, call_loader, out_dir, model_tag, use_cuda) + all_vcf_records.extend(pred_vcf_records( + ref_file, final_preds_, true_path_, chroms, num_threads)) + all_vcf_records_none.extend( + pred_vcf_records_none(none_preds_, chroms)) + all_vcf_records = dict(all_vcf_records) + all_vcf_records_none = dict(all_vcf_records_none) + + logger.info("Prepare Output VCF") + output_vcf = "{}/pred.vcf".format(out_dir) + var_vcf_records = get_vcf_records(all_vcf_records) + write_vcf(var_vcf_records, output_vcf, chroms_order, + pass_threshold, lowqual_threshold) + + logger.info("Prepare Non-Somatics VCF") + output_vcf_none = "{}/none.vcf".format(out_dir) + vcf_records_none = get_vcf_records(all_vcf_records_none) + write_vcf(vcf_records_none, output_vcf_none, + chroms_order, pass_threshold, lowqual_threshold) + else: + tmp_preds_dir = os.path.join(out_dir, "tmp_preds") + if os.path.exists(tmp_preds_dir): + logger.warning( + "Remove tmp_preds directory: {}".format(tmp_preds_dir)) + shutil.rmtree(tmp_preds_dir) + os.mkdir(tmp_preds_dir) + + map_args = [] + j = 0 + for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])): + current_L += L + candidate_files.append(candidate_file) + if current_L > max_load_candidates / 10 or i == len(candidates_tsv_) - 1: + logger.info( + "Run for candidate files: {}".format(candidate_files)) + + map_args.append([net, candidate_files, max_load_candidates, data_transform, + coverage_thr, normalize_channels, zero_ann_cols, batch_size, + out_dir, + model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, + pass_threshold, lowqual_threshold, j]) + j += 1 + current_L = 0 + candidate_files = [] + + pool = multiprocessing.Pool(num_threads) + try: + all_records = pool.map_async(single_thread_call, map_args).get() + pool.close() + except Exception as inst: + logger.error(inst) + pool.close() + traceback.print_exc() + raise Exception + + for o in all_records: + if o is None: + raise Exception("single_thread_call failed!") + + output_vcfs = [x[0] for x in all_records] + output_vcfs_none = [x[1] for x in all_records] + + logger.info("Prepare Output VCF") + output_vcf = "{}/pred.vcf".format(out_dir) + write_merged_vcf(output_vcfs, output_vcf, chroms_order) + + logger.info("Prepare Non-Somatics VCF") + output_vcf_none = "{}/none.vcf".format(out_dir) + write_merged_vcf(output_vcfs_none, output_vcf_none, chroms_order) + + if os.path.exists(tmp_preds_dir): + logger.warning( + "Remove tmp_preds directory: {}".format(tmp_preds_dir)) + shutil.rmtree(tmp_preds_dir) + + if os.path.exists(tmp_preds_dir): logger.warning( "Remove split candidates directory: {}".format(new_split_tsvs_dir)) shutil.rmtree(new_split_tsvs_dir) - - logger.info("Prepare Output VCF") - output_vcf = "{}/pred.vcf".format(out_dir) - var_vcf_records = get_vcf_records(all_vcf_records) - write_vcf(var_vcf_records, output_vcf, chroms_order, - pass_threshold, lowqual_threshold) - - logger.info("Prepare Non-Somatics VCF") - output_vcf_none = "{}/none.vcf".format(out_dir) - vcf_records_none = get_vcf_records(all_vcf_records_none) - write_vcf(vcf_records_none, output_vcf_none, - chroms_order, pass_threshold, lowqual_threshold) - if os.path.exists(matrices_dir): logger.warning("Remove matrices directory: {}".format(matrices_dir)) shutil.rmtree(matrices_dir) @@ -693,6 +833,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, default=[]) args = parser.parse_args() + logger.info(args) + use_cuda = torch.cuda.is_available() logger.info("use_cuda: {}".format(use_cuda)) diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index c2450dd..76a3773 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -195,16 +195,21 @@ def __init__(self, roots, max_load_candidates, transform=None, if len(map_args) == 1: records_ = [extract_info_tsv(map_args[0])] else: - pool = multiprocessing.Pool(num_threads) - try: - records_ = pool.map_async( - extract_info_tsv, map_args).get() - pool.close() - except Exception as inst: - pool.close() - logger.error(inst) - traceback.print_exc() - raise Exception + if num_threads==1: + records_ = [] + for w in map_args: + records_.append(extract_info_tsv(w)) + else: + pool = multiprocessing.Pool(num_threads) + try: + records_ = pool.map_async( + extract_info_tsv, map_args).get() + pool.close() + except Exception as inst: + pool.close() + logger.error(inst) + traceback.print_exc() + raise Exception for o in records_: if o is None: From a35932764d64b4db96a2facf74419acf47eab67e Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 30 Jul 2020 00:43:48 -0700 Subject: [PATCH 68/89] small fix --- neusomatic/python/call.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 79ea73f..7835778 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -784,7 +784,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, "Remove tmp_preds directory: {}".format(tmp_preds_dir)) shutil.rmtree(tmp_preds_dir) - if os.path.exists(tmp_preds_dir): + if os.path.exists(new_split_tsvs_dir): logger.warning( "Remove split candidates directory: {}".format(new_split_tsvs_dir)) shutil.rmtree(new_split_tsvs_dir) From f6d3174a03b4904083b11adcb3df3e7cb0efe54d Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 30 Jul 2020 14:00:12 -0700 Subject: [PATCH 69/89] updated dockerfile --- docker/Dockerfile | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index f6bdbc9..af16a44 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,19 +2,19 @@ FROM ubuntu:18.04 ENV NEUSOMATIC_VERSION 0.3.0 ENV ZLIB_VERSION 1.2.11 -ENV NUMPY_VERSION 1.18.1 -ENV SCIPY_VERSION 1.4.1 -ENV IMAGEIO_VERSION 2.8.0 -ENV PILLOW_VERSION 7.1.2 -ENV PYTORCH_VERSION 1.4.0 -ENV TORCHVISION_VERSION 0.5.0 -ENV CUDATOOLKIT_VERSION 9.2 +ENV NUMPY_VERSION 1.18.5 +ENV SCIPY_VERSION 1.5.0 +ENV IMAGEIO_VERSION 2.9.0 +ENV PILLOW_VERSION 7.2.0 +ENV PYTORCH_VERSION 1.6.0 +ENV TORCHVISION_VERSION 0.7.0 +ENV CUDATOOLKIT_VERSION 10.1 ENV CMAKE_VERSION 3.14.0 ENV PYSAM_VERSION 0.15.3 ENV SAMTOOLS_VERSION 1.9 ENV TABIX_VERSION 0.2.6 ENV BEDTOOLS_VERSION 2.29.2 -ENV BIOPYTHON_VERSION 1.76 +ENV BIOPYTHON_VERSION 1.77 ENV FISHER_VERSION 0.1.9 ENV GCC_VERSION 5 @@ -22,14 +22,13 @@ RUN apt-get update && apt-get install -y --fix-missing \ build-essential zlib1g-dev curl less vim bzip2 RUN apt-get install -y --fix-missing git wget -RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -RUN bash Miniconda3-latest-Linux-x86_64.sh -p /miniconda -b -RUN rm Miniconda3-latest-Linux-x86_64.sh +RUN curl -LO https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh +RUN bash Miniconda3-py37_4.8.3-Linux-x86_64.sh -p /miniconda -b +RUN rm Miniconda3-py37_4.8.3-Linux-x86_64.sh ENV PATH=/miniconda/bin:${PATH} ENV LD_LIBRARY_PATH=/miniconda/lib:${LD_LIBRARY_PATH} RUN conda update -y conda - RUN conda install -y zlib=${ZLIB_VERSION} numpy=${NUMPY_VERSION} scipy=${SCIPY_VERSION} \ pillow=${PILLOW_VERSION} cmake=${CMAKE_VERSION} imageio=${IMAGEIO_VERSION} && conda clean -a RUN conda install -y fisher=${FISHER_VERSION} -c conda-forge && conda clean -a From accb759a7b1fc4f420c71b8630247f3b59c617ad Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 30 Jul 2020 14:03:03 -0700 Subject: [PATCH 70/89] updated docker test --- test/docker_test.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/docker_test.sh b/test/docker_test.sh index 4a8f34f..191f838 100755 --- a/test/docker_test.sh +++ b/test/docker_test.sh @@ -10,16 +10,16 @@ if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa ] then if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "cd /mnt/example/ && wget ftp://ftp.ensembl.org/pub/release-75//fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi - docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "cd /mnt/example/ && gunzip -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.gz" fi if [ ! -f Homo_sapiens.GRCh37.75.dna.chromosome.22.fa.fai ] then - docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ + docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "samtools faidx /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa" fi rm -rf work_standalone @@ -27,7 +27,7 @@ rm -rf work_standalone #Stand-alone NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -45,7 +45,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash --num_threads 1 \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:test /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_standalone/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -54,7 +54,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3 --num_threads 1 \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ @@ -66,7 +66,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash rm -rf /mnt/example/work_ensemble #Ensemble NeuSomatic test -docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/preprocess.py \ --mode call \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -85,7 +85,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash --ensemble_tsv /mnt/ensemble.tsv \ --scan_alignments_binary /opt/neusomatic/neusomatic/bin/scan_alignments" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G msahraeian/neusomatic:test /bin/bash -c \ "CUDA_VISIBLE_DEVICES= python /opt/neusomatic/neusomatic/python/call.py \ --candidates_tsv /mnt/example/work_ensemble/dataset/*/candidates*.tsv \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ @@ -95,7 +95,7 @@ docker run -v ${test_dir}:/mnt -u $UID --memory 30G --shm-size 8G neusomatic:0.3 --ensemble \ --batch_size 100" -docker run -v ${test_dir}:/mnt -u $UID --memory 30G neusomatic:0.3.0 /bin/bash -c \ +docker run -v ${test_dir}:/mnt -u $UID --memory 30G msahraeian/neusomatic:test /bin/bash -c \ "python /opt/neusomatic/neusomatic/python/postprocess.py \ --reference /mnt/example/Homo_sapiens.GRCh37.75.dna.chromosome.22.fa \ --tumor_bam /mnt/tumor.bam \ From 9f5d876580053824ec0263d76ad7ee20c01a9757 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 18 Sep 2020 21:52:37 -0700 Subject: [PATCH 71/89] fix build --- build.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build.sh b/build.sh index c3fbf6d..e40035f 100755 --- a/build.sh +++ b/build.sh @@ -2,6 +2,9 @@ set -e DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd -P)" +if [ -d ${DIR}/neusomatic/build ]; then + rm -rf ${DIR}/neusomatic/build +fi rm -rf $DIR/third_party/SeqLib/ $DIR/third_party/seqan/ pushd $DIR/neusomatic mkdir build From e500bf7ac727578e6b56b72f679e61c623e16df0 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 6 Oct 2020 13:47:34 -0700 Subject: [PATCH 72/89] small_fix --- build.sh | 1 + neusomatic/python/call.py | 32 ++++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/build.sh b/build.sh index e40035f..db9ba66 100755 --- a/build.sh +++ b/build.sh @@ -13,3 +13,4 @@ pushd $DIR/neusomatic make popd popd +rm -rf $DIR/third_party/SeqLib/ $DIR/third_party/seqan/ diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 7835778..dc7b379 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -408,7 +408,8 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr logger = logging.getLogger(write_vcf.__name__) vcf_records = list(filter(lambda x: len(x) > 0, vcf_records)) vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]]) - lines = [] + old_pos = "" + lines_old_pos = set([]) with open(output_vcf, "w") as ov: for chrom_, pos_, ref_, alt_, prob in vcf_records: if ref_ == alt_: @@ -423,9 +424,18 @@ def write_vcf(vcf_records, output_vcf, chroms_order, pass_threshold, lowqual_thr filter_, "SCORE={:.4f}".format( np.round(prob, 4)), "GT", "0/1"]) + "\n" - if line not in lines: + curr_pos = "-".join([chrom_, str(pos_)]) + emit = False + if old_pos != curr_pos: + old_pos = curr_pos + lines_old_pos = set([line]) + emit = True + else: + if line not in lines_old_pos: + emit = True + lines_old_pos.add(line) + if emit: ov.write(line) - lines.append(line) def write_merged_vcf(output_vcfs, output_vcf, chroms_order): @@ -437,12 +447,22 @@ def write_merged_vcf(output_vcfs, output_vcf, chroms_order): x = line.strip().split() vcf_records.append([x[0], int(x[1]), line]) vcf_records = sorted(vcf_records, key=lambda x: [chroms_order[x[0]], x[1]]) - lines = [] + old_pos = "" + lines_old_pos = set([]) with open(output_vcf, "w") as ov: for chrom_, pos_, line in vcf_records: - if line not in lines: + curr_pos = "-".join([chrom_, str(pos_)]) + emit = False + if old_pos != curr_pos: + old_pos = curr_pos + lines_old_pos = set([line]) + emit = True + else: + if line not in lines_old_pos: + emit = True + lines_old_pos.add(line) + if emit: ov.write(line) - lines.append(line) def single_thread_call(record): From e48e15d428a5db762d623914ca2b3b5a87199442 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 7 Nov 2020 00:18:13 -0800 Subject: [PATCH 73/89] force cov_thr --- neusomatic/python/call.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index dc7b379..5cf437a 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -523,6 +523,7 @@ def single_thread_call(record): def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, force_zero_ann_cols, + force_cov_thr, use_cuda): logger = logging.getLogger(call_neusomatic.__name__) @@ -568,6 +569,12 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols)) zero_ann_cols = force_zero_ann_cols + if force_cov_thr is not None: + logger.info( + "Override coverage_thr from force_cov_thr: {}".format(force_cov_thr)) + coverage_thr = force_cov_thr + + logger.info("coverage_thr: {}".format(coverage_thr)) logger.info("normalize_channels: {}".format(normalize_channels)) logger.info("no_seq_complexity: {}".format(no_seq_complexity)) @@ -851,6 +858,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, --zero_ann_cols and pretrained setting.\ idx starts from 5th column in candidate.tsv file', default=[]) + parser.add_argument('--force_cov_thr', type=int, + help='Force maximum coverage threshold.', default=None) args = parser.parse_args() logger.info(args) @@ -864,6 +873,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, args.num_threads, args.batch_size, args.max_load_candidates, args.pass_threshold, args.lowqual_threshold, args.force_zero_ann_cols, + args.force_cov_thr, use_cuda) except Exception as e: logger.error(traceback.format_exc()) From a96a2367d1a53e2d4a54cd1c1be41f50704cb224 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 8 Nov 2020 20:35:11 -0800 Subject: [PATCH 74/89] fix max_cov --- neusomatic/python/call.py | 20 ++++++++++---------- neusomatic/python/dataloader.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 5cf437a..f57c638 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -471,7 +471,7 @@ def single_thread_call(record): try: torch.set_num_threads(1) net, candidate_files, max_load_candidates, data_transform, \ - coverage_thr, normalize_channels, zero_ann_cols, batch_size, \ + coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, \ out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \ pass_threshold, lowqual_threshold, i = record @@ -480,6 +480,7 @@ def single_thread_call(record): transform=data_transform, is_test=True, num_threads=1, coverage_thr=coverage_thr, + max_cov=max_cov, normalize_channels=normalize_channels, zero_ann_cols=zero_ann_cols) call_loader = torch.utils.data.DataLoader(call_set, @@ -523,7 +524,7 @@ def single_thread_call(record): def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, batch_size, max_load_candidates, pass_threshold, lowqual_threshold, force_zero_ann_cols, - force_cov_thr, + max_cov, use_cuda): logger = logging.getLogger(call_neusomatic.__name__) @@ -569,11 +570,9 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, "Override zero_ann_cols from force_zero_ann_cols: {}".format(force_zero_ann_cols)) zero_ann_cols = force_zero_ann_cols - if force_cov_thr is not None: + if max_cov is not None: logger.info( - "Override coverage_thr from force_cov_thr: {}".format(force_cov_thr)) - coverage_thr = force_cov_thr - + "Set max_cov: {}".format(max_cov)) logger.info("coverage_thr: {}".format(coverage_thr)) logger.info("normalize_channels: {}".format(normalize_channels)) @@ -719,6 +718,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, transform=data_transform, is_test=True, num_threads=num_threads, coverage_thr=coverage_thr, + max_cov=max_cov, normalize_channels=normalize_channels, zero_ann_cols=zero_ann_cols) call_loader = torch.utils.data.DataLoader(call_set, @@ -773,7 +773,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, "Run for candidate files: {}".format(candidate_files)) map_args.append([net, candidate_files, max_load_candidates, data_transform, - coverage_thr, normalize_channels, zero_ann_cols, batch_size, + coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, pass_threshold, lowqual_threshold, j]) @@ -858,8 +858,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, --zero_ann_cols and pretrained setting.\ idx starts from 5th column in candidate.tsv file', default=[]) - parser.add_argument('--force_cov_thr', type=int, - help='Force maximum coverage threshold.', default=None) + parser.add_argument('--max_cov', type=int, + help='maximum coverage threshold.', default=None) args = parser.parse_args() logger.info(args) @@ -873,7 +873,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, args.num_threads, args.batch_size, args.max_load_candidates, args.pass_threshold, args.lowqual_threshold, args.force_zero_ann_cols, - args.force_cov_thr, + args.max_cov, use_cuda) except Exception as e: logger.error(traceback.format_exc()) diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index 76a3773..71fe47b 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -130,6 +130,7 @@ def __init__(self, roots, max_load_candidates, transform=None, loader=candidate_loader_tsv, is_test=False, num_threads=1, disable_ensemble=False, data_augmentation=False, nclasses_t=4, nclasses_l=4, coverage_thr=100, + max_cov=None, normalize_channels=False, zero_ann_cols=[], max_opended_tsv=-1): @@ -195,7 +196,7 @@ def __init__(self, roots, max_load_candidates, transform=None, if len(map_args) == 1: records_ = [extract_info_tsv(map_args[0])] else: - if num_threads==1: + if num_threads == 1: records_ = [] for w in map_args: records_.append(extract_info_tsv(w)) @@ -237,6 +238,7 @@ def __init__(self, roots, max_load_candidates, transform=None, self.disable_ensemble = disable_ensemble self.data_augmentation = data_augmentation self.coverage_thr = coverage_thr + self.max_cov = max_cov def open_candidate_tsvs(self): for i, tsv in enumerate(self.tsvs): @@ -271,8 +273,8 @@ def __getitem__(self, index): if self.disable_ensemble: anns = [] - if self.zero_ann_cols and len(anns)>0: - anns=np.array(anns) + if self.zero_ann_cols and len(anns) > 0: + anns = np.array(anns) anns[self.zero_ann_cols] = 0 anns = anns.tolist() @@ -281,6 +283,9 @@ def __getitem__(self, index): ".") tumor_cov = int(tumor_cov) normal_cov = int(normal_cov) + if self.max_cov is not None: + tumor_cov = min(tumor_cov, self.max_cov) + normal_cov = min(normal_cov, self.max_cov) center = int(center) length = int(length) From d804742168d111182f222163deb8fa29c7e41259 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sat, 5 Dec 2020 22:04:50 -0800 Subject: [PATCH 75/89] fixed matrices gradual delete --- neusomatic/python/call.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index f57c638..db1b55b 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -53,7 +53,7 @@ def get_type(ref, alt): return "SNP" -def call_variants(net, call_loader, out_dir, model_tag, use_cuda): +def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): logger = logging.getLogger(call_variants.__name__) net.eval() nclasses = len(VARTYPE_CLASSES) @@ -94,8 +94,8 @@ def call_variants(net, call_loader, out_dir, model_tag, use_cuda): path = path_.split("/")[-1] preds[i] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i]] if VARTYPE_CLASSES[predicted[i]] != "NONE": - file_name = "{}/matrices_{}/{}.png".format( - out_dir, model_tag, path) + file_name = "{}/matrices_{}/{}/{}.{}_{}.png".format( + out_dir, model_tag, run_i, path, iii, i) if not os.path.exists(file_name): imwrite(file_name, np.array( non_transformed_matrices[i, :, :, 0:3])) @@ -494,7 +494,7 @@ def single_thread_call(record): return [], [] final_preds_, none_preds_, true_path_ = call_variants( - net, call_loader, out_dir, model_tag, use_cuda) + net, call_loader, out_dir, model_tag, i, use_cuda) all_vcf_records = pred_vcf_records( ref_file, final_preds_, true_path_, chroms, 1) all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms) @@ -513,6 +513,10 @@ def single_thread_call(record): output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i) write_vcf(vcf_records_none, output_vcf_none, chroms_order, pass_threshold, lowqual_threshold) + matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, i) + if os.path.exists(matrices_dir_j): + logger.warning("Done with {}. Remove matrices directory {}: {}".format(i, i, matrices_dir_j)) + shutil.rmtree(matrices_dir_j) return output_vcf, output_vcf_none except Exception as ex: @@ -707,6 +711,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, all_vcf_records = [] all_vcf_records_none = [] if use_cuda: + run_i = -1 for i, (candidate_file, L) in enumerate(sorted(zip(candidates_tsv_, Ls), key=lambda x: x[1])): current_L += L candidate_files.append(candidate_file) @@ -728,19 +733,30 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, current_L = 0 candidate_files = [] - + run_i += 1 logger.info("N_dataset: {}".format(len(call_set))) if len(call_set) == 0: logger.warning( "Skip {} with 0 candidates".format(candidate_file)) continue + matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, run_i) + if os.path.exists(matrices_dir_j): + logger.warning("Remove matrices directory {}: {}".format(run_i, matrices_dir_j)) + shutil.rmtree(matrices_dir_j) + os.mkdir(matrices_dir_j) + final_preds_, none_preds_, true_path_ = call_variants( - net, call_loader, out_dir, model_tag, use_cuda) + net, call_loader, out_dir, model_tag, run_i, use_cuda) all_vcf_records.extend(pred_vcf_records( ref_file, final_preds_, true_path_, chroms, num_threads)) all_vcf_records_none.extend( pred_vcf_records_none(none_preds_, chroms)) + + if os.path.exists(matrices_dir_j): + logger.warning("Done with {}. Remove matrices directory {}: {}".format(run_i, run_i, matrices_dir_j)) + shutil.rmtree(matrices_dir_j) + all_vcf_records = dict(all_vcf_records) all_vcf_records_none = dict(all_vcf_records_none) @@ -772,6 +788,11 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info( "Run for candidate files: {}".format(candidate_files)) + matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, j) + if os.path.exists(matrices_dir_j): + logger.warning("Remove matrices directory {}: {}".format(j, matrices_dir_j)) + shutil.rmtree(matrices_dir_j) + os.mkdir(matrices_dir_j) map_args.append([net, candidate_files, max_load_candidates, data_transform, coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, out_dir, From 45fbcd6d6cbfed77260b6e17e296665f7cc5e209 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 29 Dec 2020 00:31:22 -0800 Subject: [PATCH 76/89] fix generate_dataset --- neusomatic/python/generate_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 53920c5..4881382 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1707,8 +1707,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be rlen = record_len[int(record[-1])] rcenter = record_center[int(record[-1])] ch_order = chroms_order[record[0]] - ann = anns[ - int(record[-1])] if ensemble_bed else [] + ann = list(anns[int(record[-1])] + ) if ensemble_bed else [] map_args_records.append((ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths)) if cnt >= is_end: @@ -1726,8 +1726,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be if is_current <= cnt < is_end: rcenter = record_center[int(record[-1])] ch_order = chroms_order[record[0]] - ann = anns[ - int(record[-1])] if ensemble_bed else [] + ann = list(anns[int(record[-1])] + ) if ensemble_bed else [] map_args_nones.append((ref_file, tumor_count_bed, normal_count_bed, record, "NONE", 0, rcenter, ch_order, matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths)) From cff3653d76e08177b1b800274f862a3ac3c45382 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 22 Jan 2021 12:25:58 -0800 Subject: [PATCH 77/89] fix ensemble rounding --- neusomatic/python/generate_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 4881382..97f2eac 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1564,7 +1564,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea if i_s: s = ensemble_data[:, np.array(i_s)] s = np.maximum(np.minimum(s, mx), mn) - s = (s - mn) / (mx - mn) + s = np.round((s - mn) / (mx - mn),6) ensemble_data[:, np.array(i_s)] = s ensemble_data = ensemble_data[:, selected_features] ensemble_data = ensemble_data.tolist() From 603d582f32bb3c8b9e58e987e4b9649ab1387200 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Fri, 22 Jan 2021 22:36:30 -0800 Subject: [PATCH 78/89] reduce disc I/O while calling --- neusomatic/python/call.py | 56 ++++++++------------------------------- 1 file changed, 11 insertions(+), 45 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index db1b55b..5505d59 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -15,7 +15,6 @@ import pysam import numpy as np -from imageio import imwrite, imread import torch from torch.autograd import Variable import torch.nn as nn @@ -59,7 +58,6 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): nclasses = len(VARTYPE_CLASSES) final_preds = {} none_preds = {} - true_path = {} final_preds = {} none_preds = {} @@ -94,12 +92,6 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): path = path_.split("/")[-1] preds[i] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i]] if VARTYPE_CLASSES[predicted[i]] != "NONE": - file_name = "{}/matrices_{}/{}/{}.{}_{}.png".format( - out_dir, model_tag, run_i, path, iii, i) - if not os.path.exists(file_name): - imwrite(file_name, np.array( - non_transformed_matrices[i, :, :, 0:3])) - true_path[path] = file_name final_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i], list(map(lambda x: round(x, 4), F.softmax( outputs1[i, :], 0).data.cpu().numpy())), @@ -108,7 +100,8 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): list(map(lambda x: round(x, 4), outputs1.data.cpu()[i].numpy())), list(map(lambda x: round(x, 4), - outputs3.data.cpu()[i].numpy()))] + outputs3.data.cpu()[i].numpy())), + np.array(non_transformed_matrices[i, :, :, 0:3])] else: none_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i], list(map(lambda x: round(x, 4), F.softmax( @@ -122,17 +115,17 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): if (iii % 10 == 0): logger.info("Called {} candidates in this batch.".format(j)) logger.info("Called {} candidates in this batch.".format(j)) - return final_preds, none_preds, true_path + return final_preds, none_preds def pred_vcf_records_path(record): - path, true_path_, pred_all, chroms, ref_file = record + path, pred_all, chroms, ref_file = record thread_logger = logging.getLogger( "{} ({})".format(pred_vcf_records_path.__name__, multiprocessing.current_process().name)) try: fasta_file = pysam.FastaFile(ref_file) ACGT = "ACGT" - I = imread(true_path_) / 255.0 + I = pred_all[-1] / 255.0 vcf_record = [] Ih, Iw, _ = I.shape zref_pos = np.where((np.argmax(I[:, :, 0], 0) == 0) & ( @@ -340,13 +333,13 @@ def pred_vcf_records_path(record): return None -def pred_vcf_records(ref_file, final_preds, true_path, chroms, num_threads): +def pred_vcf_records(ref_file, final_preds, chroms, num_threads): logger = logging.getLogger(pred_vcf_records.__name__) logger.info( "Prepare VCF records for predicted somatic variants in this batch.") map_args = [] for path in final_preds.keys(): - map_args.append([path, true_path[path], final_preds[path], + map_args.append([path, final_preds[path], chroms, ref_file]) if num_threads == 1: @@ -493,10 +486,10 @@ def single_thread_call(record): "Skip {} with 0 candidates".format(candidate_file)) return [], [] - final_preds_, none_preds_, true_path_ = call_variants( + final_preds_, none_preds_ = call_variants( net, call_loader, out_dir, model_tag, i, use_cuda) all_vcf_records = pred_vcf_records( - ref_file, final_preds_, true_path_, chroms, 1) + ref_file, final_preds_, chroms, 1) all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms) all_vcf_records = dict(all_vcf_records) @@ -513,10 +506,6 @@ def single_thread_call(record): output_vcf_none = "{}/none_{}.vcf".format(tmp_preds_dir, i) write_vcf(vcf_records_none, output_vcf_none, chroms_order, pass_threshold, lowqual_threshold) - matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, i) - if os.path.exists(matrices_dir_j): - logger.warning("Done with {}. Remove matrices directory {}: {}".format(i, i, matrices_dir_j)) - shutil.rmtree(matrices_dir_j) return output_vcf, output_vcf_none except Exception as ex: @@ -655,11 +644,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, if not os.path.exists(out_dir): os.mkdir(out_dir) - matrices_dir = "{}/matrices_{}".format(out_dir, model_tag) - if os.path.exists(matrices_dir): - logger.warning("Remove matrices directory: {}".format(matrices_dir)) - shutil.rmtree(matrices_dir) - os.mkdir(matrices_dir) new_split_tsvs_dir = os.path.join(out_dir, "split_tsvs") if os.path.exists(new_split_tsvs_dir): @@ -740,23 +724,13 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, "Skip {} with 0 candidates".format(candidate_file)) continue - matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, run_i) - if os.path.exists(matrices_dir_j): - logger.warning("Remove matrices directory {}: {}".format(run_i, matrices_dir_j)) - shutil.rmtree(matrices_dir_j) - os.mkdir(matrices_dir_j) - - final_preds_, none_preds_, true_path_ = call_variants( + final_preds_, none_preds_ = call_variants( net, call_loader, out_dir, model_tag, run_i, use_cuda) all_vcf_records.extend(pred_vcf_records( - ref_file, final_preds_, true_path_, chroms, num_threads)) + ref_file, final_preds_, chroms, num_threads)) all_vcf_records_none.extend( pred_vcf_records_none(none_preds_, chroms)) - if os.path.exists(matrices_dir_j): - logger.warning("Done with {}. Remove matrices directory {}: {}".format(run_i, run_i, matrices_dir_j)) - shutil.rmtree(matrices_dir_j) - all_vcf_records = dict(all_vcf_records) all_vcf_records_none = dict(all_vcf_records_none) @@ -788,11 +762,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info( "Run for candidate files: {}".format(candidate_files)) - matrices_dir_j = "{}/matrices_{}/{}".format(out_dir, model_tag, j) - if os.path.exists(matrices_dir_j): - logger.warning("Remove matrices directory {}: {}".format(j, matrices_dir_j)) - shutil.rmtree(matrices_dir_j) - os.mkdir(matrices_dir_j) map_args.append([net, candidate_files, max_load_candidates, data_transform, coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, out_dir, @@ -836,9 +805,6 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.warning( "Remove split candidates directory: {}".format(new_split_tsvs_dir)) shutil.rmtree(new_split_tsvs_dir) - if os.path.exists(matrices_dir): - logger.warning("Remove matrices directory: {}".format(matrices_dir)) - shutil.rmtree(matrices_dir) logger.info("Calling is Done.") From d8738f05c608a57a102c7548f213999754480821 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 26 Jan 2021 09:30:31 -0800 Subject: [PATCH 79/89] Updated README --- README.md | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 18b2c6d..8764832 100644 --- a/README.md +++ b/README.md @@ -40,27 +40,29 @@ NeuSomatic first scans the genome to identify candidate variants and extract ali The binary for this step can be obtained at `neusomatic/bin` folder by running `./build.sh` (which requires cmake 3.13.2 and g++ 5.4.0). Python 3.7 and the following Python packages must be installed: -* pytorch 1.1.0 -* torchvision 0.3.0 -* pybedtools 0.8.0 -* pysam 0.15.2 +* pytorch 1.6.0 +* torchvision 0.7.0 +* pysam 0.16.0.1 * zlib 1.2.11 -* numpy 1.15.4 -* scipy 1.2.0 -* imageio 2.5.0 -* biopython 1.73 +* numpy 1.18.1 +* scipy 1.4.1 +* pillow 7.2.0 +* imageio 2.8.0 +* biopython 1.77 +* fisher 0.1.9 It also depends on the following packages: -* cudatoolkit 9.0 (if you want to use GPU) +* cudatoolkit 10.1 (if you want to use GPU) * tabix 0.2.6 -* bedtools 2.27.1 +* bedtools 2.29.2 * samtools 1.9 -You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) : +You can install these packages using [anaconda](https://www.anaconda.com/download)/[miniconda](https://conda.io/miniconda.html) (for Python 3.7 on miniconda you can use [this link](https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.3-Linux-x86_64.sh)): ``` -conda install zlib=1.2.11 numpy=1.15.4 scipy=1.2.0 cmake=3.13.2 imageio=2.5.0 -conda install pysam=0.15.2 pybedtools=0.8.0 samtools=1.9 tabix=0.2.6 bedtools=2.27.1 biopython=1.73 -c bioconda -conda install pytorch=1.1.0 torchvision=0.3.0 cudatoolkit=9.0 -c pytorch +conda install zlib=1.2.11 numpy=1.18.1 scipy=1.4.1 pillow=7.2.0 cmake=3.17.0 imageio=2.8.0 +conda install pysam=0.16.0.1 samtools=1.9 tabix=0.2.6 bedtools=2.29.2 biopython=1.77 -c bioconda +conda install pytorch=1.6.0 torchvision=0.7.0 cudatoolkit=10.1 -c pytorch +conda install -c conda-forge fisher=0.1.9 ``` Then you can export the conda paths as: ``` From e8f9f04db31f6cf89626148f321cb1d6046862e2 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 26 Jan 2021 11:25:27 -0800 Subject: [PATCH 80/89] fix README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8764832..83daf44 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ python preprocess.py \ --work work_train \ --truth_vcf truth.vcf \ --min_mapq 10 \ - --number_threads 10 \ + --num_threads 10 \ --scan_alignments_binary ../bin/scan_alignments ``` 2. Train network @@ -149,7 +149,7 @@ python preprocess.py \ --normal_bam normal.bam \ --work work_call \ --min_mapq 10 \ - --number_threads 10 \ + --num_threads 10 \ --scan_alignments_binary ../bin/scan_alignments ``` 2. Call variants @@ -280,7 +280,7 @@ do --reference GRCh38.fa --tumor_bam tumor.bam --normal_bam normal.bam \ --region_bed work/splits/region_${i}.bed \ --work work/work_${i} \ - --min_mapq 10 --number_threads 24 \ + --min_mapq 10 --num_threads 24 \ --scan_alignments_binary ../bin/scan_alignments" done ``` From 088c8458e816f82f00659eb5be59aa11b023f71b Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 4 Mar 2021 19:21:50 -0800 Subject: [PATCH 81/89] added uint16 as an option for input matrices --- neusomatic/python/call.py | 15 +++++++--- neusomatic/python/dataloader.py | 43 ++++++++++++++++++++------- neusomatic/python/defaults.py | 1 + neusomatic/python/generate_dataset.py | 30 ++++++++++++++----- neusomatic/python/preprocess.py | 13 ++++++-- neusomatic/python/train.py | 23 +++++++++++--- 6 files changed, 96 insertions(+), 29 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 5505d59..9130dca 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -466,7 +466,7 @@ def single_thread_call(record): net, candidate_files, max_load_candidates, data_transform, \ coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, \ out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, \ - pass_threshold, lowqual_threshold, i = record + pass_threshold, lowqual_threshold, matrix_dtype, i = record call_set = NeuSomaticDataset(roots=candidate_files, max_load_candidates=max_load_candidates, @@ -475,7 +475,8 @@ def single_thread_call(record): coverage_thr=coverage_thr, max_cov=max_cov, normalize_channels=normalize_channels, - zero_ann_cols=zero_ann_cols) + zero_ann_cols=zero_ann_cols, + matrix_dtype=matrix_dtype) call_loader = torch.utils.data.DataLoader(call_set, batch_size=batch_size, shuffle=True, # pin_memory=True, @@ -557,6 +558,10 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, ensemble_custom_header = pretrained_dict["ensemble_custom_header"] else: ensemble_custom_header = False + if "matrix_dtype" in pretrained_dict: + matrix_dtype = pretrained_dict["matrix_dtype"] + else: + matrix_dtype = "uint8" if force_zero_ann_cols: logger.info( @@ -572,6 +577,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, logger.info("no_seq_complexity: {}".format(no_seq_complexity)) logger.info("zero_ann_cols: {}".format(zero_ann_cols)) logger.info("ensemble_custom_header: {}".format(ensemble_custom_header)) + logger.info("matrix_dtype: {}".format(matrix_dtype)) if not ensemble_custom_header: expected_ens_fields = NUM_ENS_FEATURES @@ -709,7 +715,8 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, coverage_thr=coverage_thr, max_cov=max_cov, normalize_channels=normalize_channels, - zero_ann_cols=zero_ann_cols) + zero_ann_cols=zero_ann_cols, + matrix_dtype=matrix_dtype) call_loader = torch.utils.data.DataLoader(call_set, batch_size=batch_size, shuffle=True, pin_memory=True, @@ -766,7 +773,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, coverage_thr, max_cov, normalize_channels, zero_ann_cols, batch_size, out_dir, model_tag, ref_file, chroms, tmp_preds_dir, chroms_order, - pass_threshold, lowqual_threshold, j]) + pass_threshold, lowqual_threshold, matrix_dtype, j]) j += 1 current_L = 0 candidate_files = [] diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index 71fe47b..3ae64ee 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -16,7 +16,7 @@ import resource from utils import skip_empty -from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES +from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, MAT_DTYPES FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' logging.basicConfig(level=logging.INFO, format=FORMAT) @@ -36,11 +36,18 @@ def __call__(self, matrix): return matrix_ -def extract_zlib(zlib_compressed_im): - return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint8").reshape((5, 32, 23)) +def extract_zlib(zlib_compressed_im, matrix_dtype): + if matrix_dtype == "uint8": + return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint8").reshape((5, 32, 23)) + elif matrix_dtype == "uint16": + return np.fromstring(zlib.decompress(zlib_compressed_im), dtype="uint16").reshape((5, 32, 23)) + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) + raise Exception -def candidate_loader_tsv(tsv, open_tsv, idx, i): +def candidate_loader_tsv(tsv, open_tsv, idx, i, matrix_dtype): if open_tsv: i_f = open_tsv else: @@ -48,7 +55,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i): i_f.seek(idx[i]) fields = i_f.read(idx[i + 1] - idx[i]).strip().split() tag = fields[2] - im = extract_zlib(base64.b64decode(fields[3])) + im = extract_zlib(base64.b64decode(fields[3]), matrix_dtype) if len(fields) > 4: anns = list(map(float, fields[4:])) else: @@ -60,7 +67,7 @@ def candidate_loader_tsv(tsv, open_tsv, idx, i): def extract_info_tsv(record): - i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l = record + i_b, tsv, idx, L, max_load_candidates, nclasses_t, nclasses_l, matrix_dtype = record thread_logger = logging.getLogger( "{} ({})".format(extract_info_tsv.__name__, multiprocessing.current_process().name)) try: @@ -101,7 +108,8 @@ def extract_info_tsv(record): count_class_l[min(int(length), 3)] += 1 if ((cnt_var < max_load_candidates_var) and ("NONE" not in tag)) or ( (cnt_none < max_load_candidates_none) and ("NONE" in tag)): - im = extract_zlib(base64.b64decode(fields[3])) + im = extract_zlib(base64.b64decode( + fields[3]), matrix_dtype) label = TYPE_CLASS_DICT[tag.split(".")[4]] if len(fields) > 4: anns = list(map(float, fields[4:])) @@ -133,6 +141,7 @@ def __init__(self, roots, max_load_candidates, transform=None, max_cov=None, normalize_channels=False, zero_ann_cols=[], + matrix_dtype="uint8", max_opended_tsv=-1): soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) @@ -144,6 +153,7 @@ def __init__(self, roots, max_load_candidates, transform=None, self.max_opended_tsv = max_opended_tsv self.normalize_channels = normalize_channels self.zero_ann_cols = zero_ann_cols + self.matrix_dtype = matrix_dtype self.da_shift_p = 0.3 self.da_base_p = 0.05 self.da_rev_p = 0.1 @@ -190,7 +200,7 @@ def __init__(self, roots, max_load_candidates, transform=None, max_load_ = self.Ls[i_b] * max_load_candidates // \ total_L if total_L > 0 else 0 map_args.append([i_b, tsv, self.idxs[i_b], self.Ls[i_b], - max_load_, nclasses_t, nclasses_l]) + max_load_, nclasses_t, nclasses_l, self.matrix_dtype]) Ls_.append(self.Ls[i_b]) logger.info("Len's of tsv files in this batch: {}".format(Ls_)) if len(map_args) == 1: @@ -260,12 +270,14 @@ def __getitem__(self, index): self.open_tsvs[ int(multiprocessing.current_process()._identity[0] ) % self.num_threads][i_b], - self.idxs[i_b], i) + self.idxs[i_b], i, self.matrix_dtype) else: path, matrix, anns, label = candidate_loader_tsv(self.tsvs[i_b], self.open_tsvs[ 0][i_b], - self.idxs[i_b], i) + self.idxs[ + i_b], i, + self.matrix_dtype) else: path, matrix, anns, label = self.data[index] @@ -419,7 +431,16 @@ def __getitem__(self, index): orig_matrix_[:, :, 0:2] = orig_matrix[:, :, 0:2] orig_matrix_[:, orig_center, 2] = np.max(orig_matrix[:, :, 0]) orig_matrix = orig_matrix_ - non_transformed_matrix = np.array(orig_matrix).astype(np.uint8) + if self.matrix_dtype == "uint8": + non_transformed_matrix = np.array(orig_matrix).astype(np.uint8) + elif self.matrix_dtype == "uint16": + non_transformed_matrix = np.array( + orig_matrix).astype(np.uint16) + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) + raise Exception + else: non_transformed_matrix = [] diff --git a/neusomatic/python/defaults.py b/neusomatic/python/defaults.py index 4cf0d21..a959cae 100644 --- a/neusomatic/python/defaults.py +++ b/neusomatic/python/defaults.py @@ -3,3 +3,4 @@ VCF_HEADER = "##fileformat=VCFv4.2" TYPE_CLASS_DICT = {"DEL": 0, "INS": 1, "NONE": 2, "SNP": 3} VARTYPE_CLASSES = ['DEL', 'INS', 'NONE', 'SNP'] +MAT_DTYPES = ["uint8", "uint16"] diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 97f2eac..6b6f8a8 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -21,7 +21,7 @@ from split_bed import split_region from utils import concatenate_vcfs, get_chromosomes_order, run_bedtools_cmd, vcf_2_bed, bedtools_sort, bedtools_window, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty -from defaults import NUM_ENS_FEATURES, VCF_HEADER +from defaults import NUM_ENS_FEATURES, VCF_HEADER, MAT_DTYPES NUC_to_NUM_tabix = {"A": 1, "C": 2, "G": 3, "T": 4, "-": 0} @@ -571,7 +571,7 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec def prep_data_single_tabix(input_record): ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, \ - matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths = input_record + matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype = input_record thread_logger = logging.getLogger( "{} ({})".format(prep_data_single_tabix.__name__, multiprocessing.current_process().name)) @@ -647,8 +647,16 @@ def prep_data_single_tabix(input_record): candidate_mat[:, :, 13 + (iii * 2) + 1] = candidate_mat[:, :, 13 + ( iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * 255 - candidate_mat = np.maximum(0, np.minimum( - candidate_mat, 255)).astype(np.uint8) + if matrix_dtype == "uint8": + candidate_mat = np.maximum(0, np.minimum( + candidate_mat, 255)).astype(np.uint8) + elif matrix_dtype == "uint16": + candidate_mat = np.maximum(0, np.minimum( + candidate_mat, 255)).astype(np.uint16) + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) + raise Exception tag = "{}.{}.{}.{}.{}.{}.{}.{}.{}".format(ch_order, pos, ref[0:55], alt[ 0:55], vartype, center, rlen, tumor_cov, normal_cov) candidate_mat = base64.b64encode( @@ -1564,7 +1572,7 @@ def extract_ensemble(ensemble_tsvs, ensemble_bed, no_seq_complexity, enforce_hea if i_s: s = ensemble_data[:, np.array(i_s)] s = np.maximum(np.minimum(s, mx), mn) - s = np.round((s - mn) / (mx - mn),6) + s = np.round((s - mn) / (mx - mn), 6) ensemble_data[:, np.array(i_s)] = s ensemble_data = ensemble_data[:, selected_features] ensemble_data = ensemble_data.tolist() @@ -1585,6 +1593,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be ensemble_custom_header, no_seq_complexity, enforce_header, zero_vscore, + matrix_dtype, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) @@ -1710,7 +1719,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be ann = list(anns[int(record[-1])] ) if ensemble_bed else [] map_args_records.append((ref_file, tumor_count_bed, normal_count_bed, record, vartype, rlen, rcenter, ch_order, - matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths)) + matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype)) if cnt >= is_end: break if cnt >= is_end: @@ -1730,7 +1739,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be ) if ensemble_bed else [] map_args_nones.append((ref_file, tumor_count_bed, normal_count_bed, record, "NONE", 0, rcenter, ch_order, - matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths)) + matrix_base_pad, matrix_width, min_ev_frac_per_col, min_cov, ann, chrom_lengths, matrix_dtype)) if cnt >= is_end: break if cnt >= is_end: @@ -1851,6 +1860,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be parser.add_argument('--zero_vscore', help='set VarScan2_Score to zero', action="store_true") + parser.add_argument('--matrix_dtype', type=str, + help='matrix_dtype to be used to store matrix', default="uint8", + choices=MAT_DTYPES) args = parser.parse_args() logger.info(args) @@ -1873,7 +1885,8 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be tsv_batch_size = args.tsv_batch_size ensemble_custom_header = args.ensemble_custom_header enforce_header = args.enforce_header - zero_vscore = zero_vscore + zero_vscore = args.zero_vscore + matrix_dtype = args.matrix_dtype try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, @@ -1881,6 +1894,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be ensemble_custom_header, no_seq_complexity, enforce_header, zero_vscore, + matrix_dtype, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index e6b5559..5007d58 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -22,6 +22,7 @@ from scan_alignments import scan_alignments from extend_features import extend_features from utils import concatenate_vcfs, run_bedtools_cmd, bedtools_sort, bedtools_merge, bedtools_intersect, bedtools_slop, get_tmp_file, skip_empty, vcf_2_bed +from defaults import MAT_DTYPES def process_split_region(tn, work, region, reference, mode, alignment_bam, @@ -83,8 +84,9 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_bed, ensemble_custom_header, no_seq_complexity, - no_feature_recomp_for_ensemble, + no_feature_recomp_for_ensemble, zero_vscore, + matrix_dtype, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, @@ -93,6 +95,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi no_seq_complexity, no_feature_recomp_for_ensemble, zero_vscore, + matrix_dtype, tsv_batch_size) @@ -212,6 +215,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, merge_d_for_scan, use_vscore, num_splits, + matrix_dtype, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -588,8 +592,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_dp, num_threads, ensemble_bed_i, ensemble_custom_header, - no_seq_complexity, no_feature_recomp_for_ensemble, + no_seq_complexity, no_feature_recomp_for_ensemble, zero_vscore, + matrix_dtype, tsv_batch_size) shutil.rmtree(bed_tempdir) @@ -699,6 +704,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, action="store_true") parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) + parser.add_argument('--matrix_dtype', type=str, + help='matrix_dtype to be used to store matrix', default="uint8", + choices=MAT_DTYPES) parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -725,6 +733,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.merge_d_for_scan, args.use_vscore, args.num_splits, + args.matrix_dtype, args.num_threads, args.scan_alignments_binary) except Exception as e: diff --git a/neusomatic/python/train.py b/neusomatic/python/train.py index 45fbae2..d90184f 100755 --- a/neusomatic/python/train.py +++ b/neusomatic/python/train.py @@ -24,7 +24,7 @@ from network import NeuSomaticNet from dataloader import NeuSomaticDataset, matrix_transform from merge_tsvs import merge_tsvs -from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES +from defaults import TYPE_CLASS_DICT, VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES, MAT_DTYPES import torch._utils try: @@ -207,6 +207,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo zero_ann_cols, force_zero_ann_cols, ensemble_custom_header, + matrix_dtype, use_cuda): logger = logging.getLogger(train_neusomatic.__name__) @@ -258,6 +259,10 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo ensemble_custom_header = pretrained_dict["ensemble_custom_header"] else: ensemble_custom_header = False + if "matrix_dtype" in pretrained_dict: + matrix_dtype = pretrained_dict["matrix_dtype"] + else: + matrix_dtype = "uint8" prev_epochs = sofar_epochs else: prev_epochs = 0 @@ -292,7 +297,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo elif len(x) == 4: break else: - raise Exception("Wrong number of fields in {}: {}".format(tsv, len(x))) + raise Exception( + "Wrong number of fields in {}: {}".format(tsv, len(x))) num_channels = expected_ens_fields + \ NUM_ST_FEATURES if ensemble else NUM_ST_FEATURES @@ -382,7 +388,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo transform=data_transform, is_test=False, num_threads=num_threads, coverage_thr=coverage_thr, normalize_channels=normalize_channels, - zero_ann_cols=zero_ann_cols) + zero_ann_cols=zero_ann_cols, + matrix_dtype=matrix_dtype) train_sets.append(train_set) none_indices = train_set.get_none_indices() var_indices = train_set.get_var_indices() @@ -416,7 +423,8 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo transform=data_transform, is_test=True, num_threads=num_threads, coverage_thr=coverage_thr, normalize_channels=normalize_channels, - zero_ann_cols=zero_ann_cols) + zero_ann_cols=zero_ann_cols, + matrix_dtype=matrix_dtype) validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True, num_workers=num_threads, pin_memory=True) @@ -463,6 +471,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, "ensemble_custom_header": ensemble_custom_header, + "matrix_dtype": matrix_dtype, }, '{}/models/checkpoint_{}_epoch{}_.pth'.format(out_dir, tag, curr_epoch)) if len(train_sets) == 1: @@ -531,6 +540,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, "ensemble_custom_header": ensemble_custom_header, + "matrix_dtype": matrix_dtype, }, '{}/models/checkpoint_{}_epoch{}.pth'.format(out_dir, tag, curr_epoch)) if validation_candidates_tsv: test(net, curr_epoch, validation_loader, use_cuda) @@ -552,6 +562,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo "no_seq_complexity": no_seq_complexity, "zero_ann_cols": zero_ann_cols, "ensemble_custom_header": ensemble_custom_header, + "matrix_dtype": matrix_dtype, }, '{}/models/checkpoint_{}_epoch{}.pth'.format( out_dir, tag, curr_epoch)) if validation_candidates_tsv: @@ -642,6 +653,9 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo help='Allow ensemble tsv to have custom header fields. (Features should be\ normalized between [0,1]', action="store_true") + parser.add_argument('--matrix_dtype', type=str, + help='matrix_dtype to be used to store matrix', default="uint8", + choices=MAT_DTYPES) args = parser.parse_args() logger.info(args) @@ -663,6 +677,7 @@ def train_neusomatic(candidates_tsv, validation_candidates_tsv, out_dir, checkpo args.zero_ann_cols, args.force_zero_ann_cols, args.ensemble_custom_header, + args.matrix_dtype, use_cuda) except Exception as e: logger.error(traceback.format_exc()) From 47ac4da42aafb312cbbd06f17d77c37dcbaed674 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Thu, 4 Mar 2021 23:09:22 -0800 Subject: [PATCH 82/89] fixed uint16 --- neusomatic/python/call.py | 20 +++++++++---- neusomatic/python/dataloader.py | 30 +++++++++---------- neusomatic/python/generate_dataset.py | 42 ++++++++++++++++----------- 3 files changed, 54 insertions(+), 38 deletions(-) diff --git a/neusomatic/python/call.py b/neusomatic/python/call.py index 9130dca..be1b40a 100755 --- a/neusomatic/python/call.py +++ b/neusomatic/python/call.py @@ -26,7 +26,7 @@ from dataloader import NeuSomaticDataset, matrix_transform from utils import get_chromosomes_order, prob2phred, skip_empty from merge_tsvs import merge_tsvs -from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES +from defaults import VARTYPE_CLASSES, NUM_ENS_FEATURES, NUM_ST_FEATURES, MAT_DTYPES import torch._utils try: @@ -52,7 +52,7 @@ def get_type(ref, alt): return "SNP" -def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): +def call_variants(net, call_loader, out_dir, model_tag, run_i, matrix_dtype, use_cuda): logger = logging.getLogger(call_variants.__name__) net.eval() nclasses = len(VARTYPE_CLASSES) @@ -63,6 +63,14 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): none_preds = {} loader_ = call_loader + if matrix_dtype == "uint8": + max_norm = 255.0 + elif matrix_dtype == "uint16": + max_norm = 65535.0 + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) + iii = 0 j = 0 for data in loader_: @@ -101,7 +109,7 @@ def call_variants(net, call_loader, out_dir, model_tag, run_i, use_cuda): outputs1.data.cpu()[i].numpy())), list(map(lambda x: round(x, 4), outputs3.data.cpu()[i].numpy())), - np.array(non_transformed_matrices[i, :, :, 0:3])] + np.array(non_transformed_matrices[i, :, :, 0:3]) / max_norm] else: none_preds[path] = [VARTYPE_CLASSES[predicted[i]], pos_pred[i], len_pred[i], list(map(lambda x: round(x, 4), F.softmax( @@ -125,7 +133,7 @@ def pred_vcf_records_path(record): try: fasta_file = pysam.FastaFile(ref_file) ACGT = "ACGT" - I = pred_all[-1] / 255.0 + I = pred_all[-1] vcf_record = [] Ih, Iw, _ = I.shape zref_pos = np.where((np.argmax(I[:, :, 0], 0) == 0) & ( @@ -488,7 +496,7 @@ def single_thread_call(record): return [], [] final_preds_, none_preds_ = call_variants( - net, call_loader, out_dir, model_tag, i, use_cuda) + net, call_loader, out_dir, model_tag, i, matrix_dtype, use_cuda) all_vcf_records = pred_vcf_records( ref_file, final_preds_, chroms, 1) all_vcf_records_none = pred_vcf_records_none(none_preds_, chroms) @@ -732,7 +740,7 @@ def call_neusomatic(candidates_tsv, ref_file, out_dir, checkpoint, num_threads, continue final_preds_, none_preds_ = call_variants( - net, call_loader, out_dir, model_tag, run_i, use_cuda) + net, call_loader, out_dir, model_tag, run_i, matrix_dtype, use_cuda) all_vcf_records.extend(pred_vcf_records( ref_file, final_preds_, chroms, num_threads)) all_vcf_records_none.extend( diff --git a/neusomatic/python/dataloader.py b/neusomatic/python/dataloader.py index 3ae64ee..0fe5470 100755 --- a/neusomatic/python/dataloader.py +++ b/neusomatic/python/dataloader.py @@ -410,20 +410,28 @@ def __getitem__(self, index): tumor_cov *= r_cov normal_cov *= r_cov + if self.matrix_dtype == "uint8": + max_norm = 255.0 + elif self.matrix_dtype == "uint16": + max_norm = 65535.0 + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(self.matrix_dtype, MAT_DTYPES)) + # add COV channel matrix_ = np.zeros((matrix.shape[0], matrix.shape[1], 26 + len(anns))) matrix_[:, :, 0:23] = matrix if self.normalize_channels: - matrix_[:, :, 3:23:2] *= (matrix_[:, :, 1:2] / 255.0) - matrix_[:, :, 4:23:2] *= (matrix_[:, :, 2:3] / 255.0) + matrix_[:, :, 3:23:2] *= (matrix_[:, :, 1:2] / max_norm) + matrix_[:, :, 4:23:2] *= (matrix_[:, :, 2:3] / max_norm) matrix = matrix_ matrix[:, center, 23] = np.max(matrix[:, :, 0]) matrix[:, :, 24] = (min(tumor_cov, self.coverage_thr) / - float(self.coverage_thr)) * 255.0 + float(self.coverage_thr)) * max_norm matrix[:, :, 25] = ( - min(normal_cov, self.coverage_thr) / float(self.coverage_thr)) * 255.0 + min(normal_cov, self.coverage_thr) / float(self.coverage_thr)) * max_norm for i, a in enumerate(anns): - matrix[:, :, 26 + i] = a * 255.0 + matrix[:, :, 26 + i] = a * max_norm if self.is_test: orig_matrix_ = np.zeros( @@ -431,21 +439,13 @@ def __getitem__(self, index): orig_matrix_[:, :, 0:2] = orig_matrix[:, :, 0:2] orig_matrix_[:, orig_center, 2] = np.max(orig_matrix[:, :, 0]) orig_matrix = orig_matrix_ - if self.matrix_dtype == "uint8": - non_transformed_matrix = np.array(orig_matrix).astype(np.uint8) - elif self.matrix_dtype == "uint16": - non_transformed_matrix = np.array( - orig_matrix).astype(np.uint16) - else: - logger.info( - "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) - raise Exception + non_transformed_matrix = np.array(orig_matrix) else: non_transformed_matrix = [] matrix = torch.from_numpy(matrix.transpose((2, 0, 1))) - matrix = matrix.float().div(255) + matrix = matrix.float().div(max_norm) if self.transform is not None: matrix = self.transform(matrix) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 6b6f8a8..cfdc280 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -615,44 +615,52 @@ def prep_data_single_tabix(input_record): tumor_cov = int(round(max(np.sum(tumor_count_matrix, 0)))) normal_cov = int(round(max(np.sum(normal_count_matrix, 0)))) + if matrix_dtype == "uint8": + max_norm = 255.0 + elif matrix_dtype == "uint16": + max_norm = 65535.0 + else: + logger.info( + "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) + candidate_mat[:, :, 0] = candidate_mat[ - :, :, 0] / (max(np.max(ref_count_matrix), np.max(tumor_count_matrix)) + 0.00001) * 255 + :, :, 0] / (max(np.max(ref_count_matrix), np.max(tumor_count_matrix)) + 0.00001) * max_norm candidate_mat[:, :, 1] = candidate_mat[:, :, 1] / \ - (np.max(tumor_count_matrix) + 0.00001) * 255 + (np.max(tumor_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 2] = candidate_mat[:, :, 2] / \ - (np.max(normal_count_matrix) + 0.00001) * 255 + (np.max(normal_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 3] = candidate_mat[:, :, 3] / \ - (max(np.max(bq_tumor_count_matrix), 41.0)) * 255 + (max(np.max(bq_tumor_count_matrix), 41.0)) * max_norm candidate_mat[:, :, 4] = candidate_mat[:, :, 4] / \ - (max(np.max(bq_normal_count_matrix), 41.0)) * 255 + (max(np.max(bq_normal_count_matrix), 41.0)) * max_norm candidate_mat[:, :, 5] = candidate_mat[:, :, 5] / \ - (max(np.max(mq_tumor_count_matrix), 70.0)) * 255 + (max(np.max(mq_tumor_count_matrix), 70.0)) * max_norm candidate_mat[:, :, 6] = candidate_mat[:, :, 6] / \ - (max(np.max(mq_normal_count_matrix), 70.0)) * 255 + (max(np.max(mq_normal_count_matrix), 70.0)) * max_norm candidate_mat[:, :, 7] = candidate_mat[:, :, 7] / \ - (np.max(tumor_count_matrix) + 0.00001) * 255 + (np.max(tumor_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 8] = candidate_mat[:, :, 8] / \ - (np.max(normal_count_matrix) + 0.00001) * 255 + (np.max(normal_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 9] = candidate_mat[:, :, 9] / \ - (np.max(tumor_count_matrix) + 0.00001) * 255 + (np.max(tumor_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 10] = candidate_mat[:, :, 10] / \ - (np.max(normal_count_matrix) + 0.00001) * 255 + (np.max(normal_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 11] = candidate_mat[:, :, 11] / \ - (np.max(tumor_count_matrix) + 0.00001) * 255 + (np.max(tumor_count_matrix) + 0.00001) * max_norm candidate_mat[:, :, 12] = candidate_mat[:, :, 12] / \ - (np.max(normal_count_matrix) + 0.00001) * 255 + (np.max(normal_count_matrix) + 0.00001) * max_norm for iii in range(len(tag_tumor_count_matrices)): candidate_mat[:, :, 13 + (iii * 2)] = candidate_mat[:, :, 13 + (iii * 2)] / ( - max(np.max(tag_tumor_count_matrices[iii]), 100.0)) * 255 + max(np.max(tag_tumor_count_matrices[iii]), 100.0)) * max_norm candidate_mat[:, :, 13 + (iii * 2) + 1] = candidate_mat[:, :, 13 + ( - iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * 255 + iii * 2) + 1] / (max(np.max(tag_normal_count_matrices[iii]), 100.0)) * max_norm if matrix_dtype == "uint8": candidate_mat = np.maximum(0, np.minimum( - candidate_mat, 255)).astype(np.uint8) + candidate_mat, max_norm)).astype(np.uint8) elif matrix_dtype == "uint16": candidate_mat = np.maximum(0, np.minimum( - candidate_mat, 255)).astype(np.uint16) + candidate_mat, max_norm)).astype(np.uint16) else: logger.info( "Wrong matrix_dtype {}. Choices are {}".format(matrix_dtype, MAT_DTYPES)) From d948d084a634b5b6a917e9898ab00c7be99bd6ab Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Sun, 7 Mar 2021 23:50:22 -0800 Subject: [PATCH 83/89] added report_all_alleles --- neusomatic/cpp/scan_alignments.cpp | 157 ++++++++++++++++----------- neusomatic/include/Options.h | 14 +++ neusomatic/python/preprocess.py | 11 +- neusomatic/python/scan_alignments.py | 19 +++- 4 files changed, 129 insertions(+), 72 deletions(-) diff --git a/neusomatic/cpp/scan_alignments.cpp b/neusomatic/cpp/scan_alignments.cpp index d019597..3f8c552 100644 --- a/neusomatic/cpp/scan_alignments.cpp +++ b/neusomatic/cpp/scan_alignments.cpp @@ -58,6 +58,7 @@ int main(int argc, char **argv) { float del_min_af=min_af; float snp_min_af=min_af; const bool calculate_qual_stat = opts.calculate_qual_stat(); + const bool report_all_alleles = opts.report_all_alleles(); //const std::map empty_pileup_counts = {{'-', 0}, {'A', 0}, {'C', 0}, {'G', 0}, {'T', 0}}; static const std::vector nuc_code_char = {'A', 'C', 'G', 'T', '-', 'N'}; @@ -206,83 +207,109 @@ int main(int argc, char **argv) { <<":"< major_count) { - minor2 = minor; - minor2_count = minor_count; - minor_count = major_count; - minor = major; - major_count = cols[i].base_freq_[row]; - major = row; - } else if (cols[i].base_freq_[row] > minor_count) { - minor2 = minor; - minor2_count = minor_count; - minor_count = cols[i].base_freq_[row]; - minor = row; - } else if (cols[i].base_freq_[row] > minor2_count) { - minor2_count = cols[i].base_freq_[row]; - minor2 = row; + std::map alt_counts; + auto ref_count = cols[i].base_freq_[ref_code]; + auto var_code = ref_code; + int var_count = 0; + int dp = ref_count; + if (report_all_alleles){ + for (int row = 0; row < cols[i].base_freq_.size(); ++row) { + auto alt_cnt = cols[i].base_freq_[row]; + if (( row != ref_code) and (alt_cnt > 0)){ + auto af = alt_cnt/float(alt_cnt+ref_count); + if ((alt_cnt >= ref_code) or ((row == 4 and af > del_min_af ) or + (row != 4 and ref_base != '-' and af > snp_min_af ) or + (ref_base =='-' and af > ins_min_af))){ + alt_counts.insert(std::pair(row, alt_cnt)); + dp += alt_cnt; + } + } + } + }else{ + int major = -1; + int major_count = 0; + int minor = -1; + int minor_count = 0; + int minor2 = -1; + int minor2_count = 0; + + for (int row = 0; row < cols[i].base_freq_.size(); ++row) { + if (cols[i].base_freq_[row] > major_count) { + minor2 = minor; + minor2_count = minor_count; + minor_count = major_count; + minor = major; + major_count = cols[i].base_freq_[row]; + major = row; + } else if (cols[i].base_freq_[row] > minor_count) { + minor2 = minor; + minor2_count = minor_count; + minor_count = cols[i].base_freq_[row]; + minor = row; + } else if (cols[i].base_freq_[row] > minor2_count) { + minor2_count = cols[i].base_freq_[row]; + minor2 = row; + } } - } - if (minor != -1 and major != -1){ - if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4 ){ - if (minor2_count>0.5*minor_count){ - minor = minor2; - minor_count = minor2_count; + if (minor != -1 and major != -1){ + if (minor2 != -1 and ref_code == major and minor == 4 and ref_code != 4 ){ + if (minor2_count>0.5*minor_count){ + minor = minor2; + minor_count = minor2_count; + } } } + auto af = minor_count/float(major_count+minor_count); + if (major != ref_code){ + var_code = major; + var_count = major_count; + } else if (minor != ref_code and ( (minor == 4 and af > del_min_af ) or + (minor != 4 and ref_base != '-' and af > snp_min_af ) or + (ref_base =='-' and af > ins_min_af))){ + var_code = minor; + var_count = minor_count; + } + if (var_count > 0) { + alt_counts.insert(std::pair(var_code,var_count)); + dp += var_count; + } } - auto ref_count = cols[i].base_freq_[ref_code]; - auto var_code = ref_code; - int var_count = 0; - auto af = minor_count/float(major_count+minor_count); - if (major != ref_code){ - var_code = major; - var_count = major_count; - } else if (minor != ref_code and ( (minor == 4 and af > del_min_af ) or - (minor != 4 and ref_base != '-' and af > snp_min_af ) or - (ref_base =='-' and af > ins_min_af))){ - var_code = minor; - var_count = minor_count; - } - - if (var_count > 0) { - - auto record_info = "AF="+std::to_string((var_count)/float(var_count+ref_count))+";DP="+std::to_string(nrow)+";RO="+std::to_string(ref_count)+";AO="+std::to_string(var_count); - auto gtinfo = "0/1:"+std::to_string(nrow)+":"+std::to_string(ref_count)+":"+std::to_string(var_count); + // for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it) + // { + // std::cout << it->first << " " << it->second << std::endl; + // } + for(auto it = alt_counts.cbegin(); it != alt_counts.cend(); ++it) + { + auto var_code_ = it->first; + auto var_count_ = it->second; + auto record_info = "AF="+std::to_string((var_count_)/float(dp))+";DP="+std::to_string(nrow)+";RO="+std::to_string(ref_count)+";AO="+std::to_string(var_count_); + auto gtinfo = "0/1:"+std::to_string(nrow)+":"+std::to_string(ref_count)+":"+std::to_string(var_count_); if (calculate_qual_stat){ record_info += ";ST="+std::to_string(int(round(ref_count*(cols_strand[i].strand_mean[ref_code]/100))))+ \ - ","+std::to_string(int(round(var_count*(cols_strand[i].strand_mean[var_code]/100))))+ \ + ","+std::to_string(int(round(var_count_*(cols_strand[i].strand_mean[var_code_]/100))))+ \ ";LS="+std::to_string(lsc_counts)+\ ";RS="+std::to_string(rsc_counts)+\ - ";NM="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][0])))+\ - ";AS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][1])))+ \ - ";XS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][2])))+ \ - ";PR="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][3])))+ \ - ";CL="+std::to_string(int(round(cols_tag[i].tag_mean[var_code][4])))+ \ - ";MQ="+std::to_string(int(round(cols_mqual[i].mqual_mean[var_code])))+ \ - ";BQ="+std::to_string(int(round(cols[i].bqual_mean[var_code]))); + ";NM="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][0])))+\ + ";AS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][1])))+ \ + ";XS="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][2])))+ \ + ";PR="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][3])))+ \ + ";CL="+std::to_string(int(round(cols_tag[i].tag_mean[var_code_][4])))+ \ + ";MQ="+std::to_string(int(round(cols_mqual[i].mqual_mean[var_code_])))+ \ + ";BQ="+std::to_string(int(round(cols[i].bqual_mean[var_code_]))); gtinfo += ":"+std::to_string(int(round(ref_count*(cols_strand[i].strand_mean[ref_code]/100))))+","+ \ - std::to_string(int(round(var_count*(cols_strand[i].strand_mean[var_code]/100))))+":"+\ + std::to_string(int(round(var_count_*(cols_strand[i].strand_mean[var_code_]/100))))+":"+\ std::to_string(lsc_counts)+":"+\ std::to_string(rsc_counts)+":"+\ - std::to_string(int(round(cols_tag[i].tag_mean[var_code][0])))+":"+\ - std::to_string(int(round(cols_tag[i].tag_mean[var_code][1])))+":"+\ - std::to_string(int(round(cols_tag[i].tag_mean[var_code][2])))+":"+\ - std::to_string(int(round(cols_tag[i].tag_mean[var_code][3])))+":"+\ - std::to_string(int(round(cols_tag[i].tag_mean[var_code][4])))+":"+\ - std::to_string(int(round(cols_mqual[i].mqual_mean[var_code])))+":"+\ - std::to_string(int(round(cols[i].bqual_mean[var_code]))); + std::to_string(int(round(cols_tag[i].tag_mean[var_code_][0])))+":"+\ + std::to_string(int(round(cols_tag[i].tag_mean[var_code_][1])))+":"+\ + std::to_string(int(round(cols_tag[i].tag_mean[var_code_][2])))+":"+\ + std::to_string(int(round(cols_tag[i].tag_mean[var_code_][3])))+":"+\ + std::to_string(int(round(cols_tag[i].tag_mean[var_code_][4])))+":"+\ + std::to_string(int(round(cols_mqual[i].mqual_mean[var_code_])))+":"+\ + std::to_string(int(round(cols[i].bqual_mean[var_code_]))); } - auto var_base = nuc_code_char[var_code]; + auto var_base = nuc_code_char[var_code_]; if (ref_base == '-') {ref_base = 'N';} if (var_base == '-') {var_base = 'N';} auto var_ref_pos=ginv.left() + cc.UngapPos(i); @@ -304,7 +331,7 @@ int main(int argc, char **argv) { appendValue(record.genotypeInfos, gtinfo); vcf_writer.Write(record); if (opts.verbosity()>0){ - std::cout<<"var: " << i << "," << var_ref_pos << ","<< ref_base << "," << var_base<<","< 0: cmd = "{} --ref {} -b {} -L {} --out_vcf_file {}/candidates.vcf --out_count_file {}/count.bed \ - --window_size {} --min_af {} --min_mapq {} --max_depth {} {}".format( + --window_size {} --min_af {} --min_mapq {} --max_depth {} {} {}".format( scan_alignments_binary, reference, input_bam, split_region_file_, - work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, filter_duplicate_str) + work, work, window_size, maf, min_mapq, max_dp * window_size / 100.0, report_all_alleles_str, filter_duplicate_str) if calc_qual: cmd += " --calculate_qual_stat" run_shell_command(cmd, stdout=os.path.join(work, "scan.out"), @@ -80,7 +84,7 @@ def run_scan_alignments(record): def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, regions_bed_file, reference, num_splits, - num_threads, window_size, maf, min_mapq, max_dp, filter_duplicate, restart=True, + num_threads, window_size, maf, min_mapq, max_dp, report_all_alleles, filter_duplicate, restart=True, split_region_files=[], calc_qual=True): logger = logging.getLogger(scan_alignments.__name__) @@ -152,7 +156,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, shutil.rmtree(work_) map_args.append((os.path.join(work, "work.{}".format(i)), reference, merge_d_for_scan, scan_alignments_binary, split_region_file, - input_bam, window_size, maf, min_mapq, max_dp, filter_duplicate, calc_qual)) + input_bam, window_size, maf, min_mapq, max_dp, report_all_alleles, filter_duplicate, calc_qual)) not_done.append(i) else: all_outputs[i] = [os.path.join(work, "work.{}".format(i), "candidates.vcf"), @@ -209,6 +213,9 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, parser.add_argument('--merge_d_for_scan', type=int, help='-d used to merge regions before scan', default=None) + parser.add_argument('--report_all_alleles', + help='report all alleles per position', + action="store_true") parser.add_argument('--num_splits', type=int, help='number of region splits', default=None) parser.add_argument('--num_threads', type=int, @@ -220,7 +227,7 @@ def scan_alignments(work, merge_d_for_scan, scan_alignments_binary, input_bam, outputs = scan_alignments(args.work, args.merge_d_for_scan, args.scan_alignments_binary, args.input_bam, args.regions_bed_file, args.reference, args.num_splits, args.num_threads, args.window_size, args.maf, - args.min_mapq, args.max_dp, args.filter_duplicate) + args.min_mapq, args.max_dp, args.report_all_alleles, args.filter_duplicate) except Exception as e: logger.error(traceback.format_exc()) logger.error("Aborting!") From 798c880690d8133e4be54e9f5cdbd65734ab292d Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Mon, 8 Mar 2021 00:05:41 -0800 Subject: [PATCH 84/89] added strict_labeling --- neusomatic/python/generate_dataset.py | 20 +++++++++++++------- neusomatic/python/preprocess.py | 8 ++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index cfdc280..ffce9da 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -794,7 +794,7 @@ def merge_records(fasta_file, records): return [str(chrom), pos_m + 1, ref2_, alt2_] -def is_part_of(record1, record2): +def is_part_of(record1, record2, strict_labeling): logger = logging.getLogger(is_part_of.__name__) chrom1, pos1, ref1, alt1 = record1[0:4] chrom2, pos2, ref2, alt2 = record2[0:4] @@ -802,10 +802,10 @@ def is_part_of(record1, record2): return False vartype1 = get_type(ref1, alt1) vartype2 = get_type(ref2, alt2) - if vartype1 == "SNP" and vartype2 == "DEL": + if (not strict_labeling) and (vartype1 == "SNP" and vartype2 == "DEL"): if pos2 < pos1 < pos2 + len(ref2): return True - elif vartype2 == "SNP" and vartype1 == "DEL": + elif (not strict_labeling) and (vartype2 == "SNP" and vartype1 == "DEL"): if pos1 < pos2 < pos1 + len(ref1): return True elif vartype1 == vartype2: @@ -877,7 +877,7 @@ def keep_in_region(input_file, region_bed, def find_records(input_record): - work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, work_index = input_record + work, split_region_file, truth_vcf_file, pred_vcf_file, ref_file, ensemble_bed, num_ens_features, strict_labeling, work_index = input_record thread_logger = logging.getLogger( "{} ({})".format(find_records.__name__, multiprocessing.current_process().name)) try: @@ -1238,7 +1238,7 @@ def find_records(input_record): truth_record = truth_records[i] tr, eqs = push_lr(fasta_file, truth_record, 2) for eq in eqs: - if is_part_of(eq, record): + if is_part_of(eq, record, strict_labeling): ref_t, alt_t = truth_record[2:4] vartype_t = get_type(ref_t, alt_t) record_center[j] = find_i_center(ref, alt) @@ -1256,7 +1256,7 @@ def find_records(input_record): ref_p, alt_p = records[p][2:4] tr, eqs = push_lr(fasta_file, records[p], 2) for eq in eqs: - if is_part_of(eq, record): + if is_part_of(eq, record, strict_labeling): vartype = vtype[p] record_center[j] = find_i_center(ref, alt) record_len[j] = find_len(ref_p, alt_p) @@ -1602,6 +1602,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be no_seq_complexity, enforce_header, zero_vscore, matrix_dtype, + strict_labeling, tsv_batch_size): logger = logging.getLogger(generate_dataset.__name__) @@ -1671,7 +1672,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be map_args = [] for i, split_region_file in enumerate(split_region_files): map_args.append((work, split_region_file, truth_vcf_file, - tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, i)) + tumor_pred_vcf_file, ref_file, ensemble_bed, num_ens_features, strict_labeling, i)) try: records_data = pool.map_async(find_records, map_args).get() pool.close() @@ -1871,6 +1872,9 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be parser.add_argument('--matrix_dtype', type=str, help='matrix_dtype to be used to store matrix', default="uint8", choices=MAT_DTYPES) + parser.add_argument('--strict_labeling', + help='strict labeling in train mode', + action="store_true") args = parser.parse_args() logger.info(args) @@ -1895,6 +1899,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be enforce_header = args.enforce_header zero_vscore = args.zero_vscore matrix_dtype = args.matrix_dtype + strict_labeling = args.strict_labeling try: generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_bed_file, tumor_count_bed, normal_count_bed, ref_file, matrix_width, matrix_base_pad, min_ev_frac_per_col, min_cov, num_threads, ensemble_tsv, @@ -1903,6 +1908,7 @@ def generate_dataset(work, truth_vcf_file, mode, tumor_pred_vcf_file, region_be no_seq_complexity, enforce_header, zero_vscore, matrix_dtype, + strict_labeling, tsv_batch_size) except Exception as e: logger.error(traceback.format_exc()) diff --git a/neusomatic/python/preprocess.py b/neusomatic/python/preprocess.py index c54cb98..6c5860f 100755 --- a/neusomatic/python/preprocess.py +++ b/neusomatic/python/preprocess.py @@ -88,6 +88,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi no_feature_recomp_for_ensemble, zero_vscore, matrix_dtype, + strict_labeling, tsv_batch_size): logger = logging.getLogger(generate_dataset_region.__name__) generate_dataset(work, truth_vcf, mode, filtered_candidates_vcf, region, tumor_count_bed, normal_count_bed, reference, @@ -97,6 +98,7 @@ def generate_dataset_region(work, truth_vcf, mode, filtered_candidates_vcf, regi no_feature_recomp_for_ensemble, zero_vscore, matrix_dtype, + strict_labeling, tsv_batch_size) @@ -218,6 +220,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, num_splits, matrix_dtype, report_all_alleles, + strict_labeling, num_threads, scan_alignments_binary,): logger = logging.getLogger(preprocess.__name__) @@ -600,6 +603,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, no_seq_complexity, no_feature_recomp_for_ensemble, zero_vscore, matrix_dtype, + strict_labeling, tsv_batch_size) shutil.rmtree(bed_tempdir) @@ -715,6 +719,9 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, parser.add_argument('--report_all_alleles', help='report all alleles per position', action="store_true") + parser.add_argument('--strict_labeling', + help='strict labeling in train mode', + action="store_true") parser.add_argument('--num_threads', type=int, help='number of threads', default=1) parser.add_argument('--scan_alignments_binary', type=str, @@ -743,6 +750,7 @@ def preprocess(work, mode, reference, region_bed, tumor_bam, normal_bam, dbsnp, args.num_splits, args.matrix_dtype, args.report_all_alleles, + args.strict_labeling, args.num_threads, args.scan_alignments_binary) except Exception as e: From cdfc0629ed113e1a76d80e8c28b6d4594289d129 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Mon, 8 Mar 2021 00:25:56 -0800 Subject: [PATCH 85/89] fixed strict_labeling --- neusomatic/python/generate_dataset.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index ffce9da..609c597 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -794,7 +794,7 @@ def merge_records(fasta_file, records): return [str(chrom), pos_m + 1, ref2_, alt2_] -def is_part_of(record1, record2, strict_labeling): +def is_part_of(record1, record2): logger = logging.getLogger(is_part_of.__name__) chrom1, pos1, ref1, alt1 = record1[0:4] chrom2, pos2, ref2, alt2 = record2[0:4] @@ -802,10 +802,10 @@ def is_part_of(record1, record2, strict_labeling): return False vartype1 = get_type(ref1, alt1) vartype2 = get_type(ref2, alt2) - if (not strict_labeling) and (vartype1 == "SNP" and vartype2 == "DEL"): + if (vartype1 == "SNP" and vartype2 == "DEL"): if pos2 < pos1 < pos2 + len(ref2): return True - elif (not strict_labeling) and (vartype2 == "SNP" and vartype1 == "DEL"): + elif (vartype2 == "SNP" and vartype1 == "DEL"): if pos1 < pos2 < pos1 + len(ref1): return True elif vartype1 == vartype2: @@ -1236,9 +1236,12 @@ def find_records(input_record): done = False for i in i_s: truth_record = truth_records[i] - tr, eqs = push_lr(fasta_file, truth_record, 2) + if not strict_labeling: + tr, eqs = push_lr(fasta_file, truth_record, 2) + else: + tr, eqs = push_lr(fasta_file, truth_record, 0) for eq in eqs: - if is_part_of(eq, record, strict_labeling): + if is_part_of(eq, record): ref_t, alt_t = truth_record[2:4] vartype_t = get_type(ref_t, alt_t) record_center[j] = find_i_center(ref, alt) @@ -1254,9 +1257,12 @@ def find_records(input_record): perfect_idx) for p in p_s: ref_p, alt_p = records[p][2:4] - tr, eqs = push_lr(fasta_file, records[p], 2) + if not strict_labeling: + tr, eqs = push_lr(fasta_file, truth_record, 2) + else: + tr, eqs = push_lr(fasta_file, truth_record, 0) for eq in eqs: - if is_part_of(eq, record, strict_labeling): + if is_part_of(eq, record): vartype = vtype[p] record_center[j] = find_i_center(ref, alt) record_len[j] = find_len(ref_p, alt_p) From 74a27df8fc15aab6894d9e8063eb1f0bb062cfb5 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Mon, 8 Mar 2021 01:26:39 -0800 Subject: [PATCH 86/89] fixed strict_labeling --- neusomatic/python/generate_dataset.py | 35 +++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 609c597..edaead7 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -744,6 +744,35 @@ def push_lr(fasta_file, record, left_right_both): return record, eqs +def push_left(fasta_file, record): + logger = logging.getLogger(push_lr.__name__) + record[0] = str(record[0]) + if "," not in record[3]: + if record[2] != record[3]: + chrom, pos, ref, alt = record[0:4] + new_pos = pos + new_ref = ref + new_alt = alt + while(new_pos > 1): + l_base = fasta_file.fetch( + (chrom), new_pos - 2, new_pos - 1).upper() + new_ref = l_base + new_ref + new_alt = l_base + new_alt + new_pos -= 1 + while(len(new_alt) > 1 and len(new_ref) > 1): + if new_alt[-1] == new_ref[-1]: + new_alt = new_alt[:-1] + new_ref = new_ref[:-1] + else: + break + if len(new_alt) > len(alt): + new_ref = new_ref[1:] + new_alt = new_alt[1:] + new_pos += 1 + break + record = [chrom, new_pos, new_ref, new_alt] + record[4:] + return record + def merge_records(fasta_file, records): logger = logging.getLogger(merge_records.__name__) if len(set(map(lambda x: x[0], records))) != 1: @@ -1117,8 +1146,10 @@ def find_records(input_record): record[3] = l_base + record[3] record[4] = l_base + record[4] pos -= 1 - truth_records.append( - [record[0], pos, record[3], record[4], str(i)]) + tr = [record[0], pos, record[3], record[4], str(i)] + if strict_labeling: + tr = push_left(fasta_file, tr) + truth_records.append(tr) i += 1 truth_bed = get_tmp_file() From a2eed5ed83949d672ab150ce65c92fe5e43937d7 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 9 Mar 2021 21:08:43 -0800 Subject: [PATCH 87/89] small fix --- neusomatic/python/generate_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index edaead7..4146247 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1289,9 +1289,9 @@ def find_records(input_record): for p in p_s: ref_p, alt_p = records[p][2:4] if not strict_labeling: - tr, eqs = push_lr(fasta_file, truth_record, 2) + tr, eqs = push_lr(fasta_file, records[p], 2) else: - tr, eqs = push_lr(fasta_file, truth_record, 0) + tr, eqs = push_lr(fasta_file, records[p], 0) for eq in eqs: if is_part_of(eq, record): vartype = vtype[p] From 399b15cc53c5d25c0c8bf1fe24df99188572daf6 Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Tue, 9 Mar 2021 21:53:50 -0800 Subject: [PATCH 88/89] fixed strict_labeling --- neusomatic/python/generate_dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 4146247..8646c6f 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -773,6 +773,7 @@ def push_left(fasta_file, record): record = [chrom, new_pos, new_ref, new_alt] + record[4:] return record + def merge_records(fasta_file, records): logger = logging.getLogger(merge_records.__name__) if len(set(map(lambda x: x[0], records))) != 1: @@ -1190,6 +1191,7 @@ def find_records(input_record): good_records = {"INS": [], "DEL": [], "SNP": []} vtype = {} record_len = {} + perfect_t_idx = set([]) for i, js in map_truth_2_pred.items(): truth_record = truth_records[i] for j in js: @@ -1203,6 +1205,7 @@ def find_records(input_record): record_len[j] = find_len(ref, alt) good_records[vartype].append(j) vtype[j] = vartype + perfect_t_idx.add(i) good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - @@ -1250,6 +1253,7 @@ def find_records(input_record): record_len[j] = find_len(ref, alt) good_records[vartype].append(j) vtype[j] = vartype + perfect_t_idx |= set(t_i) done_js.append(j) done_js_.append(j) done_is_.extend(t_i) @@ -1266,6 +1270,8 @@ def find_records(input_record): i_s = map_pred_2_truth[j] done = False for i in i_s: + if strict_labeling and (i not in perfect_t_idx): + continue truth_record = truth_records[i] if not strict_labeling: tr, eqs = push_lr(fasta_file, truth_record, 2) @@ -1342,7 +1348,7 @@ def find_records(input_record): vartype = get_type(record[2], record[3]) pos, ref, alt = record[1:4] rc = find_i_center(ref, alt) - if vartype_t == vartype and pos_t == pos: + if vartype_t == vartype and pos_t == pos and ((not strict_labeling) or vartype_t != "SNP"): good_records[vartype_t].append(j) vtype[j] = vartype_t record_len[j] = find_len(ref_t, alt_t) From 13c45b7e4b5bf1636945596c3889172dc011641c Mon Sep 17 00:00:00 2001 From: Sahraeian Date: Wed, 10 Mar 2021 16:48:35 -0800 Subject: [PATCH 89/89] small fix for strict_labeling --- neusomatic/cpp/scan_alignments.cpp | 2 +- neusomatic/python/filter_candidates.py | 2 +- neusomatic/python/generate_dataset.py | 81 ++++++++++++++------------ 3 files changed, 45 insertions(+), 40 deletions(-) diff --git a/neusomatic/cpp/scan_alignments.cpp b/neusomatic/cpp/scan_alignments.cpp index 3f8c552..d93adad 100644 --- a/neusomatic/cpp/scan_alignments.cpp +++ b/neusomatic/cpp/scan_alignments.cpp @@ -212,7 +212,7 @@ int main(int argc, char **argv) { auto var_code = ref_code; int var_count = 0; int dp = ref_count; - if (report_all_alleles){ + if (report_all_alleles and ref_base != '-'){ for (int row = 0; row < cols[i].base_freq_.size(); ++row) { auto alt_cnt = cols[i].base_freq_[row]; if (( row != ref_code) and (alt_cnt > 0)){ diff --git a/neusomatic/python/filter_candidates.py b/neusomatic/python/filter_candidates.py index 1fa395e..aa7a063 100755 --- a/neusomatic/python/filter_candidates.py +++ b/neusomatic/python/filter_candidates.py @@ -109,7 +109,7 @@ def filter_candidates(candidate_record): else: ins = [ins[0][:-1]] good_records.extend(ins) - if dels and (ins or list(filter(lambda x: x[3] != "N" and x[2] != "N", rs))): + if dels and (ins or len(list(filter(lambda x: x[3] == "N" and x[2] != "N", rs))) == 0): # emit del if len(dels) == 1: ro = dels[0][5] diff --git a/neusomatic/python/generate_dataset.py b/neusomatic/python/generate_dataset.py index 8646c6f..0753a8f 100755 --- a/neusomatic/python/generate_dataset.py +++ b/neusomatic/python/generate_dataset.py @@ -1357,50 +1357,55 @@ def find_records(input_record): good_records_idx = [i for w in list(good_records.values()) for i in w] remained_idx = sorted(set(range(len(records))) - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.items(): - truth_record = truth_records[i] + if not strict_labeling: + for i, js in map_truth_2_pred.items(): + truth_record = truth_records[i] - if set(js) & set(good_records_idx): - continue - pos_t, ref_t, alt_t = truth_record[1:4] - vartype_t = get_type(ref_t, alt_t) - rct = find_i_center(ref_t, alt_t) - for j in js: - if j not in remained_idx: + if set(js) & set(good_records_idx): continue - record = records[j] - vartype = get_type(record[2], record[3]) - pos, ref, alt = record[1:4] - rc = find_i_center(ref, alt) - if pos_t + rct[0] + rct[1] == pos + rc[0] + rc[1]: - if (vartype_t == "INS" and vartype == "SNP") or (vartype == "INS" and vartype_t == "SNP"): + pos_t, ref_t, alt_t = truth_record[1:4] + vartype_t = get_type(ref_t, alt_t) + rct = find_i_center(ref_t, alt_t) + for j in js: + if j not in remained_idx: + continue + record = records[j] + vartype = get_type(record[2], record[3]) + pos, ref, alt = record[1:4] + rc = find_i_center(ref, alt) + if pos_t + rct[0] + rct[1] == pos + rc[0] + rc[1]: + if (vartype_t == "INS" and vartype == "SNP") or (vartype == "INS" and vartype_t == "SNP"): + good_records[vartype_t].append(j) + vtype[j] = vartype_t + record_len[j] = find_len(ref_t, alt_t) + record_center[j] = rc + + good_records_idx = [i for w in list( + good_records.values()) for i in w] + remained_idx = sorted(set(range(len(records))) - + (set(good_records_idx) | set(none_records_ids))) + + if not strict_labeling: + for i, js in map_truth_2_pred.items(): + truth_record = truth_records[i] + if set(js) & set(good_records_idx): + continue + pos_t, ref_t, alt_t = truth_record[1:4] + vartype_t = get_type(ref_t, alt_t) + for j in js: + record = records[j] + pos, ref, alt = record[1:4] + vartype = get_type(record[2], record[3]) + if (vartype == vartype_t) and vartype_t != "SNP" and abs(pos - pos_t) < 2: good_records[vartype_t].append(j) vtype[j] = vartype_t + record_center[j] = find_i_center(ref, alt) record_len[j] = find_len(ref_t, alt_t) - record_center[j] = rc - - good_records_idx = [i for w in list(good_records.values()) for i in w] - remained_idx = sorted(set(range(len(records))) - - (set(good_records_idx) | set(none_records_ids))) - for i, js in map_truth_2_pred.items(): - truth_record = truth_records[i] - if set(js) & set(good_records_idx): - continue - pos_t, ref_t, alt_t = truth_record[1:4] - vartype_t = get_type(ref_t, alt_t) - for j in js: - record = records[j] - pos, ref, alt = record[1:4] - vartype = get_type(record[2], record[3]) - if (vartype == vartype_t) and vartype_t != "SNP" and abs(pos - pos_t) < 2: - good_records[vartype_t].append(j) - vtype[j] = vartype_t - record_center[j] = find_i_center(ref, alt) - record_len[j] = find_len(ref_t, alt_t) - good_records_idx = [i for w in list(good_records.values()) for i in w] - remained_idx = sorted(set(range(len(records))) - - (set(good_records_idx) | set(none_records_ids))) + good_records_idx = [i for w in list( + good_records.values()) for i in w] + remained_idx = sorted(set(range(len(records))) - + (set(good_records_idx) | set(none_records_ids))) for i, js in map_truth_2_pred.items(): truth_record = truth_records[i]