diff --git a/rnaindel/analysis/alignment_feature_calculator.py b/rnaindel/analysis/alignment_feature_calculator.py index f1b8137..de8b85c 100755 --- a/rnaindel/analysis/alignment_feature_calculator.py +++ b/rnaindel/analysis/alignment_feature_calculator.py @@ -58,7 +58,7 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000): df = df[df["alt_count"] > 1] - return df + return df.reset_index(drop=True) def _wrapper(row, bam, mapq, downsample_threshold): @@ -347,8 +347,7 @@ def infer_del_seq_from_data(non_target_reads, target_deletion): lt_seq, rt_seq = split(non_target_read, target_deletion, is_for_ref=False) if len(rt_seq) > del_len: inferred = rt_seq[:del_len] - if inferred != del_seq: - non_ref_del_seq.append(inferred) + non_ref_del_seq.append(inferred) if non_ref_del_seq: return most_common(non_ref_del_seq) diff --git a/rnaindel/analysis/classifier.py b/rnaindel/analysis/classifier.py index 4192f87..ef1a71d 100755 --- a/rnaindel/analysis/classifier.py +++ b/rnaindel/analysis/classifier.py @@ -11,6 +11,7 @@ warnings.filterwarnings("ignore") + def classify(df, model_dir, num_of_processes): """ Makes prediction Args: @@ -44,7 +45,7 @@ def calculate_proba(df, model_dir, num_of_processes): sni_features = feature_dict["single_nucleotide_indels"] mni_features = feature_dict["multi_nucleotide_indels"] - + # to keep the original row order df["order"] = df.index df_sni, df_mni = split_by_indel_size(df) @@ -76,7 +77,7 @@ def calculate_proba(df, model_dir, num_of_processes): dfp_mni = pd.DataFrame(data=mni_proba) dfp_mni.columns = header else: - dfp_mni= pd.DataFrame(columns=header) + dfp_mni = pd.DataFrame(columns=header) df_mni = pd.concat([df_mni, dfp_mni], axis=1) @@ -112,13 +113,16 @@ def make_feature_dict(model_dir): model_dir (str): path to data directory where "features.txt" is located. Returns: feature_dict (dict): {indel_class : [feture names]} - """ + """ features = os.path.join(model_dir, "features.txt") f = open(features) - feature_dict = {line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f} + feature_dict = { + line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f + } f.close() return feature_dict + def predict(model, data, features): """ Calculate prediction probabaility Args: diff --git a/rnaindel/analysis/coding_indel.py b/rnaindel/analysis/coding_indel.py index bc818dd..80f8d4d 100755 --- a/rnaindel/analysis/coding_indel.py +++ b/rnaindel/analysis/coding_indel.py @@ -1,5 +1,6 @@ from .sequence_properties import exists_stop_codon + def annotate_coding_info(indel, coding_gene_db): """Generate coding indel objects @@ -27,9 +28,9 @@ def annotate_coding_info(indel, coding_gene_db): indel_type_db = 1 if indel_type == "I" else 0 # conversion for coding_gene_db style pos = pos + 1 # conversion for bambino style margin = 11 # allow a margin of 11bp for splice region - + chrom = chrom if chrom.startswith("chr") else "chr" + chrom - + try: candidate_genes = coding_gene_db.fetch(chrom, pos - margin, pos + margin) except: diff --git a/rnaindel/analysis/outlier.py b/rnaindel/analysis/outlier.py index 7d73e71..d816f8c 100755 --- a/rnaindel/analysis/outlier.py +++ b/rnaindel/analysis/outlier.py @@ -6,6 +6,7 @@ warnings.simplefilter("ignore") import pickle + def outlier_analysis(df, model_dir): _df = df[df["is_rescurable_homopolymer"]].reset_index(drop=True) diff --git a/rnaindel/defaultcaller/defaultcaller.py b/rnaindel/defaultcaller/defaultcaller.py index fcca6c9..01b318f 100755 --- a/rnaindel/defaultcaller/defaultcaller.py +++ b/rnaindel/defaultcaller/defaultcaller.py @@ -6,8 +6,9 @@ CANONICALS = [str(i) for i in range(1, 23)] + ["X", "Y"] + def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes): - + # Add Bambino home dir to CLASSPATH bambino_home = os.path.dirname(os.path.realpath(__file__)) try: @@ -29,40 +30,47 @@ def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes): heap_memory, fasta, bam ) ) - + if region: - + outfile = os.path.join(tmp_dir, "outfile.txt") - tmp = region.split(":") + tmp = region.split(":") chrom = tmp[0] tmp2 = tmp[1].split("-") start, stop = tmp2[0], tmp2[1] - cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format(chrom, start, stop, outfile) - + cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format( + chrom, start, stop, outfile + ) + stdout, stderr, return_code = run_shell_command(cmd_str) else: if num_of_processes > 1: - cmds_by_chrom = [base_cmd_str + " -chr {} -of {}".format(chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom)) for chrom in CANONICALS] + cmds_by_chrom = [ + base_cmd_str + + " -chr {} -of {}".format( + chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom) + ) + for chrom in CANONICALS + ] pool = Pool(num_of_processes) - + caller_returns = pool.map(run_shell_command, cmds_by_chrom) else: - + outfile = os.path.join(tmp_dir, "outfile.txt") cmd_str = base_cmd_str + " -of {}".format(outfile) stdout, stderr, return_code = run_shell_command(cmd_str) - - #check_caller_return(stdout, stderr, return_code) + # check_caller_return(stdout, stderr, return_code) - #if return_code != 0 or not os.path.isfile(output_file): + # if return_code != 0 or not os.path.isfile(output_file): # print("Failed while calling indels.", file=sys.stderr) # print(stderr, file=sys.stderr) # sys.exit(return_code) - #else: + # else: # if os.stat(output_file).st_size == 0: # print( # "No variants called. Check if the input reference FASTA file is the same file used for mapping.", @@ -93,8 +101,8 @@ def run_shell_command(command_string): return stdout, stderr, return_code -#def check_caller_return(stdout, stderr, return_code): -# +# def check_caller_return(stdout, stderr, return_code): +# # if return_code != 0 : # print("Failed while calling indels.", file=sys.stderr) # print(stderr, file=sys.stderr) @@ -111,9 +119,7 @@ def run_shell_command(command_string): # - def check_file(file_path, file_name): if not os.path.isfile(file_path): sys.exit("Error: {} Not Found.".format(file_name)) return file_path - diff --git a/rnaindel/rnaindel.py b/rnaindel/rnaindel.py index 3d24581..123c26c 100755 --- a/rnaindel/rnaindel.py +++ b/rnaindel/rnaindel.py @@ -55,5 +55,6 @@ def Train(self): def CountOccurrence(self): oc.count() + if __name__ == "__main__": sys.exit(main()) diff --git a/rnaindel/training/__init__.py b/rnaindel/training/__init__.py index a834071..6e4433d 100755 --- a/rnaindel/training/__init__.py +++ b/rnaindel/training/__init__.py @@ -1,7 +1,8 @@ from .trainer import * -#from .downsampler import * -#from .feature_selector import * -#from .parameter_tuner import * -#from .model_updater import * -#from .result_reporter import * -#from .model_selection import * + +# from .downsampler import * +# from .feature_selector import * +# from .parameter_tuner import * +# from .model_updater import * +# from .result_reporter import * +# from .model_selection import * diff --git a/rnaindel/training/downsampler.py b/rnaindel/training/downsampler.py index 977f24a..d8da6d2 100755 --- a/rnaindel/training/downsampler.py +++ b/rnaindel/training/downsampler.py @@ -10,13 +10,7 @@ def downsampler( - df, - k, - indel_class, - beta, - num_of_processes, - downsample_ratio, - max_features="auto", + df, k, indel_class, beta, num_of_processes, downsample_ratio, max_features="auto", ): """Optimize downsampling ratio optimizing F beta: somatic:germline:artifact = 1:1:x @@ -41,13 +35,13 @@ def downsampler( # somatic:germline:artifact = 1:1:x for 1 <= x <= 20 result_dict_lst = [] - + search_range = ( range(downsample_ratio, downsample_ratio + 1) if downsample_ratio else range(1, 21) ) - + for x in search_range: # k-fold CrossValidation stats = perform_k_fold_cv( diff --git a/rnaindel/training/model_updater.py b/rnaindel/training/model_updater.py index 57511d7..4da6f5d 100755 --- a/rnaindel/training/model_updater.py +++ b/rnaindel/training/model_updater.py @@ -74,7 +74,7 @@ def update_coverage_info(df, indel_class, model_dir): fr = open(coveragefile, "r") data = [line.rstrip() for line in fr.readlines()] fr.close() - + fw = open(coveragefile, "w") for line in data: if line.startswith(indel_class): @@ -82,11 +82,11 @@ def update_coverage_info(df, indel_class, model_dir): df = df[df["indel_size"] == 1] else: df = df[df["indel_size"] > 1] - df["cov"] = df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1) - coverage_quantile = int(df["cov"].quantile(.9)) + df["cov"] = df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1) + coverage_quantile = int(df["cov"].quantile(0.9)) class_to_be_updated = line.split("\t")[0] newline = class_to_be_updated + "\t" + str(coverage_quantile) fw.write(newline + "\n") else: fw.write(line + "\n") - fw.close() + fw.close() diff --git a/rnaindel/training/parameter_tuner.py b/rnaindel/training/parameter_tuner.py index 97d636c..46cdea7 100755 --- a/rnaindel/training/parameter_tuner.py +++ b/rnaindel/training/parameter_tuner.py @@ -10,14 +10,7 @@ def tuner( - df, - k, - indel_class, - artifact_ratio, - features, - beta, - num_of_processes, - auto_param, + df, k, indel_class, artifact_ratio, features, beta, num_of_processes, auto_param, ): """Optimize the maximum number of features considered in sklearn random forest model diff --git a/rnaindel/training/trainer.py b/rnaindel/training/trainer.py index a201258..9d621a2 100755 --- a/rnaindel/training/trainer.py +++ b/rnaindel/training/trainer.py @@ -12,6 +12,7 @@ from .result_reporter import reporter from .homopolyer_trainer import train_homolopolymer + def train(): subcommand = "Train" @@ -21,7 +22,7 @@ def train(): data_dir = args.data_dir.rstrip("/") df = input_validator(args.training_data, indel_class) - + if indel_class in ["s", "m"]: # downsampling @@ -60,7 +61,9 @@ def train(): # update models model_dir = "{}/models".format(data_dir) - updater(df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir) + updater( + df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir + ) # make report reporter( @@ -86,8 +89,10 @@ def train(): else "multi-nucleotide indels" ) - print("rnaindel training for " + msg + " completed successfully.", file=sys.stdout) - + print( + "rnaindel training for " + msg + " completed successfully.", file=sys.stdout + ) + else: model_dir = "{}/outliers".format(data_dir) train_homolopolymer(df, model_dir)