v.3.0.1

stjude · Aug 5, 2021 · 6d689b5 · 6d689b5
1 parent 27a1b39
commit 6d689b5
Show file tree

Hide file tree

Showing 11 changed files with 62 additions and 57 deletions.
diff --git a/rnaindel/analysis/alignment_feature_calculator.py b/rnaindel/analysis/alignment_feature_calculator.py
@@ -58,7 +58,7 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000):
 
     df = df[df["alt_count"] > 1]
 
-    return df
+    return df.reset_index(drop=True)
 
 
 def _wrapper(row, bam, mapq, downsample_threshold):
@@ -347,8 +347,7 @@ def infer_del_seq_from_data(non_target_reads, target_deletion):
         lt_seq, rt_seq = split(non_target_read, target_deletion, is_for_ref=False)
         if len(rt_seq) > del_len:
             inferred = rt_seq[:del_len]
-            if inferred != del_seq:
-                non_ref_del_seq.append(inferred)
+            non_ref_del_seq.append(inferred)
 
     if non_ref_del_seq:
         return most_common(non_ref_del_seq)

diff --git a/rnaindel/analysis/classifier.py b/rnaindel/analysis/classifier.py
@@ -11,6 +11,7 @@
 
 warnings.filterwarnings("ignore")
 
+
 def classify(df, model_dir, num_of_processes):
     """ Makes prediction
     Args:
@@ -44,7 +45,7 @@ def calculate_proba(df, model_dir, num_of_processes):
 
     sni_features = feature_dict["single_nucleotide_indels"]
     mni_features = feature_dict["multi_nucleotide_indels"]
-    
+
     # to keep the original row order
     df["order"] = df.index
     df_sni, df_mni = split_by_indel_size(df)
@@ -76,7 +77,7 @@ def calculate_proba(df, model_dir, num_of_processes):
         dfp_mni = pd.DataFrame(data=mni_proba)
         dfp_mni.columns = header
     else:
-        dfp_mni= pd.DataFrame(columns=header)
+        dfp_mni = pd.DataFrame(columns=header)
 
     df_mni = pd.concat([df_mni, dfp_mni], axis=1)
 
@@ -112,13 +113,16 @@ def make_feature_dict(model_dir):
         model_dir (str): path to data directory where "features.txt" is located.
     Returns:
         feature_dict (dict): {indel_class : [feture names]}
-    """ 
+    """
     features = os.path.join(model_dir, "features.txt")
     f = open(features)
-    feature_dict = {line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f}
+    feature_dict = {
+        line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f
+    }
     f.close()
     return feature_dict
 
+
 def predict(model, data, features):
     """ Calculate prediction probabaility
     Args:

diff --git a/rnaindel/analysis/coding_indel.py b/rnaindel/analysis/coding_indel.py
@@ -1,5 +1,6 @@
 from .sequence_properties import exists_stop_codon
 
+
 def annotate_coding_info(indel, coding_gene_db):
     """Generate coding indel objects
     
@@ -27,9 +28,9 @@ def annotate_coding_info(indel, coding_gene_db):
     indel_type_db = 1 if indel_type == "I" else 0  # conversion for coding_gene_db style
     pos = pos + 1  # conversion for bambino style
     margin = 11  # allow a margin of 11bp for splice region
-    
+
     chrom = chrom if chrom.startswith("chr") else "chr" + chrom
-    
+
     try:
         candidate_genes = coding_gene_db.fetch(chrom, pos - margin, pos + margin)
     except:

diff --git a/rnaindel/analysis/outlier.py b/rnaindel/analysis/outlier.py
@@ -6,6 +6,7 @@
 warnings.simplefilter("ignore")
 import pickle
 
+
 def outlier_analysis(df, model_dir):
 
     _df = df[df["is_rescurable_homopolymer"]].reset_index(drop=True)

diff --git a/rnaindel/defaultcaller/defaultcaller.py b/rnaindel/defaultcaller/defaultcaller.py
@@ -6,8 +6,9 @@
 
 CANONICALS = [str(i) for i in range(1, 23)] + ["X", "Y"]
 
+
 def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes):
-     
+
     # Add Bambino home dir to CLASSPATH
     bambino_home = os.path.dirname(os.path.realpath(__file__))
     try:
@@ -29,40 +30,47 @@ def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes):
             heap_memory, fasta, bam
         )
     )
-    
+
     if region:
-        
+
         outfile = os.path.join(tmp_dir, "outfile.txt")
 
-        tmp = region.split(":") 
+        tmp = region.split(":")
         chrom = tmp[0]
         tmp2 = tmp[1].split("-")
         start, stop = tmp2[0], tmp2[1]
-        cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format(chrom, start, stop, outfile)
-
+        cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format(
+            chrom, start, stop, outfile
+        )
+
         stdout, stderr, return_code = run_shell_command(cmd_str)
 
     else:
         if num_of_processes > 1:
-            cmds_by_chrom = [base_cmd_str + " -chr {} -of {}".format(chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom)) for chrom in CANONICALS]
+            cmds_by_chrom = [
+                base_cmd_str
+                + " -chr {} -of {}".format(
+                    chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom)
+                )
+                for chrom in CANONICALS
+            ]
             pool = Pool(num_of_processes)
-            
+
             caller_returns = pool.map(run_shell_command, cmds_by_chrom)
         else:
-            
+
             outfile = os.path.join(tmp_dir, "outfile.txt")
 
             cmd_str = base_cmd_str + " -of {}".format(outfile)
             stdout, stderr, return_code = run_shell_command(cmd_str)
-
-            #check_caller_return(stdout, stderr, return_code)
 
+            # check_caller_return(stdout, stderr, return_code)
 
-    #if return_code != 0 or not os.path.isfile(output_file):
+    # if return_code != 0 or not os.path.isfile(output_file):
     #    print("Failed while calling indels.", file=sys.stderr)
     #    print(stderr, file=sys.stderr)
     #    sys.exit(return_code)
-    #else:
+    # else:
     #    if os.stat(output_file).st_size == 0:
     #        print(
     #            "No variants called. Check if the input reference FASTA file is the same file used for mapping.",
@@ -93,8 +101,8 @@ def run_shell_command(command_string):
     return stdout, stderr, return_code
 
 
-#def check_caller_return(stdout, stderr, return_code):
-#   
+# def check_caller_return(stdout, stderr, return_code):
+#
 #   if return_code != 0 :
 #       print("Failed while calling indels.", file=sys.stderr)
 #       print(stderr, file=sys.stderr)
@@ -111,9 +119,7 @@ def run_shell_command(command_string):
 #
 
 
-
 def check_file(file_path, file_name):
     if not os.path.isfile(file_path):
         sys.exit("Error: {} Not Found.".format(file_name))
     return file_path
-
diff --git a/rnaindel/rnaindel.py b/rnaindel/rnaindel.py
@@ -55,5 +55,6 @@ def Train(self):
     def CountOccurrence(self):
         oc.count()
 
+
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/rnaindel/training/__init__.py b/rnaindel/training/__init__.py
@@ -1,7 +1,8 @@
 from .trainer import *
-#from .downsampler import *
-#from .feature_selector import *
-#from .parameter_tuner import *
-#from .model_updater import *
-#from .result_reporter import *
-#from .model_selection import *
+
+# from .downsampler import *
+# from .feature_selector import *
+# from .parameter_tuner import *
+# from .model_updater import *
+# from .result_reporter import *
+# from .model_selection import *
diff --git a/rnaindel/training/downsampler.py b/rnaindel/training/downsampler.py
@@ -10,13 +10,7 @@
 
 
 def downsampler(
-    df,
-    k,
-    indel_class,
-    beta,
-    num_of_processes,
-    downsample_ratio,
-    max_features="auto",
+    df, k, indel_class, beta, num_of_processes, downsample_ratio, max_features="auto",
 ):
     """Optimize downsampling ratio optimizing F beta: somatic:germline:artifact = 1:1:x
 
@@ -41,13 +35,13 @@ def downsampler(
 
     # somatic:germline:artifact = 1:1:x for 1 <= x <= 20
     result_dict_lst = []
-    
+
     search_range = (
         range(downsample_ratio, downsample_ratio + 1)
         if downsample_ratio
         else range(1, 21)
     )
-    
+
     for x in search_range:
         # k-fold CrossValidation
         stats = perform_k_fold_cv(

diff --git a/rnaindel/training/model_updater.py b/rnaindel/training/model_updater.py
@@ -74,19 +74,19 @@ def update_coverage_info(df, indel_class, model_dir):
     fr = open(coveragefile, "r")
     data = [line.rstrip() for line in fr.readlines()]
     fr.close()
-    
+
     fw = open(coveragefile, "w")
     for line in data:
         if line.startswith(indel_class):
             if indel_class == "s":
                 df = df[df["indel_size"] == 1]
             else:
                 df = df[df["indel_size"] > 1]
-            df["cov"] =  df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1)
-            coverage_quantile = int(df["cov"].quantile(.9))
+            df["cov"] = df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1)
+            coverage_quantile = int(df["cov"].quantile(0.9))
             class_to_be_updated = line.split("\t")[0]
             newline = class_to_be_updated + "\t" + str(coverage_quantile)
             fw.write(newline + "\n")
         else:
             fw.write(line + "\n")
-    fw.close()        
+    fw.close()
diff --git a/rnaindel/training/parameter_tuner.py b/rnaindel/training/parameter_tuner.py
@@ -10,14 +10,7 @@
 
 
 def tuner(
-    df,
-    k,
-    indel_class,
-    artifact_ratio,
-    features,
-    beta,
-    num_of_processes,
-    auto_param,
+    df, k, indel_class, artifact_ratio, features, beta, num_of_processes, auto_param,
 ):
     """Optimize the maximum number of features considered in sklearn random forest model
 

diff --git a/rnaindel/training/trainer.py b/rnaindel/training/trainer.py
@@ -12,6 +12,7 @@
 from .result_reporter import reporter
 from .homopolyer_trainer import train_homolopolymer
 
+
 def train():
 
     subcommand = "Train"
@@ -21,7 +22,7 @@ def train():
     data_dir = args.data_dir.rstrip("/")
 
     df = input_validator(args.training_data, indel_class)
-    
+
     if indel_class in ["s", "m"]:
 
         # downsampling
@@ -60,7 +61,9 @@ def train():
 
         # update models
         model_dir = "{}/models".format(data_dir)
-        updater(df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir)
+        updater(
+            df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir
+        )
 
         # make report
         reporter(
@@ -86,8 +89,10 @@ def train():
             else "multi-nucleotide indels"
         )
 
-        print("rnaindel training for " + msg + " completed successfully.", file=sys.stdout)
-
+        print(
+            "rnaindel training for " + msg + " completed successfully.", file=sys.stdout
+        )
+
     else:
         model_dir = "{}/outliers".format(data_dir)
         train_homolopolymer(df, model_dir)