Skip to content

Commit

Permalink
v.3.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
rawagiha committed Aug 5, 2021
1 parent 27a1b39 commit 6d689b5
Show file tree
Hide file tree
Showing 11 changed files with 62 additions and 57 deletions.
5 changes: 2 additions & 3 deletions rnaindel/analysis/alignment_feature_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def alignment_features(df, bam, mapq, downsample_threshold=1000):

df = df[df["alt_count"] > 1]

return df
return df.reset_index(drop=True)


def _wrapper(row, bam, mapq, downsample_threshold):
Expand Down Expand Up @@ -347,8 +347,7 @@ def infer_del_seq_from_data(non_target_reads, target_deletion):
lt_seq, rt_seq = split(non_target_read, target_deletion, is_for_ref=False)
if len(rt_seq) > del_len:
inferred = rt_seq[:del_len]
if inferred != del_seq:
non_ref_del_seq.append(inferred)
non_ref_del_seq.append(inferred)

if non_ref_del_seq:
return most_common(non_ref_del_seq)
Expand Down
12 changes: 8 additions & 4 deletions rnaindel/analysis/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

warnings.filterwarnings("ignore")


def classify(df, model_dir, num_of_processes):
""" Makes prediction
Args:
Expand Down Expand Up @@ -44,7 +45,7 @@ def calculate_proba(df, model_dir, num_of_processes):

sni_features = feature_dict["single_nucleotide_indels"]
mni_features = feature_dict["multi_nucleotide_indels"]

# to keep the original row order
df["order"] = df.index
df_sni, df_mni = split_by_indel_size(df)
Expand Down Expand Up @@ -76,7 +77,7 @@ def calculate_proba(df, model_dir, num_of_processes):
dfp_mni = pd.DataFrame(data=mni_proba)
dfp_mni.columns = header
else:
dfp_mni= pd.DataFrame(columns=header)
dfp_mni = pd.DataFrame(columns=header)

df_mni = pd.concat([df_mni, dfp_mni], axis=1)

Expand Down Expand Up @@ -112,13 +113,16 @@ def make_feature_dict(model_dir):
model_dir (str): path to data directory where "features.txt" is located.
Returns:
feature_dict (dict): {indel_class : [feture names]}
"""
"""
features = os.path.join(model_dir, "features.txt")
f = open(features)
feature_dict = {line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f}
feature_dict = {
line.split("\t")[0]: line.rstrip().split("\t")[1].split(";") for line in f
}
f.close()
return feature_dict


def predict(model, data, features):
""" Calculate prediction probabaility
Args:
Expand Down
5 changes: 3 additions & 2 deletions rnaindel/analysis/coding_indel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .sequence_properties import exists_stop_codon


def annotate_coding_info(indel, coding_gene_db):
"""Generate coding indel objects
Expand Down Expand Up @@ -27,9 +28,9 @@ def annotate_coding_info(indel, coding_gene_db):
indel_type_db = 1 if indel_type == "I" else 0 # conversion for coding_gene_db style
pos = pos + 1 # conversion for bambino style
margin = 11 # allow a margin of 11bp for splice region

chrom = chrom if chrom.startswith("chr") else "chr" + chrom

try:
candidate_genes = coding_gene_db.fetch(chrom, pos - margin, pos + margin)
except:
Expand Down
1 change: 1 addition & 0 deletions rnaindel/analysis/outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
warnings.simplefilter("ignore")
import pickle


def outlier_analysis(df, model_dir):

_df = df[df["is_rescurable_homopolymer"]].reset_index(drop=True)
Expand Down
40 changes: 23 additions & 17 deletions rnaindel/defaultcaller/defaultcaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

CANONICALS = [str(i) for i in range(1, 23)] + ["X", "Y"]


def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes):

# Add Bambino home dir to CLASSPATH
bambino_home = os.path.dirname(os.path.realpath(__file__))
try:
Expand All @@ -29,40 +30,47 @@ def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes):
heap_memory, fasta, bam
)
)

if region:

outfile = os.path.join(tmp_dir, "outfile.txt")

tmp = region.split(":")
tmp = region.split(":")
chrom = tmp[0]
tmp2 = tmp[1].split("-")
start, stop = tmp2[0], tmp2[1]
cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format(chrom, start, stop, outfile)

cmd_str = base_cmd_str + " -chr {} -start {} -end {} -of {}".format(
chrom, start, stop, outfile
)

stdout, stderr, return_code = run_shell_command(cmd_str)

else:
if num_of_processes > 1:
cmds_by_chrom = [base_cmd_str + " -chr {} -of {}".format(chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom)) for chrom in CANONICALS]
cmds_by_chrom = [
base_cmd_str
+ " -chr {} -of {}".format(
chrom, os.path.join(tmp_dir, "chr{}.txt").format(chrom)
)
for chrom in CANONICALS
]
pool = Pool(num_of_processes)

caller_returns = pool.map(run_shell_command, cmds_by_chrom)
else:

outfile = os.path.join(tmp_dir, "outfile.txt")

cmd_str = base_cmd_str + " -of {}".format(outfile)
stdout, stderr, return_code = run_shell_command(cmd_str)

#check_caller_return(stdout, stderr, return_code)

# check_caller_return(stdout, stderr, return_code)

#if return_code != 0 or not os.path.isfile(output_file):
# if return_code != 0 or not os.path.isfile(output_file):
# print("Failed while calling indels.", file=sys.stderr)
# print(stderr, file=sys.stderr)
# sys.exit(return_code)
#else:
# else:
# if os.stat(output_file).st_size == 0:
# print(
# "No variants called. Check if the input reference FASTA file is the same file used for mapping.",
Expand Down Expand Up @@ -93,8 +101,8 @@ def run_shell_command(command_string):
return stdout, stderr, return_code


#def check_caller_return(stdout, stderr, return_code):
#
# def check_caller_return(stdout, stderr, return_code):
#
# if return_code != 0 :
# print("Failed while calling indels.", file=sys.stderr)
# print(stderr, file=sys.stderr)
Expand All @@ -111,9 +119,7 @@ def run_shell_command(command_string):
#



def check_file(file_path, file_name):
if not os.path.isfile(file_path):
sys.exit("Error: {} Not Found.".format(file_name))
return file_path

1 change: 1 addition & 0 deletions rnaindel/rnaindel.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,6 @@ def Train(self):
def CountOccurrence(self):
oc.count()


if __name__ == "__main__":
sys.exit(main())
13 changes: 7 additions & 6 deletions rnaindel/training/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .trainer import *
#from .downsampler import *
#from .feature_selector import *
#from .parameter_tuner import *
#from .model_updater import *
#from .result_reporter import *
#from .model_selection import *

# from .downsampler import *
# from .feature_selector import *
# from .parameter_tuner import *
# from .model_updater import *
# from .result_reporter import *
# from .model_selection import *
12 changes: 3 additions & 9 deletions rnaindel/training/downsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,7 @@


def downsampler(
df,
k,
indel_class,
beta,
num_of_processes,
downsample_ratio,
max_features="auto",
df, k, indel_class, beta, num_of_processes, downsample_ratio, max_features="auto",
):
"""Optimize downsampling ratio optimizing F beta: somatic:germline:artifact = 1:1:x
Expand All @@ -41,13 +35,13 @@ def downsampler(

# somatic:germline:artifact = 1:1:x for 1 <= x <= 20
result_dict_lst = []

search_range = (
range(downsample_ratio, downsample_ratio + 1)
if downsample_ratio
else range(1, 21)
)

for x in search_range:
# k-fold CrossValidation
stats = perform_k_fold_cv(
Expand Down
8 changes: 4 additions & 4 deletions rnaindel/training/model_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,19 @@ def update_coverage_info(df, indel_class, model_dir):
fr = open(coveragefile, "r")
data = [line.rstrip() for line in fr.readlines()]
fr.close()

fw = open(coveragefile, "w")
for line in data:
if line.startswith(indel_class):
if indel_class == "s":
df = df[df["indel_size"] == 1]
else:
df = df[df["indel_size"] > 1]
df["cov"] = df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1)
coverage_quantile = int(df["cov"].quantile(.9))
df["cov"] = df.apply(lambda x: (x["ref_count"] + x["alt_count"]), axis=1)
coverage_quantile = int(df["cov"].quantile(0.9))
class_to_be_updated = line.split("\t")[0]
newline = class_to_be_updated + "\t" + str(coverage_quantile)
fw.write(newline + "\n")
else:
fw.write(line + "\n")
fw.close()
fw.close()
9 changes: 1 addition & 8 deletions rnaindel/training/parameter_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,7 @@


def tuner(
df,
k,
indel_class,
artifact_ratio,
features,
beta,
num_of_processes,
auto_param,
df, k, indel_class, artifact_ratio, features, beta, num_of_processes, auto_param,
):
"""Optimize the maximum number of features considered in sklearn random forest model
Expand Down
13 changes: 9 additions & 4 deletions rnaindel/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .result_reporter import reporter
from .homopolyer_trainer import train_homolopolymer


def train():

subcommand = "Train"
Expand All @@ -21,7 +22,7 @@ def train():
data_dir = args.data_dir.rstrip("/")

df = input_validator(args.training_data, indel_class)

if indel_class in ["s", "m"]:

# downsampling
Expand Down Expand Up @@ -60,7 +61,9 @@ def train():

# update models
model_dir = "{}/models".format(data_dir)
updater(df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir)
updater(
df, args.indel_class, artifact_ratio, feature_lst, max_features, model_dir
)

# make report
reporter(
Expand All @@ -86,8 +89,10 @@ def train():
else "multi-nucleotide indels"
)

print("rnaindel training for " + msg + " completed successfully.", file=sys.stdout)

print(
"rnaindel training for " + msg + " completed successfully.", file=sys.stdout
)

else:
model_dir = "{}/outliers".format(data_dir)
train_homolopolymer(df, model_dir)
Expand Down

0 comments on commit 6d689b5

Please sign in to comment.