-
Notifications
You must be signed in to change notification settings - Fork 0
/
raxml_for_each_partition_multithreading.py
149 lines (106 loc) · 4.58 KB
/
raxml_for_each_partition_multithreading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# raxml_for_each_partition_multithreading.py
# Mathias Scharmann
# 2018-02-21
"""
Input:
- aligment in phylip format
- raxml partition format file
Output:
- a directory with trees estimated separately for each partition!
module load gdc
module load python/2.7
PATH=$PATH:/cluster/project/gdc/people/schamath/tools
"""
import sys, os
import subprocess
###
def is_non_zero_file(fpath):
return os.path.isfile(fpath) and os.path.getsize(fpath) > 0
def read_phylip(INFILE):
indict = {}
with open(INFILE, "r") as infile:
infile.readline() # remove header
for line in infile:
fields = line.strip("\n").split( )
indict[fields[0]] = fields[1]
return indict
def write_phylip_clean (cl_dict, outfile):
with open(outfile, "w") as OUTFILE:
ntaxa = len(cl_dict.keys())
len_align = len(cl_dict[cl_dict.keys()[0]]) # gets value of first element in dictionary -> this OK since all seqs have same length
header = str(ntaxa) + " " + str(len_align) + "\n"
OUTFILE.write(header)
for sample, seq in cl_dict.items():
out_line = sample + " " + seq + "\n"
OUTFILE.write(out_line)
OUTFILE.close()
def split_alignment_and_do_raxml ( alignment_file, partitions_file, outdir, parallel_processes ):
dryrun = False
master_aln = read_phylip( alignment_file )
partitions_dict = {}
with open(partitions_file, "r") as INFILE:
for line in INFILE:
name = line.split(",")[1].split("=")[0]
start_idx = int(line.split("=")[1].split("-")[0]) -1 # zero-based offset for python idxes
end_idx = int(line.split("=")[1].split("-")[1] ) #+ 1 # string/list slicing in python: right end idx NOT included in slice
partitions_dict[name] = [start_idx, end_idx]
cnt = 0
processes = set()
max_processes = parallel_processes
num_cores_per_process = num_cores = 2
for name, idxes in partitions_dict.items():
cnt += 1
print cnt
outdict = {tax: master_aln[tax][partitions_dict[name][0] : partitions_dict[name][1]] for tax in master_aln.keys() }
# drop taxa that have only undetermined characters "-", otherwise RAxML will complain
clean_dict = {k:v for k,v in outdict.items() if set(v) != set("-") }
# sometimes, alignment is slightly incorrect and contains ONLY missing data for a locus; catch these errors and continue. Also, gene trees aren't informative if there is not at least 4 tips on them.
if len(clean_dict.keys()) >= 4:
if not dryrun:
# also, don't run again if there is already a treefile with content in it:
if not is_non_zero_file("./" + outdir + "/" + name + ".raxml.tre"):
write_phylip_clean (clean_dict , outdir + "/" + name + ".phy")
raxml_run_name = "run_" + str(cnt) + "_"
cmd = " ".join ( ["cd", outdir,";", "raxml","-T",str(num_cores),"-p","12345","-s",\
name + ".phy" ,"-n", raxml_run_name ,"-m GTRCAT", "--silent" , "--no-bfgs", ";",\
"rm", name + ".phy", "RAxML_parsimonyTree." + raxml_run_name,\
"RAxML_result." + raxml_run_name,\
"RAxML_log." + raxml_run_name,\
"RAxML_info." + raxml_run_name,\
name + ".phy.reduced",\
";", "mv", "RAxML_bestTree." + raxml_run_name, name + ".raxml.tre"] )
# print cmd
processes.add(subprocess.Popen([cmd, raxml_run_name], shell = True))
if len(processes) >= max_processes:
os.wait()
processes.difference_update([ p for p in processes if p.poll() is not None])
else: # report which partitions could not be used to make a gene tree:
with open("failing_partitions_report.txt", "a") as FPR:
FPR.write(name + "\t" + "less_than_4_taxa\n")
def report_outcome (partitions_file, outdir):
partitions = set([])
with open(partitions_file, "r") as INFILE:
for line in INFILE:
name = line.split(",")[1].split("=")[0]
partitions.add(name)
outdir_trees = [x.strip(".raxml.tre") for x in os.listdir(outdir) if x.endswith(".raxml.tre")]
missing_trees = partitions.difference(outdir_trees)
with open("failing_partitions_report.txt", "a") as FPR:
FPR.write("\n".join(sorted(missing_trees)) + "\n")
########## main
if len(sys.argv) != 5:
print """usage: python raxml_for_each_partition_multithreading.py alignment_file partitions_file outdir total_cores\n
RAxML will be called on 2 cores for each parallel job"""
exit()
alignment_file = sys.argv[1]
partitions_file = sys.argv[2]
outdir = sys.argv[3]
try:
os.mkdir( outdir )
except OSError:
None
total_cores = int( sys.argv[4] )
num_cores_per_process = 2
parallel_processes = total_cores / num_cores_per_process
split_alignment_and_do_raxml ( alignment_file, partitions_file, outdir, parallel_processes )
report_outcome (partitions_file, outdir)