forked from madeleinevlt/Fold_U
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fold_u
executable file
·183 lines (154 loc) · 7.55 KB
/
fold_u
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This program is the second step (downstream) of a protein structure prediction project.
This step consists of threading a query sequence on different given templates.
Our project is part of the Meet-U 2018-2019 competition. Meet-U is a collaborative pedagogical
and research initiative between several Universities of Paris area. The course is intended to
Master students (2nd year) in Bioinformatics. For more details, please refer to :
http://www.meet-u.org/
Usage:
./fold_u QUERY_FASTA UNIREF_DB [--nb_pdb NUM] [--nb_psiblast NUM] [--cpu NUM] [--dope FILE]
[--metafold FILE] [--benchmark FILE] [--output PATH]
Arguments:
QUERY_FASTA Path to the QUERY fasta sequence to predicted.
UNIREF_DB Path to Uniref database.
Options:
-h, --help Show this
-p NUM, --nb_pdb NUM Number of pdb to create
[default: 10]
-t NUM, --nb_psiblast Round number for PSI-BLAST
[default: 3]
-c NUM, --cpu NUM Number of cpus to use for parallelisation. By default
using all available (0).
[default: 0]
-d FILE, --dope FILE Path to the dope.par file
[default: data/dope.par]
-m FILE, --metafold FILE Path to the metafold.list file
[default: data/metafold.list]
-b FILE, --benchmark FILE Path to the benchmark.list file
[default: data/benchmark.list]
-o PATH, --output PATH Path to the directory containing
the result files (scores and pdb)
[default: ./results]
"""
__authors__ = "Gabriel Cretin, Hélène Kabbech, Franz-Arnold Ake, Tom Gutman and Flora Mikaeloff"
# Third-party modules
import subprocess
from multiprocessing import Pool, cpu_count
from functools import partial
from datetime import datetime
from tqdm import tqdm
from docopt import docopt
from schema import Schema, And, Use, SchemaError
# Local modules
import src.parsing as parsing
from src.alignment import process, clean_modeller_outputs
from src.score import Score
DIST_RANGE = [5, 15]
def check_args():
"""
Checks and validates the types of inputs parsed by docopt from command line.
"""
schema = Schema({
'QUERY_FASTA': Use(open, error='QUERY_FASTA should be readable'),
'--nb_pdb': And(Use(int), lambda n: 1 <= n <= 100,
error='--nb_pdb=NUM should be integer 1 <= N <= 100'),
'--nb_psiblast': And(Use(int), lambda n: 1<= n <= 10,
error='--nb_psiblast shoud be integer 1<= N <= 10'),
'--cpu': And(Use(int), lambda n: 0 <= n <= cpu_count(),
error='--cpus=NUM should be integer 1 <= N <= ' + str(cpu_count())),
'--dope': Use(open, error='dope file should be readable'),
'--metafold': Use(open, error='METAFOLD_FILE should be readable'),
'--benchmark': Use(open, error='BENCHMARK_FILE should be readable'),
# The output PATH is created (if not exists) at the end of the program
# so we skip the check.
object: object})
try:
schema.validate(ARGUMENTS)
except SchemaError as err:
exit(err)
if __name__ == "__main__":
START_TIME = datetime.now()
### Parse command line
######################
ARGUMENTS = docopt(__doc__, version='fold_u 3.0')
# Check the types and ranges of the command line arguments parsed by docopt
check_args()
# FASTA sequence of the Query
FASTA = ARGUMENTS["QUERY_FASTA"]
# Query name
QUERY_NAME = FASTA.split("/")[-1].split(".")[0]
# Path to the query folder
QUERY_PATH = FASTA.split(".")[0]
# Uniref database
UNIREF = ARGUMENTS["UNIREF_DB"]
# Create the first n pdb templates
NB_PDB = int(ARGUMENTS["--nb_pdb"])
# Round number for PSI-BLAST
NB_PSIBLAST = ARGUMENTS["--nb_psiblast"]
# Number of cpus for parallelisation
NB_PROC = cpu_count() if int(ARGUMENTS["--cpu"]) == 0 else int(ARGUMENTS["--cpu"])
# METAFOLD file
METAFOLD_FILE = ARGUMENTS["--metafold"]
# DOPE file
DOPE_FILE = ARGUMENTS["--dope"]
# BENCHMARK file
BENCHMARK_FILE = ARGUMENTS["--benchmark"]
# OUTPUT file
OUTPUT_PATH = ARGUMENTS["--output"]
### Run Uptream step from SALUT program (Team 1)
############################################
# profile-profile comparison of a query sequence and template with known structures
print("Run Upstream step (SALUT program): profile-profile comparison :")
subprocess.Popen(["./bin/salut_1.0/salut2.sh", FASTA, str(NB_PSIBLAST), UNIREF]).communicate()[0]
# Output files generated
FOLDREC_FILE = QUERY_PATH + ".foldrec"
ALN_FILE = QUERY_PATH + ".mfasta"
### Run CCMpred
###############
print("Run CCMpred")
subprocess.Popen(["./scripts/run_ccmpred.py", ALN_FILE]).communicate()[0]
# Output files generated
CLUSTAL_FILE = QUERY_PATH + ".clustal"
CCMPRED_FILE = QUERY_PATH + ".mat"
### Parse data files
####################
# Parse Metafold file
print("Parsing metafold (" + METAFOLD_FILE + ")")
METAFOLD_DICT = parsing.parse_metafold(METAFOLD_FILE)
# Parse Foldrec file
print("Parsing foldrec (" + FOLDREC_FILE + ")")
ALIGNMENT_DICT = parsing.parse_foldrec(FOLDREC_FILE, METAFOLD_DICT)
# Set the benchmark of the query
print("Parsing benchmark.list (" + BENCHMARK_FILE + ")")
parsing.parse_benchmark(QUERY_NAME, BENCHMARK_FILE, ALIGNMENT_DICT)
# Parse DOPE file
print("Parsing DOPE (" + DOPE_FILE + ")")
DOPE_DICT = parsing.parse_dope(DOPE_FILE)
# Parse CCMPRED result file
print("Parsing CCMPRED result file (" + CCMPRED_FILE + ")")
INDEX_LIST = parsing.get_index_list(CLUSTAL_FILE)
TOP_COUPLINGS_DICT = parsing.parse_ccmpred_result(CCMPRED_FILE, INDEX_LIST)
### Threading & Scores calculations
###################################
# Parallelization of the main loop
print("\n\n" + str(cpu_count()) + " cpus detected, using " + str(NB_PROC))
print("Run Downstream step")
print("Processing alignments ...\n")
with Pool(processes=NB_PROC) as pool:
FUNC = partial(process, DIST_RANGE, DOPE_DICT, OUTPUT_PATH, INDEX_LIST,
TOP_COUPLINGS_DICT)
# tqdm module enables an ETA progress bar for each alignment processed
# imap_unordered can smooth things out by yielding faster-calculated values
# ahead of slower-calculated values.
RESULTS = Score([res_ali for res_ali in tqdm(pool.imap_unordered(FUNC,
ALIGNMENT_DICT.values()),
total=len(ALIGNMENT_DICT.values()))])
# remove useless files generated by MODELLER
clean_modeller_outputs(OUTPUT_PATH + "/modeller/")
### Results : Score and top N PDB files
#######################################
RESULTS.write_score(OUTPUT_PATH, NB_PDB, ALIGNMENT_DICT)
print("\nThe program ended successfully !\nThe results are stored in " + OUTPUT_PATH)
print("\nTotal runtime: {} seconds".format(str(datetime.now() - START_TIME)))