-
Notifications
You must be signed in to change notification settings - Fork 1
/
NLGenomeSweeper
executable file
·302 lines (267 loc) · 13.4 KB
/
NLGenomeSweeper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
#!/usr/bin/env python3
import subprocess
import sys
import argparse
import os
import shutil
#import pandas as pd
description = """
NLGenomeSweeper
v1.2.4
Written by Nicholas Toda
Description: Searches a genome for NBS-LRR genes based on the presence
of the NB-ARC domain using the consensus sequence of the Pfam HMM profile
(PF00931) and can be used with custon domain profiles built for a species
of interest or related species for greater power.
Details on how to use the program can be found by running the command "NLGenomeSweeper --help"
For full details see: https://github.com/ntoda03/NLGenomeSweeper
Please report any problems to the github page.
"""
'''
Definition of global names for files and paths
'''
# Main output directory
nlg_folder = 'NLGenomeSweeper/'
# Folder created in output dir for candidate search results
search_folder = '01_candidate_identification/'
# subfolder within the search folder for searching proteins with hmmer
protein_search_folder = 'proteins_pfam_search/'
# subfolder within the search folder for searching proteins with a custom hmm profile
protein_custom_folder = 'proteins_custom_search/'
# subfolder within the search folder for searching the genome with blast
genome_search_folder = 'genome_search/'
# the name of the species specific consensus file generated when provided with protein file
species_consensus = 'customhmmerprofile.fa'
# Folder created in output dir for domain identification results
domain_folder = '02_domain_identification/'
# Location of the Pfam NB-ARC hmm profile and consensus within the program directory
nbarc_hmm = "data/pfam_NB-ARC.hmm"
nbarc_fa = "data/combined_consensus.fa"
profiler_file='data/Vitis_vinifera_NB-ARC_consensus.NLG_based.fa'
# Output file names
annotated_candidates = 'Annotated_candidates.bed'
unannotated_candidates = 'All_candidates.bed'
annotated_gff = 'All_candidates.gff3'
consensus_file ='Species_specific_consensus.fa'
filter_candidates ='Filtered_candidates.bed'
# We'll look to merge domains across introns smaller than this
max_intron_len = 1000
# Append the contents of an input file to an output file
def append_file(input_file, output_file):
infile = open(input_file, 'r')
appendFile = open(output_file,'a')
appendFile.write('\n')
appendFile.write(infile.read())
appendFile.close()
##########################################################################
#
# DEPRACATED - no protein search done, only the genome search is used
#
# identify_candidates_proteins
#
# Arguments:
# program_dir - directory of this program
# protein - protein file to search (fasta format)
# gff - gff3 annotation of the genome
# hmm - custom NB-ARC hmm profile to also use for search (or None)
# outdir - output directory
#
# Output:
# outdir/Annotated_candidates.bed - a bed file of identified candidates
#
# This function calls bash scripts to search a protein file in fasta
# format for NB-ARC domain containing proteins using hmmer. The search is
# always done using the pfam NB-ARC domain hmm profile and additionally
# a custom NB-ARC profile can be optimally used.
#
##########################################################################
def identify_candidates_proteins(program_dir, protein, gff, hmm, outdir):
# Searching using the pfam NB-ARC hmm profile
os.mkdir(outdir + search_folder + protein_search_folder)
subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir, protein, gff,
program_dir + nbarc_hmm,
outdir + search_folder + protein_search_folder, program_dir), shell=True, check=True)
shutil.copy(outdir + search_folder + protein_search_folder + annotated_candidates,
outdir + annotated_candidates)
## Also searching using a custom hmm profile
if hmm:
os.mkdir(outdir + search_folder + protein_custom_folder)
subprocess.run('{}/scripts/hmmer_candidates.sh {} {} {} {} {}'.format(program_dir,protein,gff,
hmm,outdir + search_folder + protein_custom_folder, program_dir), shell=True)
# Concatenate to the pfam search results
append_file(outdir + search_folder + protein_custom_folder + annotated_candidates,
outdir + annotated_candidates)
##########################################################################
#
# identify_candidates_genome
#
# Arguments:
# program_dir - directory of this program
# genome - genome file to search (fasta format)
# consensus - consensus protein sequence(s) of NB-ARC domain in fasta format
# outdir - output directory
# cores - number of cores to use
#
# Output:
# outdir/Unannotated_candidates.bed - a bed file of identified candidates
# This function calls bash scripts to search a genome file in fasta
# format for NB-ARC domain containing sequences using tblast. The search is
# always done using the pfam NB-ARC domain consensus sequence and additionally
# a custom NB-ARC consensus protein sequence can be used. The custom file
# can contain multiple sequences and it is strongly recommended to create
# multiple consensus sequence for each type of NBS-LRR gene (TNL, CNL, NL).
#
##########################################################################
def identify_candidates_genome(program_dir, genome, consensus, outdir, cores, intron_size):
os.mkdir(outdir + search_folder + genome_search_folder)
shutil.copy(program_dir + nbarc_fa,
outdir + search_folder + genome_search_folder + 'pfam_NB-ARC.fa')
shutil.copy(program_dir + profiler_file,
outdir + search_folder + genome_search_folder + 'profiler_file.fa')
# Also searching using custom consensus sequence(s), add to query file
if consensus:
append_file(consensus, outdir + search_folder + genome_search_folder + 'pfam_NB-ARC.fa')
# DEPRACATED: Protein search has been removed.
# Also searched a protein file so a species specific NB-ARC consensus sequence was generated
#if os.path.isfile(outdir + annotated_candidates + protein_search_folder + species_consensus):
# append_file(outdir + annotated_candidates + protein_search_folder + species_consensus,
# outdir + search_folder + genome_search_folder + 'pfam_NB-ARC.fa')
# Also searched a protein file using a custom hmm so a second species specific NB-ARC
# consensus sequence was generated
#if os.path.isfile(outdir + annotated_candidates + protein_custom_folder + species_consensus):
# append_file(outdir + annotated_candidates + protein_custom_folder + species_consensus,
# outdir + search_folder + genome_search_folder + 'pfam_NB-ARC.fa')
# Already ran protein based search, mask genome to avoid duplication
#if os.path.isfile(outdir + annotated_candidates):
# subprocess.run('bedtools maskfasta -fi {} -fo {} -bed {}'.format(
# genome, outdir + search_folder + genome_search_folder + 'genome.masked.fa',
# outdir + annotated_candidates), shell=True)
# genome = outdir + search_folder + genome_search_folder + 'genome.masked.fa'
# # Remember to also include the annotated candidates for extracting sequences
# shutil.copy(outdir + annotated_candidates,
# outdir + search_folder + genome_search_folder + 'candiates_sites.bed')
# Run the genome based search
subprocess.run('{}/scripts/genome_search.sh {} {} {} {} {}'.format(program_dir,genome,
outdir + search_folder + genome_search_folder, program_dir, cores, intron_size), shell=True, check=True)
#shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates,
# outdir + unannotated_candidates)
shutil.copy(outdir + search_folder + genome_search_folder + consensus_file,
outdir + consensus_file)
shutil.copy(outdir + search_folder + genome_search_folder + 'Candidate_sites.with_flanking.fa',
outdir + domain_folder + 'Candidate_sites.with_flanking.fa')
shutil.copy(outdir + search_folder + genome_search_folder + unannotated_candidates,
outdir + domain_folder + unannotated_candidates)
##########################################################################
#
# run_interproscan
#
# Arguments:
# program_dir - directory of this program
# outdir - output directory
# cores - number of cores to use
#
# Output:
# outdir/All_candidates_regions.annotated.gff3 - domain annotation of candidate sites
# This function calls a bash script to search the identified NBS-LRR genes
# candidate sequences for ORFs and domains using interproscan. 10 kb of flanking
# sequence on both sides are also searched. This returns an annotation file
# in gff3 format that can be viewed directly in a genome browser that contains
# relevant information on the ORFs and domains at the candidate sites.
#
##########################################################################
def run_interproscan(program_dir, outdir, t):
subprocess.run('{}/scripts/region_interproscan.sh {} {} {}'.format(program_dir, outdir + domain_folder, t, program_dir),
shell=True, check=True)
# File management handled by script now
#shutil.copy(outdir + domain_folder + annotated_gff,
# outdir + annotated_gff)
#shutil.copy(outdir + domain_folder + unannotated_candidates,
# outdir + unannotated_candidates)
#shutil.copy(outdir + domain_folder + filter_candidates,
# outdir + filter_candidates)
##########################################################################
#
# main
#
##########################################################################
def main(argv):
# Find current and program directory
program_dir = os.path.dirname(os.path.realpath(__file__)).replace(' ', '\ ')
current_dir = os.getcwd()
# Parse command line arguments.
parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
required_group = parser.add_argument_group('Required arguments')
required_group.add_argument("-genome", metavar="<fasta file>", required=True, type=str,
help="A genome sequence in fasta format to search.")
input_group = parser.add_argument_group('Input arguments')
# DEPRACATED - no protein search is available
# input_group.add_argument("-protein", metavar="<fasta file>", type=str, default = None,
# help="Protein sequences in fasta format.")
# input_group.add_argument("-gff", metavar="<gff3 file>", type=str, default = None,
# help="gff3 annotation of the genome. Required if searching protein sequences.")
input_group.add_argument("-consensus", metavar="<fasta file>", type=str, default = None,
help="Also search the genome using a custom NB-ARC consensus sequence(s) (optional)." +
"It is not recommended to use this option since species specific consensuses will be created.")
# DEPRACATED - no protein search is available
# input_group.add_argument("-hmm", metavar="<file>", type=str, default = None,
# help="Also search the protein file using a custom NB-ARC HMM.")
output_group = parser.add_argument_group('Output arguments')
output_group.add_argument("-overwrite", metavar="[T/F]", default='F', type=str,
help="Whether to overwrite output files if they already exist. [Default F]")
output_group.add_argument("-outdir", metavar="<path>", default=current_dir, type=str,
help="Path where output directory NLGenomeSweeper will be created. [Default ./]")
run_group = parser.add_argument_group('Program control')
run_group.add_argument("-t", metavar="<threads>", type=int, default = 1,
help="The number of threads to use. [Default 1]")
# This gives the user control over the max intron size allowed. Not active for simplicity
#advanced_group = parser.add_argument_group('Advanced parameters')
#advanced_group.add_argument("-intron", metavar="<max intron size>", type=int, default = 1000,
# help="Maximum allowed size of intron within the NB-ARC domain. [Default 1000 bp]")
args = parser.parse_args()
# Error handling.
if not os.path.isfile(args.genome):
parser.print_help()
raise Exception( 'ERROR: genome file ' + args.genome + ' does not exist!')
# DEPRACATED - no protein search is available
# if args.protein and not os.path.isfile(args.protein):
# parser.print_help()
# raise Exception( 'ERROR: protein file ' + args.protein + ' does not exist!')
# if args.protein and not args.gff:
# parser.print_help()
# raise Exception( 'ERROR: gff file must be supplied when searching proteins!')
# if args.gff and not os.path.isfile(args.gff):
# parser.print_help()
# raise Exception( 'ERROR: gff file ' + args.gff + ' does not exist!')
if args.consensus and not os.path.isfile(args.consensus):
parser.print_help()
raise Exception( 'ERROR: consensus file ' + args.consensus + ' does not exist!')
# DEPRACATED - no protein search is available
# if args.hmm and not os.path.isfile(args.hmm):
# parser.print_help()
# raise Exception( 'ERROR: HMM file ' + args.hmm + ' does not exist!')
if not args.outdir.endswith('/'):
args.outdir += '/'
if not program_dir.endswith('/'):
program_dir += '/'
if os.path.exists(args.outdir + nlg_folder) and (args.overwrite == "F"):
parser.print_help()
raise Exception( 'ERROR: program output already exists and program is not set to overwrite!')
else:
if os.path.exists(args.outdir + nlg_folder ):
shutil.rmtree(args.outdir + nlg_folder )
os.mkdir(args.outdir + nlg_folder)
os.mkdir(args.outdir + nlg_folder + search_folder)
os.mkdir(args.outdir + nlg_folder + domain_folder)
print(description)
sys.stdout.flush()
# DEPRACATED - no protein search is available
# If a protein file was given to search
## if args.protein:
# Search protein file using hmmer
## identify_candidates_proteins(program_dir,args.protein, args.gff, args.hmm, args.outdir )
# Search the genome using blast
identify_candidates_genome(program_dir, args.genome, args.consensus, args.outdir + nlg_folder, args.t, max_intron_len)
# Define ORFs and domains using interproscan
run_interproscan(program_dir, args.outdir + nlg_folder, args.t)
if __name__ == "__main__":
main(sys.argv[1:])