Skip to content

Commit

Permalink
great improvements, no new functionality
Browse files Browse the repository at this point in the history
- identifies BLAST databases by calling blastdbcmd
 instead of looking for *.nhr and *.phr
- implemented benchmarking directly in run.rb
- better documentation of config files
  • Loading branch information
averissimo committed May 4, 2016
1 parent 0b65963 commit fb46491
Show file tree
Hide file tree
Showing 5 changed files with 236 additions and 121 deletions.
13 changes: 7 additions & 6 deletions config/config_blast.rb
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,14 @@ def process_config
# set existing dbs
logger.info("loads databases (from directory '#{@store.db.parent}'): ")
if @store.db.list.nil? || @store.db.list.empty?
@store.db.list = []
Dir[File.join(@store.db.parent, '*.nhr'),
File.join(@store.db.parent, '*.phr')].each do |filename|
next unless File.file? filename
no_ext = File.basename(filename, File.extname(filename))
@store.db.list << no_ext.gsub(/\.[0-9]+$/, '')
list_ary = []
Open3.popen3("blastdbcmd -list #{@store.db.parent}") do |_i, o, _e, _t|
o.each_line("\n") do |line|
pair = line.split(/ (Nucleotide|Protein)\n/)
list_ary << File.basename(pair[0]).gsub(/\.[0-9]+$/, '')
end
end
@store.db.list = list_ary.uniq
end
if @store.db.list.nil? || @store.db.list.empty?
msg = "No blast dbs found in #{@store.db.parent}."
Expand Down
46 changes: 34 additions & 12 deletions config/default.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,38 @@
#
#
# separate each transcriptome in a individual folder
# this will create many folders in the output directory
# each ending with the database name
separate_db: false
#
#
# number of threads to use
# only valid with separate_db option
use_threads: 1
#
#
# Threshold that should filter out results by identity
# values can take between 0 and 1 (i.e. 0% and 100%)
identity:
# minimum range for identity
min: .40
# maximum range for identity
max: 1
#
#
# Filtering options
#
prune_identical:
# use_worst: will pick the result with highest identity if false
# and the lowest identity if true
use_worst: false
# first: the initial filtering step will remove all identical sseqid and
# pick the best/worst (depending on use_worst option)
first: sseqid
# list: list of subsequent filtering columns, works the same as first
list:
- nt_aligned_seq
#
# Configuration for finding longest ORF
# that requires:
# - start codon: sequence that indicates where ORF begins
Expand Down Expand Up @@ -33,18 +67,6 @@ prune_identical:
#
codon_table: 1
#
# output directory options:
# - dir: output directory's name
# - ext: extension for blast files
output:
dir: output
extension: .out
intermediate: intermediate
blast_results: blast_results
fastas: fasta_files
#
annotation_dir: "db_and_queries/annotation"
#
debug:
level: "info"
file: output/log.txt
Expand Down
104 changes: 2 additions & 102 deletions script.rb
Original file line number Diff line number Diff line change
@@ -1,103 +1,3 @@
require_relative 'src/blastn'
require_relative 'src/tblastn'
require_relative 'src/tblastx'
require_relative 'src/blastp'
require_relative 'src/download'

require 'configatron'
require_relative 'src/run.rb'
#
#
def run_user_config
# configuration
config_path = File.expand_path((ARGV.empty? ? 'user.yml' : ARGV[0]))
config = YAML.load_file(config_path)
config_parent = File.dirname(config_path)
#
b = nil
#
if config['separate_db']
if config['db']['list'].nil? || config['db']['list'].empty?
list_db = []
#
Dir[File.expand_path(File.join(config['db']['parent'], '*.nhr'),
config_parent),
File.expand_path(File.join(config['db']['parent'], '*.phr'),
config_parent)].each do |item|
no_ext = File.basename(item, File.extname(item))
list_db << no_ext.gsub(/\.[0-9]+$/, '')
end
else
list_db = config['db']['list']
end
# needs to make directories relative to tmp folder
config['output']['dir'] = File.join('..', config['output']['dir'])
config['db']['parent'] = File.join('..', config['db']['parent'])
config['debug']['file'] = File.join('..', config['debug']['file'])
config['query']['parent'] = File.join('..', config['query']['parent'])
config['annotation_dir'] = File.join('..', config['annotation_dir'])
else
list_db = [-1]
end
#
#
list_db.each do |item|

if item == -1
new_config = ARGV[0]
else
# create a temporary older named tmp that holds the
# individual config files generated
tmp_path = File.expand_path('tmp', config_parent)
Dir.mkdir(tmp_path) unless Dir.exist? tmp_path
# output folder will be named with database as suffix
if config['force_folder'].nil? || config['force_folder'].strip == ''
output_folder = Time.now.strftime('%Y_%m_%d-%H_%M_%S') +
'-' + srand.to_s[3..6]
else
output_folder = config['force_folder']
end
# keep original to reset it, otherwise it will concatenate all
output_folder_original = config['force_folder']
# set output folder for this db
output_folder += '-' + item
# add .yml to config name
new_config = File.join(tmp_path, output_folder + '.config.yml')
# write change configuration to file, forcing only a single db
File.open(new_config, 'wb') do |fw|
config['db']['list'] = [item]
config['force_folder'] = output_folder
fw.write YAML.dump(config)
end
# reset name of folder to original
config['force_folder'] = output_folder_original
end
#
case config['engine']
when 'tblastn'
b = TBlastn.new(new_config)
when 'blastn'
b = Blastn.new new_config
when 'tblastx'
b = TBlastx.new new_config
when 'blastp'
b = Blastp.new new_config
else
fail "Cannot recognize engine: #{config['engine']}. Please check" \
' documentation for implemented engines'
end
# download taxdb from ncbi
ExternalData.download(b.store.db.parent, TRUE)
# blast folders
b.blast_folders
# generate report.csv
b.gen_report_from_output
# prune results
b.prune_results
#
b.write_fasta
# remove temporary file
File.delete(new_config) unless item == -1
end
end
#
run_user_config
run_user_config((ARGV.empty? ? 'user.yml' : ARGV[0]))
183 changes: 183 additions & 0 deletions src/run.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
require_relative 'blastn'
require_relative 'tblastn'
require_relative 'tblastx'
require_relative 'blastp'
require_relative 'download'

require 'configatron'
require 'benchmark'
#
#
def run_user_config(my_config, benchmark = nil)
# configuration
config_path = File.expand_path(my_config)
config_parent = File.dirname(config_path)
config = YAML.load_file(config_path)
#
list_ary = []
list_db = Queue.new
#
if config['separate_db']
if config['db']['list'].nil? || config['db']['list'].empty?
#
db_parent = File.expand_path(config['db']['parent'], config_parent)
#
# run command blsatdbcmd to search for BLAST databases
# in directory
Open3.popen3("blastdbcmd -list #{db_parent}") do |_i, o, _e, _t|
o.each_line("\n") do |line|
pair = line.split(/ (Nucleotide|Protein)\n/)
list_ary << File.basename(pair[0]).gsub(/\.[0-9]+$/, '')
end
end
list_ary = list_ary.uniq
else
config['db']['list'].each do |el|
list_ary << el
end
end
list_ary.each do |el|
list_db << el
end
# needs to make directories relative to tmp folder
relative_dir = proc do |path|
File.expand_path(File.join(path), config_parent)
end
config['output']['dir'] = relative_dir.call(config['output']['dir'])
config['db']['parent'] = relative_dir.call(config['db']['parent'])
config['debug']['file'] = relative_dir.call(config['debug']['file'])
config['query']['parent'] = relative_dir.call(config['query']['parent'])
config['annotation_dir'] = relative_dir.call(config['annotation_dir'])
else
list_db << -1
end
#
# if separte folder then use same time for all
base_time = Time.now.strftime('%Y_%m_%d-%H_%M_%S')
# array to store threads id
threads = []
# must be at least one thread
config['use_threads'] = 1 \
if config['use_threads'].nil? || config['use_threads'] < 1
#
config['use_threads'].times do
threads << Thread.new do
loop do
# stop if list_db is empty
Thread.exit if list_db.empty?
#
item = list_db.pop
#
if item == -1
new_config = ARGV[0]
else
# create a temporary older named tmp that holds the
# individual config files generated
tmp_path = File.expand_path('tmp', config_parent)
Dir.mkdir(tmp_path) unless Dir.exist? tmp_path
# output folder will be named with database as suffix
if config['force_folder'].nil? || config['force_folder'].strip == ''
output_folder = base_time +
'-' + srand.to_s[3..6]
else
output_folder = config['force_folder']
end
# keep original to reset it, otherwise it will concatenate all
output_folder_original = config['force_folder']
debug_file_original = config['debug']['file']
# set output folder for this db
output_folder += '_' + item
# add .yml to config name
new_config = File.join tmp_path, "#{output_folder}.config.yml"
# write change configuration to file, forcing only a single db
File.open(new_config, 'wb') do |fw|
config['db']['list'] = [item]
config['force_folder'] = output_folder

if config['use_threads'] > 1
config['debug']['file'] = \
debug_file_original + '.thread.' + item
end
fw.write YAML.dump(config)
end
# reset name of folder to original
config['debug']['file'] = debug_file_original
config['force_folder'] = output_folder_original
end
#
begin
run_blast(new_config, config['engine'], benchmark)
rescue StandardError => e
puts e.to_s
end
# remove temporary file
File.delete(new_config) unless item == -1
end
end
end
# wait for all threads to finish
threads.map(&:join)
end

def run_blast(new_config, engine, benchmark = nil)
#
case engine
when 'tblastn'
b = TBlastn.new new_config
when 'blastn'
b = Blastn.new new_config
when 'tblastx'
b = TBlastx.new new_config
when 'blastp'
b = Blastp.new new_config
else
fail "Cannot recognize engine: #{config['engine']}. Please check" \
' documentation for implemented engines'
end
#
# download taxdb from ncbi
ExternalData.download(b.store.db.parent, TRUE)
# either run a normal run or with benchmarks
if benchmark.nil?
# blast folders
b.blast_folders
# generate report.csv
b.gen_report_from_output
# prune results
b.prune_results
#
b.write_fasta
else
logger = Logger.new \
"#{b.store.output.dir}/log.benchmark.txt"
#
logger.info 'Starting Benchmark'
#
bm = Benchmark.bm(benchmark, 'total:', 'average:') do |x|
tb = x.report('blast:') { b.blast_folders } # blast folders
#
tp = x.report('proc.:') do
b.gen_report_from_output # generate report.csv
b.prune_results # find redundand and unecessary results
b.write_fasta # write fasta files
end
[tb + tp, (tb + tp) / 2]
end
db_info = b.db_information
query_info = b.query_information
db_bases = 0
db_seqs = 0
db_info.each do |db_el|
db_bases += db_el[:bases]
db_seqs += db_el[:sequences]
end
logger.info " bases in DBs: #{db_bases}"
logger.info " sequences in DBs: #{db_seqs}"
logger.info " bases in queries: #{query_info[:base]}"
logger.info " sequences in queries: #{query_info[:sequences]}"
logger.info ' user system total real'
bm.each do |bm_el|
logger.info " #{bm_el.label} #{bm_el.format}".gsub(/\n|\r/, '')
end
end
end
11 changes: 10 additions & 1 deletion user.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@ engine: tblastn
# separate each transcriptome in a individual folder
# this will create many folders in the output directory
# each ending with the database name
separate_db: false
separate_db: true
#
#
# number of threads to use
# only valid with separate_db option
use_threads: 5
#
#
debug:
file: output/log.txt
#
#
# Opts are engine specific, the default in user.yml
Expand Down

0 comments on commit fb46491

Please sign in to comment.