From fb464917feba48d41b616b21678b630d49dbdada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Ver=C3=ADssimo?= Date: Wed, 4 May 2016 15:33:45 +0100 Subject: [PATCH] great improvements, no new functionality - identifies BLAST databases by calling blastdbcmd instead of looking for *.nhr and *.phr - implemented benchmarking directly in run.rb - better documentation of config files --- config/config_blast.rb | 13 +-- config/default.yml | 46 ++++++++--- script.rb | 104 +---------------------- src/run.rb | 183 +++++++++++++++++++++++++++++++++++++++++ user.yml | 11 ++- 5 files changed, 236 insertions(+), 121 deletions(-) create mode 100644 src/run.rb diff --git a/config/config_blast.rb b/config/config_blast.rb index e6ec47c..3948bf1 100644 --- a/config/config_blast.rb +++ b/config/config_blast.rb @@ -152,13 +152,14 @@ def process_config # set existing dbs logger.info("loads databases (from directory '#{@store.db.parent}'): ") if @store.db.list.nil? || @store.db.list.empty? - @store.db.list = [] - Dir[File.join(@store.db.parent, '*.nhr'), - File.join(@store.db.parent, '*.phr')].each do |filename| - next unless File.file? filename - no_ext = File.basename(filename, File.extname(filename)) - @store.db.list << no_ext.gsub(/\.[0-9]+$/, '') + list_ary = [] + Open3.popen3("blastdbcmd -list #{@store.db.parent}") do |_i, o, _e, _t| + o.each_line("\n") do |line| + pair = line.split(/ (Nucleotide|Protein)\n/) + list_ary << File.basename(pair[0]).gsub(/\.[0-9]+$/, '') + end end + @store.db.list = list_ary.uniq end if @store.db.list.nil? || @store.db.list.empty? msg = "No blast dbs found in #{@store.db.parent}." diff --git a/config/default.yml b/config/default.yml index 8e42a5a..a4259bf 100644 --- a/config/default.yml +++ b/config/default.yml @@ -1,4 +1,38 @@ # +# +# separate each transcriptome in a individual folder +# this will create many folders in the output directory +# each ending with the database name +separate_db: false +# +# +# number of threads to use +# only valid with separate_db option +use_threads: 1 +# +# +# Threshold that should filter out results by identity +# values can take between 0 and 1 (i.e. 0% and 100%) +identity: + # minimum range for identity + min: .40 + # maximum range for identity + max: 1 +# +# +# Filtering options +# +prune_identical: + # use_worst: will pick the result with highest identity if false + # and the lowest identity if true + use_worst: false + # first: the initial filtering step will remove all identical sseqid and + # pick the best/worst (depending on use_worst option) + first: sseqid + # list: list of subsequent filtering columns, works the same as first + list: + - nt_aligned_seq +# # Configuration for finding longest ORF # that requires: # - start codon: sequence that indicates where ORF begins @@ -33,18 +67,6 @@ prune_identical: # codon_table: 1 # -# output directory options: -# - dir: output directory's name -# - ext: extension for blast files -output: - dir: output - extension: .out - intermediate: intermediate - blast_results: blast_results - fastas: fasta_files -# -annotation_dir: "db_and_queries/annotation" -# debug: level: "info" file: output/log.txt diff --git a/script.rb b/script.rb index a4c5ab5..904cfb6 100644 --- a/script.rb +++ b/script.rb @@ -1,103 +1,3 @@ -require_relative 'src/blastn' -require_relative 'src/tblastn' -require_relative 'src/tblastx' -require_relative 'src/blastp' -require_relative 'src/download' - -require 'configatron' +require_relative 'src/run.rb' # -# -def run_user_config - # configuration - config_path = File.expand_path((ARGV.empty? ? 'user.yml' : ARGV[0])) - config = YAML.load_file(config_path) - config_parent = File.dirname(config_path) - # - b = nil - # - if config['separate_db'] - if config['db']['list'].nil? || config['db']['list'].empty? - list_db = [] - # - Dir[File.expand_path(File.join(config['db']['parent'], '*.nhr'), - config_parent), - File.expand_path(File.join(config['db']['parent'], '*.phr'), - config_parent)].each do |item| - no_ext = File.basename(item, File.extname(item)) - list_db << no_ext.gsub(/\.[0-9]+$/, '') - end - else - list_db = config['db']['list'] - end - # needs to make directories relative to tmp folder - config['output']['dir'] = File.join('..', config['output']['dir']) - config['db']['parent'] = File.join('..', config['db']['parent']) - config['debug']['file'] = File.join('..', config['debug']['file']) - config['query']['parent'] = File.join('..', config['query']['parent']) - config['annotation_dir'] = File.join('..', config['annotation_dir']) - else - list_db = [-1] - end - # - # - list_db.each do |item| - - if item == -1 - new_config = ARGV[0] - else - # create a temporary older named tmp that holds the - # individual config files generated - tmp_path = File.expand_path('tmp', config_parent) - Dir.mkdir(tmp_path) unless Dir.exist? tmp_path - # output folder will be named with database as suffix - if config['force_folder'].nil? || config['force_folder'].strip == '' - output_folder = Time.now.strftime('%Y_%m_%d-%H_%M_%S') + - '-' + srand.to_s[3..6] - else - output_folder = config['force_folder'] - end - # keep original to reset it, otherwise it will concatenate all - output_folder_original = config['force_folder'] - # set output folder for this db - output_folder += '-' + item - # add .yml to config name - new_config = File.join(tmp_path, output_folder + '.config.yml') - # write change configuration to file, forcing only a single db - File.open(new_config, 'wb') do |fw| - config['db']['list'] = [item] - config['force_folder'] = output_folder - fw.write YAML.dump(config) - end - # reset name of folder to original - config['force_folder'] = output_folder_original - end - # - case config['engine'] - when 'tblastn' - b = TBlastn.new(new_config) - when 'blastn' - b = Blastn.new new_config - when 'tblastx' - b = TBlastx.new new_config - when 'blastp' - b = Blastp.new new_config - else - fail "Cannot recognize engine: #{config['engine']}. Please check" \ - ' documentation for implemented engines' - end - # download taxdb from ncbi - ExternalData.download(b.store.db.parent, TRUE) - # blast folders - b.blast_folders - # generate report.csv - b.gen_report_from_output - # prune results - b.prune_results - # - b.write_fasta - # remove temporary file - File.delete(new_config) unless item == -1 - end -end -# -run_user_config +run_user_config((ARGV.empty? ? 'user.yml' : ARGV[0])) diff --git a/src/run.rb b/src/run.rb new file mode 100644 index 0000000..6ba2bb3 --- /dev/null +++ b/src/run.rb @@ -0,0 +1,183 @@ +require_relative 'blastn' +require_relative 'tblastn' +require_relative 'tblastx' +require_relative 'blastp' +require_relative 'download' + +require 'configatron' +require 'benchmark' +# +# +def run_user_config(my_config, benchmark = nil) + # configuration + config_path = File.expand_path(my_config) + config_parent = File.dirname(config_path) + config = YAML.load_file(config_path) + # + list_ary = [] + list_db = Queue.new + # + if config['separate_db'] + if config['db']['list'].nil? || config['db']['list'].empty? + # + db_parent = File.expand_path(config['db']['parent'], config_parent) + # + # run command blsatdbcmd to search for BLAST databases + # in directory + Open3.popen3("blastdbcmd -list #{db_parent}") do |_i, o, _e, _t| + o.each_line("\n") do |line| + pair = line.split(/ (Nucleotide|Protein)\n/) + list_ary << File.basename(pair[0]).gsub(/\.[0-9]+$/, '') + end + end + list_ary = list_ary.uniq + else + config['db']['list'].each do |el| + list_ary << el + end + end + list_ary.each do |el| + list_db << el + end + # needs to make directories relative to tmp folder + relative_dir = proc do |path| + File.expand_path(File.join(path), config_parent) + end + config['output']['dir'] = relative_dir.call(config['output']['dir']) + config['db']['parent'] = relative_dir.call(config['db']['parent']) + config['debug']['file'] = relative_dir.call(config['debug']['file']) + config['query']['parent'] = relative_dir.call(config['query']['parent']) + config['annotation_dir'] = relative_dir.call(config['annotation_dir']) + else + list_db << -1 + end + # + # if separte folder then use same time for all + base_time = Time.now.strftime('%Y_%m_%d-%H_%M_%S') + # array to store threads id + threads = [] + # must be at least one thread + config['use_threads'] = 1 \ + if config['use_threads'].nil? || config['use_threads'] < 1 + # + config['use_threads'].times do + threads << Thread.new do + loop do + # stop if list_db is empty + Thread.exit if list_db.empty? + # + item = list_db.pop + # + if item == -1 + new_config = ARGV[0] + else + # create a temporary older named tmp that holds the + # individual config files generated + tmp_path = File.expand_path('tmp', config_parent) + Dir.mkdir(tmp_path) unless Dir.exist? tmp_path + # output folder will be named with database as suffix + if config['force_folder'].nil? || config['force_folder'].strip == '' + output_folder = base_time + + '-' + srand.to_s[3..6] + else + output_folder = config['force_folder'] + end + # keep original to reset it, otherwise it will concatenate all + output_folder_original = config['force_folder'] + debug_file_original = config['debug']['file'] + # set output folder for this db + output_folder += '_' + item + # add .yml to config name + new_config = File.join tmp_path, "#{output_folder}.config.yml" + # write change configuration to file, forcing only a single db + File.open(new_config, 'wb') do |fw| + config['db']['list'] = [item] + config['force_folder'] = output_folder + + if config['use_threads'] > 1 + config['debug']['file'] = \ + debug_file_original + '.thread.' + item + end + fw.write YAML.dump(config) + end + # reset name of folder to original + config['debug']['file'] = debug_file_original + config['force_folder'] = output_folder_original + end + # + begin + run_blast(new_config, config['engine'], benchmark) + rescue StandardError => e + puts e.to_s + end + # remove temporary file + File.delete(new_config) unless item == -1 + end + end + end + # wait for all threads to finish + threads.map(&:join) +end + +def run_blast(new_config, engine, benchmark = nil) + # + case engine + when 'tblastn' + b = TBlastn.new new_config + when 'blastn' + b = Blastn.new new_config + when 'tblastx' + b = TBlastx.new new_config + when 'blastp' + b = Blastp.new new_config + else + fail "Cannot recognize engine: #{config['engine']}. Please check" \ + ' documentation for implemented engines' + end + # + # download taxdb from ncbi + ExternalData.download(b.store.db.parent, TRUE) + # either run a normal run or with benchmarks + if benchmark.nil? + # blast folders + b.blast_folders + # generate report.csv + b.gen_report_from_output + # prune results + b.prune_results + # + b.write_fasta + else + logger = Logger.new \ + "#{b.store.output.dir}/log.benchmark.txt" + # + logger.info 'Starting Benchmark' + # + bm = Benchmark.bm(benchmark, 'total:', 'average:') do |x| + tb = x.report('blast:') { b.blast_folders } # blast folders + # + tp = x.report('proc.:') do + b.gen_report_from_output # generate report.csv + b.prune_results # find redundand and unecessary results + b.write_fasta # write fasta files + end + [tb + tp, (tb + tp) / 2] + end + db_info = b.db_information + query_info = b.query_information + db_bases = 0 + db_seqs = 0 + db_info.each do |db_el| + db_bases += db_el[:bases] + db_seqs += db_el[:sequences] + end + logger.info " bases in DBs: #{db_bases}" + logger.info " sequences in DBs: #{db_seqs}" + logger.info " bases in queries: #{query_info[:base]}" + logger.info " sequences in queries: #{query_info[:sequences]}" + logger.info ' user system total real' + bm.each do |bm_el| + logger.info " #{bm_el.label} #{bm_el.format}".gsub(/\n|\r/, '') + end + end +end diff --git a/user.yml b/user.yml index 7ceeed4..3631714 100644 --- a/user.yml +++ b/user.yml @@ -11,7 +11,16 @@ engine: tblastn # separate each transcriptome in a individual folder # this will create many folders in the output directory # each ending with the database name -separate_db: false +separate_db: true +# +# +# number of threads to use +# only valid with separate_db option +use_threads: 5 +# +# +debug: + file: output/log.txt # # # Opts are engine specific, the default in user.yml