Skip to content

Commit

Permalink
[MRG] multigather using index as query (#1090)
Browse files Browse the repository at this point in the history
* enable multigather from index

* add test for multigather with sbt query

* better test sbt usage with multigather, init lca testing

* upd for proper query, query-from-file usage

* add test for incorrect usage. better error msg would probably be good here

* use md5sum as filename if query.filename not properly set

* minor spacing etc

* add better error handling to load_file_list_of_signatures

* adjust test to work in py27

* fix errant decode

Co-authored-by: C. Titus Brown <[email protected]>
  • Loading branch information
bluegenes and ctb authored Jul 14, 2020
1 parent 17123ba commit 8f5a55b
Show file tree
Hide file tree
Showing 3 changed files with 341 additions and 94 deletions.
191 changes: 99 additions & 92 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,10 +721,9 @@ def multigather(args):
more_files = sourmash_args.load_file_list_of_signatures(args.query_from_file)
inp_files.extend(more_files)

query = sourmash_args.load_query_signature(inp_files[0],
ksize=args.ksize,
select_moltype=moltype)
# set up the search databases
# need a query to get ksize, moltype for db loading
query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))

databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
args.traverse_directory)

Expand All @@ -733,108 +732,116 @@ def multigather(args):
sys.exit(-1)

# run gather on all the queries.
n=0
for queryfile in inp_files:
# load the query signature & figure out all the things
query = sourmash_args.load_query_signature(queryfile,
ksize=args.ksize,
select_moltype=moltype)
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
error('query signature needs to be created with --scaled; skipping')
continue
# load the query signature(s) & figure out all the things
for query in sourmash_args.load_file_as_signatures(queryfile,
ksize=args.ksize,
select_moltype=moltype):
notify('loaded query: {}... (k={}, {})', query.name()[:30],
query.minhash.ksize,
sourmash_args.get_moltype(query))

# downsample if requested
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)
# verify signature was computed right.
if query.minhash.max_hash == 0:
error('query signature needs to be created with --scaled; skipping')
continue

# empty?
if not len(query.minhash):
error('no query hashes!? skipping to next..')
continue
# downsample if requested
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)

# empty?
if not len(query.minhash):
error('no query hashes!? skipping to next..')
continue

found = []
weighted_missed = 1
for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
print_results("")
print_results("overlap p_query p_match avg_abund")
print_results("--------- ------- ------- ---------")
else:
print_results("")
print_results("overlap p_query p_match")
print_results("--------- ------- -------")


# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.match._display_name(40)

found = []
weighted_missed = 1
for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
if not len(found): # first result? print header.
if query.minhash.track_abundance and not args.ignore_abundance:
print_results("")
print_results("overlap p_query p_match avg_abund")
print_results("--------- ------- ------- ---------")
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
else:
print_results("")
print_results("overlap p_query p_match")
print_results("--------- ------- -------")


# print interim result & save in a list for later use
pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
pct_genome = '{:.1f}%'.format(result.f_match*100)
average_abund ='{:.1f}'.format(result.average_abund)
name = result.match._display_name(40)

if query.minhash.track_abundance and not args.ignore_abundance:
print_results('{:9} {:>7} {:>7} {:>9} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
average_abund, name)
else:
print_results('{:9} {:>7} {:>7} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
found.append(result)
print_results('{:9} {:>7} {:>7} {}',
format_bp(result.intersect_bp), pct_query, pct_genome,
name)
found.append(result)


# basic reporting
print_results('\nfound {} matches total;', len(found))
# basic reporting
print_results('\nfound {} matches total;', len(found))

print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
print_results('')

if not found:
notify('nothing found... skipping.')
continue
print_results('the recovered matches hit {:.1f}% of the query',
(1 - weighted_missed) * 100)
print_results('')

output_base = os.path.basename(queryfile)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig']
with open(output_csv, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['match'] # actual signature not in CSV.
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.match for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)
notify('nothing found... skipping.')
continue

query_filename = query.filename
if not query_filename:
# use md5sum if query.filename not properly set
query_filename = query.md5sum()

output_base = os.path.basename(query_filename)
output_csv = output_base + '.csv'

fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
'f_unique_to_query', 'f_unique_weighted',
'average_abund', 'median_abund', 'std_abund', 'name',
'filename', 'md5', 'f_match_orig']
with open(output_csv, 'wt') as fp:
w = csv.DictWriter(fp, fieldnames=fieldnames)
w.writeheader()
for result in found:
d = dict(result._asdict())
del d['match'] # actual signature not output to CSV!
w.writerow(d)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify('saving all matches to "{}"', outname)
sig.save_signatures([ r.match for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
with open(output_unassigned, 'wt') as fp:
if not found:
notify('nothing found - entire query signature unassigned.')
elif not len(query.minhash):
notify('no unassigned hashes! not saving.')
else:
notify('saving unassigned hashes to "{}"', output_unassigned)

e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature(e) ], fp)
e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
e.add_many(next_query.minhash.get_mins())
sig.save_signatures([ sig.SourmashSignature(e) ], fp)
n += 1

# fini, next query!
notify('\nconducted gather searches on {} signatures', n)


def watch(args):
Expand Down
9 changes: 7 additions & 2 deletions sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,13 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None,

def load_file_list_of_signatures(filename):
"Load a list-of-files text file."
with open(filename, 'rt') as fp:
file_list = [ x.rstrip('\r\n') for x in fp ]
try:
with open(filename, 'rt') as fp:
file_list = [ x.rstrip('\r\n') for x in fp ]
except OSError:
raise ValueError("cannot open file '{}'".format(filename))
except UnicodeDecodeError:
raise ValueError("cannot parse file '{}' as list of filenames".format(filename))

return file_list

Expand Down
Loading

0 comments on commit 8f5a55b

Please sign in to comment.