Skip to content

Commit

Permalink
[MRG] fix two bugs in gather --output-unassigned (#1156) (#1160)
Browse files Browse the repository at this point in the history
* add test for no unassigned hashes to output with gather

* add test for gather --output-unassigned with protein query

* fix two bugs in gather --output-unassigned

* Update tests/test_sourmash.py

Co-authored-by: Olga Botvinnik <[email protected]>

Co-authored-by: Olga Botvinnik <[email protected]>

Co-authored-by: Olga Botvinnik <[email protected]>
  • Loading branch information
ctb and olgabot authored Aug 11, 2020
1 parent 5092774 commit 111b46e
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 12 deletions.
15 changes: 3 additions & 12 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,22 +694,13 @@ def gather(args):
sig.save_signatures([ r.match for r in found ], fp)

if args.output_unassigned:
if not len(query.minhash):
notify('no unassigned hashes! not saving.')
if not len(next_query.minhash):
notify('no unassigned hashes to save with --output-unassigned!')
else:
notify('saving unassigned hashes to "{}"', args.output_unassigned)

with_abundance = next_query.minhash.track_abundance
e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash,
track_abundance=with_abundance)
if with_abundance:
abunds = next_query.minhash.hashes
e.set_abundances(abunds)
else:
e.add_many(next_query.minhash.hashes)

with FileOutput(args.output_unassigned, 'wt') as fp:
sig.save_signatures([ sig.SourmashSignature(e) ], fp)
sig.save_signatures([ next_query ], fp)


def multigather(args):
Expand Down
51 changes: 51 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3216,6 +3216,35 @@ def test_gather_metagenome_output_unassigned():
'NC_011294.1' in out))


def test_gather_metagenome_output_unassigned_none():
# test what happens when there's nothing unassigned to output
with utils.TempDirectory() as location:
testdata_glob = utils.get_test_data('gather/GCF_*.sig')
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = 'gather {} {} -k 21'.format(query_sig, " ".join(testdata_sigs))
cmd += ' --output-unassigned=unassigned.sig'
cmd += ' --threshold=0'
status, out, err = utils.runscript('sourmash', cmd.split(' '),
in_directory=location)

print(out)
print(err)

assert 'found 12 matches total' in out
assert 'the recovered matches hit 100.0% of the query' in out
assert all(('4.9 Mbp 33.2% 100.0%' in out,
'NC_003198.1 Salmonella enterica subsp...' in out))
assert all(('4.5 Mbp 0.1% 0.4%' in out,
'NC_004631.1 Salmonella enterica subsp...' in out))

# now examine unassigned
assert not os.path.exists(os.path.join(location, 'unassigned.sig'))
assert 'no unassigned hashes to save with --output-unassigned!' in err


@utils.in_tempdir
def test_gather_metagenome_output_unassigned_nomatches(c):
# test --output-unassigned when there are no matches
Expand All @@ -3234,6 +3263,28 @@ def test_gather_metagenome_output_unassigned_nomatches(c):
assert x.minhash == y.minhash


@utils.in_tempdir
def test_gather_metagenome_output_unassigned_nomatches_protein(c):
# test --output-unassigned with protein signatures
query_sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
against_sig = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig')

c.run_sourmash('gather', query_sig, against_sig,
'--output-unassigned', 'foo.sig')

print(c.last_result.out)
assert 'found 0 matches total;' in c.last_result.out

c.run_sourmash('sig', 'describe', c.output('foo.sig'))
print(c.last_result.out)

x = sourmash.load_one_signature(query_sig, ksize=57)
y = sourmash.load_one_signature(c.output('foo.sig'))

assert x.minhash == y.minhash
assert y.minhash.moltype == "protein"


def test_gather_metagenome_downsample():
# downsample w/scaled of 100,000
with utils.TempDirectory() as location:
Expand Down

0 comments on commit 111b46e

Please sign in to comment.