[MRG] multigather using index as query (#1090)

* enable multigather from index * add test for multigather with sbt query * better test sbt usage with multigather, init lca testing * upd for proper query, query-from-file usage * add test for incorrect usage. better error msg would probably be good here * use md5sum as filename if query.filename not properly set * minor spacing etc * add better error handling to load_file_list_of_signatures * adjust test to work in py27 * fix errant decode Co-authored-by: C. Titus Brown <[email protected]>
sourmash-bio · Jul 14, 2020 · 8f5a55b · 8f5a55b
1 parent 17123ba
commit 8f5a55b
Show file tree

Hide file tree

Showing 3 changed files with 341 additions and 94 deletions.
diff --git a/sourmash/commands.py b/sourmash/commands.py
@@ -721,10 +721,9 @@ def multigather(args):
         more_files = sourmash_args.load_file_list_of_signatures(args.query_from_file)
         inp_files.extend(more_files)
 
-    query = sourmash_args.load_query_signature(inp_files[0],
-                                               ksize=args.ksize,
-                                               select_moltype=moltype)
-    # set up the search databases
+    # need a query to get ksize, moltype for db loading
+    query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))
+
     databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
                                                 args.traverse_directory)
 
@@ -733,108 +732,116 @@ def multigather(args):
         sys.exit(-1)
 
     # run gather on all the queries.
+    n=0
     for queryfile in inp_files:
-        # load the query signature & figure out all the things
-        query = sourmash_args.load_query_signature(queryfile,
-                                                   ksize=args.ksize,
-                                                   select_moltype=moltype)
-        notify('loaded query: {}... (k={}, {})', query.name()[:30],
-                                                 query.minhash.ksize,
-                                                 sourmash_args.get_moltype(query))
-
-        # verify signature was computed right.
-        if query.minhash.max_hash == 0:
-            error('query signature needs to be created with --scaled; skipping')
-            continue
+        # load the query signature(s) & figure out all the things
+        for query in sourmash_args.load_file_as_signatures(queryfile,
+                                                       ksize=args.ksize,
+                                                       select_moltype=moltype):
+            notify('loaded query: {}... (k={}, {})', query.name()[:30],
+                                            query.minhash.ksize,
+                                            sourmash_args.get_moltype(query))
 
-        # downsample if requested
-        if args.scaled:
-            notify('downsampling query from scaled={} to {}',
-                   query.minhash.scaled, int(args.scaled))
-            query.minhash = query.minhash.downsample_scaled(args.scaled)
+            # verify signature was computed right.
+            if query.minhash.max_hash == 0:
+                error('query signature needs to be created with --scaled; skipping')
+                continue
 
-        # empty?
-        if not len(query.minhash):
-            error('no query hashes!? skipping to next..')
-            continue
+            # downsample if requested
+            if args.scaled:
+                notify('downsampling query from scaled={} to {}',
+                       query.minhash.scaled, int(args.scaled))
+                query.minhash = query.minhash.downsample_scaled(args.scaled)
+
+            # empty?
+            if not len(query.minhash):
+                error('no query hashes!? skipping to next..')
+                continue
+
+            found = []
+            weighted_missed = 1
+            for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
+                if not len(found):                # first result? print header.
+                    if query.minhash.track_abundance and not args.ignore_abundance:
+                        print_results("")
+                        print_results("overlap     p_query p_match avg_abund")
+                        print_results("---------   ------- ------- ---------")
+                    else:
+                        print_results("")
+                        print_results("overlap     p_query p_match")
+                        print_results("---------   ------- -------")
+
+
+                # print interim result & save in a list for later use
+                pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
+                pct_genome = '{:.1f}%'.format(result.f_match*100)
+                average_abund ='{:.1f}'.format(result.average_abund)
+                name = result.match._display_name(40)
 
-        found = []
-        weighted_missed = 1
-        for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
-            if not len(found):                # first result? print header.
                 if query.minhash.track_abundance and not args.ignore_abundance:
-                    print_results("")
-                    print_results("overlap     p_query p_match avg_abund")
-                    print_results("---------   ------- ------- ---------")
+                    print_results('{:9}   {:>7} {:>7} {:>9}    {}',
+                              format_bp(result.intersect_bp), pct_query, pct_genome,
+                              average_abund, name)
                 else:
-                    print_results("")
-                    print_results("overlap     p_query p_match")
-                    print_results("---------   ------- -------")
-
-
-            # print interim result & save in a list for later use
-            pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
-            pct_genome = '{:.1f}%'.format(result.f_match*100)
-            average_abund ='{:.1f}'.format(result.average_abund)
-            name = result.match._display_name(40)
-
-            if query.minhash.track_abundance and not args.ignore_abundance:
-                print_results('{:9}   {:>7} {:>7} {:>9}    {}',
-                          format_bp(result.intersect_bp), pct_query, pct_genome,
-                          average_abund, name)
-            else:
-                print_results('{:9}   {:>7} {:>7}    {}',
-                          format_bp(result.intersect_bp), pct_query, pct_genome,
-                          name)
-            found.append(result)
+                    print_results('{:9}   {:>7} {:>7}    {}',
+                              format_bp(result.intersect_bp), pct_query, pct_genome,
+                              name)
+                found.append(result)
 
 
-        # basic reporting
-        print_results('\nfound {} matches total;', len(found))
+            # basic reporting
+            print_results('\nfound {} matches total;', len(found))
 
-        print_results('the recovered matches hit {:.1f}% of the query',
-               (1 - weighted_missed) * 100)
-        print_results('')
-
-        if not found:
-            notify('nothing found... skipping.')
-            continue
+            print_results('the recovered matches hit {:.1f}% of the query',
+                   (1 - weighted_missed) * 100)
+            print_results('')
 
-        output_base = os.path.basename(queryfile)
-        output_csv = output_base + '.csv'
-
-        fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
-                      'f_unique_to_query', 'f_unique_weighted',
-                      'average_abund', 'median_abund', 'std_abund', 'name',
-                      'filename', 'md5', 'f_match_orig']
-        with open(output_csv, 'wt') as fp:
-            w = csv.DictWriter(fp, fieldnames=fieldnames)
-            w.writeheader()
-            for result in found:
-                d = dict(result._asdict())
-                del d['match']                 # actual signature not in CSV.
-                w.writerow(d)
-
-        output_matches = output_base + '.matches.sig'
-        with open(output_matches, 'wt') as fp:
-            outname = output_matches
-            notify('saving all matches to "{}"', outname)
-            sig.save_signatures([ r.match for r in found ], fp)
-
-        output_unassigned = output_base + '.unassigned.sig'
-        with open(output_unassigned, 'wt') as fp:
             if not found:
-                notify('nothing found - entire query signature unassigned.')
-            elif not len(query.minhash):
-                notify('no unassigned hashes! not saving.')
-            else:
-                notify('saving unassigned hashes to "{}"', output_unassigned)
+                notify('nothing found... skipping.')
+                continue
+
+            query_filename = query.filename
+            if not query_filename:
+                # use md5sum if query.filename not properly set
+                query_filename = query.md5sum()
+
+            output_base = os.path.basename(query_filename)
+            output_csv = output_base + '.csv'
+
+            fieldnames = ['intersect_bp', 'f_orig_query', 'f_match',
+                          'f_unique_to_query', 'f_unique_weighted',
+                          'average_abund', 'median_abund', 'std_abund', 'name',
+                          'filename', 'md5', 'f_match_orig']
+            with open(output_csv, 'wt') as fp:
+                w = csv.DictWriter(fp, fieldnames=fieldnames)
+                w.writeheader()
+                for result in found:
+                    d = dict(result._asdict())
+                    del d['match']      # actual signature not output to CSV!
+                    w.writerow(d)
+
+            output_matches = output_base + '.matches.sig'
+            with open(output_matches, 'wt') as fp:
+                outname = output_matches
+                notify('saving all matches to "{}"', outname)
+                sig.save_signatures([ r.match for r in found ], fp)
+
+            output_unassigned = output_base + '.unassigned.sig'
+            with open(output_unassigned, 'wt') as fp:
+                if not found:
+                    notify('nothing found - entire query signature unassigned.')
+                elif not len(query.minhash):
+                    notify('no unassigned hashes! not saving.')
+                else:
+                    notify('saving unassigned hashes to "{}"', output_unassigned)
 
-                e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
-                e.add_many(next_query.minhash.get_mins())
-                sig.save_signatures([ sig.SourmashSignature(e) ], fp)
+                    e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
+                    e.add_many(next_query.minhash.get_mins())
+                    sig.save_signatures([ sig.SourmashSignature(e) ], fp)
+            n += 1
 
         # fini, next query!
+    notify('\nconducted gather searches on {} signatures', n)
 
 
 def watch(args):

diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
@@ -503,8 +503,13 @@ def load_file_as_signatures(filename, select_moltype=None, ksize=None,
 
 def load_file_list_of_signatures(filename):
     "Load a list-of-files text file."
-    with open(filename, 'rt') as fp:
-        file_list = [ x.rstrip('\r\n') for x in fp ]
+    try:
+        with open(filename, 'rt') as fp:
+            file_list = [ x.rstrip('\r\n') for x in fp ]
+    except OSError:
+        raise ValueError("cannot open file '{}'".format(filename))
+    except UnicodeDecodeError:
+        raise ValueError("cannot parse file '{}' as list of filenames".format(filename))
 
     return file_list