Merge branch 'master' of github.com:biocore/redbiom

biocore · Dec 20, 2022 · 8f0d140 · 8f0d140
2 parents 54ab86d + 727824d
commit 8f0d140
Show file tree

Hide file tree

Showing 15 changed files with 258 additions and 38 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -13,7 +13,7 @@ jobs:
       matrix:
         # based roughly on https://github.com/conda-incubator/setup-miniconda#example-1-basic-usage
         # via biocore/empress
-        python-version: ["3.6", "3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
 
     steps:
     - name: Check out repository code
@@ -28,7 +28,7 @@ jobs:
     - name: Install
       shell: bash -l {0}
       run: |
-        conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython
+        conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython future
         conda activate test-env
         conda install -c conda-forge --yes scikit-bio biom-format
         pip install flake8 nltk msgpack

diff --git a/Makefile b/Makefile
@@ -39,6 +39,7 @@ test: test_db
 	/bin/bash test.sh
 	nosetests
 	/bin/bash test_failures.sh  # this blows away the db
+	/bin/bash test_external.sh  # relies on public redbiom instance
 
 test_bulk: test_db_bulk
 	/bin/bash test.sh

diff --git a/README.md b/README.md
@@ -358,18 +358,18 @@ Ambiguities can arise if the same sample was processed multiple times as might h
 
 ### Load some data (i.e., if you are running your own server)
 
-To make use of this cache, we need to load things. Loading can be done in parallel. First, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:<sample_id>`, hash buckets for each category and sample combination (e.g., `metadata:category:<category_name>` as the hash and `<sample_id>` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`):
+To make use of this cache, we need to load things. Loading can be done in parallel. First, we need to set the server to be writable.
+
+    $ redbiom admin scripts-writable
+
+Next, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:<sample_id>`, hash buckets for each category and sample combination (e.g., `metadata:category:<category_name>` as the hash and `<sample_id>` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`):
 
     $ redbiom admin load-sample-metadata --metadata path/to/qiime/compat/mapping.txt
 
 redbiom supports one to many mappings between sample metadata and actual sample data. This is done as there may be multiple types of processing performed on the same data (e.g., different nucleotide trims). Or, a physical sample may have been run through multiple protocols (e.g., 16S, WGS, etc). So before we load any data, we need to create a context for the data to be placed. The following action will add an entry into the `state:contexts` hash bucket keyed by `name` and valued by `description`:
 
     $ redbiom admin create-context --name deblur-100nt --description "16S V4 Caporaso et al data deblurred at 100nt"
 
-Next, we'll load up associations between every single feature in a BIOM table to all the samples its found in. This will create Redis sets and can be accessed using keys of the form `<context_name>:samples:<feature_id>`. Note that we specify the context we're loading into.
-
-    $ redbiom admin load-features --context deblur-100nt --table /path/to/biom/table.biom
-
 Last, let's load up all of the BIOM table data. We'll only store the non-zero values, and we'll encode the sample data into something simple so that it goes in as just a string to Redis. Important: we only support storing count data right now, not floating point. The keys created are of the form `<context_name>:sample:<redbiom_id>`. To reduce space, we reindex the feature IDs as things like sOTUs tend to be very long in name. The mapping is stable over all tables loaded (ie the same feature has the same index), and is stored under `<context_name>:feature-index`. Because we need to update the index, this operation cannot be done in parallel however the code is setup with a redis-based mutex so it's okay to queue up multiple loads.
 
     $ redbiom load-sample-data --context deblur-100nt --table /path/to/biom/table.biom

diff --git a/redbiom/_requests.py b/redbiom/_requests.py
@@ -104,10 +104,12 @@ def make_script_exec(config):
     config = redbiom.get_config()
 
     def f(sha, *args):
-        payload = [config['hostname'], 'EVALSHA', sha]
+        payload = [sha]
         payload.extend([str(a) for a in args])
-        url = '/'.join(payload)
-        return json.loads(_parse_validate_request(s.get(url), 'EVALSHA'))
+        payload = '/'.join(payload)
+        data = _format_request(None, 'EVALSHA', payload)
+        req = s.post(config['hostname'], data=data)
+        return json.loads(req.json()['EVALSHA'])
 
     return f
 

diff --git a/redbiom/admin.py b/redbiom/admin.py
@@ -251,8 +251,13 @@ def create_context(name, description):
 
     config = redbiom.get_config()
     post = redbiom._requests.make_post(config)
-    post('state', 'HSET', "contexts/%s/%s" % (name, description))
-    post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__)
+    try:
+        post('state', 'HSET', "contexts/%s/%s" % (name, description))
+        post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__)
+    except:  # noqa
+        import sys
+        print("Unable to create context: %s" % name, file=sys.stderr)
+        raise
     ScriptManager.load_scripts()
 
 
@@ -566,7 +571,7 @@ def load_sample_metadata(md, tag=None):
 
     # subset to only the novel IDs
     represented = get('metadata', 'SMEMBERS', 'samples-represented')
-    md = md.loc[set(md.index) - set(represented)]
+    md = md.loc[list(set(md.index) - set(represented))]
     if len(md) == 0:
         return 0
 

diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py
@@ -140,18 +140,28 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output,
               help=("Resolve ambiguities that may be present in the samples "
                     "which can arise from, for example, technical "
                     "replicates."))
-@click.option('--skip-taxonomy', is_flag=True, default=False, required=False,
-              help=("Do not resolve taxonomy on fetch. Setting this flag can "
-                    "reduce the time required to complete a fetch"))
+@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False,
+              help=("Resolve taxonomy on fetch. Setting this flag increases "
+                    "the time required to complete a fetch. Note that Deblur "
+                    "contexts do not cache taxonomy."))
+@click.option('--retain-artifact-id', is_flag=True, default=False,
+              required=False,
+              help=("If using --resolve-ambiguities=most-reads, set this flag "
+                    "to retain the artifact ID of the sample kept"))
 @click.argument('features', nargs=-1)
 def fetch_samples_from_obserations(features, exact, from_, output,
                                    context, md5, resolve_ambiguities,
-                                   skip_taxonomy):
+                                   fetch_taxonomy, retain_artifact_id):
     """Fetch sample data containing features."""
+    if retain_artifact_id and resolve_ambiguities != 'merge-reads':
+        raise ValueError('--retain-artifact-id only impacts a merge-reads '
+                         'ambiguity resolution')
+
     import redbiom.util
     iterable = redbiom.util.from_or_nargs(from_, features)
 
     import redbiom.fetch
+    skip_taxonomy = not fetch_taxonomy
     tab, map_ = redbiom.fetch.data_from_features(context, iterable, exact,
                                                  skip_taxonomy=skip_taxonomy)
 
@@ -163,7 +173,8 @@ def fetch_samples_from_obserations(features, exact, from_, output,
     if resolve_ambiguities == 'merge':
         tab = redbiom.fetch._ambiguity_merge(tab, map_)
     elif resolve_ambiguities == 'most-reads':
-        tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_)
+        tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_,
+                                                       retain_artifact_id)
 
     import h5py
     with h5py.File(output, 'w') as fp:
@@ -189,17 +200,28 @@ def fetch_samples_from_obserations(features, exact, from_, output,
               help=("Resolve ambiguities that may be present in the samples "
                     "which can arise from, for example, technical "
                     "replicates."))
-@click.option('--skip-taxonomy', is_flag=True, default=False, required=False,
-              help=("Do not resolve taxonomy on fetch. Setting this flag can "
-                    "reduce the time required to complete a fetch"))
+@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False,
+              help=("Resolve taxonomy on fetch. Setting this flag increases "
+                    "the time required to complete a fetch. Note that Deblur "
+                    "contexts do not cache taxonomy."))
+@click.option('--retain-artifact-id', is_flag=True, default=False,
+              required=False,
+              help=("If using --resolve-ambiguities=most-reads, set this flag "
+                    "to retain the artifact ID of the sample kept"))
 @click.argument('samples', nargs=-1)
 def fetch_samples_from_samples(samples, from_, output, context, md5,
-                               resolve_ambiguities, skip_taxonomy):
+                               resolve_ambiguities, fetch_taxonomy,
+                               retain_artifact_id):
     """Fetch sample data."""
+    if retain_artifact_id and resolve_ambiguities != 'merge-reads':
+        raise ValueError('--retain-artifact-id only impacts a merge-reads '
+                         'ambiguity resolution')
+
     import redbiom.util
     iterable = redbiom.util.from_or_nargs(from_, samples)
 
     import redbiom.fetch
+    skip_taxonomy = not fetch_taxonomy
     table, ambig = redbiom.fetch.data_from_samples(context, iterable,
                                                    skip_taxonomy=skip_taxonomy)
 
@@ -211,14 +233,113 @@ def fetch_samples_from_samples(samples, from_, output, context, md5,
     if resolve_ambiguities == 'merge':
         table = redbiom.fetch._ambiguity_merge(table, ambig)
     elif resolve_ambiguities == 'most-reads':
-        table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig)
+        table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig,
+                                                         retain_artifact_id)
 
     import h5py
     with h5py.File(output, 'w') as fp:
         table.to_hdf5(fp, 'redbiom')
     _write_ambig(ambig, output)
 
 
+@fetch.command(name='qiita-study')
+@click.option('--study-id', type=int, required=True, help='The study to fetch')
+@click.option('--context', required=True, type=str, default=None,
+              help="The context to fetch from.")
+@click.option('--resolve-ambiguities', required=False,
+              type=click.Choice(['merge', 'most-reads']), default=None,
+              help=("Resolve ambiguities that may be present in the samples "
+                    "which can arise from, for example, technical "
+                    "replicates."))
+@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False,
+              help=("Resolve taxonomy on fetch. Setting this flag increases "
+                    "the time required to complete a fetch. Note that Deblur "
+                    "contexts do not cache taxonomy."))
+@click.option('--retain-artifact-id', is_flag=True, default=False,
+              required=False,
+              help=("If using --resolve-ambiguities=most-reads, set this flag "
+                    "to retain the artifact ID of the sample kept"))
+@click.option('--remove-blanks', is_flag=True, default=False, required=False,
+              help=("If True, remove samples with 'blank' in their name based "
+                    "on case insensitive substring match"))
+@click.option('--output-basename', type=str, required=True,
+              help='The output file basename to use.')
+@click.option('--md5', required=False, type=bool,
+              help="Calculate and use MD5 for the features. This will also "
+              "save a tsv file with the original feature name and the md5",
+              default=False)
+def qiita_study(study_id, context, resolve_ambiguities, fetch_taxonomy,
+                retain_artifact_id, remove_blanks, output_basename, md5):
+    """Fetch sample data from a Qiita study."""
+    if retain_artifact_id and resolve_ambiguities != 'merge-reads':
+        raise ValueError('--retain-artifact-id only impacts a merge-reads '
+                         'ambiguity resolution')
+    import redbiom.search
+    query = "where qiita_study_id==%d" % study_id
+    samples = list(redbiom.search.metadata_full(query, categories=False))
+
+    import redbiom.fetch
+    skip_taxonomy = not fetch_taxonomy
+    table, ambig = redbiom.fetch.data_from_samples(context, samples,
+                                                   skip_taxonomy=skip_taxonomy)
+
+    if remove_blanks:
+        keep = {i for i in table.ids() if 'blank' not in i.lower()}
+        table = table.filter(keep).remove_empty()
+        ambig = {k: v for k, v in ambig.items() if k in keep}
+
+    if md5:
+        table, new_ids = redbiom.util.convert_biom_ids_to_md5(table)
+        with open(output_basename + 'md5mapping.tsv', 'w') as f:
+            f.write('\n'.join(['\t'.join(x) for x in new_ids.items()]))
+
+    if resolve_ambiguities == 'merge':
+        table = redbiom.fetch._ambiguity_merge(table, ambig)
+    elif resolve_ambiguities == 'most-reads':
+        table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig,
+                                                         retain_artifact_id)
+
+    import pandas as pd
+    md, map_ = redbiom.fetch.sample_metadata(samples, context=context,
+                                             common=False, tagged=False)
+
+    if resolve_ambiguities in ('merge', 'most-reads'):
+        if resolve_ambiguities == 'most-reads' and retain_artifact_id:
+            pass
+        else:
+            md.set_index('#SampleID', inplace=True)
+
+            # a temporary key to use when resolving ambiguities
+            # that will be removed before writing the metadata
+            key = "__@@AMBIGUITY@@__"
+
+            # add ambiguity information into the frame
+            ambigs = pd.Series(map_)
+            ambigs = ambigs.loc[md.index]
+            md[key] = ambigs
+
+            # remove duplicated unambiguous identifiers
+            md = md[~md[key].duplicated()]
+
+            # remove our index, and replace the entries with the ambiguous
+            # names
+            md.reset_index(inplace=True)
+            md['#SampleID'] = md[key]
+
+            # cleanup
+            md.drop(columns=key, inplace=True)
+
+    md.set_index('#SampleID', inplace=True)
+    md = md.loc[list(table.ids())]
+    md.to_csv(output_basename + '.tsv', sep='\t', header=True, index=True,
+              encoding='utf-8')
+
+    import h5py
+    with h5py.File(output_basename + '.biom', 'w') as fp:
+        table.to_hdf5(fp, 'redbiom')
+    _write_ambig(ambig, output_basename)
+
+
 def _write_ambig(map_, output):
     from collections import defaultdict
     ambig = defaultdict(list)

diff --git a/redbiom/commands/search.py b/redbiom/commands/search.py
@@ -117,8 +117,14 @@ def search_metadata(query, categories):
     $ redbiom search metadata --categories "ph - water"
     """
     import redbiom.search
-    for i in redbiom.search.metadata_full(query, categories):
-        click.echo(i)
+    import sys
+
+    try:
+        for i in redbiom.search.metadata_full(query, categories):
+            click.echo(i)
+    except (TypeError, SyntaxError):
+        click.echo("The search query appears to be malformed.")
+        sys.exit(1)
 
 
 @search.command(name='taxon')

diff --git a/redbiom/commands/summarize.py b/redbiom/commands/summarize.py
@@ -205,6 +205,11 @@ def taxonomy(from_, context, normalize_ranks, features):
     lineages = redbiom.fetch.taxon_ancestors(context, ids,
                                              normalize=normalize_ranks)
 
+    if not lineages:
+        import sys
+        click.echo("No taxonomy information found.")
+        sys.exit(0)
+
     import skbio
     tree = skbio.TreeNode.from_taxonomy([(i, l)
                                          for i, l in zip(ids, lineages)])