diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 01e7543..6f5e750 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: matrix: # based roughly on https://github.com/conda-incubator/setup-miniconda#example-1-basic-usage # via biocore/empress - python-version: ["3.6", "3.7", "3.8", "3.9"] + python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - name: Check out repository code @@ -28,7 +28,7 @@ jobs: - name: Install shell: bash -l {0} run: | - conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython + conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython future conda activate test-env conda install -c conda-forge --yes scikit-bio biom-format pip install flake8 nltk msgpack diff --git a/Makefile b/Makefile index d54b2eb..b52e4e2 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ test: test_db /bin/bash test.sh nosetests /bin/bash test_failures.sh # this blows away the db + /bin/bash test_external.sh # relies on public redbiom instance test_bulk: test_db_bulk /bin/bash test.sh diff --git a/README.md b/README.md index 7c6ab7c..ce2b836 100644 --- a/README.md +++ b/README.md @@ -358,7 +358,11 @@ Ambiguities can arise if the same sample was processed multiple times as might h ### Load some data (i.e., if you are running your own server) -To make use of this cache, we need to load things. Loading can be done in parallel. First, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:`, hash buckets for each category and sample combination (e.g., `metadata:category:` as the hash and `` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`): +To make use of this cache, we need to load things. Loading can be done in parallel. First, we need to set the server to be writable. + + $ redbiom admin scripts-writable + +Next, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:`, hash buckets for each category and sample combination (e.g., `metadata:category:` as the hash and `` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`): $ redbiom admin load-sample-metadata --metadata path/to/qiime/compat/mapping.txt @@ -366,10 +370,6 @@ redbiom supports one to many mappings between sample metadata and actual sample $ redbiom admin create-context --name deblur-100nt --description "16S V4 Caporaso et al data deblurred at 100nt" -Next, we'll load up associations between every single feature in a BIOM table to all the samples its found in. This will create Redis sets and can be accessed using keys of the form `:samples:`. Note that we specify the context we're loading into. - - $ redbiom admin load-features --context deblur-100nt --table /path/to/biom/table.biom - Last, let's load up all of the BIOM table data. We'll only store the non-zero values, and we'll encode the sample data into something simple so that it goes in as just a string to Redis. Important: we only support storing count data right now, not floating point. The keys created are of the form `:sample:`. To reduce space, we reindex the feature IDs as things like sOTUs tend to be very long in name. The mapping is stable over all tables loaded (ie the same feature has the same index), and is stored under `:feature-index`. Because we need to update the index, this operation cannot be done in parallel however the code is setup with a redis-based mutex so it's okay to queue up multiple loads. $ redbiom load-sample-data --context deblur-100nt --table /path/to/biom/table.biom diff --git a/redbiom/_requests.py b/redbiom/_requests.py index 4bedee2..916287b 100644 --- a/redbiom/_requests.py +++ b/redbiom/_requests.py @@ -104,10 +104,12 @@ def make_script_exec(config): config = redbiom.get_config() def f(sha, *args): - payload = [config['hostname'], 'EVALSHA', sha] + payload = [sha] payload.extend([str(a) for a in args]) - url = '/'.join(payload) - return json.loads(_parse_validate_request(s.get(url), 'EVALSHA')) + payload = '/'.join(payload) + data = _format_request(None, 'EVALSHA', payload) + req = s.post(config['hostname'], data=data) + return json.loads(req.json()['EVALSHA']) return f diff --git a/redbiom/admin.py b/redbiom/admin.py index 59db9c6..080a520 100644 --- a/redbiom/admin.py +++ b/redbiom/admin.py @@ -251,8 +251,13 @@ def create_context(name, description): config = redbiom.get_config() post = redbiom._requests.make_post(config) - post('state', 'HSET', "contexts/%s/%s" % (name, description)) - post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__) + try: + post('state', 'HSET', "contexts/%s/%s" % (name, description)) + post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__) + except: # noqa + import sys + print("Unable to create context: %s" % name, file=sys.stderr) + raise ScriptManager.load_scripts() @@ -566,7 +571,7 @@ def load_sample_metadata(md, tag=None): # subset to only the novel IDs represented = get('metadata', 'SMEMBERS', 'samples-represented') - md = md.loc[set(md.index) - set(represented)] + md = md.loc[list(set(md.index) - set(represented))] if len(md) == 0: return 0 diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py index b970778..24d8829 100644 --- a/redbiom/commands/fetch.py +++ b/redbiom/commands/fetch.py @@ -140,18 +140,28 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output, help=("Resolve ambiguities that may be present in the samples " "which can arise from, for example, technical " "replicates.")) -@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, - help=("Do not resolve taxonomy on fetch. Setting this flag can " - "reduce the time required to complete a fetch")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) @click.argument('features', nargs=-1) def fetch_samples_from_obserations(features, exact, from_, output, context, md5, resolve_ambiguities, - skip_taxonomy): + fetch_taxonomy, retain_artifact_id): """Fetch sample data containing features.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.util iterable = redbiom.util.from_or_nargs(from_, features) import redbiom.fetch + skip_taxonomy = not fetch_taxonomy tab, map_ = redbiom.fetch.data_from_features(context, iterable, exact, skip_taxonomy=skip_taxonomy) @@ -163,7 +173,8 @@ def fetch_samples_from_obserations(features, exact, from_, output, if resolve_ambiguities == 'merge': tab = redbiom.fetch._ambiguity_merge(tab, map_) elif resolve_ambiguities == 'most-reads': - tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_) + tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_, + retain_artifact_id) import h5py with h5py.File(output, 'w') as fp: @@ -189,17 +200,28 @@ def fetch_samples_from_obserations(features, exact, from_, output, help=("Resolve ambiguities that may be present in the samples " "which can arise from, for example, technical " "replicates.")) -@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, - help=("Do not resolve taxonomy on fetch. Setting this flag can " - "reduce the time required to complete a fetch")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) @click.argument('samples', nargs=-1) def fetch_samples_from_samples(samples, from_, output, context, md5, - resolve_ambiguities, skip_taxonomy): + resolve_ambiguities, fetch_taxonomy, + retain_artifact_id): """Fetch sample data.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.util iterable = redbiom.util.from_or_nargs(from_, samples) import redbiom.fetch + skip_taxonomy = not fetch_taxonomy table, ambig = redbiom.fetch.data_from_samples(context, iterable, skip_taxonomy=skip_taxonomy) @@ -211,7 +233,8 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, if resolve_ambiguities == 'merge': table = redbiom.fetch._ambiguity_merge(table, ambig) elif resolve_ambiguities == 'most-reads': - table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig) + table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig, + retain_artifact_id) import h5py with h5py.File(output, 'w') as fp: @@ -219,6 +242,104 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, _write_ambig(ambig, output) +@fetch.command(name='qiita-study') +@click.option('--study-id', type=int, required=True, help='The study to fetch') +@click.option('--context', required=True, type=str, default=None, + help="The context to fetch from.") +@click.option('--resolve-ambiguities', required=False, + type=click.Choice(['merge', 'most-reads']), default=None, + help=("Resolve ambiguities that may be present in the samples " + "which can arise from, for example, technical " + "replicates.")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) +@click.option('--remove-blanks', is_flag=True, default=False, required=False, + help=("If True, remove samples with 'blank' in their name based " + "on case insensitive substring match")) +@click.option('--output-basename', type=str, required=True, + help='The output file basename to use.') +@click.option('--md5', required=False, type=bool, + help="Calculate and use MD5 for the features. This will also " + "save a tsv file with the original feature name and the md5", + default=False) +def qiita_study(study_id, context, resolve_ambiguities, fetch_taxonomy, + retain_artifact_id, remove_blanks, output_basename, md5): + """Fetch sample data from a Qiita study.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.search + query = "where qiita_study_id==%d" % study_id + samples = list(redbiom.search.metadata_full(query, categories=False)) + + import redbiom.fetch + skip_taxonomy = not fetch_taxonomy + table, ambig = redbiom.fetch.data_from_samples(context, samples, + skip_taxonomy=skip_taxonomy) + + if remove_blanks: + keep = {i for i in table.ids() if 'blank' not in i.lower()} + table = table.filter(keep).remove_empty() + ambig = {k: v for k, v in ambig.items() if k in keep} + + if md5: + table, new_ids = redbiom.util.convert_biom_ids_to_md5(table) + with open(output_basename + 'md5mapping.tsv', 'w') as f: + f.write('\n'.join(['\t'.join(x) for x in new_ids.items()])) + + if resolve_ambiguities == 'merge': + table = redbiom.fetch._ambiguity_merge(table, ambig) + elif resolve_ambiguities == 'most-reads': + table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig, + retain_artifact_id) + + import pandas as pd + md, map_ = redbiom.fetch.sample_metadata(samples, context=context, + common=False, tagged=False) + + if resolve_ambiguities in ('merge', 'most-reads'): + if resolve_ambiguities == 'most-reads' and retain_artifact_id: + pass + else: + md.set_index('#SampleID', inplace=True) + + # a temporary key to use when resolving ambiguities + # that will be removed before writing the metadata + key = "__@@AMBIGUITY@@__" + + # add ambiguity information into the frame + ambigs = pd.Series(map_) + ambigs = ambigs.loc[md.index] + md[key] = ambigs + + # remove duplicated unambiguous identifiers + md = md[~md[key].duplicated()] + + # remove our index, and replace the entries with the ambiguous + # names + md.reset_index(inplace=True) + md['#SampleID'] = md[key] + + # cleanup + md.drop(columns=key, inplace=True) + + md.set_index('#SampleID', inplace=True) + md = md.loc[list(table.ids())] + md.to_csv(output_basename + '.tsv', sep='\t', header=True, index=True, + encoding='utf-8') + + import h5py + with h5py.File(output_basename + '.biom', 'w') as fp: + table.to_hdf5(fp, 'redbiom') + _write_ambig(ambig, output_basename) + + def _write_ambig(map_, output): from collections import defaultdict ambig = defaultdict(list) diff --git a/redbiom/commands/search.py b/redbiom/commands/search.py index d3771eb..e8086d9 100644 --- a/redbiom/commands/search.py +++ b/redbiom/commands/search.py @@ -117,8 +117,14 @@ def search_metadata(query, categories): $ redbiom search metadata --categories "ph - water" """ import redbiom.search - for i in redbiom.search.metadata_full(query, categories): - click.echo(i) + import sys + + try: + for i in redbiom.search.metadata_full(query, categories): + click.echo(i) + except (TypeError, SyntaxError): + click.echo("The search query appears to be malformed.") + sys.exit(1) @search.command(name='taxon') diff --git a/redbiom/commands/summarize.py b/redbiom/commands/summarize.py index f473249..b6a99cd 100644 --- a/redbiom/commands/summarize.py +++ b/redbiom/commands/summarize.py @@ -205,6 +205,11 @@ def taxonomy(from_, context, normalize_ranks, features): lineages = redbiom.fetch.taxon_ancestors(context, ids, normalize=normalize_ranks) + if not lineages: + import sys + click.echo("No taxonomy information found.") + sys.exit(0) + import skbio tree = skbio.TreeNode.from_taxonomy([(i, l) for i, l in zip(ids, lineages)]) diff --git a/redbiom/fetch.py b/redbiom/fetch.py index 3c954f2..023a75c 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -198,7 +198,7 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None, getter = redbiom._requests.buffered(list(ambig_assoc), 'categories', 'MGET', 'metadata', get=get, - buffer_size=100) + buffer_size=200) for samples, columns_by_sample in getter: all_samples.extend(samples) for column_set in columns_by_sample: @@ -381,7 +381,6 @@ def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None, # fill in the matrix mat = ss.lil_matrix((len(unique_indices), len(table_data))) for col, (sample, col_data) in enumerate(table_data): - # since this isn't dense, hopefully roworder doesn't hose us for obs_id, value in col_data.items(): mat[unique_indices_map[obs_id], col] = value @@ -440,7 +439,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None): hmgetter = redbiom._requests.buffered remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context, - get=get, buffer_size=100, + get=get, buffer_size=200, multikey='feature-index') # map the feature identifier to an internal ID @@ -459,7 +458,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None): key = 'taxonomy-parents' getter = hmgetter(iter(to_get), None, 'HMGET', context, get=get, - buffer_size=100, multikey=key) + buffer_size=200, multikey=key) new_to_get = set() for block in getter: @@ -539,7 +538,7 @@ def taxon_descendents(context, taxon, get=None): to_get = new_to_get remapped_bulk = hmgetter(to_keep, None, 'HMGET', context, - get=get, buffer_size=100, + get=get, buffer_size=200, multikey='feature-index-inverted') remapped = {name @@ -688,7 +687,7 @@ def metadata(where=None, tag=None, restrict_to=None): getter = redbiom._requests.buffered(samples, 'categories', 'MGET', 'metadata', get=get, - buffer_size=100) + buffer_size=200) samples_to_get = [] for chunk in getter: @@ -752,7 +751,7 @@ def get_sample_values(samples, category, get=None): getter = redbiom._requests.buffered(iter(samples), None, 'HMGET', 'metadata', get=get, - buffer_size=100, + buffer_size=200, multikey=key) return [item for chunk in getter for item in zip(*chunk)] @@ -798,7 +797,7 @@ def collapser(i, m): return collapsed_table -def _ambiguity_keep_most_reads(table, ambig_map): +def _ambiguity_keep_most_reads(table, ambig_map, retain_artifact_id=False): """Keep the ambiguous sample with the most reads Parameters @@ -807,6 +806,8 @@ def _ambiguity_keep_most_reads(table, ambig_map): The table obtained from redbiom ambig_map : dict A mapping of a sample ID in the table to its ambiguous form. + retain_artifact_id : boolean, default False + If True, do not strip the artifact ID Returns ------- @@ -841,6 +842,8 @@ def _ambiguity_keep_most_reads(table, ambig_map): to_keep.append(sample_ids[0]) subset_table = table.filter(set(to_keep), inplace=False).remove_empty() - subset_table.update_ids(ambig_map, inplace=True) + + if not retain_artifact_id: + subset_table.update_ids(ambig_map, inplace=True) return subset_table diff --git a/redbiom/search.py b/redbiom/search.py index 5adfb1e..5d41808 100644 --- a/redbiom/search.py +++ b/redbiom/search.py @@ -93,10 +93,12 @@ def query_plan(query): return [('where', part)] parts = query.split('where', 1) - for part in parts: + for i, part in enumerate(parts): if not part: raise ValueError('No query') + parts[i] = parts[i].strip() + if len(parts) == 1: return [('set', parts[0].strip())] else: diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py index 9be8961..e24f8d5 100644 --- a/redbiom/tests/test_fetch.py +++ b/redbiom/tests/test_fetch.py @@ -5,7 +5,7 @@ import numpy as np import biom import pandas as pd -import pandas.util.testing as pdt +import pandas.testing as pdt import redbiom.admin import redbiom.fetch @@ -418,6 +418,28 @@ def test_ambiguity_keep_most_reads(self): obs_table = _ambiguity_keep_most_reads(table, ambig_map) self.assertEqual(obs_table, exp_table) + def test_ambiguity_keep_most_reads_retain_artifact_id(self): + ambig_map = {'10317.1234.foo': '10317.1234', + '10317.1234.bar': '10317.1234', + '10317.4321.foo': '10317.4321', + '10317.1234.baz': '10317.1234'} + + table = biom.Table(np.array([[0, 3, 2, 1], + [4, 7, 6, 5], + [8, 11, 10, 9]]), + ['O1', 'O2', 'O3'], + ['10317.1234.foo', + '10317.1234.bar', + '10317.4321.foo', + '10317.1234.baz']) + + exp_table = biom.Table(np.array([[3, 2], [7, 6], [11, 10]]), + ['O1', 'O2', 'O3'], + ['10317.1234.bar', '10317.4321.foo']) + + obs_table = _ambiguity_keep_most_reads(table, ambig_map, True) + self.assertEqual(obs_table, exp_table) + def test_ambiguity_keep_most_reads_mismatch(self): ambig_map = {'10317.1234.foo': '10317.1234', '10317.4321.foo': '10317.4321', diff --git a/redbiom/tests/test_where_expr.py b/redbiom/tests/test_where_expr.py index 62d6f99..af90339 100644 --- a/redbiom/tests/test_where_expr.py +++ b/redbiom/tests/test_where_expr.py @@ -52,6 +52,8 @@ def test_whereeval(self): ("(age >= 5) <= 15", {'D', 'C'}), ("sex == 'male'", {'D', }), ("sex in ('male', 'female')", {'A', 'B', 'D'}), + ("'male' in sex", {'D', }), + ("'male' not in sex", {'A', 'B', 'C'}), ("sex is 'male' or age < 11", {'D', 'A', 'C'}), ("(age <= 10) != 8 and sex is 'male'", {'D', }), ("(age <= 10) != 8 or sex is 'male'", {'D', 'A', 'C'}), diff --git a/redbiom/where_expr.py b/redbiom/where_expr.py index 9dabac3..a9ee2a8 100644 --- a/redbiom/where_expr.py +++ b/redbiom/where_expr.py @@ -99,11 +99,24 @@ def NotEq(): def _in(left, right): - return left[left.isin(right)] + if isinstance(right, tuple): + # some_category in foo + return left[left.isin(right)] + else: + # foo in some_category + if len(right) == 0: + return pd.Series([]) + + return right[right.isin([left])] def _notin(left, right): - return left[~left.isin(right)] + if isinstance(right, tuple): + # some_category not in foo + return left[~left.isin(right)] + else: + # foo not in some_category + return right[~right.isin([left])] def Or(): diff --git a/test.sh b/test.sh index eb6332a..372fdaa 100644 --- a/test.sh +++ b/test.sh @@ -226,3 +226,9 @@ redbiom search samples --context test 10317.000003302 | sort - > obs_sample_sear redbiom search samples --context test UNTAGGED_10317.000003302 | sort - > obs_sample_search_rbid.txt md5test obs_sample_search.txt exp_sample_search.txt md5test obs_sample_search_rbid.txt exp_sample_search.txt + +if [[ $(redbiom search metadata "where 'UBERON:feces' in BODY_SITE" | wc -l | awk '{ print $1 }') != "11" ]]; +then + echo "fail" + exit 1 +fi diff --git a/test_external.sh b/test_external.sh new file mode 100644 index 0000000..02d35e3 --- /dev/null +++ b/test_external.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -x +set -e + +unset REDBIOM_HOST + +# WARNING: the context is not assured to be stable. It is possible +# that in the future, the context name will change and this test +# will fail. However, that is unlikely to happen anytime soon so +# let's do the simple thing of hard coding it. +redbiom fetch qiita-study \ + --study-id 2136 \ + --context Deblur_2021.09-Illumina-16S-V4-90nt-dd6875 \ + --output-basename 2136test \ + --resolve-ambiguities most-reads \ + --remove-blanks + +in_table=$(biom table-ids -i 2136test.biom | wc -l) +in_md=$(grep "^2136" 2136test.tsv | wc -l) +if [[ ${in_table} != ${in_md} ]]; +then + exit 1 +fi + +if [[ ! ${in_table} -eq 450 ]]; +then + exit 1 +fi + +# see https://github.com/biocore/redbiom/issues/117 +redbiom fetch samples --context Deblur_2021.09-Illumina-16S-V4-150nt-ac8c0b --output samples.biom 11896.A2.raw