From 9ac740e0728a811c89c6f446131ec09830eecf26 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 15:12:09 -0800 Subject: [PATCH 01/20] ENH: allow retaininig the artifact ID, fixes #119 --- redbiom/commands/fetch.py | 27 +++++++++++++++++++++++---- redbiom/fetch.py | 8 ++++++-- redbiom/tests/test_fetch.py | 22 ++++++++++++++++++++++ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py index b970778..be78942 100644 --- a/redbiom/commands/fetch.py +++ b/redbiom/commands/fetch.py @@ -143,11 +143,19 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output, @click.option('--skip-taxonomy', is_flag=True, default=False, required=False, help=("Do not resolve taxonomy on fetch. Setting this flag can " "reduce the time required to complete a fetch")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) @click.argument('features', nargs=-1) def fetch_samples_from_obserations(features, exact, from_, output, context, md5, resolve_ambiguities, - skip_taxonomy): + skip_taxonomy, retain_artifact_id): """Fetch sample data containing features.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.util iterable = redbiom.util.from_or_nargs(from_, features) @@ -163,7 +171,8 @@ def fetch_samples_from_obserations(features, exact, from_, output, if resolve_ambiguities == 'merge': tab = redbiom.fetch._ambiguity_merge(tab, map_) elif resolve_ambiguities == 'most-reads': - tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_) + tab = redbiom.fetch._ambiguity_keep_most_reads(tab, map_, + retain_artifact_id) import h5py with h5py.File(output, 'w') as fp: @@ -192,10 +201,19 @@ def fetch_samples_from_obserations(features, exact, from_, output, @click.option('--skip-taxonomy', is_flag=True, default=False, required=False, help=("Do not resolve taxonomy on fetch. Setting this flag can " "reduce the time required to complete a fetch")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) @click.argument('samples', nargs=-1) def fetch_samples_from_samples(samples, from_, output, context, md5, - resolve_ambiguities, skip_taxonomy): + resolve_ambiguities, skip_taxonomy, + retain_artifact_id): """Fetch sample data.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.util iterable = redbiom.util.from_or_nargs(from_, samples) @@ -211,7 +229,8 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, if resolve_ambiguities == 'merge': table = redbiom.fetch._ambiguity_merge(table, ambig) elif resolve_ambiguities == 'most-reads': - table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig) + table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig, + retain_artifact_id) import h5py with h5py.File(output, 'w') as fp: diff --git a/redbiom/fetch.py b/redbiom/fetch.py index 3c954f2..d96bce0 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -798,7 +798,7 @@ def collapser(i, m): return collapsed_table -def _ambiguity_keep_most_reads(table, ambig_map): +def _ambiguity_keep_most_reads(table, ambig_map, retain_artifact_id=False): """Keep the ambiguous sample with the most reads Parameters @@ -807,6 +807,8 @@ def _ambiguity_keep_most_reads(table, ambig_map): The table obtained from redbiom ambig_map : dict A mapping of a sample ID in the table to its ambiguous form. + retain_artifact_id : boolean, default False + If True, do not strip the artifact ID Returns ------- @@ -841,6 +843,8 @@ def _ambiguity_keep_most_reads(table, ambig_map): to_keep.append(sample_ids[0]) subset_table = table.filter(set(to_keep), inplace=False).remove_empty() - subset_table.update_ids(ambig_map, inplace=True) + + if not retain_artifact_id: + subset_table.update_ids(ambig_map, inplace=True) return subset_table diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py index 9be8961..d2942fe 100644 --- a/redbiom/tests/test_fetch.py +++ b/redbiom/tests/test_fetch.py @@ -418,6 +418,28 @@ def test_ambiguity_keep_most_reads(self): obs_table = _ambiguity_keep_most_reads(table, ambig_map) self.assertEqual(obs_table, exp_table) + def test_ambiguity_keep_most_reads_retain_artifact_id(self): + ambig_map = {'10317.1234.foo': '10317.1234', + '10317.1234.bar': '10317.1234', + '10317.4321.foo': '10317.4321', + '10317.1234.baz': '10317.1234'} + + table = biom.Table(np.array([[0, 3, 2, 1], + [4, 7, 6, 5], + [8, 11, 10, 9]]), + ['O1', 'O2', 'O3'], + ['10317.1234.foo', + '10317.1234.bar', + '10317.4321.foo', + '10317.1234.baz']) + + exp_table = biom.Table(np.array([[3, 2], [7, 6], [11, 10]]), + ['O1', 'O2', 'O3'], + ['10317.1234.bar', '10317.4321.foo']) + + obs_table = _ambiguity_keep_most_reads(table, ambig_map, True) + self.assertEqual(obs_table, exp_table) + def test_ambiguity_keep_most_reads_mismatch(self): ambig_map = {'10317.1234.foo': '10317.1234', '10317.4321.foo': '10317.4321', From edd8b2bac5a039d5ea43439ce04ec16c0ad8b4f5 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 15:32:05 -0800 Subject: [PATCH 02/20] future --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 01e7543..1423bfc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -28,7 +28,7 @@ jobs: - name: Install shell: bash -l {0} run: | - conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython + conda create --yes -n test-env python=${{ matrix.python-version }} requests pandas click redis h5py nose cython future conda activate test-env conda install -c conda-forge --yes scikit-bio biom-format pip install flake8 nltk msgpack From 11a3ebe4cc33d260d99a06173d46696c64486289 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 15:46:09 -0800 Subject: [PATCH 03/20] MAINT: Use post for script get, fixes #117 --- redbiom/_requests.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/redbiom/_requests.py b/redbiom/_requests.py index 4bedee2..916287b 100644 --- a/redbiom/_requests.py +++ b/redbiom/_requests.py @@ -104,10 +104,12 @@ def make_script_exec(config): config = redbiom.get_config() def f(sha, *args): - payload = [config['hostname'], 'EVALSHA', sha] + payload = [sha] payload.extend([str(a) for a in args]) - url = '/'.join(payload) - return json.loads(_parse_validate_request(s.get(url), 'EVALSHA')) + payload = '/'.join(payload) + data = _format_request(None, 'EVALSHA', payload) + req = s.post(config['hostname'], data=data) + return json.loads(req.json()['EVALSHA']) return f From 186bab5a297f34c36e71831c0c1a9f0003234fe0 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 15:47:59 -0800 Subject: [PATCH 04/20] address some warnings --- redbiom/admin.py | 2 +- redbiom/tests/test_fetch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/redbiom/admin.py b/redbiom/admin.py index 59db9c6..9b756f0 100644 --- a/redbiom/admin.py +++ b/redbiom/admin.py @@ -566,7 +566,7 @@ def load_sample_metadata(md, tag=None): # subset to only the novel IDs represented = get('metadata', 'SMEMBERS', 'samples-represented') - md = md.loc[set(md.index) - set(represented)] + md = md.loc[list(set(md.index) - set(represented))] if len(md) == 0: return 0 diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py index d2942fe..e24f8d5 100644 --- a/redbiom/tests/test_fetch.py +++ b/redbiom/tests/test_fetch.py @@ -5,7 +5,7 @@ import numpy as np import biom import pandas as pd -import pandas.util.testing as pdt +import pandas.testing as pdt import redbiom.admin import redbiom.fetch From 3a81e65ac59788443af2bc3f61302123f90222ac Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 16:23:08 -0800 Subject: [PATCH 05/20] API: qiita-study fetch, fixes #116 --- Makefile | 1 + redbiom/commands/fetch.py | 96 +++++++++++++++++++++++++++++++++++++++ test_external.sh | 29 ++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 test_external.sh diff --git a/Makefile b/Makefile index d54b2eb..b52e4e2 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,7 @@ test: test_db /bin/bash test.sh nosetests /bin/bash test_failures.sh # this blows away the db + /bin/bash test_external.sh # relies on public redbiom instance test_bulk: test_db_bulk /bin/bash test.sh diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py index be78942..1b1f6a5 100644 --- a/redbiom/commands/fetch.py +++ b/redbiom/commands/fetch.py @@ -238,6 +238,102 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, _write_ambig(ambig, output) +@fetch.command(name='qiita-study') +@click.option('--study-id', type=int, required=True, help='The study to fetch') +@click.option('--context', required=True, type=str, default=None, + help="The context to fetch from.") +@click.option('--resolve-ambiguities', required=False, + type=click.Choice(['merge', 'most-reads']), default=None, + help=("Resolve ambiguities that may be present in the samples " + "which can arise from, for example, technical " + "replicates.")) +@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, + help=("Do not resolve taxonomy on fetch. Setting this flag can " + "reduce the time required to complete a fetch")) +@click.option('--retain-artifact-id', is_flag=True, default=False, + required=False, + help=("If using --resolve-ambiguities=most-reads, set this flag " + "to retain the artifact ID of the sample kept")) +@click.option('--remove-blanks', is_flag=True, default=False, required=False, + help=("If True, remove samples with 'blank' in their name based " + "on case insensitive substring match")) +@click.option('--output-basename', type=str, required=True, + help='The output file basename to use.') +@click.option('--md5', required=False, type=bool, + help="Calculate and use MD5 for the features. This will also " + "save a tsv file with the original feature name and the md5", + default=False) +def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, + retain_artifact_id, remove_blanks, output_basename, md5): + """Fetch sample data from a Qiita study.""" + if retain_artifact_id and resolve_ambiguities != 'merge-reads': + raise ValueError('--retain-artifact-id only impacts a merge-reads ' + 'ambiguity resolution') + import redbiom.search + query = "where qiita_study_id==%d" % study_id + samples = list(redbiom.search.metadata_full(query, categories=False)) + + import redbiom.fetch + table, ambig = redbiom.fetch.data_from_samples(context, samples, + skip_taxonomy=skip_taxonomy) + + if remove_blanks: + keep = {i for i in table.ids() if 'blank' not in i.lower()} + table = table.filter(keep).remove_empty() + ambig = {k: v for k, v in ambig.items() if k in keep} + + if md5: + table, new_ids = redbiom.util.convert_biom_ids_to_md5(table) + with open(output + '.tsv', 'w') as f: + f.write('\n'.join(['\t'.join(x) for x in new_ids.items()])) + + if resolve_ambiguities == 'merge': + table = redbiom.fetch._ambiguity_merge(table, ambig) + elif resolve_ambiguities == 'most-reads': + table = redbiom.fetch._ambiguity_keep_most_reads(table, ambig, + retain_artifact_id) + + import pandas as pd + md, map_ = redbiom.fetch.sample_metadata(samples, context=context, + common=False, tagged=False) + + md_resolve_ambiguities = resolve_ambiguities in ('merge', 'most-reads') + if md_resolve_ambiguities: + if resolve_ambiguities == 'most-reads' and retain_artifact_id: + pass + else: + md.set_index('#SampleID', inplace=True) + + # a temporary key to use when resolving ambiguities + # that will be removed before writing the metadata + key = "__@@AMBIGUITY@@__" + + # add ambiguity information into the frame + ambigs = pd.Series(map_) + ambigs = ambigs.loc[md.index] + md[key] = ambigs + + # remove duplicated unambiguous identifiers + md = md[~md[key].duplicated()] + + # remove our index, and replace the entries with the ambiguous names + md.reset_index(inplace=True) + md['#SampleID'] = md[key] + + # cleanup + md.drop(columns=key, inplace=True) + + md.set_index('#SampleID', inplace=True) + md = md.loc[list(table.ids())] + md.to_csv(output_basename + '.tsv', sep='\t', header=True, index=True, + encoding='utf-8') + + import h5py + with h5py.File(output_basename + '.biom', 'w') as fp: + table.to_hdf5(fp, 'redbiom') + _write_ambig(ambig, output_basename) + + def _write_ambig(map_, output): from collections import defaultdict ambig = defaultdict(list) diff --git a/test_external.sh b/test_external.sh new file mode 100644 index 0000000..ff12e00 --- /dev/null +++ b/test_external.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -x +set -e + +unset REDBIOM_HOST + +# WARNING: the context is not assured to be stable. It is possible +# that in the future, the context name will change and this test +# will fail. However, that is unlikely to happen anytime soon so +# let's do the simple thing of hard coding it. +redbiom fetch qiita-study \ + --study-id 2136 \ + --context Deblur_2021.09-Illumina-16S-V4-90nt-dd6875 \ + --output-basename 2136test \ + --resolve-ambiguities most-reads \ + --remove-blanks + +in_table=$(biom table-ids -i 2136test.biom | wc -l) +in_md=$(grep "^2136" 2136test.tsv | wc -l) +if [[ ${in_table} != ${in_md} ]]; +then + exit 1 +fi + +if [[ ! ${in_table} -eq 450 ]]; +then + exit 1 +fi From 2913dfe21bb9b7a801e2fa3b01a5ba0a8fde3ab7 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 16:26:23 -0800 Subject: [PATCH 06/20] lint --- redbiom/commands/fetch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py index 1b1f6a5..b5b019d 100644 --- a/redbiom/commands/fetch.py +++ b/redbiom/commands/fetch.py @@ -284,7 +284,7 @@ def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, if md5: table, new_ids = redbiom.util.convert_biom_ids_to_md5(table) - with open(output + '.tsv', 'w') as f: + with open(output_basename + 'md5mapping.tsv', 'w') as f: f.write('\n'.join(['\t'.join(x) for x in new_ids.items()])) if resolve_ambiguities == 'merge': @@ -297,8 +297,7 @@ def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, md, map_ = redbiom.fetch.sample_metadata(samples, context=context, common=False, tagged=False) - md_resolve_ambiguities = resolve_ambiguities in ('merge', 'most-reads') - if md_resolve_ambiguities: + if resolve_ambiguities in ('merge', 'most-reads'): if resolve_ambiguities == 'most-reads' and retain_artifact_id: pass else: @@ -316,7 +315,8 @@ def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, # remove duplicated unambiguous identifiers md = md[~md[key].duplicated()] - # remove our index, and replace the entries with the ambiguous names + # remove our index, and replace the entries with the ambiguous + # names md.reset_index(inplace=True) md['#SampleID'] = md[key] From 69c1240004d231bd5459414471d85d50539629b5 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 16 Dec 2022 16:40:00 -0800 Subject: [PATCH 07/20] PERF: use dok rather than lil, should fix #98 --- redbiom/fetch.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/redbiom/fetch.py b/redbiom/fetch.py index d96bce0..4859a56 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -379,9 +379,8 @@ def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None, sample_ids = [id_ for id_, _ in table_data] # fill in the matrix - mat = ss.lil_matrix((len(unique_indices), len(table_data))) + mat = ss.dok_matrix((len(unique_indices), len(table_data))) for col, (sample, col_data) in enumerate(table_data): - # since this isn't dense, hopefully roworder doesn't hose us for obs_id, value in col_data.items(): mat[unique_indices_map[obs_id], col] = value From 634e8d61d50fb85ad3818f0efa3c12de334fd388 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 08:25:00 -0800 Subject: [PATCH 08/20] TST: for #117 --- test_external.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_external.sh b/test_external.sh index ff12e00..02d35e3 100644 --- a/test_external.sh +++ b/test_external.sh @@ -27,3 +27,6 @@ if [[ ! ${in_table} -eq 450 ]]; then exit 1 fi + +# see https://github.com/biocore/redbiom/issues/117 +redbiom fetch samples --context Deblur_2021.09-Illumina-16S-V4-150nt-ac8c0b --output samples.biom 11896.A2.raw From 9106b7f1c40b54f350740ee442045c229c57b73f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 08:40:05 -0800 Subject: [PATCH 09/20] Fixes #43 --- redbiom/commands/summarize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/redbiom/commands/summarize.py b/redbiom/commands/summarize.py index f473249..54ae1bc 100644 --- a/redbiom/commands/summarize.py +++ b/redbiom/commands/summarize.py @@ -205,6 +205,11 @@ def taxonomy(from_, context, normalize_ranks, features): lineages = redbiom.fetch.taxon_ancestors(context, ids, normalize=normalize_ranks) + if lineages is None: + import sys + click.echo("No taxonomy information found.") + sys.exit(0) + import skbio tree = skbio.TreeNode.from_taxonomy([(i, l) for i, l in zip(ids, lineages)]) From fefaaba361b4709e4ea64e62582b3de558fdafef Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 08:49:20 -0800 Subject: [PATCH 10/20] Fixes #44 --- redbiom/commands/search.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/redbiom/commands/search.py b/redbiom/commands/search.py index d3771eb..8f2c85a 100644 --- a/redbiom/commands/search.py +++ b/redbiom/commands/search.py @@ -117,8 +117,13 @@ def search_metadata(query, categories): $ redbiom search metadata --categories "ph - water" """ import redbiom.search - for i in redbiom.search.metadata_full(query, categories): - click.echo(i) + try: + for i in redbiom.search.metadata_full(query, categories): + click.echo(i) + except TypeError: + import sys + click.echo("The search query appears to be malformed. ") + sys.exit(1) @search.command(name='taxon') From 7d124dff6cdaf061eb11610d2787ab80aa72be11 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 09:21:03 -0800 Subject: [PATCH 11/20] PERF: short circuit on taxonomy, double buffer sizes, reduces qiita study fetch by > 50% --- redbiom/commands/summarize.py | 2 +- redbiom/fetch.py | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/redbiom/commands/summarize.py b/redbiom/commands/summarize.py index 54ae1bc..b6a99cd 100644 --- a/redbiom/commands/summarize.py +++ b/redbiom/commands/summarize.py @@ -205,7 +205,7 @@ def taxonomy(from_, context, normalize_ranks, features): lineages = redbiom.fetch.taxon_ancestors(context, ids, normalize=normalize_ranks) - if lineages is None: + if not lineages: import sys click.echo("No taxonomy information found.") sys.exit(0) diff --git a/redbiom/fetch.py b/redbiom/fetch.py index 4859a56..c4d93c8 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -198,7 +198,7 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None, getter = redbiom._requests.buffered(list(ambig_assoc), 'categories', 'MGET', 'metadata', get=get, - buffer_size=100) + buffer_size=200) for samples, columns_by_sample in getter: all_samples.extend(samples) for column_set in columns_by_sample: @@ -379,7 +379,7 @@ def _biom_from_samples(context, samples, get=None, normalize_taxonomy=None, sample_ids = [id_ for id_, _ in table_data] # fill in the matrix - mat = ss.dok_matrix((len(unique_indices), len(table_data))) + mat = ss.lil_matrix((len(unique_indices), len(table_data))) for col, (sample, col_data) in enumerate(table_data): for obs_id, value in col_data.items(): mat[unique_indices_map[obs_id], col] = value @@ -431,15 +431,25 @@ def taxon_ancestors(context, ids, get=None, normalize=None): """ from future.moves.itertools import zip_longest import redbiom._requests + import re if get is None: import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) + is_asv = re.compile(r"^[ATGC]{90}") + non_asvs = [] + for i in ids: + if not is_asv.match(i): + non_asvs.append(i) + ids = non_asvs + if not ids: + return None + hmgetter = redbiom._requests.buffered remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context, - get=get, buffer_size=100, + get=get, buffer_size=200, multikey='feature-index') # map the feature identifier to an internal ID @@ -458,7 +468,7 @@ def taxon_ancestors(context, ids, get=None, normalize=None): key = 'taxonomy-parents' getter = hmgetter(iter(to_get), None, 'HMGET', context, get=get, - buffer_size=100, multikey=key) + buffer_size=200, multikey=key) new_to_get = set() for block in getter: @@ -538,7 +548,7 @@ def taxon_descendents(context, taxon, get=None): to_get = new_to_get remapped_bulk = hmgetter(to_keep, None, 'HMGET', context, - get=get, buffer_size=100, + get=get, buffer_size=200, multikey='feature-index-inverted') remapped = {name @@ -687,7 +697,7 @@ def metadata(where=None, tag=None, restrict_to=None): getter = redbiom._requests.buffered(samples, 'categories', 'MGET', 'metadata', get=get, - buffer_size=100) + buffer_size=200) samples_to_get = [] for chunk in getter: @@ -751,7 +761,7 @@ def get_sample_values(samples, category, get=None): getter = redbiom._requests.buffered(iter(samples), None, 'HMGET', 'metadata', get=get, - buffer_size=100, + buffer_size=200, multikey=key) return [item for chunk in getter for item in zip(*chunk)] From 9aefd47ec57503680c703b9c43850e2fd7ff73e4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 09:23:13 -0800 Subject: [PATCH 12/20] Fixes #46 --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7c6ab7c..ce2b836 100644 --- a/README.md +++ b/README.md @@ -358,7 +358,11 @@ Ambiguities can arise if the same sample was processed multiple times as might h ### Load some data (i.e., if you are running your own server) -To make use of this cache, we need to load things. Loading can be done in parallel. First, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:`, hash buckets for each category and sample combination (e.g., `metadata:category:` as the hash and `` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`): +To make use of this cache, we need to load things. Loading can be done in parallel. First, we need to set the server to be writable. + + $ redbiom admin scripts-writable + +Next, we'll load up metadata. This will create keys in Redis which describe all of the columns associated with a sample (e.g., `metadata:categories:`, hash buckets for each category and sample combination (e.g., `metadata:category:` as the hash and `` as the field), a set of all known categories (e.g., `metadata:categories-represented`), and a set of all known sample IDs (e.g., `metadata:samples-represented`): $ redbiom admin load-sample-metadata --metadata path/to/qiime/compat/mapping.txt @@ -366,10 +370,6 @@ redbiom supports one to many mappings between sample metadata and actual sample $ redbiom admin create-context --name deblur-100nt --description "16S V4 Caporaso et al data deblurred at 100nt" -Next, we'll load up associations between every single feature in a BIOM table to all the samples its found in. This will create Redis sets and can be accessed using keys of the form `:samples:`. Note that we specify the context we're loading into. - - $ redbiom admin load-features --context deblur-100nt --table /path/to/biom/table.biom - Last, let's load up all of the BIOM table data. We'll only store the non-zero values, and we'll encode the sample data into something simple so that it goes in as just a string to Redis. Important: we only support storing count data right now, not floating point. The keys created are of the form `:sample:`. To reduce space, we reindex the feature IDs as things like sOTUs tend to be very long in name. The mapping is stable over all tables loaded (ie the same feature has the same index), and is stored under `:feature-index`. Because we need to update the index, this operation cannot be done in parallel however the code is setup with a redis-based mutex so it's okay to queue up multiple loads. $ redbiom load-sample-data --context deblur-100nt --table /path/to/biom/table.biom From ff398b5f340e61cac0b6eedd5a0b11379ea1c884 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 09:25:22 -0800 Subject: [PATCH 13/20] Fixes #52 --- redbiom/admin.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/redbiom/admin.py b/redbiom/admin.py index 9b756f0..080a520 100644 --- a/redbiom/admin.py +++ b/redbiom/admin.py @@ -251,8 +251,13 @@ def create_context(name, description): config = redbiom.get_config() post = redbiom._requests.make_post(config) - post('state', 'HSET', "contexts/%s/%s" % (name, description)) - post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__) + try: + post('state', 'HSET', "contexts/%s/%s" % (name, description)) + post(name, 'HSET', "state/db-version/%s" % redbiom.__db_version__) + except: # noqa + import sys + print("Unable to create context: %s" % name, file=sys.stderr) + raise ScriptManager.load_scripts() From b8106029042dc51e11aca2679d5fed9399d12a05 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 09:44:24 -0800 Subject: [PATCH 14/20] Move skip of deblur taxonomy --- redbiom/commands/fetch.py | 30 ++++++++++++++++++------------ redbiom/fetch.py | 10 ---------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/redbiom/commands/fetch.py b/redbiom/commands/fetch.py index b5b019d..24d8829 100644 --- a/redbiom/commands/fetch.py +++ b/redbiom/commands/fetch.py @@ -140,9 +140,10 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output, help=("Resolve ambiguities that may be present in the samples " "which can arise from, for example, technical " "replicates.")) -@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, - help=("Do not resolve taxonomy on fetch. Setting this flag can " - "reduce the time required to complete a fetch")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) @click.option('--retain-artifact-id', is_flag=True, default=False, required=False, help=("If using --resolve-ambiguities=most-reads, set this flag " @@ -150,7 +151,7 @@ def fetch_sample_metadata(from_, samples, all_columns, context, output, @click.argument('features', nargs=-1) def fetch_samples_from_obserations(features, exact, from_, output, context, md5, resolve_ambiguities, - skip_taxonomy, retain_artifact_id): + fetch_taxonomy, retain_artifact_id): """Fetch sample data containing features.""" if retain_artifact_id and resolve_ambiguities != 'merge-reads': raise ValueError('--retain-artifact-id only impacts a merge-reads ' @@ -160,6 +161,7 @@ def fetch_samples_from_obserations(features, exact, from_, output, iterable = redbiom.util.from_or_nargs(from_, features) import redbiom.fetch + skip_taxonomy = not fetch_taxonomy tab, map_ = redbiom.fetch.data_from_features(context, iterable, exact, skip_taxonomy=skip_taxonomy) @@ -198,16 +200,17 @@ def fetch_samples_from_obserations(features, exact, from_, output, help=("Resolve ambiguities that may be present in the samples " "which can arise from, for example, technical " "replicates.")) -@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, - help=("Do not resolve taxonomy on fetch. Setting this flag can " - "reduce the time required to complete a fetch")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) @click.option('--retain-artifact-id', is_flag=True, default=False, required=False, help=("If using --resolve-ambiguities=most-reads, set this flag " "to retain the artifact ID of the sample kept")) @click.argument('samples', nargs=-1) def fetch_samples_from_samples(samples, from_, output, context, md5, - resolve_ambiguities, skip_taxonomy, + resolve_ambiguities, fetch_taxonomy, retain_artifact_id): """Fetch sample data.""" if retain_artifact_id and resolve_ambiguities != 'merge-reads': @@ -218,6 +221,7 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, iterable = redbiom.util.from_or_nargs(from_, samples) import redbiom.fetch + skip_taxonomy = not fetch_taxonomy table, ambig = redbiom.fetch.data_from_samples(context, iterable, skip_taxonomy=skip_taxonomy) @@ -247,9 +251,10 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, help=("Resolve ambiguities that may be present in the samples " "which can arise from, for example, technical " "replicates.")) -@click.option('--skip-taxonomy', is_flag=True, default=False, required=False, - help=("Do not resolve taxonomy on fetch. Setting this flag can " - "reduce the time required to complete a fetch")) +@click.option('--fetch-taxonomy', is_flag=True, default=False, required=False, + help=("Resolve taxonomy on fetch. Setting this flag increases " + "the time required to complete a fetch. Note that Deblur " + "contexts do not cache taxonomy.")) @click.option('--retain-artifact-id', is_flag=True, default=False, required=False, help=("If using --resolve-ambiguities=most-reads, set this flag " @@ -263,7 +268,7 @@ def fetch_samples_from_samples(samples, from_, output, context, md5, help="Calculate and use MD5 for the features. This will also " "save a tsv file with the original feature name and the md5", default=False) -def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, +def qiita_study(study_id, context, resolve_ambiguities, fetch_taxonomy, retain_artifact_id, remove_blanks, output_basename, md5): """Fetch sample data from a Qiita study.""" if retain_artifact_id and resolve_ambiguities != 'merge-reads': @@ -274,6 +279,7 @@ def qiita_study(study_id, context, resolve_ambiguities, skip_taxonomy, samples = list(redbiom.search.metadata_full(query, categories=False)) import redbiom.fetch + skip_taxonomy = not fetch_taxonomy table, ambig = redbiom.fetch.data_from_samples(context, samples, skip_taxonomy=skip_taxonomy) diff --git a/redbiom/fetch.py b/redbiom/fetch.py index c4d93c8..023a75c 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -431,22 +431,12 @@ def taxon_ancestors(context, ids, get=None, normalize=None): """ from future.moves.itertools import zip_longest import redbiom._requests - import re if get is None: import redbiom config = redbiom.get_config() get = redbiom._requests.make_get(config) - is_asv = re.compile(r"^[ATGC]{90}") - non_asvs = [] - for i in ids: - if not is_asv.match(i): - non_asvs.append(i) - ids = non_asvs - if not ids: - return None - hmgetter = redbiom._requests.buffered remapped_bulk = hmgetter(iter(ids), None, 'HMGET', context, get=get, buffer_size=200, From dac58d95c24e02135f63ec46b6aecf34dfb6e90d Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 09:57:43 -0800 Subject: [PATCH 15/20] Fixes #50 --- redbiom/commands/search.py | 7 ++++--- redbiom/search.py | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/redbiom/commands/search.py b/redbiom/commands/search.py index 8f2c85a..e8086d9 100644 --- a/redbiom/commands/search.py +++ b/redbiom/commands/search.py @@ -117,12 +117,13 @@ def search_metadata(query, categories): $ redbiom search metadata --categories "ph - water" """ import redbiom.search + import sys + try: for i in redbiom.search.metadata_full(query, categories): click.echo(i) - except TypeError: - import sys - click.echo("The search query appears to be malformed. ") + except (TypeError, SyntaxError): + click.echo("The search query appears to be malformed.") sys.exit(1) diff --git a/redbiom/search.py b/redbiom/search.py index 5adfb1e..18ab9fe 100644 --- a/redbiom/search.py +++ b/redbiom/search.py @@ -93,10 +93,17 @@ def query_plan(query): return [('where', part)] parts = query.split('where', 1) - for part in parts: + for i, part in enumerate(parts): if not part: raise ValueError('No query') + parts[i] = parts[i].strip() + + if ' ' in parts[0]: + import sys + print("Set queries cannot contain spaces.", file=sys.stderr) + raise SyntaxError + if len(parts) == 1: return [('set', parts[0].strip())] else: From ef152fe8ea086771db15f366b7a82621ca6f6cfa Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 10:18:19 -0800 Subject: [PATCH 16/20] Fixes #41 --- redbiom/tests/test_where_expr.py | 2 ++ redbiom/where_expr.py | 14 ++++++++++++-- test.sh | 6 ++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/redbiom/tests/test_where_expr.py b/redbiom/tests/test_where_expr.py index 62d6f99..af90339 100644 --- a/redbiom/tests/test_where_expr.py +++ b/redbiom/tests/test_where_expr.py @@ -52,6 +52,8 @@ def test_whereeval(self): ("(age >= 5) <= 15", {'D', 'C'}), ("sex == 'male'", {'D', }), ("sex in ('male', 'female')", {'A', 'B', 'D'}), + ("'male' in sex", {'D', }), + ("'male' not in sex", {'A', 'B', 'C'}), ("sex is 'male' or age < 11", {'D', 'A', 'C'}), ("(age <= 10) != 8 and sex is 'male'", {'D', }), ("(age <= 10) != 8 or sex is 'male'", {'D', 'A', 'C'}), diff --git a/redbiom/where_expr.py b/redbiom/where_expr.py index 9dabac3..db90d1c 100644 --- a/redbiom/where_expr.py +++ b/redbiom/where_expr.py @@ -99,11 +99,21 @@ def NotEq(): def _in(left, right): - return left[left.isin(right)] + if isinstance(right, tuple): + # some_category in foo + return left[left.isin(right)] + else: + # foo in some_category + return right[right.isin([left])] def _notin(left, right): - return left[~left.isin(right)] + if isinstance(right, tuple): + # some_category not in foo + return left[~left.isin(right)] + else: + # foo not in some_category + return right[~right.isin([left])] def Or(): diff --git a/test.sh b/test.sh index eb6332a..8e9752a 100644 --- a/test.sh +++ b/test.sh @@ -226,3 +226,9 @@ redbiom search samples --context test 10317.000003302 | sort - > obs_sample_sear redbiom search samples --context test UNTAGGED_10317.000003302 | sort - > obs_sample_search_rbid.txt md5test obs_sample_search.txt exp_sample_search.txt md5test obs_sample_search_rbid.txt exp_sample_search.txt + +if [[ $(redbiom search metadata "where 'feces' in BODY_SITE" | wc -l | awk '{ print $1 }') != "9" ]]; +then + echo "fail" + exit 1 +fi From e57b15e2e7fa1c6bcc762561784ed5d85c09e813 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 10:25:27 -0800 Subject: [PATCH 17/20] Be exact --- redbiom/where_expr.py | 3 +++ test.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/redbiom/where_expr.py b/redbiom/where_expr.py index db90d1c..a9ee2a8 100644 --- a/redbiom/where_expr.py +++ b/redbiom/where_expr.py @@ -104,6 +104,9 @@ def _in(left, right): return left[left.isin(right)] else: # foo in some_category + if len(right) == 0: + return pd.Series([]) + return right[right.isin([left])] diff --git a/test.sh b/test.sh index 8e9752a..372fdaa 100644 --- a/test.sh +++ b/test.sh @@ -227,7 +227,7 @@ redbiom search samples --context test UNTAGGED_10317.000003302 | sort - > obs_sa md5test obs_sample_search.txt exp_sample_search.txt md5test obs_sample_search_rbid.txt exp_sample_search.txt -if [[ $(redbiom search metadata "where 'feces' in BODY_SITE" | wc -l | awk '{ print $1 }') != "9" ]]; +if [[ $(redbiom search metadata "where 'UBERON:feces' in BODY_SITE" | wc -l | awk '{ print $1 }') != "11" ]]; then echo "fail" exit 1 From b60137a9ae30e2f93d1c5d976166a67fb4549b2f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 10:52:40 -0800 Subject: [PATCH 18/20] query plan can have spaces --- redbiom/search.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/redbiom/search.py b/redbiom/search.py index 18ab9fe..5d41808 100644 --- a/redbiom/search.py +++ b/redbiom/search.py @@ -99,11 +99,6 @@ def query_plan(query): parts[i] = parts[i].strip() - if ' ' in parts[0]: - import sys - print("Set queries cannot contain spaces.", file=sys.stderr) - raise SyntaxError - if len(parts) == 1: return [('set', parts[0].strip())] else: From b974f6566f8883aa64b741bdd79b89ce05e14ed0 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 11:44:45 -0800 Subject: [PATCH 19/20] BLD: remove support for py36, test on py310 and 311 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1423bfc..98573a7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: matrix: # based roughly on https://github.com/conda-incubator/setup-miniconda#example-1-basic-usage # via biocore/empress - python-version: ["3.6", "3.7", "3.8", "3.9"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - name: Check out repository code From 1889c6d7db8e250a74b4f01fac53f8b9a2379ee6 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 19 Dec 2022 11:46:57 -0800 Subject: [PATCH 20/20] BLD: py311 seems a nogo --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 98573a7..6f5e750 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: matrix: # based roughly on https://github.com/conda-incubator/setup-miniconda#example-1-basic-usage # via biocore/empress - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - name: Check out repository code