From e4c552c1d817316b47fe9e537c5fdbbc8fa52e99 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Sat, 20 Nov 2021 16:23:13 -0700 Subject: [PATCH 1/8] updated installation guide --- CHANGELOG.md | 8 +++++++- README.md | 6 ++++-- doc/database.md | 1 + doc/install.md | 6 +++--- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2efe1e1..ff0bb93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,22 @@ Change Log ========== -## Version 2.0b3 (12/27/2020) +## Version 2.0b3 (ongoing) ### Added - Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported. - A quick-start guide added to the homepage. +- Let the database construction task resume from an interrupted task and use already downloaded files, while detecting and skipping corrupt files. +- Added NCBI-defined type materials as a new optional criterion for genome sampling during database construction. ### Changed - Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab). +- Updated recommended dependency versions, however the program should continue to be compatible with previous versions. - Minor tweaks with no visible impact on program behavior. +### Fixed +- Fixed an issue with the NCBI FTP server connection during database construction. + ## Version 2.0b2 (5/3/2020) diff --git a/README.md b/README.md index b3959b8..813ac37 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ References Set up a Conda environment and install dependencies: ```bash -conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond +conda create -n hgtector -c conda-forge python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond conda activate hgtector ``` @@ -40,6 +40,8 @@ Install HGTector2: pip install git+https://github.com/qiyunlab/HGTector.git ``` +Then you will be able to type `hgtector` to run the program. Here are more details of [installation](doc/install.md). + Build a reference database using the default protocol: ```bash @@ -69,7 +71,7 @@ It is recommended that you read the [first run](doc/1strun.md), [second run](doc ## License -Copyright (c) 2013-2020, [Qiyun Zhu](mailto:qiyunzhu@gmail.com) and [Katharina Dittmar](mailto:katharinad@gmail.com). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE). +Copyright (c) 2013-2021, [Qiyun Zhu](mailto:qiyunzhu@gmail.com) and [Katharina Dittmar](mailto:katharinad@gmail.com). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE). ## Citation diff --git a/doc/database.md b/doc/database.md index 628bcc3..00e5926 100644 --- a/doc/database.md +++ b/doc/database.md @@ -185,6 +185,7 @@ Option | Default | Description `--complete` | - | Only include complete genomes, i.e., `assembly_level` is `Complete Genome` or `Chromosome`. `--reference` | - | Add NCBI-defined reference genomes to selection (after taxon sampling). `--representative` | - | Add NCBI-defined representative genomes to selection (after taxon sampling). +`--typematerial` | - | Add NCBI-defined type material genomes to selection (after taxon sampling). ### Taxonomic filter diff --git a/doc/install.md b/doc/install.md index a841a9a..1265ec2 100644 --- a/doc/install.md +++ b/doc/install.md @@ -18,7 +18,7 @@ HGTector is written in Python 3. One needs at least Python 3.6 to run the progra ### Option 1: Through Conda (recommended) ```bash -conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn +conda create -n hgtector -c conda-forge python=3 pyyaml pandas matplotlib scikit-learn conda activate hgtector pip install git+https://github.com/qiyunlab/HGTector.git ``` @@ -80,8 +80,8 @@ conda env remove -n hgtector --all ## Compatibility -If in the future some dependencies have changes that are not compatible with the current release of HGTector, the following "safe" command can be used to install the current versions of dependencies (note: DIAMOND version is too tricky to specify). +If in the future some dependencies have changes that are not compatible with the current release of HGTector, the following "safe" command can be used to install the current versions of dependencies. ```bash -conda create -n hgtector -c bioconda python=3.7.7 pyyaml=5.3.1 pandas=1.0.3 matplotlib=3.1.3 scikit-learn=0.22.1 diamond blast=2.9.0 +conda create -n hgtector-dev -c conda-forge python=3.10.0 pyyaml=6.0 pandas=1.3.4 matplotlib=3.5.0 scikit-learn=1.0.1 bioconda::diamond=2.0.13 bioconda::blast=2.12.0 ``` From 32097746ae72202084b70860a41b3403c6627ff7 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Sat, 20 Nov 2021 16:41:18 -0700 Subject: [PATCH 2/8] shortened database flags --- doc/database.md | 8 ++++---- hgtector/database.py | 30 ++++++++++++++++++------------ 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/doc/database.md b/doc/database.md index 00e5926..b2a89c1 100644 --- a/doc/database.md +++ b/doc/database.md @@ -20,7 +20,7 @@ hgtector database -o --default This will download all protein sequences of NCBI RefSeq genomes of bacteria, archaea, fungi and protozoa, keep one genome per species, plus all NCBI-defined reference and representative genomes. Finally it will attempt to compile the database using DIAMOND, if available. The command is equivalent to: ```bash -hgtector database --output --cats microbe --sample 1 --rank species --reference --representative --compile diamond +hgtector database --output --cats microbe --sample 1 --rank species --reference --represent --compile diamond ``` A pre-built default database as of 2019-10-21 is available for [download](https://www.dropbox.com/s/qdnfgzdcjadlm4i/hgtdb_20191021.tar.xz?dl=0). It needs to be [compiled](#Manual-compiling) using choice of aligner. @@ -183,9 +183,9 @@ Option | Default | Description --- | --- | --- `--genbank` | - | By default the program only downloads RefSeq genomes (`GCF`). This flag will let the program also download GenBank genomes (`GCA`). But RefSeq has higher priority than GenBank if the same genome is hosted by both catalogs. `--complete` | - | Only include complete genomes, i.e., `assembly_level` is `Complete Genome` or `Chromosome`. -`--reference` | - | Add NCBI-defined reference genomes to selection (after taxon sampling). -`--representative` | - | Add NCBI-defined representative genomes to selection (after taxon sampling). -`--typematerial` | - | Add NCBI-defined type material genomes to selection (after taxon sampling). +`--reference` | - | Include NCBI-defined reference genomes. +`--represent` | - | Include NCBI-defined representative genomes. +`--typemater` | - | Include NCBI-defined type material genomes. ### Taxonomic filter diff --git a/hgtector/database.py b/hgtector/database.py index bd274a3..76e3a0d 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -63,10 +63,10 @@ {'action': 'store_true'}], ['--reference', 'include NCBI-defined reference genomes', {'action': 'store_true'}], - ['--representative', 'include NCBI-defined representative genomes', - {'action': 'store_true'}], - ['--typematerial', 'include NCBI-defined type material genomes', - {'action': 'store_true'}], + ['--represent', 'include NCBI-defined representative genomes', + {'action': 'store_true'}], + ['--typemater', 'include NCBI-defined type material genomes', + {'action': 'store_true'}], 'taxonomic filter', ['--capital', 'organism name must be capitalized', @@ -215,7 +215,7 @@ def set_parameters(self, args): self.sample = 1 self.rank = 'species' self.reference = True - self.representative = True + self.represent = True if self.diamond is None: self.diamond = 'diamond' @@ -499,15 +499,21 @@ def sample_by_taxonomy(self): if not selected: raise ValueError(f'No genome is classified at rank "{self.rank}".') - # add reference / representative genomes - for key in 'reference', 'representative': - if getattr(self, key): - print(f'Add {key} genomes to selection.') - selected.update(self.df.query( - f'refseq_category == "{key} genome"')['genome'].tolist()) + # add reference genomes + if self.reference: + print('Add reference genomes to selection.') + selected.update(self.df.query( + 'refseq_category == "reference genome"')['genome'].tolist()) + + # add representative genomes + if self.represent: + print('Add representative genomes to selection.') + selected.update(self.df.query( + 'refseq_category == "representative genome"')[ + 'genome'].tolist()) # add type material genomes - if self.typematerial: + if self.typemater: print('Add type material genomes to selection.') selected.update(self.df[self.df[ 'relation_to_type_material'].notna()]['genome'].tolist()) From 5a0b25dd3566bdbdb847539326a1c8c964e1dcbb Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Sat, 20 Nov 2021 18:10:28 -0700 Subject: [PATCH 3/8] taxonomic sampling at ranks above --- hgtector/database.py | 100 +++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 36 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 76e3a0d..bac09f9 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -52,8 +52,11 @@ 'taxon sampling', ['-s|--sample', 'sample up to this number of genomes per taxonomic ' 'group at given rank (0 for all)', {'type': int}], - ['-r|--rank', 'taxonomic rank at which subsampling will be performed', + ['-r|--rank', 'taxonomic rank at which sampling will be performed', {'default': 'species'}], + ['--above', 'sampling will also be performed on ranks from the ' + 'designated one to phylum', + {'action': 'store_true'}], 'genome sampling', ['--genbank', 'also search GenBank if a genome is not found in RefSeq; ' @@ -129,6 +132,9 @@ def __call__(self, args): # filter genomes self.filter_genomes() + # sort genomes by quality + self.sort_genomes() + # identify taxonomy of genomes self.identify_taxonomy() @@ -404,6 +410,27 @@ def report_diff(msg): report_diff('Dropped {} genomes.') print('Done.') + def sort_genomes(self): + """Sort genomes by quality as informed by metadata. + """ + # sort by reference > representative > type material > other + self.df['rc_seq'] = self.df.apply( + lambda x: 0 if x['refseq_category'] == 'reference genome' + else (1 if x['refseq_category'] == 'representative genome' + else (2 if pd.notnull(x['relation_to_type_material']) + else 3)), axis=1) + + # sort by complete > scaffold > contig + self.df['al_seq'] = self.df['assembly_level'].map( + {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1, + 'Contig': 2}) + + # sort genomes by three criteria + self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True) + + # clean up temporary columns + self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) + def identify_taxonomy(self): """Identify taxonomy of genomes. """ @@ -468,7 +495,7 @@ def report_diff(msg): def sample_by_taxonomy(self): """Taxonomy-based sampling. """ - if not self.sample: + if self.sample is None: return print('Sampling genomes based on taxonomy...') print(f'Up to {self.sample} genome(s) will be sampled per {self.rank}' @@ -479,44 +506,48 @@ def sample_by_taxonomy(self): self.df[self.rank] = self.df['taxid'].apply( taxid_at_rank, rank=self.rank, taxdump=self.taxdump) - # sort by reference > representative > type material > other - self.df['rc_seq'] = self.df.apply( - lambda x: 0 if x['refseq_category'] == 'reference genome' - else (1 if x['refseq_category'] == 'representative genome' - else (2 if pd.notnull(x['relation_to_type_material']) - else 3)), axis=1) - - # sort by complete > scaffold > contig - self.df['al_seq'] = self.df['assembly_level'].map( - {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1, - 'Contig': 2}) - - # sort genomes by three criteria - self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True) - # select up to given number of genomes of each taxonomic group - selected = set(self.df.groupby(self.rank).head(self.sample)['genome']) - if not selected: - raise ValueError(f'No genome is classified at rank "{self.rank}".') + selected = set(self.df.dropna(subset=[self.rank]).groupby( + self.rank).head(self.sample)['genome']) + print(f'Sampled {len(selected)} genomes at the {self.rank} rank.') + + # sample at ranks above the current one + ranks = ['phylum', 'class', 'order', 'family', 'genus', 'species'] + if self.above: + if self.rank not in ranks: + raise ValueError( + f'Cannot determine taxonomic ranks above "{self.rank}", ' + 'because it is not among the six standard ranks: ' + f'{", ".join(ranks)}.') + ranks = ranks[:ranks.index(self.rank)][::-1] + print(f'Sampling will also be performed on: {", ".join(ranks)}.') + for rank in ranks: + self.df['temp_rank_'] = self.df['taxid'].apply( + taxid_at_rank, rank=rank, taxdump=self.taxdump) + df_ = self.df.dropna(subset=['temp_rank_']).groupby( + 'temp_rank_').head(self.sample) + selected.update(df_['genome']) + print(f'Sampled a total of {len(selected)} genomes at {self.rank} ' + 'and above.') + self.df.drop(columns=['temp_rank_'], inplace=True) # add reference genomes if self.reference: - print('Add reference genomes to selection.') - selected.update(self.df.query( - 'refseq_category == "reference genome"')['genome'].tolist()) + df_ = self.df.query('refseq_category == "reference genome"') + print(f'Include {df_.shape[0]} reference genomes.') + selected.update(df_['genome']) # add representative genomes if self.represent: - print('Add representative genomes to selection.') - selected.update(self.df.query( - 'refseq_category == "representative genome"')[ - 'genome'].tolist()) + df_ = self.df.query('refseq_category == "representative genome"') + print(f'Include {df_.shape[0]} representative genomes.') + selected.update(df_['genome']) # add type material genomes if self.typemater: - print('Add type material genomes to selection.') - selected.update(self.df[self.df[ - 'relation_to_type_material'].notna()]['genome'].tolist()) + df_ = self.df[self.df['relation_to_type_material'].notna()] + print(f'Include {df_.shape[0]} type material genomes.') + selected.update(df_['genome']) # filter genomes to selected self.df.query('genome in @selected', inplace=True) @@ -525,15 +556,12 @@ def sample_by_taxonomy(self): raise ValueError('No genome is retained after sampling.') print(f'Total number of sampled genomes: {n}.') - # sort by genome ID - self.df.sort_values('genome', inplace=True) - - # clean up temporary columns - self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) - def download_genomes(self): """Download genomes from NCBI. """ + # sort by genome ID + self.df.sort_values('genome', inplace=True) + # reconnect to avoid server timeout problem self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout) self.ftp.login() From 8a7d81b943c1c5fece44d1f3e5cb287cfe58c9df Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Sat, 20 Nov 2021 18:27:37 -0700 Subject: [PATCH 4/8] fixed taxdump download dunno why --- hgtector/database.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hgtector/database.py b/hgtector/database.py index bac09f9..66ed9c9 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -260,7 +260,8 @@ def retrieve_taxdump(self): if not self.check_local_file(local_file, self.overwrite): print('Downloading NCBI taxonomy database...', end='', flush=True) with open(local_file, 'wb') as f: - self.ftp.retrbinary('RETR ' + remote_file, f.write) + self.ftp.retrbinary( + 'RETR ' + remote_file, f.write, blocksize=65536) print(' done.') # read taxdump From 98e960f0d3740929160a2ddcb90a7e7c8d51a86c Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Tue, 23 Nov 2021 12:24:43 -0700 Subject: [PATCH 5/8] updated database --- CHANGELOG.md | 10 +- README.md | 4 +- doc/2ndrun.md | 2 +- doc/database.md | 65 +++++-- doc/install.md | 2 +- hgtector/database.py | 326 +++++++++++++++++--------------- hgtector/tests/test_database.py | 6 +- hgtector/tests/test_util.py | 10 +- hgtector/util.py | 27 +++ 9 files changed, 274 insertions(+), 178 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff0bb93..9289fbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,21 +1,25 @@ Change Log ========== -## Version 2.0b3 (ongoing) +## Version 2.0b3 (2021, ongoing) ### Added - Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported. - A quick-start guide added to the homepage. - Let the database construction task resume from an interrupted task and use already downloaded files, while detecting and skipping corrupt files. -- Added NCBI-defined type materials as a new optional criterion for genome sampling during database construction. +- Added an option `--above` to sample genomes at all ranks from the designated one to phylum during database construction. +- Added an option `--typemater` to include NCBI-defined type materials in genome sampling during database construction. +- Added an option `--manual` to export URLs of sampled genomes during database construction, and let the user download them manually. ### Changed +- Updated pre-built default database to 2021-11-21 (after NCBI RefSeq 209) - Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab). - Updated recommended dependency versions, however the program should continue to be compatible with previous versions. - Minor tweaks with no visible impact on program behavior. ### Fixed -- Fixed an issue with the NCBI FTP server connection during database construction. +- Fixed an issue with the NCBI FTP server connection during database construction. NCBI now recommends rsync over ftp. Therefore the protocol has been updated accordingly. +- Fixed compatibility with latest DIAMOND (2.0.13). ## Version 2.0b2 (5/3/2020) diff --git a/README.md b/README.md index 813ac37..b539d1b 100644 --- a/README.md +++ b/README.md @@ -42,13 +42,13 @@ pip install git+https://github.com/qiyunlab/HGTector.git Then you will be able to type `hgtector` to run the program. Here are more details of [installation](doc/install.md). -Build a reference database using the default protocol: +Build a reference [database](doc/database.md) using the default protocol: ```bash hgtector database -o db_dir --default ``` -This will retrieve the latest genomic data from NCBI. If this does not work (e.g., due to network issues), or you need some customization, please read the [database](doc/database.md) page. +Or [download](https://www.dropbox.com/s/tszxy9etp52id3u/hgtdb_20211121.tar.xz?dl=0) a pre-built database as of 2021-11-21, and [compile](doc/database.md#Manual-compiling) it. Prepare input file(s). They should be multi-Fasta files of amino acid sequences (faa). Each file represents the whole protein set of a complete or partial genome. diff --git a/doc/2ndrun.md b/doc/2ndrun.md index d87148a..13ce66f 100644 --- a/doc/2ndrun.md +++ b/doc/2ndrun.md @@ -37,7 +37,7 @@ Here we will analyze a single genome of [_Escherichia coli_ O55:H7](https://www. Download the whole protein set of _E. coli_ O55:H7 (5,278 protein sequences in total) from the NCBI server: ```bash -wget -O o55h7.faa.gz ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/165/GCF_000025165.1_ASM2516v1/GCF_000025165.1_ASM2516v1_protein.faa.gz +wget -O o55h7.faa.gz https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/025/165/GCF_000025165.1_ASM2516v1/GCF_000025165.1_ASM2516v1_protein.faa.gz ``` You don't need to unzip the file, as HGTector can automatically parse compressed files in the format of gzip, bzip2 and lzma. diff --git a/doc/database.md b/doc/database.md index b2a89c1..e631e62 100644 --- a/doc/database.md +++ b/doc/database.md @@ -9,7 +9,8 @@ The `database` command is an automated workflow for sampling reference genomes, hgtector database -o ``` -### Default protocol + +## Default protocol HGTector provides a default protocol for database building. @@ -17,25 +18,25 @@ HGTector provides a default protocol for database building. hgtector database -o --default ``` -This will download all protein sequences of NCBI RefSeq genomes of bacteria, archaea, fungi and protozoa, keep one genome per species, plus all NCBI-defined reference and representative genomes. Finally it will attempt to compile the database using DIAMOND, if available. The command is equivalent to: +This will download all protein sequences of NCBI RefSeq genomes of **bacteria**, **archaea**, **fungi** and **protozoa**, keep _one genome per species_ that has a Latinate name, plus one genome per taxonomic group at higher ranks, regardless whether that genome has a Latinate species name, plus all NCBI-defined **reference**, **representative** and **type material** genomes (prioritized during taxonomy-based sampling, and added afterwards if not sampled). Finally it will attempt to compile the database using DIAMOND, if available. The command is equivalent to: ```bash -hgtector database --output --cats microbe --sample 1 --rank species --reference --represent --compile diamond +hgtector database --output --cats microbe --sample 1 --rank species_latin --above --reference --represent --typemater --compile diamond ``` -A pre-built default database as of 2019-10-21 is available for [download](https://www.dropbox.com/s/qdnfgzdcjadlm4i/hgtdb_20191021.tar.xz?dl=0). It needs to be [compiled](#Manual-compiling) using choice of aligner. -### Procedures +## Pre-built database -The workflow consists of the following steps: +A database built using the default protocol on 2021-11-21 is available for [download](https://www.dropbox.com/s/tszxy9etp52id3u/hgtdb_20211121.tar.xz?dl=0) \([MD5](https://www.dropbox.com/s/kdopz946pk088wr/hgtdb_20211121.tar.xz.md5?dl=0)\). It needs to be [compiled](#Manual-compiling) using choice of aligner. -1. Download NCBI taxonomy database (taxdump). -2. Download NCBI RefSeq assembly summary. -3. Sample genomes based on various properties and taxonomic information. -4. Download protein sequences associated with sampled genomes. -5. Compile local databases using DIAMOND and/or BLAST. +This database, sampled from NCBI RefSeq after release, 209 contains 68,977,351 unique protein sequences from 21,754 microbial genomes, representing 3 domains, 74 phyla, 145 classes, 337 orders, 783 families, 3,753 genera and 15,932 species. + +Building this database used a maximum of 63 GB memory. Searching this database using DIAMOND v2.0.13 requires ~7 GB memory. -### Database files +A previous version of the database built on 2019-10-21 is available [here](https://www.dropbox.com/s/qdnfgzdcjadlm4i/hgtdb_20191021.tar.xz?dl=0). + + +## Database files File or directory | Description --- | --- @@ -60,6 +61,20 @@ The protein-to-TaxID map is already integrated into the compiled databases, so o Feel free to delete (e.g., `download/`) or compress the intermediate files (e.g., `db.faa`) to save disk space. + +## Procedures + +The workflow consists of the following steps: + +1. Download NCBI taxonomy database (taxdump). +2. Download NCBI RefSeq assembly summary. +3. Sample genomes based on various properties and taxonomic information. +4. Download protein sequences associated with sampled genomes. +5. Compile local databases using DIAMOND and/or BLAST. + + +## Considerations + ### More examples ```bash @@ -80,12 +95,31 @@ hgtector database -g gids.txt -o . This will only download genomes specified in the file `gids.txt`. Useful for controlled tests. +### Clean up + +After the database is successfully built, you may consider compressing `db.faa` and deleting `download/` (or just `download/faa/`) to save disk space. HGTector won't do this automatically. + ### Break and resume Should any of the download steps be interrupted by e.g., a network failure, one can resume the downloading process by re-executing the same command. The program will skip the already downloaded files in this new run. In some instances, one may need to manually remove the last file from the failed run (because that file may be corrupt), before re-running the program. If one wants to overwrite downloaded files (e.g., upgrading), add `--overwrite` to the command. +### Manual downloading + +One may want to download genomes manually in a more controled manner, instead of letting HGTector running for hours to days to retrieve them one after another before moving to the next step. In this case, add `--manual` to the command, and the program will generate `urls.txt`, a list of URLs of the sampled genomes, and quit. + +Then one can choose the most appropriate method to download them. For example, one may use the [rsync protocol](https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/#protocols), as recommended by NCBI: + +```bash +while read url +do + rsync -Ltq rsync${url#https}/${url##*/}_protein.faa.gz download/faa/ +done < urls.txt +``` + +After all genomes (protein sequences) are downloaded to `download/faa/`, one may restart the program without `--manual`, and the program will take the downloaded files and move to the next step. + ### Manual compiling The sampling & downloading steps (1-4) require extensive network traffics (usually several hours, if the bottleneck is not on the recipient side) but little local computing load; whereas the compling step (5) requires extensive local computing power, but no networking. @@ -95,12 +129,11 @@ Therefore, it is a reasonable plan to only download database files without compi For DIAMOND: ```bash -echo $'accession\taccession.version\ttaxid' > prot.accession2taxid -zcat taxon.map.gz | awk -v OFS='\t' '{split($1, a, "."); print a[1], $1, $2}' >> prot.accession2taxid +echo $'accession.version\ttaxid' | cat - <(zcat taxon.map.gz) > prot.accession2taxid.FULL -diamond makedb --threads 16 --in db.faa --taxonmap prot.accession2taxid --taxonnodes taxdump/nodes.dmp --taxonnames taxdump/names.dmp --db diamond/db +diamond makedb --threads 16 --in db.faa --taxonmap prot.accession2taxid.FULL --taxonnodes taxdump/nodes.dmp --taxonnames taxdump/names.dmp --db diamond/db -rm prot.accession2taxid +rm prot.accession2taxid.FULL ``` For BLAST: diff --git a/doc/install.md b/doc/install.md index 1265ec2..3921c4e 100644 --- a/doc/install.md +++ b/doc/install.md @@ -49,7 +49,7 @@ conda install -c bioconda diamond blast HGTector has a command `database` for automated database construction. It defaults to the **NCBI** RefSeq microbial genomes and taxonomy. Meanwhile, we also provide instructions for using **GTDB** and custom databases. See [details](database.md). -A standard database built using the default protocol on 2019-10-21 is available for [download](https://www.dropbox.com/s/qdnfgzdcjadlm4i/hgtdb_20191021.tar.xz?dl=0), together with [instruction](database.md#Manual-compiling) for compiling. +A standard database built using the default protocol on 2021-11-21 is available for [download](https://www.dropbox.com/s/tszxy9etp52id3u/hgtdb_20211121.tar.xz?dl=0) \([MD5](https://www.dropbox.com/s/kdopz946pk088wr/hgtdb_20211121.tar.xz.md5?dl=0)\), together with [instruction](database.md#Manual-compiling) for compiling. A small, pre-compiled test database is also available for [download](https://www.dropbox.com/s/46v3uc708rvc5rc/ref107.tar.xz?dl=0). diff --git a/hgtector/database.py b/hgtector/database.py index 66ed9c9..911398b 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -8,11 +8,11 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import sys from os import remove, makedirs, cpu_count from os.path import join, isfile, isdir, getsize, basename from shutil import which, rmtree from time import sleep -import ftplib import gzip import tarfile from tempfile import mkdtemp @@ -22,7 +22,7 @@ from hgtector.util import ( timestamp, load_configs, get_config, arg2bool, run_command, list_from_param, read_taxdump, contain_words, is_latin, is_capital, - taxid_at_rank, taxids_at_ranks, is_ancestral, find_lca) + taxid_at_rank, taxids_at_ranks, is_ancestral, find_lca, rank_plural) description = """build reference protein sequence and taxonomy database""" @@ -80,6 +80,10 @@ {'choices': ['yes', 'no']}], 'download', + ['--manual', 'export URLs of sampled genomes without downloading them' + ', so the user may download them manually, then resume ' + 'the database pipeline.', {'action': 'store_true'}], + ['--overwrite', 'overwrite existing files with newly downloaded ones; ' 'othewise use existing files whenever available, i.e., ' 'resume an interrupted run', {'action': 'store_true'}], @@ -98,6 +102,8 @@ ['--tmpdir', 'temporary directory for diamond makedb'] ] +server = 'rsync://ftp.ncbi.nlm.nih.gov' + class Database(object): @@ -114,12 +120,6 @@ def __call__(self, args): # read and validate arguments self.set_parameters(args) - # connect to NCBI FTP server - self.connect_server() - - # create target directory - makedirs(self.output, exist_ok=True) - # retrieve taxonomy database self.retrieve_taxdump() @@ -136,11 +136,17 @@ def __call__(self, args): self.sort_genomes() # identify taxonomy of genomes - self.identify_taxonomy() + self.filter_by_taxonomy() # sample genomes by taxonomy self.sample_by_taxonomy() + # sample genomes by quality + self.sample_by_quality() + + # filter genomes to sampled one + self.filter_to_sampled() + # download genomes self.download_genomes() @@ -239,39 +245,32 @@ def set_parameters(self, args): if self.threads is None: self.threads = 1 + # create target directories makedirs(self.output, exist_ok=True) - - def connect_server(self): - """Connect to the NCBI FTP server. - """ - print('Connecting to the NCBI FTP server...', end='', flush=True) - makedirs(join(self.output, 'download'), exist_ok=True) - self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout) - self.ftp.login() - print(' done.') + self.down = join(self.output, 'download') + makedirs(self.down, exist_ok=True) def retrieve_taxdump(self): - """Retrieve NCBI taxdump.""" + """Retrieve NCBI taxdump. + """ fname = 'taxdump.tar.gz' - remote_file = f'/pub/taxonomy/{fname}' - local_file = join(self.output, 'download', fname) + rfile = f'pub/taxonomy/{fname}' + lfile = join(self.down, fname) # download taxdump - if not self.check_local_file(local_file, self.overwrite): + if not self.check_local_file(lfile, self.overwrite): print('Downloading NCBI taxonomy database...', end='', flush=True) - with open(local_file, 'wb') as f: - self.ftp.retrbinary( - 'RETR ' + remote_file, f.write, blocksize=65536) + run_command(f'rsync -Ltq {server}/{rfile} {self.down}') print(' done.') # read taxdump print('Reading NCBI taxonomy database...', end='', flush=True) - with tarfile.open(local_file, 'r:gz') as f: + with tarfile.open(lfile, 'r:gz') as f: f.extract('names.dmp', self.tmpdir) f.extract('nodes.dmp', self.tmpdir) self.taxdump = read_taxdump(self.tmpdir) print(' done.') - print(f'Total number of TaxIDs: {len(self.taxdump)}.') + print(f' Total number of TaxIDs: {len(self.taxdump)}.') def retrieve_summary(self, genbank=False): """Retrieve genome assembly summary. @@ -280,27 +279,26 @@ def retrieve_summary(self, genbank=False): def get_summary(target): key = target.lower() fname = f'assembly_summary_{key}.txt' - remote_file = f'/genomes/{key}/{fname}' - local_file = join(self.output, 'download', fname) + rfile = f'genomes/{key}/{fname}' + lfile = join(self.down, fname) # download summary - if not self.check_local_file(local_file, self.overwrite): + if not self.check_local_file(lfile, self.overwrite): print(f'Downloading {target} assembly summary...', end='', flush=True) - with open(local_file, 'wb') as f: - self.ftp.retrbinary('RETR ' + remote_file, f.write) + run_command(f'rsync -Ltq {server}/{rfile} {self.down}') print(' done.') # read summary print(f'Reading {target} assembly summary...', end='', flush=True) - df = pd.read_table(local_file, skiprows=1, low_memory=False) + df = pd.read_table(lfile, skiprows=1, low_memory=False) print(' done.') return df self.df = get_summary('RefSeq') if self.genbank: self.df = pd.concat([self.df, get_summary('GenBank')]) - print(f'Total number of genomes: {self.df.shape[0]}.') + print(f' Total number of genomes: {self.df.shape[0]}.') def retrieve_categories(self): """Retrieve genome categories. @@ -313,44 +311,45 @@ def retrieve_categories(self): self.cats = ['archaea', 'bacteria', 'fungi', 'protozoa'] else: self.cats = cats.split(',') - print(f'Genome categories: {", ".join(self.cats)}') + print(f'Genome categories: {", ".join(self.cats)}') def get_categories(target): key = target.lower() # validate categories - self.ftp.cwd(f'/genomes/{key}') - dirs = [x[0] for x in self.ftp.mlsd() if x[1]['type'] == 'dir'] + ec, out = run_command( + f'rsync --list-only --no-motd {server}/genomes/{key}/') + cats = [line.rsplit(None, 1)[-1] for line in out if + line.startswith('d') and not line.endswith('.')] for cat in self.cats: - if cat not in dirs: + if cat not in cats: raise ValueError( f'"{cat}" is not a valid {target} genome category.') # get genome list per category print(f'Downloading genome list per {target} category...') - dir_ = join(self.output, 'download', 'cats') - makedirs(dir_, exist_ok=True) + makedirs(join(self.down, 'cats'), exist_ok=True) + ldir = join(self.down, 'cats') fname = 'assembly_summary.txt' - asms = [] - file_ = join(self.tmpdir, 'tmp.txt') + file_ = join(self.tmpdir, fname) + asms = [] for cat in self.cats: - file = join(dir_, f'{key}_{cat}.txt') - islocal = self.check_local_file(file) + lfile = join(ldir, f'{key}_{cat}.txt') # use local file - if islocal: - with open(file, 'r') as f: + if self.check_local_file(lfile): + with open(lfile, 'r') as f: asms_ = f.read().splitlines() # download remote file else: - with open(file_, 'wb') as f: - self.ftp.retrbinary(f'RETR {cat}/{fname}', f.write) + rfile = f'genomes/{key}/{cat}/{fname}' + run_command(f'rsync -Ltq {server}/{rfile} {self.tmpdir}') with open(file_, 'r') as f: asms_ = [x.split('\t', 1)[0] for x in f.read(). splitlines() if not x.startswith('#')] - with open(file, 'w') as f: + with open(lfile, 'w') as f: f.write(''.join([x + '\n' for x in asms_])) print(f' {cat}: {len(asms_)}') @@ -361,16 +360,12 @@ def get_categories(target): print('Done.') return asms + # filter genomes by category asmset = set(get_categories('RefSeq')) if self.genbank: asmset.update(get_categories('GenBank')) - - # filter genomes by category self.df = self.df[self.df['# assembly_accession'].isin(asmset)] - print(f'Total number of genomes in categories: {self.df.shape[0]}.') - - # close and reconnect later to avoid some problems - self.ftp.close() + print(f' Total number of genomes in categories: {self.df.shape[0]}.') def filter_genomes(self): """Filter genomes based on genome metadata. @@ -409,6 +404,10 @@ def report_diff(msg): self.df['genome'].isin(self.genoids)) != self.exclude] report_diff('Dropped {} genomes.') + + # genomes without download link + self.df.query('ftp_path != "na"', inplace=True) + report_diff('Dropped {} genomes without download link.') print('Done.') def sort_genomes(self): @@ -432,10 +431,10 @@ def sort_genomes(self): # clean up temporary columns self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) - def identify_taxonomy(self): + def filter_by_taxonomy(self): """Identify taxonomy of genomes. """ - print('Identifying taxonomy of genomes...') + print('Filtering genomes by taxonomy...') n = self.df.shape[0] def report_diff(msg): @@ -476,9 +475,10 @@ def report_diff(msg): report_diff('Dropped {} genomes without valid species taxId.') # drop genomes without Latinate species name + self.df['latin'] = self.df['species'].apply( + lambda x: is_latin(self.taxdump[x]['name'])) if self.latin: - self.df = self.df[self.df['species'].apply( - lambda x: is_latin(self.taxdump[x]['name']))] + self.df.query('latin', inplace=True) report_diff('Dropped {} genomes without Latinate species name.') print('Done.') @@ -494,130 +494,148 @@ def report_diff(msg): report_diff('Dropped {} genomes.') def sample_by_taxonomy(self): - """Taxonomy-based sampling. + """Sample genomes at designated taxonomic rank. """ - if self.sample is None: + self.selected, n = set(), 0 + rank, sample, latin = self.rank, self.sample, False + if sample is None: return print('Sampling genomes based on taxonomy...') - print(f'Up to {self.sample} genome(s) will be sampled per {self.rank}' - '.') + + # Latinate species names + if rank == 'species_latin': + rank, latin = 'species', True + print(f'Up to {sample} genome(s) will be sampled per {rank}' + + (' (Latinate names only) ' if latin else '') + '.') # assign genomes to given rank - if self.rank not in self.df.columns: - self.df[self.rank] = self.df['taxid'].apply( - taxid_at_rank, rank=self.rank, taxdump=self.taxdump) + if rank not in self.df.columns: + self.df[rank] = self.df['taxid'].apply( + taxid_at_rank, rank=rank, taxdump=self.taxdump) + df_ = self.df.dropna(subset=[rank]) + + # keep Latinate species names only + if latin: + df_ = df_.query('latin') # select up to given number of genomes of each taxonomic group - selected = set(self.df.dropna(subset=[self.rank]).groupby( - self.rank).head(self.sample)['genome']) - print(f'Sampled {len(selected)} genomes at the {self.rank} rank.') + self.selected = set(df_.groupby(rank).head(sample)['genome']) + n = df_[rank].unique().shape[0] + print(f' Sampled {len(self.selected)} genomes from {n} ' + f'{rank_plural(rank)}.') # sample at ranks above the current one - ranks = ['phylum', 'class', 'order', 'family', 'genus', 'species'] - if self.above: - if self.rank not in ranks: - raise ValueError( - f'Cannot determine taxonomic ranks above "{self.rank}", ' - 'because it is not among the six standard ranks: ' - f'{", ".join(ranks)}.') - ranks = ranks[:ranks.index(self.rank)][::-1] - print(f'Sampling will also be performed on: {", ".join(ranks)}.') - for rank in ranks: - self.df['temp_rank_'] = self.df['taxid'].apply( - taxid_at_rank, rank=rank, taxdump=self.taxdump) - df_ = self.df.dropna(subset=['temp_rank_']).groupby( - 'temp_rank_').head(self.sample) - selected.update(df_['genome']) - print(f'Sampled a total of {len(selected)} genomes at {self.rank} ' - 'and above.') - self.df.drop(columns=['temp_rank_'], inplace=True) + if not self.above: + return + # determine ranks to sample at + ranks = ['phylum', 'class', 'order', 'family', 'genus', 'species'] + if rank not in ranks: + raise ValueError( + f'Cannot determine taxonomic ranks above "{rank}", because it ' + 'is not among the six standard ranks: {", ".join(ranks)}.') + ranks = ranks[:ranks.index(rank)][::-1] + print(f'Sampling will also be performed on: {", ".join(ranks)}.') + + for r in ranks: + self.df['tmp_rank'] = self.df['taxid'].apply( + taxid_at_rank, rank=r, taxdump=self.taxdump) + df_ = self.df.dropna(subset=['tmp_rank']) + + # calculate number of genomes yet to be sampled per group + goal = pd.Series(sample, df_['tmp_rank'].unique()) + done = df_.query('genome in @self.selected')[ + 'tmp_rank'].value_counts() + left = goal.subtract(done, fill_value=0) + left = left[left > 0].astype(int) + + # sample specific number of genomes per group + s = [] + for tid, num in left.items(): + s.extend(df_.query(f'tmp_rank == "{tid}"').head(num)['genome']) + print(f' Sampled {len(s)} more genomes from {left.shape[0]} ' + f'{rank_plural(r)}.') + self.selected.update(s) + + print(f' Sampled a total of {len(self.selected)} genomes at ' + f'{rank} and above.') + self.df.drop(columns=['tmp_rank'], inplace=True) + + def sample_by_quality(self): + """Sample genomes according to quality categories. + """ # add reference genomes if self.reference: df_ = self.df.query('refseq_category == "reference genome"') print(f'Include {df_.shape[0]} reference genomes.') - selected.update(df_['genome']) + self.selected.update(df_['genome']) # add representative genomes if self.represent: df_ = self.df.query('refseq_category == "representative genome"') print(f'Include {df_.shape[0]} representative genomes.') - selected.update(df_['genome']) + self.selected.update(df_['genome']) # add type material genomes if self.typemater: df_ = self.df[self.df['relation_to_type_material'].notna()] print(f'Include {df_.shape[0]} type material genomes.') - selected.update(df_['genome']) + self.selected.update(df_['genome']) - # filter genomes to selected - self.df.query('genome in @selected', inplace=True) - n = self.df.shape[0] + def filter_to_sampled(self): + """Filter genomes to sampled ones. + """ + n = len(self.selected) if n == 0: raise ValueError('No genome is retained after sampling.') + self.df.query('genome in @self.selected', inplace=True) + self.df.sort_values('genome', inplace=True) print(f'Total number of sampled genomes: {n}.') def download_genomes(self): """Download genomes from NCBI. """ - # sort by genome ID - self.df.sort_values('genome', inplace=True) - - # reconnect to avoid server timeout problem - self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout) - self.ftp.login() - self.ftp.cwd('/genomes/all') + if self.manual: + fname = 'urls.txt' + self.df['ftp_path'].to_csv( + join(self.output, 'urls.txt'), header=False, index=False) + print(f'URLs of sampled genomes written to {fname}. You may ' + 'manually download their protein sequence data to faa/, ' + 'then restart the database building pipeline.') + sys.exit(0) print('Downloading non-redundant genomic data from NCBI...', flush=True) - dir_ = join(self.output, 'download', 'faa') - makedirs(dir_, exist_ok=True) - stems = {} - failures = [] + ldir = join(self.down, 'faa') + makedirs(ldir, exist_ok=True) + failed = [] for row in self.df.itertuples(): g = row.genome - remote_dir = row.ftp_path.split('/', 5)[-1] - stem = remote_dir.rsplit('/', 1)[-1] - stems[g] = stem + rdir = row.ftp_path.split('/', 3)[-1] + stem = rdir.rsplit('/', 1)[-1] fname = f'{stem}_protein.faa.gz' - file = join(dir_, fname) - if self.check_local_file(file): - success = True - else: - success = False - for i in range(self.retries): - try: - with open(file, 'wb') as f: - self.ftp.retrbinary( - f'RETR {remote_dir}/{fname}', f.write) - print(' ' + g, flush=True) - success = True - except ftplib.error_perm as resp: - if str(resp).split()[0] == '550': - break - except ftplib.error_temp: - sleep(self.delay) - continue - except EOFError: - sleep(self.delay) - self.ftp = ftplib.FTP( - 'ftp.ncbi.nlm.nih.gov', timeout=self.timeout) - self.ftp.login() - self.ftp.cwd('/genomes/all') - continue - else: - break - if not success: + lfile = join(ldir, fname) + if self.check_local_file(lfile): + continue + for i in range(self.retries): + ec, _ = run_command( + f'rsync -Ltq {server}/{rdir}/{fname} {ldir}') + if ec == 0: + print(' ' + g, flush=True) + break + else: + sleep(self.delay) + if ec > 0: print(f'WARNING: Cannot retrieve {fname}.') - failures.append(g) + failed.append(g) print('Done.') # drop genomes that cannot be retrieved - if len(failures): + if len(failed): print('Failed to retrieve the following genomes:') - print(' ' + ', '.join(failures)) - failures = set(failures) - self.df = self.df[~self.df['genome'].isin(failures)] + print(' ' + ', '.join(failed)) + failed = set(failed) + self.df.query('genome in @failures', inplace=True) def extract_genomes(self): """Extract proteins from genomes. @@ -628,22 +646,29 @@ def extract_genomes(self): Write protein to genomes(s) map to genome.map.gz. """ print('Extracting downloaded genomic data...', end='', flush=True) - dir_ = join(self.output, 'download', 'faa') + ldir = join(self.down, 'faa') prots = {} - cp = None + cp = None # current protein accession fout = open(join(self.output, 'db.faa'), 'w') def write_prot(): - if cp: + # some protein accessions may be duplicated + try: fout.write(f'>{cp} {prots[cp]["name"]}\n{prots[cp]["seq"]}\n') + except KeyError: + return + else: + prots[cp]['aa'] = len(prots[cp]['seq']) + del prots[cp]['name'] + del prots[cp]['seq'] g2n, g2aa = {}, {} for row in self.df.itertuples(): g, tid = row.genome, row.taxid g2n[g], g2aa[g] = 0, 0 stem = row.ftp_path.rsplit('/', 1)[-1] - fp = join(dir_, f'{stem}_protein.faa.gz') - with gzip.open(fp, 'rb') as f: + lfile = join(ldir, f'{stem}_protein.faa.gz') + with gzip.open(lfile, 'rb') as f: try: content = f.read().decode().splitlines() except (OSError, EOFError, TypeError): @@ -652,7 +677,6 @@ def write_prot(): continue cp = None for line in content: - line = line.rstrip('\r\n') if line.startswith('>'): write_prot() p, name = line[1:].split(None, 1) @@ -661,7 +685,7 @@ def write_prot(): cp = None prots[p]['gs'].add(g) prots[p]['tids'].add(tid) - g2aa[g] += len(prots[p]['seq']) + g2aa[g] += prots[p]['aa'] else: cp = p prots[p] = { @@ -680,11 +704,11 @@ def write_prot(): self.p2tids = {k: v['tids'] for k, v in prots.items()} # report summary - print(f'Total number of genomes extracted: {len(g2n)}.') - print(f'Total number of unique proteins extracted: {len(prots)}.') - print('Number of proteins shared by multiple genomes: {}.'.format( + print(f' Total number of genomes extracted: {len(g2n)}.') + print(f' Total number of unique proteins extracted: {len(prots)}.') + print(' Number of proteins shared by multiple genomes: {}.'.format( sum([1 for k, v in prots.items() if len(v['gs']) > 1]))) - print('Number of proteins shared by multiple TaxIDs: {}.'.format( + print(' Number of proteins shared by multiple TaxIDs: {}.'.format( sum([1 for k, v in prots.items() if len(v['tids']) > 1]))) # write protein to genomes(s) map diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py index 9642b75..5e5ffac 100644 --- a/hgtector/tests/test_database.py +++ b/hgtector/tests/test_database.py @@ -232,8 +232,8 @@ def test_genome_metadata(self): 'infraspecific_name': '', 'isolate': '', 'taxid': '12345', - 'ftp_path': ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/' - '456/GCF_000123456.1_ASM123v1'), + 'ftp_path': ('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/' + '123/456/GCF_000123456.1_ASM123v1'), 'proteins': 100, 'residues': 12500, 'whatever': 'nonsense'}).to_frame().T @@ -247,7 +247,7 @@ def test_genome_metadata(self): exp = ('G1', '100', '12500', 'Chromosome', 'GCF_000123456.1', 'PRJNA123456', 'SAMN00123456', 'ASM123v1', 'hypothetical organism', '', '', '12345', - ('ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/456/' + ('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/123/456/' 'GCF_000123456.1_ASM123v1')) self.assertEqual(obs[1], '\t'.join(exp)) remove(join(self.tmpdir, 'genomes.tsv')) diff --git a/hgtector/tests/test_util.py b/hgtector/tests/test_util.py index fd43e32..872ef81 100644 --- a/hgtector/tests/test_util.py +++ b/hgtector/tests/test_util.py @@ -22,7 +22,7 @@ read_fasta, read_input_prots, contain_words, get_product, seqid2accver, taxids_at_ranks, find_lca, get_lineage, sort_by_hierarchy, refine_taxdump, _get_taxon, add_children, get_descendants, is_latin, is_capital, - taxdump_from_text) + taxdump_from_text, rank_plural) class UtilTests(TestCase): @@ -572,6 +572,14 @@ def test_taxdump_from_text(self): obs['2157'], {'name': 'Archaea', 'parent': '131567', 'rank': 'superkingdom'}) + def test_rank_plural(self): + self.assertEqual(rank_plural('subspecies'), 'subspecies') + self.assertEqual(rank_plural('superphylum'), 'superphyla') + self.assertEqual(rank_plural('infraorder'), 'infraorders') + self.assertEqual(rank_plural('class'), 'classes') + self.assertEqual(rank_plural('family'), 'families') + self.assertEqual(rank_plural('genus'), 'genera') + """Constants""" diff --git a/hgtector/util.py b/hgtector/util.py index dedcd51..abbc4be 100644 --- a/hgtector/util.py +++ b/hgtector/util.py @@ -945,3 +945,30 @@ def taxdump_from_text(text): x = line.split(',') res[x[0]] = {'name': x[1], 'parent': x[2], 'rank': x[3]} return res + + +def rank_plural(rank): + """Convert taxonomic rank to plural form. + + Parameters + ---------- + rank : str + taxonomic rank + + Returns + ------- + str + taxonomic rank in plural form + """ + if rank.endswith('phylum'): + return rank[:-2] + 'a' + elif rank.endswith('class'): + return rank + 'es' + elif rank.endswith('family'): + return rank[:-1] + 'ies' + elif rank.endswith('genus'): + return rank[:-2] + 'era' + elif rank.endswith('species'): + return rank + else: + return rank + 's' From 7af4e426c50cc18c38576d1acc01d619f444d506 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Tue, 23 Nov 2021 13:27:13 -0700 Subject: [PATCH 6/8] fixed scikit-learn compatibility --- CHANGELOG.md | 1 + hgtector/analyze.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9289fbc..a4c4440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Change Log ### Fixed - Fixed an issue with the NCBI FTP server connection during database construction. NCBI now recommends rsync over ftp. Therefore the protocol has been updated accordingly. +- Fixed compatibility with latest scikit-learn (1.0.1). - Fixed compatibility with latest DIAMOND (2.0.13). diff --git a/hgtector/analyze.py b/hgtector/analyze.py index 4660955..b6922b6 100644 --- a/hgtector/analyze.py +++ b/hgtector/analyze.py @@ -887,9 +887,8 @@ def perform_kde(self, data): https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/ """ bw = self.bandwidth - data_ = data[:, np.newaxis] scaler = StandardScaler() - data_ = scaler.fit_transform(data[:, np.newaxis]) + data_ = scaler.fit_transform(data.reshape(-1, 1)) estimator = KernelDensity(kernel='gaussian') # grid search optimization @@ -914,8 +913,8 @@ def perform_kde(self, data): # get density function x, y = self.density_func(data_, kde) - x = scaler.inverse_transform(x) - y = scaler.inverse_transform(y) + x = scaler.inverse_transform(x.reshape(-1, 1)).reshape(-1) + y = scaler.inverse_transform(y.reshape(-1, 1)).reshape(-1) return x, y, bw @staticmethod @@ -1127,7 +1126,7 @@ def smart_kde(self, group): """ data = self.df[group].values scaler = StandardScaler() - data_ = scaler.fit_transform(data[:, np.newaxis]) + data_ = scaler.fit_transform(data.reshape(-1, 1)) estimator = KernelDensity(kernel='gaussian') bwspace = np.logspace(0, -1, self.bw_steps) @@ -1137,8 +1136,8 @@ def smart_kde(self, group): setattr(estimator, 'bandwidth', bw) kde = estimator.fit(data_) x, y = self.density_func(data_, kde) - x = scaler.inverse_transform(x) - y = scaler.inverse_transform(y) + x = scaler.inverse_transform(x.reshape(-1, 1)).reshape(-1) + y = scaler.inverse_transform(y.reshape(-1, 1)).reshape(-1) try: peak, valley = self.first_hill(x, y) except ValueError: @@ -1177,7 +1176,8 @@ def calc_cluster_props(self): # calculate centroid clf = NearestCentroid() clf.fit(data_, self.df['hgt']) - cent = scaler.inverse_transform(clf.centroids_[1]) + cent = scaler.inverse_transform(clf.centroids_[1].reshape( + -1, data_.shape[1])).reshape(-1) return cent def refine_cluster(self, cent): From e6cc0a634346acce48bf8cec855dade84797129f Mon Sep 17 00:00:00 2001 From: drz Date: Thu, 25 Nov 2021 09:51:00 -0700 Subject: [PATCH 7/8] updated database tests and docs --- doc/database.md | 33 +++++------- hgtector/database.py | 2 +- hgtector/tests/test_database.py | 96 ++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 51 deletions(-) diff --git a/doc/database.md b/doc/database.md index e631e62..738b1fb 100644 --- a/doc/database.md +++ b/doc/database.md @@ -1,6 +1,9 @@ Database ======== +## Index +- [Overview](#overview) | [Default protocol](#default-protocol) | [Pre-built database](#pre-built-database) | [Database files](#database-files) | [Considerations](#considerations) | [Command-line reference](#command-line-reference) + ## Overview The `database` command is an automated workflow for sampling reference genomes, downloading non-redundant protein sequences, and building local databases for sequence homology search. It provides various options for flexible customization of the database, to address specific research goals including HGT prediction or other general purposes. @@ -9,15 +12,17 @@ The `database` command is an automated workflow for sampling reference genomes, hgtector database -o ``` +The workflow consists of the following steps: -## Default protocol - -HGTector provides a default protocol for database building. +1. Download NCBI taxonomy database (taxdump). +2. Download NCBI RefSeq assembly summary. +3. Sample genomes based on various properties and taxonomic information. +4. Download protein sequences associated with sampled genomes. +5. Compile local databases using DIAMOND and/or BLAST. -```bash -hgtector database -o --default -``` +## Default protocol +Database files This will download all protein sequences of NCBI RefSeq genomes of **bacteria**, **archaea**, **fungi** and **protozoa**, keep _one genome per species_ that has a Latinate name, plus one genome per taxonomic group at higher ranks, regardless whether that genome has a Latinate species name, plus all NCBI-defined **reference**, **representative** and **type material** genomes (prioritized during taxonomy-based sampling, and added afterwards if not sampled). Finally it will attempt to compile the database using DIAMOND, if available. The command is equivalent to: ```bash @@ -62,17 +67,6 @@ The protein-to-TaxID map is already integrated into the compiled databases, so o Feel free to delete (e.g., `download/`) or compress the intermediate files (e.g., `db.faa`) to save disk space. -## Procedures - -The workflow consists of the following steps: - -1. Download NCBI taxonomy database (taxdump). -2. Download NCBI RefSeq assembly summary. -3. Sample genomes based on various properties and taxonomic information. -4. Download protein sequences associated with sampled genomes. -5. Compile local databases using DIAMOND and/or BLAST. - - ## Considerations ### More examples @@ -207,8 +201,9 @@ Option | Default | Description Option | Default | Description --- | --- | --- -`-s`, `--sample` | 0 | Sample up to this number of genomes per taxonomic group at the given rank. "0" is for all (disable sampling). -`-r`, `--rank` | species | Taxonomic rank at which subsampling will be performed. +`-s`, `--sample` | 0 | Sample up to this number of genomes per taxonomic group at the given rank. "0" is for all (disable sampling). Prior to sampling, genomes will be sorted by NCBI genome category: reference > representative > type material, then by assembly level: complete genome or chromosome > scaffolds > contigs. Sampling will start from the top of the list. +`-r`, `--rank` | species | Taxonomic rank at which subsampling will be performed. Can be any taxonomic rank defined in the NCBI taxonomy database. A special case is "species_latin", which will sample from species that have Latinate names. +`--above` | - | Sampling will also be performed on ranks from the one given by `-r` to phylum (low to high). They will not overlap the already sampled ones. For example, if two _E. coli_ genomes are already sampled, no more genome will be added when sampling in genus _Escherichia_. This flag is useful in the case of `-r species_latin`, because some ranks above species may be undersampled. ### Genome sampling diff --git a/hgtector/database.py b/hgtector/database.py index 911398b..2520fb6 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -432,7 +432,7 @@ def sort_genomes(self): self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) def filter_by_taxonomy(self): - """Identify taxonomy of genomes. + """Filter genomes by taxonomy. """ print('Filtering genomes by taxonomy...') n = self.df.shape[0] diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py index 5e5ffac..9843fff 100644 --- a/hgtector/tests/test_database.py +++ b/hgtector/tests/test_database.py @@ -59,12 +59,12 @@ def test_retrieve_categories(self): def test_filter_genomes(self): me = Database() - header = ('# assembly_accession', 'assembly_level') - data = (('GCF_000000001.1', 'Chromosome'), - ('GCF_000000002.1', 'Complete Genome'), - ('GCF_000000003.2', 'Scaffold'), - ('GCF_000000004.1', 'Contig'), - ('GCA_000000004.1', 'Contig')) + header = ('# assembly_accession', 'assembly_level', 'ftp_path') + data = (('GCF_000000001.1', 'Chromosome', ''), + ('GCF_000000002.1', 'Complete Genome', ''), + ('GCF_000000003.2', 'Scaffold', ''), + ('GCF_000000004.1', 'Contig', ''), + ('GCA_000000004.1', 'Contig', '')) df = pd.DataFrame(data, columns=header) me.complete = False me.genoids = None @@ -102,7 +102,27 @@ def test_filter_genomes(self): self.assertListEqual(me.df['accession'].tolist(), [ 'GCF_000000001.1', 'GCF_000000003.2']) - def test_identify_taxonomy(self): + def test_sort_genomes(self): + me = Database() + header = ( + 'genome', 'taxid', 'refseq_category', 'assembly_level', + 'relation_to_type_material') + data = ( + ('G1', '585056', '', 'Chromosome', None), # E. coli UMN026 + ('G2', '1038927', 'representative genome', 'Chromosome', None), + # E. coli O104:H4 + ('G3', '2580236', '', 'Contig', None), # sync E. coli + ('G4', '622', '', 'Scaffold', 'yes'), # Shigella + ('G5', '548', '', 'Scaffold', None), # Klebsiella + ('G6', '126792', 'reference genome', 'Contig', None)) # plasmid + df = pd.DataFrame(data, columns=header) + me.taxdump = taxdump_from_text(taxdump_proteo) + me.df = df.copy() + me.sort_genomes() + self.assertListEqual(me.df['genome'].tolist(), [ + 'G6', 'G2', 'G4', 'G1', 'G5', 'G3']) + + def test_filter_by_taxonomy(self): me = Database() header = ('organism_name', 'taxid', 'species', 'species_taxid') data = (('Escherichia coli UMN026', '585056', 'E. coli', '562'), @@ -120,7 +140,7 @@ def test_identify_taxonomy(self): me.exclude = False me.taxdump = taxdump_from_text(taxdump_proteo) me.df = df.copy() - me.identify_taxonomy() + me.filter_by_taxonomy() self.assertNotIn('species_taxid', me.df.columns) self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) self.assertListEqual(me.df['species'].tolist(), ['562', '562', '548']) @@ -129,35 +149,34 @@ def test_identify_taxonomy(self): me.block = 'plasmid' me.latin = False me.df = df.copy() - me.identify_taxonomy() + me.filter_by_taxonomy() self.assertListEqual(me.df.index.tolist(), [0, 1, 2]) # no Escherichia me.taxids = '561' me.exclude = True me.df = df.copy() - me.identify_taxonomy() + me.filter_by_taxonomy() self.assertListEqual(me.df.index.tolist(), [2]) def test_sample_by_taxonomy(self): me = Database() # do nothing + me.rank = None me.sample = None + me.above = None self.assertIsNone(me.sample_by_taxonomy()) - # xxx - header = ('genome', 'taxid', 'refseq_category', 'assembly_level') - data = (('G1', '585056', '', 'Chromosome'), # E. coli UMN026 - ('G2', '1038927', 'representative genome', 'Chromosome'), - # E. coli O104:H4 (rep. genome to be prioritized over G1) - ('G3', '2580236', '', 'Contig'), # sync E. coli - ('G4', '622', '', 'Scaffold'), # Shigella - ('G5', '548', '', 'Scaffold'), # Klebsiella - ('G6', '126792', 'reference genome', 'Contig')) # plasmid + # regular + header = ('genome', 'taxid') + data = (('G1', '585056'), # E. coli UMN026 + ('G2', '1038927'), # E. coli O104:H4 + ('G3', '2580236'), # sync E. coli + ('G4', '622'), # Shigella + ('G5', '548'), # Klebsiella + ('G6', '126792')) # plasmid df = pd.DataFrame(data, columns=header) - me.reference = False - me.representative = False me.taxdump = taxdump_from_text(taxdump_proteo) # up to one genome per genus @@ -166,21 +185,36 @@ def test_sample_by_taxonomy(self): me.df = df.copy() me.sample_by_taxonomy() self.assertListEqual(me.df.columns.tolist(), list(header) + ['genus']) - self.assertListEqual(me.df['genome'].tolist(), ['G2', 'G4', 'G5']) - - # include reference genome (plasmid) - me.reference = True - me.df = df.copy() - me.sample_by_taxonomy() - self.assertEqual(me.df['genome'].tolist()[-1], 'G6') + self.assertListEqual(sorted(me.selected), ['G1', 'G4', 'G5']) # up to two genomes for entire cellular life me.rank = 'superkingdom' me.sample = 2 - me.reference = False - me.df = df.copy() me.sample_by_taxonomy() - self.assertListEqual(me.df['genome'].tolist(), ['G1', 'G2']) + self.assertListEqual(sorted(me.selected), ['G1', 'G2']) + + # up to one genome per species + me.rank = 'species' + me.sample = 1 + me.sample_by_taxonomy() + self.assertListEqual(sorted(me.selected), [ + 'G1', 'G3', 'G4', 'G5', 'G6']) + + # up to one genome per species, Latinate only + me.rank = 'species_latin' + me.sample = 1 + me.df['latin'] = [True, True, True, True, True, False] + me.sample_by_taxonomy() + self.assertListEqual(sorted(me.selected), [ + 'G1', 'G3', 'G4', 'G5']) + + # species and above (no difference) + me.rank = 'species_latin' + me.above = True + me.sample = 10 + me.sample_by_taxonomy() + self.assertListEqual(sorted(me.selected), [ + 'G1', 'G2', 'G3', 'G4', 'G5']) def test_download_genomes(self): # TODO From bdb606c88910408605fd88ffb1ebf9eee0965c5a Mon Sep 17 00:00:00 2001 From: drz Date: Thu, 25 Nov 2021 13:07:58 -0700 Subject: [PATCH 8/8] updated remote blast policy --- doc/1strun.md | 2 +- hgtector/config.yml | 8 +++--- hgtector/search.py | 67 +++++++++++++++++++++++++++++---------------- 3 files changed, 48 insertions(+), 29 deletions(-) diff --git a/doc/1strun.md b/doc/1strun.md index 3314752..e743cd2 100644 --- a/doc/1strun.md +++ b/doc/1strun.md @@ -11,7 +11,7 @@ A small example is provided in the subdirectory [example](../example). The input Let's analyze this small example using HGTector. -**Note**: It has been increasingly infeasible as of 2020 to run remote search through the NCBI BLAST server. If you experience an very slow run when going through this tutorial, please skip and move on to [second run](2ndrun.md). +**Note**: Automatic remote BLAST search using URL API is inefficient as of 2021 and has been [deprecated](https://ncbi.github.io/blast-cloud/dev/api.html) by NCBI. Therefore this tutorial is for reference only. Unless you want to wait for long hours, please skip this tutorial and move on to the [second run](2ndrun.md). ## Search diff --git a/hgtector/config.yml b/hgtector/config.yml index f3c29da..5e464cc 100644 --- a/hgtector/config.yml +++ b/hgtector/config.yml @@ -101,8 +101,8 @@ blast: remote: # remote search database - # options: nr, refseq_protein, swissprot, pdb, etc. - db: nr + # options: nr, refseq_select_prot, refseq_protein, swissprot, pdb, etc. + db: refseq_select_prot # remote search algorithm # options: blastp psiBlast deltaBlast kmerBlastp phiBlast, etc. @@ -114,13 +114,13 @@ remote: # typically cannot exceed 8,000 characters) retries: 5 # maximum number of retries per search delay: 60 # seconds between two search requests - timeout: 900 # seconds before program gives up waiting + timeout: 1800 # seconds before program gives up waiting # entrez query text entrez: all [filter] NOT(environmental samples[filter] OR metagenomes[orgn]) txid131567[orgn] # extra URL arguments for remote search - extrargs: "&FILTER=m%20S" + extrargs: "&WORD_SIZE=6&FILTER=m%20S" ## Fetch information from remote server diff --git a/hgtector/search.py b/hgtector/search.py index f07de07..056e078 100644 --- a/hgtector/search.py +++ b/hgtector/search.py @@ -15,8 +15,8 @@ from tempfile import mkdtemp from time import time, sleep from math import log -from urllib.parse import quote -from urllib.request import urlopen, HTTPError, URLError +from urllib.parse import quote, urlencode +from urllib.request import urlopen, Request, HTTPError, URLError from hgtector.util import ( timestamp, load_configs, get_config, arg2bool, list_from_param, file2id, @@ -1550,6 +1550,11 @@ def remote_search(self, seqs): dict of list of dict hit table per query sequence + Notes + ----- + As of 2021, use of RESTful APIs to conduct BLAST searches is + deprecated by NCBI. + .. _NCBI's official reference of RESTful APIs: https://ncbi.github.io/blast-cloud/dev/using-url-api.html .. _NCBI's official sample Perl script: @@ -1559,24 +1564,29 @@ def remote_search(self, seqs): https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE= BlastDocs&DOC_TYPE=DeveloperInfo .. _Instead, NCBI recommends setting up custom BLAST servers. See: - https://ncbi.github.io/blast-cloud/ + https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs& + DOC_TYPE=CloudBlast """ - # generate query URL - query = ''.join([f'>{id_}\n{seq}\n' for id_, seq in seqs]) - url = f'{self.server}?CMD=Put&PROGRAM=blastp&DATABASE={self.db}' + print(f'Submitting {len(seqs)} queries for search.', end='', + flush=True) + data = [('CMD', 'Put'), + ('PROGRAM', 'blastp'), + ('DATABASE', self.db), + ('WORD_SIZE', 6)] if self.algorithm: - url += '&BLAST_PROGRAMS=' + self.algorithm + data.append(('BLAST_PROGRAMS', self.algorithm)) if self.evalue: - url += '&EXPECT=' + str(self.evalue) + data.append(('EXPECT', self.evalue)) if self.maxseqs: - url += '&MAX_NUM_SEQ=' + str(self.maxseqs) + data.append(('MAX_NUM_SEQ', self.maxseqs)) if self.entrez: - url += '&EQ_TEXT=' + quote(self.entrez) + data.append(('EQ_TEXT', self.entrez)) + query = ''.join([f'>{id_}\n{seq}\n' for id_, seq in seqs]) + data.append(('QUERY', query)) + data = urlencode(data) if self.extrargs: - url += '&' + self.extrargs.lstrip('&') - url += '&QUERY=' + quote(query) - print(f'Submitting {len(seqs)} queries for search.', end='', - flush=True) + data += '&' + self.extrargs.lstrip('&') + data = data.encode() trial = 0 while True: @@ -1589,7 +1599,8 @@ def remote_search(self, seqs): trial += 1 # get request Id - with urlopen(url) as response: + req = Request(self.server, data=data) + with urlopen(req) as response: res = response.read().decode('utf-8') m = re.search(r'^ RID = (.*$)', res, re.MULTILINE) if not m: @@ -1597,14 +1608,17 @@ def remote_search(self, seqs): continue rid = m.group(1) print(f' RID: {rid}.', end='', flush=True) - sleep(1) + sleep(10) # check status - url_ = f'{self.server}?CMD=Get&FORMAT_OBJECT=SearchInfo&RID={rid}' + data_ = urlencode([('CMD', 'Get'), + ('FORMAT_OBJECT', 'SearchInfo'), + ('RID', rid)]).encode() + req_ = Request(self.server, data=data_) starttime = time() success = False while True: - with urlopen(url_) as response: + with urlopen(req_) as response: res = response.read().decode('utf-8') m = re.search(r'\s+Status=(.+)', res, re.MULTILINE) if not m: @@ -1625,6 +1639,7 @@ def remote_search(self, seqs): if 'ThereAreHits=yes' not in res: print('WARNING: Remote search returned no result.') break + print(' Success.', end='', flush=True) success = True break else: @@ -1632,15 +1647,19 @@ def remote_search(self, seqs): break if not success: continue - sleep(1) + sleep(10) # retrieve result - url_ = (f'{self.server}?CMD=Get&ALIGNMENT_VIEW=Tabular' - f'&FORMAT_TYPE=Text&RID={rid}') + data_ = [('CMD', 'Get'), + ('ALIGNMENT_VIEW', 'Tabular'), + ('FORMAT_TYPE', 'Text'), + ('RID', rid)] if self.maxseqs: - url_ += (f'&MAX_NUM_SEQ={self.maxseqs}' - f'&DESCRIPTIONS={self.maxseqs}') - with urlopen(url_) as response: + data_.extend([('MAX_NUM_SEQ', self.maxseqs), + ('DESCRIPTIONS', self.maxseqs)]) + data_ = urlencode(data_).encode() + req_ = Request(self.server, data=data_) + with urlopen(req_) as response: res = response.read().decode('utf-8') if '# blastp' not in res or '# Query: ' not in res: print('WARNING: Invalid format of remote search results.')