From 2c98d6e88d3648581a8f40615af3fd9ec40337f6 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Sun, 27 Dec 2020 18:42:28 -0700 Subject: [PATCH 01/11] database default diamond warning --- hgtector/database.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 2d539ce..4e2992f 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -202,12 +202,6 @@ def set_parameters(self, args): raise ValueError( f'Invalid {exe} executable: {getattr(self, exe)}.') - # determine number of CPUs to use - if self.compile in ('diamond', 'both') and not self.threads: - self.threads = cpu_count() - if self.threads is None: - self.threads = 1 - # default protocol if self.default: print('The default protocol is selected for database building.') @@ -220,7 +214,22 @@ def set_parameters(self, args): self.rank = 'species' self.reference = True self.representative = True - self.compile = 'diamond' + + if self.diamond is None: + self.diamond = 'diamond' + if which(self.diamond) is None: + print('WARNING: Cannot find DIAMOND in this computer. ' + 'You will need to manually compile the database ' + 'after download is complete.') + self.compile = 'none' + else: + self.compile = 'diamond' + + # determine number of CPUs to use + if self.compile in ('diamond', 'both') and not self.threads: + self.threads = cpu_count() + if self.threads is None: + self.threads = 1 makedirs(self.output, exist_ok=True) From 1616e165b3a7ed639625aa41353133115184e0ab Mon Sep 17 00:00:00 2001 From: drz Date: Mon, 28 Dec 2020 17:17:20 -0700 Subject: [PATCH 02/11] added quick start guide --- CHANGELOG.md | 1 + README.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 461e3e5..2efe1e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ Change Log ### Added - Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported. +- A quick-start guide added to the homepage. ### Changed - Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab). diff --git a/README.md b/README.md index fc5918f..b3959b8 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,53 @@ References - [Configure](doc/config.md) +## Quick start + +Set up a Conda environment and install dependencies: + +```bash +conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond +conda activate hgtector +``` + +Install HGTector2: + +```bash +pip install git+https://github.com/qiyunlab/HGTector.git +``` + +Build a reference database using the default protocol: + +```bash +hgtector database -o db_dir --default +``` + +This will retrieve the latest genomic data from NCBI. If this does not work (e.g., due to network issues), or you need some customization, please read the [database](doc/database.md) page. + +Prepare input file(s). They should be multi-Fasta files of amino acid sequences (faa). Each file represents the whole protein set of a complete or partial genome. + +Perform homology [search](doc/search.md): + +```bash +hgtector search -i input.faa -o search_dir -m diamond -p 16 -d db_dir/diamond/db -t db_dir/taxdump +``` + +Perform HGT [prediction](doc/analyze.md): + +```bash +hgtector analyze -i search_dir -o analyze_dir -t hgtdb/taxdump +``` + +Examine the prediction results under the `analyze_dir` directory. + +It is recommended that you read the [first run](doc/1strun.md), [second run](doc/2ndrun.md) and [real runs](doc/realrun.md) pages to get familiar with the pipeline, the underlying methodology, and the customization options. + + ## License Copyright (c) 2013-2020, [Qiyun Zhu](mailto:qiyunzhu@gmail.com) and [Katharina Dittmar](mailto:katharinad@gmail.com). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE). + ## Citation > Zhu Q, Kosoy M, Dittmar K. HGTector: [an automated method facilitating genome-wide discovery of putative horizontal gene transfers](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-717). *BMC Genomics*. 2014. 15:717. From 5d86dda6898982b27dc02fa69097a05d26eace3f Mon Sep 17 00:00:00 2001 From: drz Date: Fri, 1 Jan 2021 22:57:05 -0700 Subject: [PATCH 03/11] fixed custom plotting code in documentation --- doc/2ndrun.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/2ndrun.md b/doc/2ndrun.md index 8a9bd25..d87148a 100644 --- a/doc/2ndrun.md +++ b/doc/2ndrun.md @@ -374,11 +374,12 @@ import pandas as pd import matplotlib.pyplot as plt # read prediction result -hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh'], squeeze=True) +hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh', 'donor']) # bar plot fig = plt.figure(figsize=(5, 5)) -plt.barh(range(len(hgts)), hgts.sort_values()) +silhs = hgts['silh'].sort_values() +plt.barh(range(len(silhs)), silhs) plt.xlim(left=0.5) plt.xlabel('Silhouette score') plt.ylabel('Gene') @@ -402,7 +403,7 @@ df = df.query('close + distal > 0') df = df[(zscore(df[['close', 'distal']]) < 3).all(axis=1)] # append silhouette scores -df['silh'] = df['protein'].map(hgts) +df['silh'] = df['protein'].map(silhs) # scatter plot fig = plt.figure(figsize=(5, 5)) From e0d1e07dbc6a054686ef477dbee35506a9e1e70f Mon Sep 17 00:00:00 2001 From: drz Date: Sun, 14 Feb 2021 11:22:52 -0700 Subject: [PATCH 04/11] handling corrupted gzip file --- hgtector/database.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 4e2992f..9e0402d 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -594,12 +594,15 @@ def write_prot(): g2n[g], g2aa[g] = 0, 0 stem = row.ftp_path.rsplit('/', 1)[-1] fp = join(dir_, f'{stem}_protein.faa.gz') - try: - fin = gzip.open(fp, 'rt') - except TypeError: - fin = gzip.open(fp, 'r') + with gzip.open(fp, 'rb') as f: + try: + content = f.read().decode().splitlines() + except (OSError, EOFError, TypeError): + print(f' skipping corrupted file {stem}.', end='', + flush=True) + continue cp = None - for line in fin: + for line in content: line = line.rstrip('\r\n') if line.startswith('>'): write_prot() @@ -618,7 +621,6 @@ def write_prot(): line = line.rstrip('*') prots[cp]['seq'] += line g2aa[g] += len(line) - fin.close() write_prot() fout.close() print(' done.') From 1f1af76ca5cfa928fcc4dfbe3e153ee336b6106a Mon Sep 17 00:00:00 2001 From: drz Date: Wed, 24 Feb 2021 09:34:30 -0700 Subject: [PATCH 05/11] attempted to handle ftp retrbinary eoferror --- hgtector/database.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 9e0402d..7473b92 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -546,8 +546,12 @@ def download_genomes(self): for i in range(self.retries): try: with open(file, 'wb') as f: - self.ftp.retrbinary( - f'RETR {remote_dir}/{fname}', f.write) + cmd = f'RETR {remote_dir}/{fname}' + try: + self.ftp.retrbinary(cmd, f.write) + except EOFError: + sleep(self.delay) + continue print(' ' + g, flush=True) success = True except ftplib.error_perm as resp: From a4b46948cbf1f74c6f26cc7794df870df38b98ec Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 20:01:26 -0700 Subject: [PATCH 06/11] fixed database download issue --- hgtector/database.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 7473b92..db59962 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -240,7 +240,6 @@ def connect_server(self): makedirs(join(self.output, 'download'), exist_ok=True) self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout) self.ftp.login() - # self.ftp.set_pasv(False) print(' done.') def retrieve_taxdump(self): @@ -285,7 +284,7 @@ def get_summary(target): # read summary print(f'Reading {target} assembly summary...', end='', flush=True) - df = pd.read_csv(local_file, sep='\t', skiprows=1) + df = pd.read_table(local_file, skiprows=1, low_memory=False) print(' done.') return df @@ -493,15 +492,16 @@ def sample_by_taxonomy(self): 'Contig': 2}) # sample genomes per taxonomic group + # TODO: make this code faster selected = [] + # sort genomes by three criteria + self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True) for taxon in taxa: # select genomes under this taxon df_ = self.df.query(f'{self.rank} == "{taxon}"') - # sort genomes by three criteria - df_ = df_.sort_values(by=['rc_seq', 'al_seq', 'genome']) # take up to given number of genomes from top - df_ = df_.head(min(self.sample, df_.shape[0])) - selected.extend(df_['genome'].tolist()) + gs = df_.head(min(self.sample, df_.shape[0]))['genome'].tolist() + selected.extend(gs) selected = set(selected) # add reference / representative @@ -511,8 +511,11 @@ def sample_by_taxonomy(self): selected.update(self.df.query( f'refseq_category == "{key} genome"')['genome'].tolist()) - self.df = self.df[self.df['genome'].isin(selected)] - print(f'Total number of sampled genomes: {self.df.shape[0]}.') + self.df.query('genome in @selected', inplace=True) + n = self.df.shape[0] + if n == 0: + raise ValueError('No genome is retained after sampling.') + print(f'Total number of sampled genomes: {n}.') # clean up temporary columns self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) @@ -521,7 +524,6 @@ def download_genomes(self): """Download genomes from NCBI. """ # reconnect to avoid server timeout problem - # TODO: replace this ugly hack with a more stable solution self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout) self.ftp.login() self.ftp.cwd('/genomes/all') @@ -546,12 +548,8 @@ def download_genomes(self): for i in range(self.retries): try: with open(file, 'wb') as f: - cmd = f'RETR {remote_dir}/{fname}' - try: - self.ftp.retrbinary(cmd, f.write) - except EOFError: - sleep(self.delay) - continue + self.ftp.retrbinary( + f'RETR {remote_dir}/{fname}', f.write) print(' ' + g, flush=True) success = True except ftplib.error_perm as resp: @@ -560,6 +558,13 @@ def download_genomes(self): except ftplib.error_temp: sleep(self.delay) continue + except EOFError: + sleep(self.delay) + self.ftp = ftplib.FTP( + 'ftp.ncbi.nlm.nih.gov', timeout=self.timeout) + self.ftp.login() + self.ftp.cwd('/genomes/all') + continue else: break if not success: From 83030a25c3d4102c98353122d0051c3840d7b915 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 20:16:35 -0700 Subject: [PATCH 07/11] speeded up taxonomic sampling of database --- hgtector/config.yml | 1 + hgtector/database.py | 26 +++++++++----------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/hgtector/config.yml b/hgtector/config.yml index 5301f5e..f3c29da 100644 --- a/hgtector/config.yml +++ b/hgtector/config.yml @@ -144,6 +144,7 @@ download: delay: 10 # seconds between retries timeout: 60 # seconds before program gives up waiting + ## Taxonomic filtering taxonomy: diff --git a/hgtector/database.py b/hgtector/database.py index db59962..7e64dcf 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -477,13 +477,6 @@ def sample_by_taxonomy(self): self.df[self.rank] = self.df['taxid'].apply( taxid_at_rank, rank=self.rank, taxdump=self.taxdump) - # list taxonomic groups at rank - taxa = self.df[self.rank].dropna().unique().tolist() - n = len(taxa) - if n == 0: - raise ValueError(f'No genome is classified at rank "{self.rank}".') - print(f'Total number of taxonomic groups at {self.rank}: {n}.') - # custom sorting orders self.df['rc_seq'] = self.df['refseq_category'].map( {'reference genome': 0, 'representative genome': 1}) @@ -491,18 +484,13 @@ def sample_by_taxonomy(self): {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1, 'Contig': 2}) - # sample genomes per taxonomic group - # TODO: make this code faster - selected = [] # sort genomes by three criteria self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True) - for taxon in taxa: - # select genomes under this taxon - df_ = self.df.query(f'{self.rank} == "{taxon}"') - # take up to given number of genomes from top - gs = df_.head(min(self.sample, df_.shape[0]))['genome'].tolist() - selected.extend(gs) - selected = set(selected) + + # select up to given number of genomes of each taxonomic group + selected = set(self.df.groupby(self.rank).head(self.sample)['genome']) + if not selected: + raise ValueError(f'No genome is classified at rank "{self.rank}".') # add reference / representative for key in ('reference', 'representative'): @@ -511,12 +499,16 @@ def sample_by_taxonomy(self): selected.update(self.df.query( f'refseq_category == "{key} genome"')['genome'].tolist()) + # filter genomes to selected self.df.query('genome in @selected', inplace=True) n = self.df.shape[0] if n == 0: raise ValueError('No genome is retained after sampling.') print(f'Total number of sampled genomes: {n}.') + # sort by genome ID + self.df.sort_values('genome', inplace=True) + # clean up temporary columns self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True) From c1e8a5155daf3f6525050a7e4413a4e98be02b46 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 20:25:48 -0700 Subject: [PATCH 08/11] added type material selection --- hgtector/database.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 7e64dcf..956beba 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -65,6 +65,8 @@ {'action': 'store_true'}], ['--representative', 'include NCBI-defined representative genomes', {'action': 'store_true'}], + ['--typematerial', 'include NCBI-defined type material genomes', + {'action': 'store_true'}], 'taxonomic filter', ['--capital', 'organism name must be capitalized', @@ -493,12 +495,18 @@ def sample_by_taxonomy(self): raise ValueError(f'No genome is classified at rank "{self.rank}".') # add reference / representative - for key in ('reference', 'representative'): + for key in 'reference', 'representative': if getattr(self, key): - print(f'Add {key} genomes back to selection.') + print(f'Add {key} genomes to selection.') selected.update(self.df.query( f'refseq_category == "{key} genome"')['genome'].tolist()) + # add type material + if self.typematerial: + print('Add type material genomes to selection.') + selected.update(self.df[self.df[ + 'relation_to_type_material'].notna()]['genome'].tolist()) + # filter genomes to selected self.df.query('genome in @selected', inplace=True) n = self.df.shape[0] From ac5fc33da3513855deb3222046793124899cf025 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 20:47:56 -0700 Subject: [PATCH 09/11] added sorting by type material --- hgtector/database.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hgtector/database.py b/hgtector/database.py index 956beba..49e732c 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -479,9 +479,14 @@ def sample_by_taxonomy(self): self.df[self.rank] = self.df['taxid'].apply( taxid_at_rank, rank=self.rank, taxdump=self.taxdump) - # custom sorting orders - self.df['rc_seq'] = self.df['refseq_category'].map( - {'reference genome': 0, 'representative genome': 1}) + # sort by reference > representative > type material > other + self.df['rc_seq'] = self.df.apply( + lambda x: 0 if x['refseq_category'] == 'reference genome' + else (1 if x['refseq_category'] == 'representative genome' + else (2 if pd.notnull(x['relation_to_type_material']) + else 3)), axis=1) + + # sort by complete > scaffold > contig self.df['al_seq'] = self.df['assembly_level'].map( {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1, 'Contig': 2}) @@ -494,14 +499,14 @@ def sample_by_taxonomy(self): if not selected: raise ValueError(f'No genome is classified at rank "{self.rank}".') - # add reference / representative + # add reference / representative genomes for key in 'reference', 'representative': if getattr(self, key): print(f'Add {key} genomes to selection.') selected.update(self.df.query( f'refseq_category == "{key} genome"')['genome'].tolist()) - # add type material + # add type material genomes if self.typematerial: print('Add type material genomes to selection.') selected.update(self.df[self.df[ From 20365943c273454d7b64bf9196cc7a6499c988b7 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 21:10:47 -0700 Subject: [PATCH 10/11] revised rule for Latin species name --- hgtector/tests/test_util.py | 1 + hgtector/util.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/hgtector/tests/test_util.py b/hgtector/tests/test_util.py index cea39d6..fd43e32 100644 --- a/hgtector/tests/test_util.py +++ b/hgtector/tests/test_util.py @@ -367,6 +367,7 @@ def test_is_capital(self): def test_is_latin(self): self.assertTrue(is_latin('Escherichia coli')) self.assertTrue(is_latin('Rickettsia felis')) + self.assertTrue(is_latin('[Clostridium] difficile')) self.assertFalse(is_latin('Enterobacteriaceae')) self.assertFalse(is_latin('Escherichia coli O157:H7')) self.assertFalse(is_latin('Citrobacter sp. A293')) diff --git a/hgtector/util.py b/hgtector/util.py index 8a0fdfe..dedcd51 100644 --- a/hgtector/util.py +++ b/hgtector/util.py @@ -474,7 +474,7 @@ def is_capital(name): def is_latin(name): - """Check if a species name is Latin. + """Check if a species name is Latinate. Parameters ---------- @@ -484,16 +484,21 @@ def is_latin(name): Returns ------- bool - whether species name is Latin + whether species name is Latinate """ if name == '': return False elif name.count(' ') != 1: return False - str_ = name.replace(' ', '') - if not str_.istitle(): + if name[0] == '[': + i = name.find(']') + if i == -1: + return False + name = name[1:i] + name[i + 1:] + name = name.replace(' ', '') + if not name.istitle(): return False - elif not str_.isalpha(): + elif not name.isalpha(): return False return True From bd999ea8d6e1cd01f63a699a120746a3ae4e4a52 Mon Sep 17 00:00:00 2001 From: qiyunzhu Date: Fri, 19 Nov 2021 23:01:24 -0700 Subject: [PATCH 11/11] minor tuning --- hgtector/analyze.py | 12 ++++++------ hgtector/database.py | 10 +++++----- hgtector/search.py | 24 ++++++++++++------------ hgtector/tests/test_database.py | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/hgtector/analyze.py b/hgtector/analyze.py index 4b861a1..4660955 100644 --- a/hgtector/analyze.py +++ b/hgtector/analyze.py @@ -205,23 +205,23 @@ def set_parameters(self, args): # load configurations get_config(self, 'evalue', 'analyze.evalue', float) - for key in ('maxhits', 'identity', 'coverage'): + for key in 'maxhits', 'identity', 'coverage': get_config(self, key, f'analyze.{key}') - for key in ('input_cov', 'self_rank', 'close_size'): + for key in 'input_cov', 'self_rank', 'close_size': get_config(self, key, f'grouping.{key.replace("_", "")}') for key in ('weighted', 'outliers', 'orphans', 'bandwidth', 'bw_steps', 'low_part', 'noise', 'fixed', 'silhouette', 'self_low'): get_config(self, key, f'predict.{key.replace("_", "")}') get_config(self, 'distal_top', 'donor.distaltop') - for key in ('name', 'rank'): + for key in 'name', 'rank': get_config(self, f'donor_{key}', f'donor.{key}') # convert boolean values - for key in ('weighted', 'orphans', 'self_low', 'donor_name'): + for key in 'weighted', 'orphans', 'self_low', 'donor_name': setattr(self, key, arg2bool(getattr(self, key, None))) # convert fractions to percentages - for metric in ('input_cov', 'noise', 'fixed', 'distal_top'): + for metric in 'input_cov', 'noise', 'fixed', 'distal_top': val = getattr(self, metric) if val and val < 1: setattr(self, metric, val * 100) @@ -470,7 +470,7 @@ def define_groups(self): 3. `groups` (keys: self, close, distal): all taxIds under each group. """ self.groups = {} - for key in ('self', 'close'): + for key in 'self', 'close': tids = getattr(self, f'{key}_tax') # user-defined group diff --git a/hgtector/database.py b/hgtector/database.py index 49e732c..bd274a3 100644 --- a/hgtector/database.py +++ b/hgtector/database.py @@ -175,17 +175,17 @@ def set_parameters(self, args): setattr(self, key, val) # load configurations - for key in ('capital', 'block', 'latin'): + for key in 'capital', 'block', 'latin': get_config(self, key, f'taxonomy.{key}') - for key in ('retries', 'delay', 'timeout'): + for key in 'retries', 'delay', 'timeout': get_config(self, key, f'download.{key}') - for key in ('diamond', 'makeblastdb'): + for key in 'diamond', 'makeblastdb': get_config(self, key, f'program.{key}') - for key in ('threads', 'tmpdir'): + for key in 'threads', 'tmpdir': get_config(self, key, f'local.{key}') # convert boolean values - for key in ('capital', 'latin'): + for key in 'capital', 'latin': setattr(self, key, arg2bool(getattr(self, key, None))) # make temporary directory diff --git a/hgtector/search.py b/hgtector/search.py index 7dee98e..f07de07 100644 --- a/hgtector/search.py +++ b/hgtector/search.py @@ -248,9 +248,9 @@ def args_wf(self, args): # load search parameters get_config(self, 'evalue', 'search.evalue', float) - for key in ('method', 'minsize', 'maxseqs', 'identity', 'coverage'): + for key in 'method', 'minsize', 'maxseqs', 'identity', 'coverage': get_config(self, key, f'search.{key}') - for key in ('diamond', 'blastp', 'blastdbcmd'): + for key in 'diamond', 'blastp', 'blastdbcmd': get_config(self, key, f'program.{key}') if self.method not in {'auto', 'diamond', 'blast', 'remote', @@ -295,12 +295,12 @@ def args_wf(self, args): self.method = 'remote' # load method-specific arguments - for key in ('queries', 'maxchars', 'extrargs'): + for key in 'queries', 'maxchars', 'extrargs': get_config(self, key, f'{self.method}.{key}') # load remote search settings if self.method == 'remote': - for key in ('db', 'algorithm', 'delay', 'timeout', 'entrez'): + for key in 'db', 'algorithm', 'delay', 'timeout', 'entrez': get_config(self, key, f'remote.{key}') get_config(self, 'server', 'server.search') @@ -336,7 +336,7 @@ def args_wf(self, args): self.prot2tid = {} # assign taxonomy database - for key in ('taxdump', 'taxmap'): + for key in 'taxdump', 'taxmap': get_config(self, key, f'database.{key}') if self.method != 'remote': @@ -350,7 +350,7 @@ def args_wf(self, args): raise ValueError( f'Invalid taxonomy database directory: {self.taxdump}.') else: - for fname in ('names.dmp', 'nodes.dmp'): + for fname in 'names.dmp', 'nodes.dmp': if not isfile(join(self.taxdump, fname)): raise ValueError( f'Taxonomy database file {fname} is not found.') @@ -361,13 +361,13 @@ def args_wf(self, args): f'Invalid protein-to-taxId map: {self.taxmap}.') # load taxonomic filters and convert to lists - for key in ('include', 'exclude', 'block'): + for key in 'include', 'exclude', 'block': attr = f'tax_{key}' get_config(self, attr, f'taxonomy.{key}') setattr(self, attr, list_from_param(getattr(self, attr))) # load taxonomy switches - for key in ('unique', 'unirank', 'capital', 'latin'): + for key in 'unique', 'unirank', 'capital', 'latin': get_config(self, f'tax_{key}', f'taxonomy.{key}') """determine self-alignment strategy""" @@ -410,7 +410,7 @@ def args_wf(self, args): # load configurations get_config(self, 'fetch_server', 'server.fetch') - for key in ('enable', 'queries', 'retries', 'delay', 'timeout'): + for key in 'enable', 'queries', 'retries', 'delay', 'timeout': get_config(self, f'fetch_{key}', f'fetch.{key}') # determine remote or local fetching @@ -421,11 +421,11 @@ def args_wf(self, args): """final steps""" # convert boolean values - for key in ('tax_unique', 'tax_capital', 'tax_latin'): + for key in 'tax_unique', 'tax_capital', 'tax_latin': setattr(self, key, arg2bool(getattr(self, key, None))) # convert fractions to percentages - for metric in ('identity', 'coverage'): + for metric in 'identity', 'coverage': val = getattr(self, metric) if val and val < 1: setattr(self, metric, val * 100) @@ -1417,7 +1417,7 @@ def parse_fasta_xml(xml): for m in re.finditer(r'(.+?)<\/TSeq>', xml, re.DOTALL): s_ = m.group(1) seq = [] - for key in (('accver', 'taxid', 'defline', 'sequence')): + for key in 'accver', 'taxid', 'defline', 'sequence': m_ = re.search(r'(.+)<\/TSeq_%s>' % (key, key), s_) seq.append(m_.group(1) if m_ else '') seq[2] = get_product(seq[2]) diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py index 1fc860a..9642b75 100644 --- a/hgtector/tests/test_database.py +++ b/hgtector/tests/test_database.py @@ -112,7 +112,7 @@ def test_identify_taxonomy(self): ('Plasmid pPY113', '126792', '', '')) df = pd.DataFrame(data, columns=header) - # organism names must be capital and latinate + # organism names must be capital and Latinate me.capital = True me.block = None me.latin = True