From 2c98d6e88d3648581a8f40615af3fd9ec40337f6 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Sun, 27 Dec 2020 18:42:28 -0700
Subject: [PATCH 01/11] database default diamond warning

---
 hgtector/database.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 2d539ce..4e2992f 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -202,12 +202,6 @@ def set_parameters(self, args):
                     raise ValueError(
                         f'Invalid {exe} executable: {getattr(self, exe)}.')
 
-        # determine number of CPUs to use
-        if self.compile in ('diamond', 'both') and not self.threads:
-            self.threads = cpu_count()
-            if self.threads is None:
-                self.threads = 1
-
         # default protocol
         if self.default:
             print('The default protocol is selected for database building.')
@@ -220,7 +214,22 @@ def set_parameters(self, args):
             self.rank = 'species'
             self.reference = True
             self.representative = True
-            self.compile = 'diamond'
+
+            if self.diamond is None:
+                self.diamond = 'diamond'
+            if which(self.diamond) is None:
+                print('WARNING: Cannot find DIAMOND in this computer. '
+                      'You will need to manually compile the database '
+                      'after download is complete.')
+                self.compile = 'none'
+            else:
+                self.compile = 'diamond'
+
+        # determine number of CPUs to use
+        if self.compile in ('diamond', 'both') and not self.threads:
+            self.threads = cpu_count()
+            if self.threads is None:
+                self.threads = 1
 
         makedirs(self.output, exist_ok=True)
 

From 1616e165b3a7ed639625aa41353133115184e0ab Mon Sep 17 00:00:00 2001
From: drz <qiyunzhu@gmail.com>
Date: Mon, 28 Dec 2020 17:17:20 -0700
Subject: [PATCH 02/11] added quick start guide

---
 CHANGELOG.md |  1 +
 README.md    | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 461e3e5..2efe1e1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ Change Log
 
 ### Added
 - Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported.
+- A quick-start guide added to the homepage.
 
 ### Changed
 - Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab).
diff --git a/README.md b/README.md
index fc5918f..b3959b8 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,53 @@ References
 - [Configure](doc/config.md)
 
 
+## Quick start
+
+Set up a Conda environment and install dependencies:
+
+```bash
+conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond
+conda activate hgtector
+```
+
+Install HGTector2:
+
+```bash
+pip install git+https://github.com/qiyunlab/HGTector.git
+```
+
+Build a reference database using the default protocol:
+
+```bash
+hgtector database -o db_dir --default
+```
+
+This will retrieve the latest genomic data from NCBI. If this does not work (e.g., due to network issues), or you need some customization, please read the [database](doc/database.md) page.
+
+Prepare input file(s). They should be multi-Fasta files of amino acid sequences (faa). Each file represents the whole protein set of a complete or partial genome.
+
+Perform homology [search](doc/search.md):
+
+```bash
+hgtector search -i input.faa -o search_dir -m diamond -p 16 -d db_dir/diamond/db -t db_dir/taxdump
+```
+
+Perform HGT [prediction](doc/analyze.md):
+
+```bash
+hgtector analyze -i search_dir -o analyze_dir -t hgtdb/taxdump
+```
+
+Examine the prediction results under the `analyze_dir` directory.
+
+It is recommended that you read the [first run](doc/1strun.md), [second run](doc/2ndrun.md) and [real runs](doc/realrun.md) pages to get familiar with the pipeline, the underlying methodology, and the customization options.
+
+
 ## License
 
 Copyright (c) 2013-2020, [Qiyun Zhu](mailto:qiyunzhu@gmail.com) and [Katharina Dittmar](mailto:katharinad@gmail.com). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE).
 
+
 ## Citation
 
 > Zhu Q, Kosoy M, Dittmar K. HGTector: [an automated method facilitating genome-wide discovery of putative horizontal gene transfers](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-717). *BMC Genomics*. 2014. 15:717.

From 5d86dda6898982b27dc02fa69097a05d26eace3f Mon Sep 17 00:00:00 2001
From: drz <qiyunzhu@gmail.com>
Date: Fri, 1 Jan 2021 22:57:05 -0700
Subject: [PATCH 03/11] fixed custom plotting code in documentation

---
 doc/2ndrun.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/2ndrun.md b/doc/2ndrun.md
index 8a9bd25..d87148a 100644
--- a/doc/2ndrun.md
+++ b/doc/2ndrun.md
@@ -374,11 +374,12 @@ import pandas as pd
 import matplotlib.pyplot as plt
 
 # read prediction result
-hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh'], squeeze=True)
+hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh', 'donor'])
 
 # bar plot
 fig = plt.figure(figsize=(5, 5))
-plt.barh(range(len(hgts)), hgts.sort_values())
+silhs = hgts['silh'].sort_values()
+plt.barh(range(len(silhs)), silhs)
 plt.xlim(left=0.5)
 plt.xlabel('Silhouette score')
 plt.ylabel('Gene')
@@ -402,7 +403,7 @@ df = df.query('close + distal > 0')
 df = df[(zscore(df[['close', 'distal']]) < 3).all(axis=1)]
 
 # append silhouette scores
-df['silh'] = df['protein'].map(hgts)
+df['silh'] = df['protein'].map(silhs)
 
 # scatter plot
 fig = plt.figure(figsize=(5, 5))

From e0d1e07dbc6a054686ef477dbee35506a9e1e70f Mon Sep 17 00:00:00 2001
From: drz <qiyunzhu@gmail.com>
Date: Sun, 14 Feb 2021 11:22:52 -0700
Subject: [PATCH 04/11] handling corrupted gzip file

---
 hgtector/database.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 4e2992f..9e0402d 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -594,12 +594,15 @@ def write_prot():
             g2n[g], g2aa[g] = 0, 0
             stem = row.ftp_path.rsplit('/', 1)[-1]
             fp = join(dir_, f'{stem}_protein.faa.gz')
-            try:
-                fin = gzip.open(fp, 'rt')
-            except TypeError:
-                fin = gzip.open(fp, 'r')
+            with gzip.open(fp, 'rb') as f:
+                try:
+                    content = f.read().decode().splitlines()
+                except (OSError, EOFError, TypeError):
+                    print(f' skipping corrupted file {stem}.', end='',
+                          flush=True)
+                    continue
             cp = None
-            for line in fin:
+            for line in content:
                 line = line.rstrip('\r\n')
                 if line.startswith('>'):
                     write_prot()
@@ -618,7 +621,6 @@ def write_prot():
                     line = line.rstrip('*')
                     prots[cp]['seq'] += line
                     g2aa[g] += len(line)
-            fin.close()
             write_prot()
         fout.close()
         print(' done.')

From 1f1af76ca5cfa928fcc4dfbe3e153ee336b6106a Mon Sep 17 00:00:00 2001
From: drz <qiyunzhu@gmail.com>
Date: Wed, 24 Feb 2021 09:34:30 -0700
Subject: [PATCH 05/11] attempted to handle ftp retrbinary eoferror

---
 hgtector/database.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 9e0402d..7473b92 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -546,8 +546,12 @@ def download_genomes(self):
                 for i in range(self.retries):
                     try:
                         with open(file, 'wb') as f:
-                            self.ftp.retrbinary(
-                                f'RETR {remote_dir}/{fname}', f.write)
+                            cmd = f'RETR {remote_dir}/{fname}'
+                            try:
+                                self.ftp.retrbinary(cmd, f.write)
+                            except EOFError:
+                                sleep(self.delay)
+                                continue
                         print('  ' + g, flush=True)
                         success = True
                     except ftplib.error_perm as resp:

From a4b46948cbf1f74c6f26cc7794df870df38b98ec Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 20:01:26 -0700
Subject: [PATCH 06/11] fixed database download issue

---
 hgtector/database.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 7473b92..db59962 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -240,7 +240,6 @@ def connect_server(self):
         makedirs(join(self.output, 'download'), exist_ok=True)
         self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
         self.ftp.login()
-        # self.ftp.set_pasv(False)
         print(' done.')
 
     def retrieve_taxdump(self):
@@ -285,7 +284,7 @@ def get_summary(target):
 
             # read summary
             print(f'Reading {target} assembly summary...', end='', flush=True)
-            df = pd.read_csv(local_file, sep='\t', skiprows=1)
+            df = pd.read_table(local_file, skiprows=1, low_memory=False)
             print(' done.')
             return df
 
@@ -493,15 +492,16 @@ def sample_by_taxonomy(self):
              'Contig': 2})
 
         # sample genomes per taxonomic group
+        # TODO: make this code faster
         selected = []
+        # sort genomes by three criteria
+        self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True)
         for taxon in taxa:
             # select genomes under this taxon
             df_ = self.df.query(f'{self.rank} == "{taxon}"')
-            # sort genomes by three criteria
-            df_ = df_.sort_values(by=['rc_seq', 'al_seq', 'genome'])
             # take up to given number of genomes from top
-            df_ = df_.head(min(self.sample, df_.shape[0]))
-            selected.extend(df_['genome'].tolist())
+            gs = df_.head(min(self.sample, df_.shape[0]))['genome'].tolist()
+            selected.extend(gs)
         selected = set(selected)
 
         # add reference / representative
@@ -511,8 +511,11 @@ def sample_by_taxonomy(self):
                 selected.update(self.df.query(
                     f'refseq_category == "{key} genome"')['genome'].tolist())
 
-        self.df = self.df[self.df['genome'].isin(selected)]
-        print(f'Total number of sampled genomes: {self.df.shape[0]}.')
+        self.df.query('genome in @selected', inplace=True)
+        n = self.df.shape[0]
+        if n == 0:
+            raise ValueError('No genome is retained after sampling.')
+        print(f'Total number of sampled genomes: {n}.')
 
         # clean up temporary columns
         self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True)
@@ -521,7 +524,6 @@ def download_genomes(self):
         """Download genomes from NCBI.
         """
         # reconnect to avoid server timeout problem
-        # TODO: replace this ugly hack with a more stable solution
         self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
         self.ftp.login()
         self.ftp.cwd('/genomes/all')
@@ -546,12 +548,8 @@ def download_genomes(self):
                 for i in range(self.retries):
                     try:
                         with open(file, 'wb') as f:
-                            cmd = f'RETR {remote_dir}/{fname}'
-                            try:
-                                self.ftp.retrbinary(cmd, f.write)
-                            except EOFError:
-                                sleep(self.delay)
-                                continue
+                            self.ftp.retrbinary(
+                                f'RETR {remote_dir}/{fname}', f.write)
                         print('  ' + g, flush=True)
                         success = True
                     except ftplib.error_perm as resp:
@@ -560,6 +558,13 @@ def download_genomes(self):
                     except ftplib.error_temp:
                         sleep(self.delay)
                         continue
+                    except EOFError:
+                        sleep(self.delay)
+                        self.ftp = ftplib.FTP(
+                            'ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
+                        self.ftp.login()
+                        self.ftp.cwd('/genomes/all')
+                        continue
                     else:
                         break
             if not success:

From 83030a25c3d4102c98353122d0051c3840d7b915 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 20:16:35 -0700
Subject: [PATCH 07/11] speeded up taxonomic sampling of database

---
 hgtector/config.yml  |  1 +
 hgtector/database.py | 26 +++++++++-----------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/hgtector/config.yml b/hgtector/config.yml
index 5301f5e..f3c29da 100644
--- a/hgtector/config.yml
+++ b/hgtector/config.yml
@@ -144,6 +144,7 @@ download:
   delay: 10         # seconds between retries
   timeout: 60       # seconds before program gives up waiting
 
+
 ## Taxonomic filtering
 taxonomy:
 
diff --git a/hgtector/database.py b/hgtector/database.py
index db59962..7e64dcf 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -477,13 +477,6 @@ def sample_by_taxonomy(self):
             self.df[self.rank] = self.df['taxid'].apply(
                 taxid_at_rank, rank=self.rank, taxdump=self.taxdump)
 
-        # list taxonomic groups at rank
-        taxa = self.df[self.rank].dropna().unique().tolist()
-        n = len(taxa)
-        if n == 0:
-            raise ValueError(f'No genome is classified at rank "{self.rank}".')
-        print(f'Total number of taxonomic groups at {self.rank}: {n}.')
-
         # custom sorting orders
         self.df['rc_seq'] = self.df['refseq_category'].map(
             {'reference genome': 0, 'representative genome': 1})
@@ -491,18 +484,13 @@ def sample_by_taxonomy(self):
             {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1,
              'Contig': 2})
 
-        # sample genomes per taxonomic group
-        # TODO: make this code faster
-        selected = []
         # sort genomes by three criteria
         self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True)
-        for taxon in taxa:
-            # select genomes under this taxon
-            df_ = self.df.query(f'{self.rank} == "{taxon}"')
-            # take up to given number of genomes from top
-            gs = df_.head(min(self.sample, df_.shape[0]))['genome'].tolist()
-            selected.extend(gs)
-        selected = set(selected)
+
+        # select up to given number of genomes of each taxonomic group
+        selected = set(self.df.groupby(self.rank).head(self.sample)['genome'])
+        if not selected:
+            raise ValueError(f'No genome is classified at rank "{self.rank}".')
 
         # add reference / representative
         for key in ('reference', 'representative'):
@@ -511,12 +499,16 @@ def sample_by_taxonomy(self):
                 selected.update(self.df.query(
                     f'refseq_category == "{key} genome"')['genome'].tolist())
 
+        # filter genomes to selected
         self.df.query('genome in @selected', inplace=True)
         n = self.df.shape[0]
         if n == 0:
             raise ValueError('No genome is retained after sampling.')
         print(f'Total number of sampled genomes: {n}.')
 
+        # sort by genome ID
+        self.df.sort_values('genome', inplace=True)
+
         # clean up temporary columns
         self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True)
 

From c1e8a5155daf3f6525050a7e4413a4e98be02b46 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 20:25:48 -0700
Subject: [PATCH 08/11] added type material selection

---
 hgtector/database.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 7e64dcf..956beba 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -65,6 +65,8 @@
                      {'action': 'store_true'}],
     ['--representative', 'include NCBI-defined representative genomes',
                          {'action': 'store_true'}],
+    ['--typematerial', 'include NCBI-defined type material genomes',
+                       {'action': 'store_true'}],
 
     'taxonomic filter',
     ['--capital',    'organism name must be capitalized',
@@ -493,12 +495,18 @@ def sample_by_taxonomy(self):
             raise ValueError(f'No genome is classified at rank "{self.rank}".')
 
         # add reference / representative
-        for key in ('reference', 'representative'):
+        for key in 'reference', 'representative':
             if getattr(self, key):
-                print(f'Add {key} genomes back to selection.')
+                print(f'Add {key} genomes to selection.')
                 selected.update(self.df.query(
                     f'refseq_category == "{key} genome"')['genome'].tolist())
 
+        # add type material
+        if self.typematerial:
+            print('Add type material genomes to selection.')
+            selected.update(self.df[self.df[
+                'relation_to_type_material'].notna()]['genome'].tolist())
+
         # filter genomes to selected
         self.df.query('genome in @selected', inplace=True)
         n = self.df.shape[0]

From ac5fc33da3513855deb3222046793124899cf025 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 20:47:56 -0700
Subject: [PATCH 09/11] added sorting by type material

---
 hgtector/database.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/hgtector/database.py b/hgtector/database.py
index 956beba..49e732c 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -479,9 +479,14 @@ def sample_by_taxonomy(self):
             self.df[self.rank] = self.df['taxid'].apply(
                 taxid_at_rank, rank=self.rank, taxdump=self.taxdump)
 
-        # custom sorting orders
-        self.df['rc_seq'] = self.df['refseq_category'].map(
-            {'reference genome': 0, 'representative genome': 1})
+        # sort by reference > representative > type material > other
+        self.df['rc_seq'] = self.df.apply(
+            lambda x: 0 if x['refseq_category'] == 'reference genome'
+            else (1 if x['refseq_category'] == 'representative genome'
+                  else (2 if pd.notnull(x['relation_to_type_material'])
+                  else 3)), axis=1)
+
+        # sort by complete > scaffold > contig
         self.df['al_seq'] = self.df['assembly_level'].map(
             {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1,
              'Contig': 2})
@@ -494,14 +499,14 @@ def sample_by_taxonomy(self):
         if not selected:
             raise ValueError(f'No genome is classified at rank "{self.rank}".')
 
-        # add reference / representative
+        # add reference / representative genomes
         for key in 'reference', 'representative':
             if getattr(self, key):
                 print(f'Add {key} genomes to selection.')
                 selected.update(self.df.query(
                     f'refseq_category == "{key} genome"')['genome'].tolist())
 
-        # add type material
+        # add type material genomes
         if self.typematerial:
             print('Add type material genomes to selection.')
             selected.update(self.df[self.df[

From 20365943c273454d7b64bf9196cc7a6499c988b7 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 21:10:47 -0700
Subject: [PATCH 10/11] revised rule for Latin species name

---
 hgtector/tests/test_util.py |  1 +
 hgtector/util.py            | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hgtector/tests/test_util.py b/hgtector/tests/test_util.py
index cea39d6..fd43e32 100644
--- a/hgtector/tests/test_util.py
+++ b/hgtector/tests/test_util.py
@@ -367,6 +367,7 @@ def test_is_capital(self):
     def test_is_latin(self):
         self.assertTrue(is_latin('Escherichia coli'))
         self.assertTrue(is_latin('Rickettsia felis'))
+        self.assertTrue(is_latin('[Clostridium] difficile'))
         self.assertFalse(is_latin('Enterobacteriaceae'))
         self.assertFalse(is_latin('Escherichia coli O157:H7'))
         self.assertFalse(is_latin('Citrobacter sp. A293'))
diff --git a/hgtector/util.py b/hgtector/util.py
index 8a0fdfe..dedcd51 100644
--- a/hgtector/util.py
+++ b/hgtector/util.py
@@ -474,7 +474,7 @@ def is_capital(name):
 
 
 def is_latin(name):
-    """Check if a species name is Latin.
+    """Check if a species name is Latinate.
 
     Parameters
     ----------
@@ -484,16 +484,21 @@ def is_latin(name):
     Returns
     -------
     bool
-        whether species name is Latin
+        whether species name is Latinate
     """
     if name == '':
         return False
     elif name.count(' ') != 1:
         return False
-    str_ = name.replace(' ', '')
-    if not str_.istitle():
+    if name[0] == '[':
+        i = name.find(']')
+        if i == -1:
+            return False
+        name = name[1:i] + name[i + 1:]
+    name = name.replace(' ', '')
+    if not name.istitle():
         return False
-    elif not str_.isalpha():
+    elif not name.isalpha():
         return False
     return True
 

From bd999ea8d6e1cd01f63a699a120746a3ae4e4a52 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Fri, 19 Nov 2021 23:01:24 -0700
Subject: [PATCH 11/11] minor tuning

---
 hgtector/analyze.py             | 12 ++++++------
 hgtector/database.py            | 10 +++++-----
 hgtector/search.py              | 24 ++++++++++++------------
 hgtector/tests/test_database.py |  2 +-
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/hgtector/analyze.py b/hgtector/analyze.py
index 4b861a1..4660955 100644
--- a/hgtector/analyze.py
+++ b/hgtector/analyze.py
@@ -205,23 +205,23 @@ def set_parameters(self, args):
 
         # load configurations
         get_config(self, 'evalue', 'analyze.evalue', float)
-        for key in ('maxhits', 'identity', 'coverage'):
+        for key in 'maxhits', 'identity', 'coverage':
             get_config(self, key, f'analyze.{key}')
-        for key in ('input_cov', 'self_rank', 'close_size'):
+        for key in 'input_cov', 'self_rank', 'close_size':
             get_config(self, key, f'grouping.{key.replace("_", "")}')
         for key in ('weighted', 'outliers', 'orphans', 'bandwidth', 'bw_steps',
                     'low_part', 'noise', 'fixed', 'silhouette', 'self_low'):
             get_config(self, key, f'predict.{key.replace("_", "")}')
         get_config(self, 'distal_top', 'donor.distaltop')
-        for key in ('name', 'rank'):
+        for key in 'name', 'rank':
             get_config(self, f'donor_{key}', f'donor.{key}')
 
         # convert boolean values
-        for key in ('weighted', 'orphans', 'self_low', 'donor_name'):
+        for key in 'weighted', 'orphans', 'self_low', 'donor_name':
             setattr(self, key, arg2bool(getattr(self, key, None)))
 
         # convert fractions to percentages
-        for metric in ('input_cov', 'noise', 'fixed', 'distal_top'):
+        for metric in 'input_cov', 'noise', 'fixed', 'distal_top':
             val = getattr(self, metric)
             if val and val < 1:
                 setattr(self, metric, val * 100)
@@ -470,7 +470,7 @@ def define_groups(self):
         3. `groups` (keys: self, close, distal): all taxIds under each group.
         """
         self.groups = {}
-        for key in ('self', 'close'):
+        for key in 'self', 'close':
             tids = getattr(self, f'{key}_tax')
 
             # user-defined group
diff --git a/hgtector/database.py b/hgtector/database.py
index 49e732c..bd274a3 100644
--- a/hgtector/database.py
+++ b/hgtector/database.py
@@ -175,17 +175,17 @@ def set_parameters(self, args):
             setattr(self, key, val)
 
         # load configurations
-        for key in ('capital', 'block', 'latin'):
+        for key in 'capital', 'block', 'latin':
             get_config(self, key, f'taxonomy.{key}')
-        for key in ('retries', 'delay', 'timeout'):
+        for key in 'retries', 'delay', 'timeout':
             get_config(self, key, f'download.{key}')
-        for key in ('diamond', 'makeblastdb'):
+        for key in 'diamond', 'makeblastdb':
             get_config(self, key, f'program.{key}')
-        for key in ('threads', 'tmpdir'):
+        for key in 'threads', 'tmpdir':
             get_config(self, key, f'local.{key}')
 
         # convert boolean values
-        for key in ('capital', 'latin'):
+        for key in 'capital', 'latin':
             setattr(self, key, arg2bool(getattr(self, key, None)))
 
         # make temporary directory
diff --git a/hgtector/search.py b/hgtector/search.py
index 7dee98e..f07de07 100644
--- a/hgtector/search.py
+++ b/hgtector/search.py
@@ -248,9 +248,9 @@ def args_wf(self, args):
 
         # load search parameters
         get_config(self, 'evalue', 'search.evalue', float)
-        for key in ('method', 'minsize', 'maxseqs', 'identity', 'coverage'):
+        for key in 'method', 'minsize', 'maxseqs', 'identity', 'coverage':
             get_config(self, key, f'search.{key}')
-        for key in ('diamond', 'blastp', 'blastdbcmd'):
+        for key in 'diamond', 'blastp', 'blastdbcmd':
             get_config(self, key, f'program.{key}')
 
         if self.method not in {'auto', 'diamond', 'blast', 'remote',
@@ -295,12 +295,12 @@ def args_wf(self, args):
                 self.method = 'remote'
 
         # load method-specific arguments
-        for key in ('queries', 'maxchars', 'extrargs'):
+        for key in 'queries', 'maxchars', 'extrargs':
             get_config(self, key, f'{self.method}.{key}')
 
         # load remote search settings
         if self.method == 'remote':
-            for key in ('db', 'algorithm', 'delay', 'timeout', 'entrez'):
+            for key in 'db', 'algorithm', 'delay', 'timeout', 'entrez':
                 get_config(self, key, f'remote.{key}')
             get_config(self, 'server', 'server.search')
 
@@ -336,7 +336,7 @@ def args_wf(self, args):
         self.prot2tid = {}
 
         # assign taxonomy database
-        for key in ('taxdump', 'taxmap'):
+        for key in 'taxdump', 'taxmap':
             get_config(self, key, f'database.{key}')
 
         if self.method != 'remote':
@@ -350,7 +350,7 @@ def args_wf(self, args):
                 raise ValueError(
                     f'Invalid taxonomy database directory: {self.taxdump}.')
             else:
-                for fname in ('names.dmp', 'nodes.dmp'):
+                for fname in 'names.dmp', 'nodes.dmp':
                     if not isfile(join(self.taxdump, fname)):
                         raise ValueError(
                             f'Taxonomy database file {fname} is not found.')
@@ -361,13 +361,13 @@ def args_wf(self, args):
                     f'Invalid protein-to-taxId map: {self.taxmap}.')
 
         # load taxonomic filters and convert to lists
-        for key in ('include', 'exclude', 'block'):
+        for key in 'include', 'exclude', 'block':
             attr = f'tax_{key}'
             get_config(self, attr, f'taxonomy.{key}')
             setattr(self, attr, list_from_param(getattr(self, attr)))
 
         # load taxonomy switches
-        for key in ('unique', 'unirank', 'capital', 'latin'):
+        for key in 'unique', 'unirank', 'capital', 'latin':
             get_config(self, f'tax_{key}', f'taxonomy.{key}')
 
         """determine self-alignment strategy"""
@@ -410,7 +410,7 @@ def args_wf(self, args):
 
         # load configurations
         get_config(self, 'fetch_server', 'server.fetch')
-        for key in ('enable', 'queries', 'retries', 'delay', 'timeout'):
+        for key in 'enable', 'queries', 'retries', 'delay', 'timeout':
             get_config(self, f'fetch_{key}', f'fetch.{key}')
 
         # determine remote or local fetching
@@ -421,11 +421,11 @@ def args_wf(self, args):
         """final steps"""
 
         # convert boolean values
-        for key in ('tax_unique', 'tax_capital', 'tax_latin'):
+        for key in 'tax_unique', 'tax_capital', 'tax_latin':
             setattr(self, key, arg2bool(getattr(self, key, None)))
 
         # convert fractions to percentages
-        for metric in ('identity', 'coverage'):
+        for metric in 'identity', 'coverage':
             val = getattr(self, metric)
             if val and val < 1:
                 setattr(self, metric, val * 100)
@@ -1417,7 +1417,7 @@ def parse_fasta_xml(xml):
         for m in re.finditer(r'<TSeq>(.+?)<\/TSeq>', xml, re.DOTALL):
             s_ = m.group(1)
             seq = []
-            for key in (('accver', 'taxid', 'defline', 'sequence')):
+            for key in 'accver', 'taxid', 'defline', 'sequence':
                 m_ = re.search(r'<TSeq_%s>(.+)<\/TSeq_%s>' % (key, key), s_)
                 seq.append(m_.group(1) if m_ else '')
             seq[2] = get_product(seq[2])
diff --git a/hgtector/tests/test_database.py b/hgtector/tests/test_database.py
index 1fc860a..9642b75 100644
--- a/hgtector/tests/test_database.py
+++ b/hgtector/tests/test_database.py
@@ -112,7 +112,7 @@ def test_identify_taxonomy(self):
                 ('Plasmid pPY113', '126792', '', ''))
         df = pd.DataFrame(data, columns=header)
 
-        # organism names must be capital and latinate
+        # organism names must be capital and Latinate
         me.capital = True
         me.block = None
         me.latin = True