Merge pull request #83 from qiyunlab/master

Update the 2.0b3 branch
qiyunlab · Nov 20, 2021 · 1101fd1 · 1101fd1
2 parents 32a4646 + 49a4894
commit 1101fd1
Show file tree

Hide file tree

Showing 10 changed files with 151 additions and 74 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ Change Log
 
 ### Added
 - Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported.
+- A quick-start guide added to the homepage.
 
 ### Changed
 - Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab).

diff --git a/README.md b/README.md
@@ -25,10 +25,53 @@ References
 - [Configure](doc/config.md)
 
 
+## Quick start
+
+Set up a Conda environment and install dependencies:
+
+```bash
+conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond
+conda activate hgtector
+```
+
+Install HGTector2:
+
+```bash
+pip install git+https://github.com/qiyunlab/HGTector.git
+```
+
+Build a reference database using the default protocol:
+
+```bash
+hgtector database -o db_dir --default
+```
+
+This will retrieve the latest genomic data from NCBI. If this does not work (e.g., due to network issues), or you need some customization, please read the [database](doc/database.md) page.
+
+Prepare input file(s). They should be multi-Fasta files of amino acid sequences (faa). Each file represents the whole protein set of a complete or partial genome.
+
+Perform homology [search](doc/search.md):
+
+```bash
+hgtector search -i input.faa -o search_dir -m diamond -p 16 -d db_dir/diamond/db -t db_dir/taxdump
+```
+
+Perform HGT [prediction](doc/analyze.md):
+
+```bash
+hgtector analyze -i search_dir -o analyze_dir -t hgtdb/taxdump
+```
+
+Examine the prediction results under the `analyze_dir` directory.
+
+It is recommended that you read the [first run](doc/1strun.md), [second run](doc/2ndrun.md) and [real runs](doc/realrun.md) pages to get familiar with the pipeline, the underlying methodology, and the customization options.
+
+
 ## License
 
 Copyright (c) 2013-2020, [Qiyun Zhu](mailto:[email protected]) and [Katharina Dittmar](mailto:[email protected]). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE).
 
+
 ## Citation
 
 > Zhu Q, Kosoy M, Dittmar K. HGTector: [an automated method facilitating genome-wide discovery of putative horizontal gene transfers](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-717). *BMC Genomics*. 2014. 15:717.
diff --git a/doc/2ndrun.md b/doc/2ndrun.md
@@ -374,11 +374,12 @@ import pandas as pd
 import matplotlib.pyplot as plt
 
 # read prediction result
-hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh'], squeeze=True)
+hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh', 'donor'])
 
 # bar plot
 fig = plt.figure(figsize=(5, 5))
-plt.barh(range(len(hgts)), hgts.sort_values())
+silhs = hgts['silh'].sort_values()
+plt.barh(range(len(silhs)), silhs)
 plt.xlim(left=0.5)
 plt.xlabel('Silhouette score')
 plt.ylabel('Gene')
@@ -402,7 +403,7 @@ df = df.query('close + distal > 0')
 df = df[(zscore(df[['close', 'distal']]) < 3).all(axis=1)]
 
 # append silhouette scores
-df['silh'] = df['protein'].map(hgts)
+df['silh'] = df['protein'].map(silhs)
 
 # scatter plot
 fig = plt.figure(figsize=(5, 5))

diff --git a/hgtector/analyze.py b/hgtector/analyze.py
@@ -205,23 +205,23 @@ def set_parameters(self, args):
 
         # load configurations
         get_config(self, 'evalue', 'analyze.evalue', float)
-        for key in ('maxhits', 'identity', 'coverage'):
+        for key in 'maxhits', 'identity', 'coverage':
             get_config(self, key, f'analyze.{key}')
-        for key in ('input_cov', 'self_rank', 'close_size'):
+        for key in 'input_cov', 'self_rank', 'close_size':
             get_config(self, key, f'grouping.{key.replace("_", "")}')
         for key in ('weighted', 'outliers', 'orphans', 'bandwidth', 'bw_steps',
                     'low_part', 'noise', 'fixed', 'silhouette', 'self_low'):
             get_config(self, key, f'predict.{key.replace("_", "")}')
         get_config(self, 'distal_top', 'donor.distaltop')
-        for key in ('name', 'rank'):
+        for key in 'name', 'rank':
             get_config(self, f'donor_{key}', f'donor.{key}')
 
         # convert boolean values
-        for key in ('weighted', 'orphans', 'self_low', 'donor_name'):
+        for key in 'weighted', 'orphans', 'self_low', 'donor_name':
             setattr(self, key, arg2bool(getattr(self, key, None)))
 
         # convert fractions to percentages
-        for metric in ('input_cov', 'noise', 'fixed', 'distal_top'):
+        for metric in 'input_cov', 'noise', 'fixed', 'distal_top':
             val = getattr(self, metric)
             if val and val < 1:
                 setattr(self, metric, val * 100)
@@ -470,7 +470,7 @@ def define_groups(self):
         3. `groups` (keys: self, close, distal): all taxIds under each group.
         """
         self.groups = {}
-        for key in ('self', 'close'):
+        for key in 'self', 'close':
             tids = getattr(self, f'{key}_tax')
 
             # user-defined group

diff --git a/hgtector/config.yml b/hgtector/config.yml
@@ -144,6 +144,7 @@ download:
   delay: 10         # seconds between retries
   timeout: 60       # seconds before program gives up waiting
 
+
 ## Taxonomic filtering
 taxonomy:
 

diff --git a/hgtector/database.py b/hgtector/database.py
@@ -65,6 +65,8 @@
                      {'action': 'store_true'}],
     ['--representative', 'include NCBI-defined representative genomes',
                          {'action': 'store_true'}],
+    ['--typematerial', 'include NCBI-defined type material genomes',
+                       {'action': 'store_true'}],
 
     'taxonomic filter',
     ['--capital',    'organism name must be capitalized',
@@ -173,17 +175,17 @@ def set_parameters(self, args):
             setattr(self, key, val)
 
         # load configurations
-        for key in ('capital', 'block', 'latin'):
+        for key in 'capital', 'block', 'latin':
             get_config(self, key, f'taxonomy.{key}')
-        for key in ('retries', 'delay', 'timeout'):
+        for key in 'retries', 'delay', 'timeout':
             get_config(self, key, f'download.{key}')
-        for key in ('diamond', 'makeblastdb'):
+        for key in 'diamond', 'makeblastdb':
             get_config(self, key, f'program.{key}')
-        for key in ('threads', 'tmpdir'):
+        for key in 'threads', 'tmpdir':
             get_config(self, key, f'local.{key}')
 
         # convert boolean values
-        for key in ('capital', 'latin'):
+        for key in 'capital', 'latin':
             setattr(self, key, arg2bool(getattr(self, key, None)))
 
         # make temporary directory
@@ -202,12 +204,6 @@ def set_parameters(self, args):
                     raise ValueError(
                         f'Invalid {exe} executable: {getattr(self, exe)}.')
 
-        # determine number of CPUs to use
-        if self.compile in ('diamond', 'both') and not self.threads:
-            self.threads = cpu_count()
-            if self.threads is None:
-                self.threads = 1
-
         # default protocol
         if self.default:
             print('The default protocol is selected for database building.')
@@ -220,7 +216,22 @@ def set_parameters(self, args):
             self.rank = 'species'
             self.reference = True
             self.representative = True
-            self.compile = 'diamond'
+
+            if self.diamond is None:
+                self.diamond = 'diamond'
+            if which(self.diamond) is None:
+                print('WARNING: Cannot find DIAMOND in this computer. '
+                      'You will need to manually compile the database '
+                      'after download is complete.')
+                self.compile = 'none'
+            else:
+                self.compile = 'diamond'
+
+        # determine number of CPUs to use
+        if self.compile in ('diamond', 'both') and not self.threads:
+            self.threads = cpu_count()
+            if self.threads is None:
+                self.threads = 1
 
         makedirs(self.output, exist_ok=True)
 
@@ -231,7 +242,6 @@ def connect_server(self):
         makedirs(join(self.output, 'download'), exist_ok=True)
         self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
         self.ftp.login()
-        # self.ftp.set_pasv(False)
         print(' done.')
 
     def retrieve_taxdump(self):
@@ -276,7 +286,7 @@ def get_summary(target):
 
             # read summary
             print(f'Reading {target} assembly summary...', end='', flush=True)
-            df = pd.read_csv(local_file, sep='\t', skiprows=1)
+            df = pd.read_table(local_file, skiprows=1, low_memory=False)
             print(' done.')
             return df
 
@@ -469,41 +479,48 @@ def sample_by_taxonomy(self):
             self.df[self.rank] = self.df['taxid'].apply(
                 taxid_at_rank, rank=self.rank, taxdump=self.taxdump)
 
-        # list taxonomic groups at rank
-        taxa = self.df[self.rank].dropna().unique().tolist()
-        n = len(taxa)
-        if n == 0:
-            raise ValueError(f'No genome is classified at rank "{self.rank}".')
-        print(f'Total number of taxonomic groups at {self.rank}: {n}.')
+        # sort by reference > representative > type material > other
+        self.df['rc_seq'] = self.df.apply(
+            lambda x: 0 if x['refseq_category'] == 'reference genome'
+            else (1 if x['refseq_category'] == 'representative genome'
+                  else (2 if pd.notnull(x['relation_to_type_material'])
+                  else 3)), axis=1)
 
-        # custom sorting orders
-        self.df['rc_seq'] = self.df['refseq_category'].map(
-            {'reference genome': 0, 'representative genome': 1})
+        # sort by complete > scaffold > contig
         self.df['al_seq'] = self.df['assembly_level'].map(
             {'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1,
              'Contig': 2})
 
-        # sample genomes per taxonomic group
-        selected = []
-        for taxon in taxa:
-            # select genomes under this taxon
-            df_ = self.df.query(f'{self.rank} == "{taxon}"')
-            # sort genomes by three criteria
-            df_ = df_.sort_values(by=['rc_seq', 'al_seq', 'genome'])
-            # take up to given number of genomes from top
-            df_ = df_.head(min(self.sample, df_.shape[0]))
-            selected.extend(df_['genome'].tolist())
-        selected = set(selected)
-
-        # add reference / representative
-        for key in ('reference', 'representative'):
+        # sort genomes by three criteria
+        self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True)
+
+        # select up to given number of genomes of each taxonomic group
+        selected = set(self.df.groupby(self.rank).head(self.sample)['genome'])
+        if not selected:
+            raise ValueError(f'No genome is classified at rank "{self.rank}".')
+
+        # add reference / representative genomes
+        for key in 'reference', 'representative':
             if getattr(self, key):
-                print(f'Add {key} genomes back to selection.')
+                print(f'Add {key} genomes to selection.')
                 selected.update(self.df.query(
                     f'refseq_category == "{key} genome"')['genome'].tolist())
 
-        self.df = self.df[self.df['genome'].isin(selected)]
-        print(f'Total number of sampled genomes: {self.df.shape[0]}.')
+        # add type material genomes
+        if self.typematerial:
+            print('Add type material genomes to selection.')
+            selected.update(self.df[self.df[
+                'relation_to_type_material'].notna()]['genome'].tolist())
+
+        # filter genomes to selected
+        self.df.query('genome in @selected', inplace=True)
+        n = self.df.shape[0]
+        if n == 0:
+            raise ValueError('No genome is retained after sampling.')
+        print(f'Total number of sampled genomes: {n}.')
+
+        # sort by genome ID
+        self.df.sort_values('genome', inplace=True)
 
         # clean up temporary columns
         self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True)
@@ -512,7 +529,6 @@ def download_genomes(self):
         """Download genomes from NCBI.
         """
         # reconnect to avoid server timeout problem
-        # TODO: replace this ugly hack with a more stable solution
         self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
         self.ftp.login()
         self.ftp.cwd('/genomes/all')
@@ -547,6 +563,13 @@ def download_genomes(self):
                     except ftplib.error_temp:
                         sleep(self.delay)
                         continue
+                    except EOFError:
+                        sleep(self.delay)
+                        self.ftp = ftplib.FTP(
+                            'ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
+                        self.ftp.login()
+                        self.ftp.cwd('/genomes/all')
+                        continue
                     else:
                         break
             if not success:
@@ -585,12 +608,15 @@ def write_prot():
             g2n[g], g2aa[g] = 0, 0
             stem = row.ftp_path.rsplit('/', 1)[-1]
             fp = join(dir_, f'{stem}_protein.faa.gz')
-            try:
-                fin = gzip.open(fp, 'rt')
-            except TypeError:
-                fin = gzip.open(fp, 'r')
+            with gzip.open(fp, 'rb') as f:
+                try:
+                    content = f.read().decode().splitlines()
+                except (OSError, EOFError, TypeError):
+                    print(f' skipping corrupted file {stem}.', end='',
+                          flush=True)
+                    continue
             cp = None
-            for line in fin:
+            for line in content:
                 line = line.rstrip('\r\n')
                 if line.startswith('>'):
                     write_prot()
@@ -609,7 +635,6 @@ def write_prot():
                     line = line.rstrip('*')
                     prots[cp]['seq'] += line
                     g2aa[g] += len(line)
-            fin.close()
             write_prot()
         fout.close()
         print(' done.')