Skip to content

Commit

Permalink
Merge pull request #83 from qiyunlab/master
Browse files Browse the repository at this point in the history
Update the 2.0b3 branch
  • Loading branch information
qiyunzhu authored Nov 20, 2021
2 parents 32a4646 + 49a4894 commit 1101fd1
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 74 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Change Log

### Added
- Predicted HGT list now includes potential donors. Users can optionally specify a taxonomic rank at which they will be reported.
- A quick-start guide added to the homepage.

### Changed
- Repository transferred from [DittmarLab](https://github.com/DittmarLab) to [qiyunlab](https://github.com/qiyunlab).
Expand Down
43 changes: 43 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,53 @@ References
- [Configure](doc/config.md)


## Quick start

Set up a Conda environment and install dependencies:

```bash
conda create -n hgtector python=3 pyyaml pandas matplotlib scikit-learn bioconda::diamond
conda activate hgtector
```

Install HGTector2:

```bash
pip install git+https://github.com/qiyunlab/HGTector.git
```

Build a reference database using the default protocol:

```bash
hgtector database -o db_dir --default
```

This will retrieve the latest genomic data from NCBI. If this does not work (e.g., due to network issues), or you need some customization, please read the [database](doc/database.md) page.

Prepare input file(s). They should be multi-Fasta files of amino acid sequences (faa). Each file represents the whole protein set of a complete or partial genome.

Perform homology [search](doc/search.md):

```bash
hgtector search -i input.faa -o search_dir -m diamond -p 16 -d db_dir/diamond/db -t db_dir/taxdump
```

Perform HGT [prediction](doc/analyze.md):

```bash
hgtector analyze -i search_dir -o analyze_dir -t hgtdb/taxdump
```

Examine the prediction results under the `analyze_dir` directory.

It is recommended that you read the [first run](doc/1strun.md), [second run](doc/2ndrun.md) and [real runs](doc/realrun.md) pages to get familiar with the pipeline, the underlying methodology, and the customization options.


## License

Copyright (c) 2013-2020, [Qiyun Zhu](mailto:[email protected]) and [Katharina Dittmar](mailto:[email protected]). Licensed under [BSD 3-clause](http://opensource.org/licenses/BSD-3-Clause). See full license [statement](LICENSE).


## Citation

> Zhu Q, Kosoy M, Dittmar K. HGTector: [an automated method facilitating genome-wide discovery of putative horizontal gene transfers](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-717). *BMC Genomics*. 2014. 15:717.
7 changes: 4 additions & 3 deletions doc/2ndrun.md
Original file line number Diff line number Diff line change
Expand Up @@ -374,11 +374,12 @@ import pandas as pd
import matplotlib.pyplot as plt

# read prediction result
hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh'], squeeze=True)
hgts = pd.read_csv('hgts/o55h7.txt', sep='\t', names=['silh', 'donor'])

# bar plot
fig = plt.figure(figsize=(5, 5))
plt.barh(range(len(hgts)), hgts.sort_values())
silhs = hgts['silh'].sort_values()
plt.barh(range(len(silhs)), silhs)
plt.xlim(left=0.5)
plt.xlabel('Silhouette score')
plt.ylabel('Gene')
Expand All @@ -402,7 +403,7 @@ df = df.query('close + distal > 0')
df = df[(zscore(df[['close', 'distal']]) < 3).all(axis=1)]

# append silhouette scores
df['silh'] = df['protein'].map(hgts)
df['silh'] = df['protein'].map(silhs)

# scatter plot
fig = plt.figure(figsize=(5, 5))
Expand Down
12 changes: 6 additions & 6 deletions hgtector/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,23 +205,23 @@ def set_parameters(self, args):

# load configurations
get_config(self, 'evalue', 'analyze.evalue', float)
for key in ('maxhits', 'identity', 'coverage'):
for key in 'maxhits', 'identity', 'coverage':
get_config(self, key, f'analyze.{key}')
for key in ('input_cov', 'self_rank', 'close_size'):
for key in 'input_cov', 'self_rank', 'close_size':
get_config(self, key, f'grouping.{key.replace("_", "")}')
for key in ('weighted', 'outliers', 'orphans', 'bandwidth', 'bw_steps',
'low_part', 'noise', 'fixed', 'silhouette', 'self_low'):
get_config(self, key, f'predict.{key.replace("_", "")}')
get_config(self, 'distal_top', 'donor.distaltop')
for key in ('name', 'rank'):
for key in 'name', 'rank':
get_config(self, f'donor_{key}', f'donor.{key}')

# convert boolean values
for key in ('weighted', 'orphans', 'self_low', 'donor_name'):
for key in 'weighted', 'orphans', 'self_low', 'donor_name':
setattr(self, key, arg2bool(getattr(self, key, None)))

# convert fractions to percentages
for metric in ('input_cov', 'noise', 'fixed', 'distal_top'):
for metric in 'input_cov', 'noise', 'fixed', 'distal_top':
val = getattr(self, metric)
if val and val < 1:
setattr(self, metric, val * 100)
Expand Down Expand Up @@ -470,7 +470,7 @@ def define_groups(self):
3. `groups` (keys: self, close, distal): all taxIds under each group.
"""
self.groups = {}
for key in ('self', 'close'):
for key in 'self', 'close':
tids = getattr(self, f'{key}_tax')

# user-defined group
Expand Down
1 change: 1 addition & 0 deletions hgtector/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ download:
delay: 10 # seconds between retries
timeout: 60 # seconds before program gives up waiting


## Taxonomic filtering
taxonomy:

Expand Down
119 changes: 72 additions & 47 deletions hgtector/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@
{'action': 'store_true'}],
['--representative', 'include NCBI-defined representative genomes',
{'action': 'store_true'}],
['--typematerial', 'include NCBI-defined type material genomes',
{'action': 'store_true'}],

'taxonomic filter',
['--capital', 'organism name must be capitalized',
Expand Down Expand Up @@ -173,17 +175,17 @@ def set_parameters(self, args):
setattr(self, key, val)

# load configurations
for key in ('capital', 'block', 'latin'):
for key in 'capital', 'block', 'latin':
get_config(self, key, f'taxonomy.{key}')
for key in ('retries', 'delay', 'timeout'):
for key in 'retries', 'delay', 'timeout':
get_config(self, key, f'download.{key}')
for key in ('diamond', 'makeblastdb'):
for key in 'diamond', 'makeblastdb':
get_config(self, key, f'program.{key}')
for key in ('threads', 'tmpdir'):
for key in 'threads', 'tmpdir':
get_config(self, key, f'local.{key}')

# convert boolean values
for key in ('capital', 'latin'):
for key in 'capital', 'latin':
setattr(self, key, arg2bool(getattr(self, key, None)))

# make temporary directory
Expand All @@ -202,12 +204,6 @@ def set_parameters(self, args):
raise ValueError(
f'Invalid {exe} executable: {getattr(self, exe)}.')

# determine number of CPUs to use
if self.compile in ('diamond', 'both') and not self.threads:
self.threads = cpu_count()
if self.threads is None:
self.threads = 1

# default protocol
if self.default:
print('The default protocol is selected for database building.')
Expand All @@ -220,7 +216,22 @@ def set_parameters(self, args):
self.rank = 'species'
self.reference = True
self.representative = True
self.compile = 'diamond'

if self.diamond is None:
self.diamond = 'diamond'
if which(self.diamond) is None:
print('WARNING: Cannot find DIAMOND in this computer. '
'You will need to manually compile the database '
'after download is complete.')
self.compile = 'none'
else:
self.compile = 'diamond'

# determine number of CPUs to use
if self.compile in ('diamond', 'both') and not self.threads:
self.threads = cpu_count()
if self.threads is None:
self.threads = 1

makedirs(self.output, exist_ok=True)

Expand All @@ -231,7 +242,6 @@ def connect_server(self):
makedirs(join(self.output, 'download'), exist_ok=True)
self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
self.ftp.login()
# self.ftp.set_pasv(False)
print(' done.')

def retrieve_taxdump(self):
Expand Down Expand Up @@ -276,7 +286,7 @@ def get_summary(target):

# read summary
print(f'Reading {target} assembly summary...', end='', flush=True)
df = pd.read_csv(local_file, sep='\t', skiprows=1)
df = pd.read_table(local_file, skiprows=1, low_memory=False)
print(' done.')
return df

Expand Down Expand Up @@ -469,41 +479,48 @@ def sample_by_taxonomy(self):
self.df[self.rank] = self.df['taxid'].apply(
taxid_at_rank, rank=self.rank, taxdump=self.taxdump)

# list taxonomic groups at rank
taxa = self.df[self.rank].dropna().unique().tolist()
n = len(taxa)
if n == 0:
raise ValueError(f'No genome is classified at rank "{self.rank}".')
print(f'Total number of taxonomic groups at {self.rank}: {n}.')
# sort by reference > representative > type material > other
self.df['rc_seq'] = self.df.apply(
lambda x: 0 if x['refseq_category'] == 'reference genome'
else (1 if x['refseq_category'] == 'representative genome'
else (2 if pd.notnull(x['relation_to_type_material'])
else 3)), axis=1)

# custom sorting orders
self.df['rc_seq'] = self.df['refseq_category'].map(
{'reference genome': 0, 'representative genome': 1})
# sort by complete > scaffold > contig
self.df['al_seq'] = self.df['assembly_level'].map(
{'Chromosome': 0, 'Complete Genome': 0, 'Scaffold': 1,
'Contig': 2})

# sample genomes per taxonomic group
selected = []
for taxon in taxa:
# select genomes under this taxon
df_ = self.df.query(f'{self.rank} == "{taxon}"')
# sort genomes by three criteria
df_ = df_.sort_values(by=['rc_seq', 'al_seq', 'genome'])
# take up to given number of genomes from top
df_ = df_.head(min(self.sample, df_.shape[0]))
selected.extend(df_['genome'].tolist())
selected = set(selected)

# add reference / representative
for key in ('reference', 'representative'):
# sort genomes by three criteria
self.df.sort_values(by=['rc_seq', 'al_seq', 'genome'], inplace=True)

# select up to given number of genomes of each taxonomic group
selected = set(self.df.groupby(self.rank).head(self.sample)['genome'])
if not selected:
raise ValueError(f'No genome is classified at rank "{self.rank}".')

# add reference / representative genomes
for key in 'reference', 'representative':
if getattr(self, key):
print(f'Add {key} genomes back to selection.')
print(f'Add {key} genomes to selection.')
selected.update(self.df.query(
f'refseq_category == "{key} genome"')['genome'].tolist())

self.df = self.df[self.df['genome'].isin(selected)]
print(f'Total number of sampled genomes: {self.df.shape[0]}.')
# add type material genomes
if self.typematerial:
print('Add type material genomes to selection.')
selected.update(self.df[self.df[
'relation_to_type_material'].notna()]['genome'].tolist())

# filter genomes to selected
self.df.query('genome in @selected', inplace=True)
n = self.df.shape[0]
if n == 0:
raise ValueError('No genome is retained after sampling.')
print(f'Total number of sampled genomes: {n}.')

# sort by genome ID
self.df.sort_values('genome', inplace=True)

# clean up temporary columns
self.df.drop(columns=['al_seq', 'rc_seq'], inplace=True)
Expand All @@ -512,7 +529,6 @@ def download_genomes(self):
"""Download genomes from NCBI.
"""
# reconnect to avoid server timeout problem
# TODO: replace this ugly hack with a more stable solution
self.ftp = ftplib.FTP('ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
self.ftp.login()
self.ftp.cwd('/genomes/all')
Expand Down Expand Up @@ -547,6 +563,13 @@ def download_genomes(self):
except ftplib.error_temp:
sleep(self.delay)
continue
except EOFError:
sleep(self.delay)
self.ftp = ftplib.FTP(
'ftp.ncbi.nlm.nih.gov', timeout=self.timeout)
self.ftp.login()
self.ftp.cwd('/genomes/all')
continue
else:
break
if not success:
Expand Down Expand Up @@ -585,12 +608,15 @@ def write_prot():
g2n[g], g2aa[g] = 0, 0
stem = row.ftp_path.rsplit('/', 1)[-1]
fp = join(dir_, f'{stem}_protein.faa.gz')
try:
fin = gzip.open(fp, 'rt')
except TypeError:
fin = gzip.open(fp, 'r')
with gzip.open(fp, 'rb') as f:
try:
content = f.read().decode().splitlines()
except (OSError, EOFError, TypeError):
print(f' skipping corrupted file {stem}.', end='',
flush=True)
continue
cp = None
for line in fin:
for line in content:
line = line.rstrip('\r\n')
if line.startswith('>'):
write_prot()
Expand All @@ -609,7 +635,6 @@ def write_prot():
line = line.rstrip('*')
prots[cp]['seq'] += line
g2aa[g] += len(line)
fin.close()
write_prot()
fout.close()
print(' done.')
Expand Down
Loading

0 comments on commit 1101fd1

Please sign in to comment.