Skip to content

Commit

Permalink
Updates made during final curation and validation of R202.
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed Apr 8, 2021
1 parent d7ac715 commit c738f19
Show file tree
Hide file tree
Showing 10 changed files with 331 additions and 169 deletions.
24 changes: 13 additions & 11 deletions bin/gtdb_species_clusters
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def print_help():
Post-manual curation
pmc_manual_species -> Identify species names manually set by curators
pmc_replace_generic -> Replace generic names with genus assignment
pmc_species_names -> Establish final species names based on manual curation
pmc_check_type_species -> Check for agreement between GTDB genera and genomes assembled from type species of genus
pmc_check_type_strains -> Check for agreement between GTDB species and genomes assembled from type strain of species
pmc_species_names -> Establish final species names based on manual curation
pmc_validate -> Validate final species names
pmc_name_stats -> [TBD] Summary statistics indicating changes to GTDB species cluster names
pmc_cluster_stats -> Calculate final statistics for species clusters (???)
Expand Down Expand Up @@ -628,7 +628,9 @@ if __name__ == '__main__':
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Check for agreement between GTDB genera and genomes assembled from type species of genus.')
pmc_check_type_species_parser.add_argument(
'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
'manual_taxonomy', help="file with manually curated taxonomy (pmc_species_names: final_taxonomy.tsv)")
pmc_check_type_species_parser.add_argument(
'cur_gtdb_metadata_file', help="file with GTDB metadata for current GTDB release (TSV file)")
pmc_check_type_species_parser.add_argument(
'qc_passed_file', help="file indicating genomes that have passed QC (u_qc_genomes: qc_passed.tsv)")
pmc_check_type_species_parser.add_argument(
Expand All @@ -653,7 +655,7 @@ if __name__ == '__main__':
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Check for agreement between GTDB species and genomes assembled from type strain of species.')
pmc_check_type_strains_parser.add_argument(
'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
'manual_taxonomy', help="file with manually curated taxonomy (pmc_species_names: final_taxonomy.tsv)")
pmc_check_type_strains_parser.add_argument(
'cur_gtdb_metadata_file', help="file with GTDB metadata for current GTDB release (TSV file)")
pmc_check_type_strains_parser.add_argument(
Expand All @@ -676,9 +678,9 @@ if __name__ == '__main__':
pmc_species_names_parser.add_argument(
'curation_tree', help="manually curated tree to relabel with final species names")
pmc_species_names_parser.add_argument(
'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
'manual_sp_names', help="file indicating species names explicitly set by manually curated (pmc_manual_species: manual_species_names.tsv)")
pmc_species_names_parser.add_argument(
'manual_sp_names', help="file indicating species names explicitly set by manually curated (u_replace_generic: manual_species_names.tsv)")
'manual_taxonomy', help="file with manually curated taxonomy (pmc_replace_generic: taxonomy_updated_sp.tsv)")
pmc_species_names_parser.add_argument(
'pmc_custom_species', help="file indicating post-curation, manual species assignments that are hard to automatically generate")
pmc_species_names_parser.add_argument(
Expand All @@ -690,7 +692,7 @@ if __name__ == '__main__':
pmc_species_names_parser.add_argument(
'qc_passed_file', help="file indicating genomes that have passed QC (u_qc_genomes: qc_passed.tsv)")
pmc_species_names_parser.add_argument(
'ncbi_misclassified_file', help="file indicating genomes with erroneous NCBI species assignments (u_ncbi_erroneous: ncbi_missclassified_sp.tsv)")
'ncbi_misclassified_file', help="file indicating genomes with erroneous NCBI species assignments (u_ncbi_erroneous: ncbi_misclassified_sp.gtdb_clustering.tsv)")
pmc_species_names_parser.add_argument('ncbi_genbank_assembly_file',
help="NCBI GenBank assembly file indicating potentially erroneous genomes")
pmc_species_names_parser.add_argument(
Expand Down Expand Up @@ -721,10 +723,10 @@ if __name__ == '__main__':
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Validate final species names.')
pmc_validate_parser.add_argument(
'final_taxonomy', help="file with final taxonomy (u_replace_generic: final_taxonomy.tsv)")
pmc_validate_parser.add_argument('final_scaled_tree', help="Newick file with scaled curation tree")
'manual_sp_names', help="file indicating species names explicitly set by manually curated (pmc_manual_species: manual_species_names.tsv)")
pmc_validate_parser.add_argument(
'manual_sp_names', help="file indicating species names explicitly set by manually curated (u_replace_generic: manual_species_names.tsv)")
'final_taxonomy', help="file with final taxonomy (pmc_species_names: final_taxonomy.tsv)")
pmc_validate_parser.add_argument('final_scaled_tree', help="Newick file with scaled curation tree")
pmc_validate_parser.add_argument(
'pmc_custom_species', help="file indicating post-curation, manual species assignments that are hard to automatically generate")
pmc_validate_parser.add_argument(
Expand Down Expand Up @@ -759,8 +761,8 @@ if __name__ == '__main__':
'ncbi_env_bioproject_ledger', help="file indicating NCBI BioProjects composed on MAGs or SAGs incorrectly annotated at NCBI")
pmc_validate_parser.add_argument(
'lpsn_gss_metadata_file', help="file indicating GSS (genus-species-subspecies) metadata at LPSN (obtained from https://lpsn.dsmz.de/downloads)")
pmc_validate_parser.add_argument('lpsn_type_material_file',
help="file indicating type material of taxa (obtained from LPSN)")
pmc_validate_parser.add_argument('lpsn_data',
help="file with metadata from LPSN website (obtained by gtdb-migration-tk)")
pmc_validate_parser.add_argument(
'ground_truth_test_cases', help="file indicating specific test cases with ground truth assignments to validate")
pmc_validate_parser.add_argument('output_dir', help="output directory")
Expand Down
154 changes: 102 additions & 52 deletions gtdb_species_clusters/lpsn.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
###############################################################################

import os
import sys
import csv
import re
import datetime
Expand All @@ -29,22 +30,22 @@
@dataclass
class LPSN_Taxon(object):
"""Metadata for single LPSN taxon."""

# Ideally this would be a full record with the
# majority of data at LPSN. However, this information
# is not currently provided to the public except
# on the web pages. We have requested that a data file
# similar to the GSS file be provided.

name: str
priority_year: int
type_material: str
type_material_priority_year : int
type_material_priority_year: int


class LPSN(object):
"""LPSN metadata for taxa."""

def parse_lpsn_priority_year(reference_str):
"""Parse year of priority from LPSN reference string."""

Expand All @@ -53,68 +54,116 @@ def parse_lpsn_priority_year(reference_str):
years = re.sub(r'ex [^\d]*\d{4}', ' ', years)
years = re.findall('[1-3][0-9]{3}', years, re.DOTALL)
years = [int(y) for y in years if int(y) <= datetime.datetime.now().year]

if len(years) == 0:
# assume this name is validated through ICN and just take the first
# assume this name is validated through ICN and just take the first
# date given as the year of priority
years = re.findall('[1-3][0-9]{3}', references, re.DOTALL)
years = [int(y) for y in years if int(y) <= datetime.datetime.now().year]

return years[0]

def __init__(self, lpsn_type_material_file, lpsn_gss_metadata_file):
def __init__(self, lpsn_data, lpsn_gss_metadata_file):
"""Initialization."""

self.logger = logging.getLogger('timestamp')
if lpsn_type_material_file:
self.taxa = self.parse_lpsn_type_material_file(lpsn_type_material_file)

if lpsn_data:
self.taxa = self.parse_lpsn_data_file(lpsn_data)

if lpsn_gss_metadata_file:
self.sp_correct_names, self.sp_synonyms = self.parse_lpsn_gss_file(lpsn_gss_metadata_file)

def type_material(self, taxon):
"""Get type material of taxon."""

if taxon in self.taxa:
return self.taxa[taxon].type_material

return None
def parse_lpsn_type_material_file(self, lpsn_type_material_file):

def parse_lpsn_data_file(self, lpsn_data):
"""Parse type material for taxa as defined at LPSN."""


rank_prefixes = {'domain': 'd__',
'phylum': 'p__',
'class': 'c__',
'order': 'o__',
'family': 'f__',
'genus': 'g__',
'species': 's__'}

# family is never type material
type_prefixes = {0: 'c__', 1: 'o__', 2: 'g__', 3: 's__', 4: ''}

taxa = {}
with open(lpsn_type_material_file) as f:
f.readline()
with open(lpsn_data) as f:
header = f.readline().strip().split('\t')

rank_idx = header.index('Rank')
name_idx = header.index('Name')
priority_idx = header.index('Priority')
type_class_idx = header.index('Type class')
type_strain_idx = header.index('Type strain')

for line in f:
tokens = [t.strip() for t in line.split('\t')]

rank = tokens[0]
taxon = tokens[1].replace('Candidatus ', '')
type_taxon = tokens[2].replace('Candidatus ', '')

if type_taxon:
taxon_priority_year = LPSN.parse_lpsn_priority_year(tokens[3])

if rank in ['Species', 'Subspecies']:
# the type material for species and subspecies is a specific strain,
# and this strain has the same priority as the species or subspecies
type_taxon_priority_year = taxon_priority_year
else:
if tokens[4]:
type_taxon_priority_year = LPSN.parse_lpsn_priority_year(tokens[4])
else:
self.logger.warning('Type material has no priority information: {}'.format(type_taxon))
type_taxon_priority_year = Genome.NO_PRIORITY_YEAR

taxa[taxon] = LPSN_Taxon(taxon, taxon_priority_year, type_taxon, type_taxon_priority_year)


rank = tokens[rank_idx]
if rank == 'subspecies':
# not currently considering subspecies
continue

prefix = rank_prefixes[rank]
taxon = f"{prefix}{tokens[name_idx].replace('Candidatus ', '')}"

priority_citations = tokens[priority_idx]
if priority_citations.lower() == 'n/a':
continue
taxon_priority_year = LPSN.parse_lpsn_priority_year(priority_citations)

type_taxa = []
for idx, type_idx in enumerate(range(type_class_idx, type_strain_idx+1)):
type_taxon = tokens[type_idx]
if type_taxon.lower() != 'n/a':
type_taxa.append(f"{type_prefixes[idx]}{type_taxon.replace('Candidatus ', '')}")

if len(type_taxa) == 0:
# No type material for this LPSN taxon which occurs
# for placeholder taxa and candidatus taxa
type_taxa = ['n/a']
elif len(type_taxa) > 1:
self.logger.error(f'Identified multiple type taxa for {taxon}: {type_taxa}')
sys.exit(-1)

type_taxon = type_taxa[0]
taxa[taxon] = LPSN_Taxon(taxon, taxon_priority_year, type_taxon, None)

# fill in priority year information for type taxa
missing_type_count = 0
for taxon in taxa:
if taxon.startswith('s__'):
# the type material for species is a specific strain,
# and this strain has the same priority as the species
type_priority = taxa[taxon].priority_year
else:
type_taxon = taxa[taxon].type_material
if type_taxon not in taxa:
missing_type_count += 1
continue
type_priority = taxa[type_taxon].priority_year

taxa[taxon].type_material_priority_year = type_priority

if missing_type_count:
self.logger.error(
f'Identified {missing_type_count:,} taxa without entries for type material. This is currently a known issue that needs to be addressed.')

return taxa

def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
"""Parse LPSN GSS (genus-species-subspecies) metadata."""

# get mapping between record numbers and LPSN taxa
lpsn_record_taxon_map = {}
with open(lpsn_gss_metadata_file, encoding='utf-8') as f:
Expand All @@ -128,15 +177,16 @@ def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
record_no_idx = tokens.index('record_no')
else:
if tokens[genus_idx] != '' and tokens[sp_idx] != '' and tokens[subsp_idx] != '':
taxon = 's__{} {} subsp. {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip(), tokens[subsp_idx].strip())
elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
taxon = 's__{} {} subsp. {}'.format(
tokens[genus_idx].strip(), tokens[sp_idx].strip(), tokens[subsp_idx].strip())
elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
taxon = 's__{} {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip())
else:
taxon = 'g__{}'.format(tokens[genus_idx].strip())

lpsn_record_taxon_map[tokens[record_no_idx]] = taxon
# get synonyms and names considered correct at LPSN

# get synonyms and names considered correct at LPSN
lpsn_sp_correct_names = {}
lpsn_sp_synonyms = defaultdict(set)
with open(lpsn_gss_metadata_file, encoding='utf-8') as f:
Expand All @@ -152,26 +202,26 @@ def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
record_no_idx = tokens.index('record_no')
record_lnk_idx = tokens.index('record_lnk')
else:
if tokens[genus_idx] != '' and tokens[sp_idx] != '' and tokens[subsp_idx] != '':
if tokens[genus_idx] != '' and tokens[sp_idx] != '' and tokens[subsp_idx] != '':
# process subspecies
pass
elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
# process species
taxon = 's__{} {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip())

if 'correct name' in tokens[status_idx]:
assert tokens[record_lnk_idx] == ''
lpsn_sp_correct_names[taxon] = taxon
elif tokens[record_lnk_idx] != '':
# for species, this should link to the correct name for
# a species according to LPSN
lpsn_sp_correct_names[taxon] = lpsn_record_taxon_map[tokens[record_lnk_idx]]

if 'synonym' in tokens[status_idx] and tokens[record_lnk_idx] != '':
synonym_sp = lpsn_record_taxon_map[tokens[record_lnk_idx]]
lpsn_sp_synonyms[taxon].add(synonym_sp)
else:
# process genus
pass
return lpsn_sp_correct_names, lpsn_sp_synonyms

return lpsn_sp_correct_names, lpsn_sp_synonyms
4 changes: 2 additions & 2 deletions gtdb_species_clusters/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def pmc_validate(self, args):
check_file_exists(args.specific_epithet_ledger)
check_file_exists(args.ncbi_env_bioproject_ledger)
check_file_exists(args.lpsn_gss_metadata_file)
check_file_exists(args.lpsn_type_material_file)
check_file_exists(args.lpsn_data)

if args.ground_truth_test_cases.lower() != 'none':
check_file_exists(args.ground_truth_test_cases)
Expand Down Expand Up @@ -717,7 +717,7 @@ def pmc_validate(self, args):
args.specific_epithet_ledger,
args.ncbi_env_bioproject_ledger,
args.lpsn_gss_metadata_file,
args.lpsn_type_material_file,
args.lpsn_data,
args.ground_truth_test_cases,
args.skip_full_taxonomy_checks,
args.skip_genus_checks)
Expand Down
2 changes: 1 addition & 1 deletion gtdb_species_clusters/ncbi_species_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def classify_ncbi_species(self, ncbi_synonyms, misclassified_gids):
gtdb_rid,
gtdb_rep_ncbi_sp_assignments[gtdb_rid],
ncbi_species))
sys.exit(-1)
# ***sys.exit(-1)
gtdb_rep_ncbi_sp_assignments[gtdb_rid] = ncbi_species

return ncbi_all_species, unambiguous_ncbi_sp, ambiguous_ncbi_sp
Expand Down
Loading

0 comments on commit c738f19

Please sign in to comment.