Updates made during final curation and validation of R202.

Ecogenomics · Apr 8, 2021 · c738f19 · c738f19
1 parent d7ac715
commit c738f19
Show file tree

Hide file tree

Showing 10 changed files with 331 additions and 169 deletions.
diff --git a/bin/gtdb_species_clusters b/bin/gtdb_species_clusters
@@ -70,9 +70,9 @@ def print_help():
     Post-manual curation
       pmc_manual_species     -> Identify species names manually set by curators
       pmc_replace_generic    -> Replace generic names with genus assignment
+      pmc_species_names      -> Establish final species names based on manual curation
       pmc_check_type_species -> Check for agreement between GTDB genera and genomes assembled from type species of genus
       pmc_check_type_strains -> Check for agreement between GTDB species and genomes assembled from type strain of species
-      pmc_species_names      -> Establish final species names based on manual curation
       pmc_validate           -> Validate final species names
       pmc_name_stats         -> [TBD] Summary statistics indicating changes to GTDB species cluster names
       pmc_cluster_stats      -> Calculate final statistics for species clusters (???)
@@ -628,7 +628,9 @@ if __name__ == '__main__':
                                                           formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                                           description='Check for agreement between GTDB genera and genomes assembled from type species of genus.')
     pmc_check_type_species_parser.add_argument(
-        'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
+        'manual_taxonomy', help="file with manually curated taxonomy (pmc_species_names: final_taxonomy.tsv)")
+    pmc_check_type_species_parser.add_argument(
+        'cur_gtdb_metadata_file', help="file with GTDB metadata for current GTDB release (TSV file)")
     pmc_check_type_species_parser.add_argument(
         'qc_passed_file', help="file indicating genomes that have passed QC (u_qc_genomes: qc_passed.tsv)")
     pmc_check_type_species_parser.add_argument(
@@ -653,7 +655,7 @@ if __name__ == '__main__':
                                                           formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                                           description='Check for agreement between GTDB species and genomes assembled from type strain of species.')
     pmc_check_type_strains_parser.add_argument(
-        'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
+        'manual_taxonomy', help="file with manually curated taxonomy (pmc_species_names: final_taxonomy.tsv)")
     pmc_check_type_strains_parser.add_argument(
         'cur_gtdb_metadata_file', help="file with GTDB metadata for current GTDB release (TSV file)")
     pmc_check_type_strains_parser.add_argument(
@@ -676,9 +678,9 @@ if __name__ == '__main__':
     pmc_species_names_parser.add_argument(
         'curation_tree', help="manually curated tree to relabel with final species names")
     pmc_species_names_parser.add_argument(
-        'manual_taxonomy', help="file with manually curated taxonomy (u_replace_generic: taxonomy_updated_sp.tsv)")
+        'manual_sp_names', help="file indicating species names explicitly set by manually curated (pmc_manual_species: manual_species_names.tsv)")
     pmc_species_names_parser.add_argument(
-        'manual_sp_names', help="file indicating species names explicitly set by manually curated (u_replace_generic: manual_species_names.tsv)")
+        'manual_taxonomy', help="file with manually curated taxonomy (pmc_replace_generic: taxonomy_updated_sp.tsv)")
     pmc_species_names_parser.add_argument(
         'pmc_custom_species', help="file indicating post-curation, manual species assignments that are hard to automatically generate")
     pmc_species_names_parser.add_argument(
@@ -690,7 +692,7 @@ if __name__ == '__main__':
     pmc_species_names_parser.add_argument(
         'qc_passed_file', help="file indicating genomes that have passed QC (u_qc_genomes: qc_passed.tsv)")
     pmc_species_names_parser.add_argument(
-        'ncbi_misclassified_file', help="file indicating genomes with erroneous NCBI species assignments (u_ncbi_erroneous: ncbi_missclassified_sp.tsv)")
+        'ncbi_misclassified_file', help="file indicating genomes with erroneous NCBI species assignments (u_ncbi_erroneous: ncbi_misclassified_sp.gtdb_clustering.tsv)")
     pmc_species_names_parser.add_argument('ncbi_genbank_assembly_file',
                                           help="NCBI GenBank assembly file indicating potentially erroneous genomes")
     pmc_species_names_parser.add_argument(
@@ -721,10 +723,10 @@ if __name__ == '__main__':
                                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                                 description='Validate final species names.')
     pmc_validate_parser.add_argument(
-        'final_taxonomy', help="file with final taxonomy (u_replace_generic: final_taxonomy.tsv)")
-    pmc_validate_parser.add_argument('final_scaled_tree', help="Newick file with scaled curation tree")
+        'manual_sp_names', help="file indicating species names explicitly set by manually curated (pmc_manual_species: manual_species_names.tsv)")
     pmc_validate_parser.add_argument(
-        'manual_sp_names', help="file indicating species names explicitly set by manually curated (u_replace_generic: manual_species_names.tsv)")
+        'final_taxonomy', help="file with final taxonomy (pmc_species_names: final_taxonomy.tsv)")
+    pmc_validate_parser.add_argument('final_scaled_tree', help="Newick file with scaled curation tree")
     pmc_validate_parser.add_argument(
         'pmc_custom_species', help="file indicating post-curation, manual species assignments that are hard to automatically generate")
     pmc_validate_parser.add_argument(
@@ -759,8 +761,8 @@ if __name__ == '__main__':
         'ncbi_env_bioproject_ledger', help="file indicating NCBI BioProjects composed on MAGs or SAGs incorrectly annotated at NCBI")
     pmc_validate_parser.add_argument(
         'lpsn_gss_metadata_file', help="file indicating GSS (genus-species-subspecies) metadata at LPSN (obtained from https://lpsn.dsmz.de/downloads)")
-    pmc_validate_parser.add_argument('lpsn_type_material_file',
-                                     help="file indicating type material of taxa (obtained from LPSN)")
+    pmc_validate_parser.add_argument('lpsn_data',
+                                     help="file with metadata from LPSN website (obtained by gtdb-migration-tk)")
     pmc_validate_parser.add_argument(
         'ground_truth_test_cases', help="file indicating specific test cases with ground truth assignments to validate")
     pmc_validate_parser.add_argument('output_dir', help="output directory")

diff --git a/gtdb_species_clusters/lpsn.py b/gtdb_species_clusters/lpsn.py
@@ -16,6 +16,7 @@
 ###############################################################################
 
 import os
+import sys
 import csv
 import re
 import datetime
@@ -29,22 +30,22 @@
 @dataclass
 class LPSN_Taxon(object):
     """Metadata for single LPSN taxon."""
-    
+
     # Ideally this would be a full record with the
     # majority of data at LPSN. However, this information
     # is not currently provided to the public except
     # on the web pages. We have requested that a data file
     # similar to the GSS file be provided.
-    
+
     name: str
     priority_year: int
     type_material: str
-    type_material_priority_year : int
+    type_material_priority_year: int
 
 
 class LPSN(object):
     """LPSN metadata for taxa."""
-    
+
     def parse_lpsn_priority_year(reference_str):
         """Parse year of priority from LPSN reference string."""
 
@@ -53,68 +54,116 @@ def parse_lpsn_priority_year(reference_str):
         years = re.sub(r'ex [^\d]*\d{4}', ' ', years)
         years = re.findall('[1-3][0-9]{3}', years, re.DOTALL)
         years = [int(y) for y in years if int(y) <= datetime.datetime.now().year]
-        
+
         if len(years) == 0:
-            # assume this name is validated through ICN and just take the first 
+            # assume this name is validated through ICN and just take the first
             # date given as the year of priority
             years = re.findall('[1-3][0-9]{3}', references, re.DOTALL)
             years = [int(y) for y in years if int(y) <= datetime.datetime.now().year]
-        
+
         return years[0]
 
-    def __init__(self, lpsn_type_material_file, lpsn_gss_metadata_file):
+    def __init__(self, lpsn_data, lpsn_gss_metadata_file):
         """Initialization."""
-        
+
         self.logger = logging.getLogger('timestamp')
-        
-        if lpsn_type_material_file:
-            self.taxa = self.parse_lpsn_type_material_file(lpsn_type_material_file)
-            
+
+        if lpsn_data:
+            self.taxa = self.parse_lpsn_data_file(lpsn_data)
+
         if lpsn_gss_metadata_file:
             self.sp_correct_names, self.sp_synonyms = self.parse_lpsn_gss_file(lpsn_gss_metadata_file)
 
     def type_material(self, taxon):
         """Get type material of taxon."""
-        
+
         if taxon in self.taxa:
             return self.taxa[taxon].type_material
-            
+
         return None
-        
-    def parse_lpsn_type_material_file(self, lpsn_type_material_file):
+
+    def parse_lpsn_data_file(self, lpsn_data):
         """Parse type material for taxa as defined at LPSN."""
-
+
+        rank_prefixes = {'domain': 'd__',
+                         'phylum': 'p__',
+                         'class': 'c__',
+                         'order': 'o__',
+                         'family': 'f__',
+                         'genus': 'g__',
+                         'species': 's__'}
+
+        # family is never type material
+        type_prefixes = {0: 'c__', 1: 'o__', 2: 'g__', 3: 's__', 4: ''}
+
         taxa = {}
-        with open(lpsn_type_material_file) as f:
-            f.readline()
+        with open(lpsn_data) as f:
+            header = f.readline().strip().split('\t')
+
+            rank_idx = header.index('Rank')
+            name_idx = header.index('Name')
+            priority_idx = header.index('Priority')
+            type_class_idx = header.index('Type class')
+            type_strain_idx = header.index('Type strain')
+
             for line in f:
                 tokens = [t.strip() for t in line.split('\t')]
-
-                rank = tokens[0]
-                taxon = tokens[1].replace('Candidatus ', '')
-                type_taxon = tokens[2].replace('Candidatus ', '')
-
-                if type_taxon:
-                    taxon_priority_year = LPSN.parse_lpsn_priority_year(tokens[3])
-
-                    if rank in ['Species', 'Subspecies']:
-                        # the type material for species and subspecies is a specific strain,
-                        # and this strain has the same priority as the species or subspecies
-                        type_taxon_priority_year = taxon_priority_year
-                    else:
-                        if tokens[4]:
-                            type_taxon_priority_year = LPSN.parse_lpsn_priority_year(tokens[4])
-                        else:
-                            self.logger.warning('Type material has no priority information: {}'.format(type_taxon))
-                            type_taxon_priority_year = Genome.NO_PRIORITY_YEAR
-
-                    taxa[taxon] = LPSN_Taxon(taxon, taxon_priority_year, type_taxon, type_taxon_priority_year)
-
+
+                rank = tokens[rank_idx]
+                if rank == 'subspecies':
+                    # not currently considering subspecies
+                    continue
+
+                prefix = rank_prefixes[rank]
+                taxon = f"{prefix}{tokens[name_idx].replace('Candidatus ', '')}"
+
+                priority_citations = tokens[priority_idx]
+                if priority_citations.lower() == 'n/a':
+                    continue
+                taxon_priority_year = LPSN.parse_lpsn_priority_year(priority_citations)
+
+                type_taxa = []
+                for idx, type_idx in enumerate(range(type_class_idx, type_strain_idx+1)):
+                    type_taxon = tokens[type_idx]
+                    if type_taxon.lower() != 'n/a':
+                        type_taxa.append(f"{type_prefixes[idx]}{type_taxon.replace('Candidatus ', '')}")
+
+                if len(type_taxa) == 0:
+                    # No type material for this LPSN taxon which occurs
+                    # for placeholder taxa and candidatus taxa
+                    type_taxa = ['n/a']
+                elif len(type_taxa) > 1:
+                    self.logger.error(f'Identified multiple type taxa for {taxon}: {type_taxa}')
+                    sys.exit(-1)
+
+                type_taxon = type_taxa[0]
+                taxa[taxon] = LPSN_Taxon(taxon, taxon_priority_year, type_taxon, None)
+
+        # fill in priority year information for type taxa
+        missing_type_count = 0
+        for taxon in taxa:
+            if taxon.startswith('s__'):
+                # the type material for species is a specific strain,
+                # and this strain has the same priority as the species
+                type_priority = taxa[taxon].priority_year
+            else:
+                type_taxon = taxa[taxon].type_material
+                if type_taxon not in taxa:
+                    missing_type_count += 1
+                    continue
+                type_priority = taxa[type_taxon].priority_year
+
+            taxa[taxon].type_material_priority_year = type_priority
+
+        if missing_type_count:
+            self.logger.error(
+                f'Identified {missing_type_count:,} taxa without entries for type material. This is currently a known issue that needs to be addressed.')
+
         return taxa
 
     def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
         """Parse LPSN GSS (genus-species-subspecies) metadata."""
-        
+
         # get mapping between record numbers and LPSN taxa
         lpsn_record_taxon_map = {}
         with open(lpsn_gss_metadata_file, encoding='utf-8') as f:
@@ -128,15 +177,16 @@ def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
                     record_no_idx = tokens.index('record_no')
                 else:
                     if tokens[genus_idx] != '' and tokens[sp_idx] != '' and tokens[subsp_idx] != '':
-                        taxon = 's__{} {} subsp. {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip(), tokens[subsp_idx].strip())
-                    elif tokens[genus_idx] != '' and  tokens[sp_idx] != '':
+                        taxon = 's__{} {} subsp. {}'.format(
+                            tokens[genus_idx].strip(), tokens[sp_idx].strip(), tokens[subsp_idx].strip())
+                    elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
                         taxon = 's__{} {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip())
                     else:
                         taxon = 'g__{}'.format(tokens[genus_idx].strip())
 
                     lpsn_record_taxon_map[tokens[record_no_idx]] = taxon
-            
-        # get synonyms and names considered correct at LPSN 
+
+        # get synonyms and names considered correct at LPSN
         lpsn_sp_correct_names = {}
         lpsn_sp_synonyms = defaultdict(set)
         with open(lpsn_gss_metadata_file, encoding='utf-8') as f:
@@ -152,26 +202,26 @@ def parse_lpsn_gss_file(self, lpsn_gss_metadata_file):
                     record_no_idx = tokens.index('record_no')
                     record_lnk_idx = tokens.index('record_lnk')
                 else:
-                    if tokens[genus_idx] != '' and  tokens[sp_idx] != '' and tokens[subsp_idx] != '':
+                    if tokens[genus_idx] != '' and tokens[sp_idx] != '' and tokens[subsp_idx] != '':
                         # process subspecies
                         pass
-                    elif tokens[genus_idx] != '' and  tokens[sp_idx] != '':
+                    elif tokens[genus_idx] != '' and tokens[sp_idx] != '':
                         # process species
                         taxon = 's__{} {}'.format(tokens[genus_idx].strip(), tokens[sp_idx].strip())
-                        
+
                         if 'correct name' in tokens[status_idx]:
                             assert tokens[record_lnk_idx] == ''
                             lpsn_sp_correct_names[taxon] = taxon
                         elif tokens[record_lnk_idx] != '':
                             # for species, this should link to the correct name for
                             # a species according to LPSN
                             lpsn_sp_correct_names[taxon] = lpsn_record_taxon_map[tokens[record_lnk_idx]]
-                            
+
                         if 'synonym' in tokens[status_idx] and tokens[record_lnk_idx] != '':
                             synonym_sp = lpsn_record_taxon_map[tokens[record_lnk_idx]]
                             lpsn_sp_synonyms[taxon].add(synonym_sp)
                     else:
                         # process genus
                         pass
-                        
-        return lpsn_sp_correct_names, lpsn_sp_synonyms
+
+        return lpsn_sp_correct_names, lpsn_sp_synonyms
diff --git a/gtdb_species_clusters/main.py b/gtdb_species_clusters/main.py
@@ -689,7 +689,7 @@ def pmc_validate(self, args):
         check_file_exists(args.specific_epithet_ledger)
         check_file_exists(args.ncbi_env_bioproject_ledger)
         check_file_exists(args.lpsn_gss_metadata_file)
-        check_file_exists(args.lpsn_type_material_file)
+        check_file_exists(args.lpsn_data)
 
         if args.ground_truth_test_cases.lower() != 'none':
             check_file_exists(args.ground_truth_test_cases)
@@ -717,7 +717,7 @@ def pmc_validate(self, args):
               args.specific_epithet_ledger,
               args.ncbi_env_bioproject_ledger,
               args.lpsn_gss_metadata_file,
-              args.lpsn_type_material_file,
+              args.lpsn_data,
               args.ground_truth_test_cases,
               args.skip_full_taxonomy_checks,
               args.skip_genus_checks)

diff --git a/gtdb_species_clusters/ncbi_species_manager.py b/gtdb_species_clusters/ncbi_species_manager.py
@@ -538,7 +538,7 @@ def classify_ncbi_species(self, ncbi_synonyms, misclassified_gids):
                     gtdb_rid,
                     gtdb_rep_ncbi_sp_assignments[gtdb_rid],
                     ncbi_species))
-                sys.exit(-1)
+                # ***sys.exit(-1)
             gtdb_rep_ncbi_sp_assignments[gtdb_rid] = ncbi_species
 
         return ncbi_all_species, unambiguous_ncbi_sp, ambiguous_ncbi_sp