monarch-initiative · joeflack4 · Sep 23, 2024 · Sep 23, 2024
diff --git a/omim2obo/main.py b/omim2obo/main.py
@@ -49,6 +49,8 @@
 Assumptions
 1. Mappings obtained from official OMIM files as described above are interpreted correctly (e.g. skos:exactMatch).
 """
+from typing import Set
+
 import yaml
 from hashlib import md5
 
@@ -57,8 +59,7 @@
 
 from omim2obo.namespaces import *
 from omim2obo.parsers.omim_entry_parser import cleanup_title, get_alt_and_included_titles_and_symbols, get_pubs, \
-    get_mapped_ids, \
-    recapitalize_acronyms_in_title
+    get_mapped_ids, recapitalize_acronyms_in_titles
 from omim2obo.config import ROOT_DIR, GLOBAL_TERMS
 from omim2obo.parsers.omim_txt_parser import *
 
@@ -204,6 +205,16 @@ def omim2obo(use_cache: bool = False):
             get_alt_and_included_titles_and_symbols(inc_titles_str)
         included_is_included = included_titles or included_symbols  # redundant. can't be included symbol w/out title
 
+        # Recapitalize acronyms in titles
+        all_abbrevs: Set[str] = \
+            set(pref_symbols + alt_symbols + former_alt_symbols + included_symbols + former_included_symbols)
+        # todo: consider DRYing to 1 call by passing all 5 title types to a wrapper function
+        pref_title = recapitalize_acronyms_in_titles(pref_title, all_abbrevs)
+        alt_titles = recapitalize_acronyms_in_titles(alt_titles, all_abbrevs)
+        former_alt_titles = recapitalize_acronyms_in_titles(former_alt_titles, all_abbrevs)
+        included_titles = recapitalize_acronyms_in_titles(included_titles, all_abbrevs)
+        former_included_titles = recapitalize_acronyms_in_titles(former_included_titles, all_abbrevs)
+
         # Special cases depending on OMIM term type
         is_gene = omim_type == OmimType.GENE or omim_type == OmimType.HAS_AFFECTED_FEATURE
         if omim_type == OmimType.HERITABLE_PHENOTYPIC_MARKER:  # %
@@ -227,25 +238,19 @@ def omim2obo(use_cache: bool = False):
         else:
             graph.add((omim_uri, RDFS.label, Literal(pref_title)))
 
-        # todo: .clean()/.cleanup_label() 2nd param `explicit_abbrev` should be List[str] instead of str. And below,
-        #  should pass all symbols/abbrevs from each of preferred, alt, included each time it is called. If no symbols
-        #  for given term, should pass empty list. See: https://github.com/monarch-initiative/omim/issues/129
-        pref_abbrev: Union[str, None] = None if not pref_symbols else pref_symbols[0]
-
         # Add synonyms
         # - exact titles
-        graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(pref_title, pref_abbrev))))
+        graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(pref_title)))
         for title in alt_titles:
-            graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
+            graph.add((omim_uri, oboInOwl.hasExactSynonym, Literal(title)))
         # - exact abbreviations
         for abbrevs in [pref_symbols, alt_symbols]:
             for abbreviation in abbrevs:
                 add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasExactSynonym, abbreviation,
                     [(oboInOwl.hasSynonymType, OMO['0003000'])])
         # - related, deprecated 'former' titles
         for title in former_alt_titles:
-            clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
-            add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, clean_title,
+            add_triple_and_optional_annotations(graph, omim_uri, oboInOwl.hasRelatedSynonym, title,
                 [(OWL.deprecated, Literal(True))])
         # - related, deprecated 'former' abbreviations
         for abbreviation in former_alt_symbols:
@@ -259,7 +264,7 @@ def omim2obo(use_cache: bool = False):
             graph.add((omim_uri, RDFS['comment'], Literal(included_comment)))
         # - titles
         for title in included_titles:
-            graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(recapitalize_acronyms_in_title(title, pref_abbrev))))
+            graph.add((omim_uri, URIRef(MONDONS.omim_included), Literal(title)))
         # - symbols
         for symbol in included_symbols:
             add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [
@@ -268,8 +273,7 @@ def omim2obo(use_cache: bool = False):
             ])
         # - deprecated, 'former'
         for title in former_included_titles:
-            clean_title = recapitalize_acronyms_in_title(title, pref_abbrev)
-            add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), clean_title,
+            add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), title,
                 [(OWL.deprecated, Literal(True))])
         for symbol in former_included_symbols:
             add_triple_and_optional_annotations(graph, omim_uri, URIRef(MONDONS.omim_included), symbol, [

diff --git a/omim2obo/parsers/omim_entry_parser.py b/omim2obo/parsers/omim_entry_parser.py
@@ -4,7 +4,7 @@
 # import re
 from collections import defaultdict
 from copy import copy
-from typing import List, Dict, Tuple
+from typing import List, Dict, Set, Tuple, Union
 
 import pandas as pd
 from rdflib import Graph, RDF, RDFS, DC, Literal, OWL, SKOS, URIRef
@@ -21,15 +21,21 @@
 
 def get_known_capitalizations() -> Dict[str, str]:
     """Get list of known capitalizations for proper names, acronyms, and the like.
-    TODO: Contains space-delimited words, e.g. "vitamin d". The way that
+    todo: Contains space-delimited words, e.g. "vitamin d". The way that
      cleanup_label is currently implemented, each word in the label gets
      replaced; i.e. it would try to replace "vitamin" and "d" separately. Hence,
      this would fail.
      Therefore, we should probably do this in 2 different operations: (1) use
      the current 'word replacement' logic, but also, (2), at the end, do a
      generic string replacement (e.g. my_str.replace(a, b). When implementing
      (2), we should also split this dictionary into two separate dictionaries,
-     each for 1 of these 2 different purposes."""
+     each for 1 of these 2 different purposes.
+
+    todo: known_capitalizations.tsv can be refactored possibly. It really only needs 1 column, the case to replaace. The
+     pattern column is not used, and the first column (lowercase) can be computed by using .lower() on the case to
+     replace. We could also leave as-is since this file is shared elsewhere in the project infrastructure, though I do
+     not know its source-of-truth location.
+    """
     path = DATA_DIR / 'known_capitalizations.tsv'
     with open(path, "r") as file:
         data_io = csv.reader(file, delimiter="\t")
@@ -147,8 +153,7 @@ def transform_entry(entry) -> Graph:
     return graph
 
 
-# todo: probably best to combine explicit abbrevs outside of this func
-def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalization_threshold=0.75) -> List[str]:
+def detect_abbreviations(label: str, capitalization_threshold=0.75) -> List[str]:
     """Detect possible abbreviations / acronyms"""
     # Compile regexp
     acronyms_without_periods_compiler = re.compile('[A-Z]{1}[A-Z0-9]{1,}')
@@ -165,29 +170,21 @@ def _detect_abbreviations(label: str, explicit_abbrev: str = None, capitalizatio
     is_largely_uppercase = \
         fully_capitalized_count / len(words) >= capitalization_threshold
 
-    # Detect acronyms without periods
+    # Detect cases
     if is_largely_uppercase:
         acronyms_without_periods = []  # can't infer because everything was uppercase
     else:
-        acronyms_without_periods = acronyms_without_periods_compiler.findall(label)
-    # Detect more
-    title_cased_abbrevs = title_cased_abbrev_compiler.findall(label)
-    acronyms_with_periods = acronyms_with_periods_compiler.findall(label)
-    # Combine list of things to re-format
-    replacements = []
-    candidates: List[List[str]] = [
-        acronyms_with_periods, acronyms_without_periods, title_cased_abbrevs, [explicit_abbrev]]
-    for item_list in candidates:
-        for item in item_list:
-            if item:
-                replacements.append(item)
-
-    return replacements
+        acronyms_without_periods: List[str] = acronyms_without_periods_compiler.findall(label)
+    title_cased_abbrevs: List[str] = title_cased_abbrev_compiler.findall(label)
+    acronyms_with_periods: List[str] = acronyms_with_periods_compiler.findall(label)
+
+    return acronyms_with_periods + acronyms_without_periods + title_cased_abbrevs
 
 
 # todo: rename? It's doing more than cleaning; it's mutating
 def cleanup_title(
     title: str,
+    replacement_case_method: str = 'lower',  # 'upper', 'title', 'lower', 'capitalize' (=sentence case)
     conjunctions: List[str] = ['and', 'but', 'yet', 'for', 'nor', 'so'],
     little_preps: List[str] = ['at', 'by', 'in', 'of', 'on', 'to', 'up', 'as', 'it', 'or'],
     articles: List[str] = ['a', 'an', 'the'],
@@ -197,9 +194,10 @@ def cleanup_title(
 
     :param title: A preferred, alternative, or included title.
 
-    1. Removes the abbreviation suffixes
-    2. Converts roman numerals to arabic
-    3. Makes the text Title Case, except for supplied conjunctions/prepositions/articles
+    1. Converts roman numerals to arabic
+    2. Makes the text adhere to the case of `replacement_case_method`, except for supplied
+    conjunctions, prepositions, and articles, which will always be lowercased. NOTE: The default for this is 'lower',
+    meaning that this operation by default does nothing.
 
     Assumptions:
     1. All acronyms are capitalized
@@ -233,9 +231,6 @@ def cleanup_title(
        e.g.: Balint syndrome, Barre-Lieou syndrome, Wallerian degeneration, etc.
        How to do this? Simply get/create a list of known eponyms? Is this feasible?
     """
-    # Simple method: Lower/title case everything but acronyms
-    # label_newcase = getattr(label2, replacement_case_method)()
-    # Advanced method: iteritavely format words
     fixedwords = []
     i = 0
     for wrd in title.split():
@@ -254,8 +249,7 @@ def cleanup_title(
                 suffix = wrd.replace(toRoman(num), '', 1)
                 fixed = ''.join((str(num), suffix))
                 wrd = fixed
-        # todo: next few lines don't make sense. why lower 'wrd', and then conditionally lowercase it again?
-        wrd = wrd.lower()
+        wrd = getattr(wrd, replacement_case_method)()
         # replace interior conjunctions, prepositions, and articles with lowercase, always
         if wrd in (conjunctions + little_preps + articles) and i != 1:
             wrd = wrd.lower()
@@ -267,18 +261,49 @@ def cleanup_title(
     return label_newcase
 
 
-# todo: explicit_abbrev: Change to List[str]. See: https://github.com/monarch-initiative/omim/issues/129
-def recapitalize_acronyms_in_title(title: str, explicit_abbrev=None, capitalization_threshold=0.75) -> str:
-    """Re-capitalize acronyms / words based on information contained w/in original label"""
-    # todo: probably best to combine explicit abbrevs outside of this func
-    possible_abbreviations = _detect_abbreviations(
-        title, explicit_abbrev, capitalization_threshold=capitalization_threshold)
-    title2 = title
-    for abbrev in possible_abbreviations:
-        title2 = title2.replace(abbrev.upper(), abbrev)
+def recapitalize_acronyms_in_title(title: str, known_abbrevs: Set[str] = None, capitalization_threshold=0.75) -> str:
+    """Re-capitalize acronyms / words based on information contained w/in original label
+
+    todo: If title has been used on cleanup_title() using a replacement_case_method other than the non-default 'lower',
+     then the .replace() operation will not work. To solve, this (a) capture the replacement_case_method used and
+     pass that here, or (b) duplicate the .replace() line and call it on alternative casing variations (.title() and
+     capitalize() (=sentence case)), (c) possibly just compare to word.lower() instead of 'word.
+    todo: (more important): It's probable that .split(' ') is not enough to cover all cases. Should also run the check
+     by splitting on other characters. E.g. consider the following potential cases: "TITLE (ACRONYM)",
+     "TITLE: ACRONYM1&ACRONYM2", "TITLE/ACRONYM" or "TITLE ACRONYM/ACRONYM", "TITLE {ACRONYM1,ACRONYM2}",
+     "TITLE[ACRONYM]",  "TITLE-ACRONYM", or less likely cases such as "TITLE_ACRONYM", "TITLE.ACRONYM". There are quite
+      a few different combos of special char usage that could theoretically arise. It might be possible for thisthat to
+      utilize the regular expressions in detect_abbreviations(), and substitute in the acronym in the place of the [A-Z]
+      part. It is also possible to improve detect_abbreviations() by considering some of thes eother possible example
+      cases above.
+    """
+    inferred_abbrevs: Set[str] = set(detect_abbreviations(title, capitalization_threshold))
+    abbrevs: Set[str] = known_abbrevs.union(inferred_abbrevs)
+    if not abbrevs:
+        return title
+    title2_words: List[str] = []
+    for word in title.split():
+        abbrev_match = False
+        for abbrev in abbrevs:
+            if abbrev.lower() == word:
+                title2_words.append(abbrev)
+                abbrev_match = True
+                break
+        if not abbrev_match:
+            title2_words.append(word)
+    title2 = ' '.join(title2_words)
     return title2
 
 
+def recapitalize_acronyms_in_titles(
+    titles: Union[str, List[str]], known_abbrevs: Set[str] = None, capitalization_threshold=0.75
+) -> Union[str, List[str]]:
+    """Re-capitalize acronyms in a list of titles"""
+    if isinstance(titles, str):
+        return recapitalize_acronyms_in_title(titles, known_abbrevs, capitalization_threshold)
+    return [recapitalize_acronyms_in_title(title, known_abbrevs, capitalization_threshold) for title in titles]
+
+
 def remove_included_and_formerly_suffixes(title: str) -> str:
     """Remove ', INCLUDED' and ', FORMERLY' suffixes from a title"""
     for suffix in ['FORMERLY', 'INCLUDED']: