From 75ea4046d08b1541fe20f107ff3aa1306d83654b Mon Sep 17 00:00:00 2001 From: Ian Date: Wed, 26 Jun 2024 16:05:35 -0400 Subject: [PATCH] Improvements to the Logical Model generator With these changes almost all the syntactic errors in logical models should be resolved. --- poetry.lock | 50 ++++++- pyproject.toml | 1 + tests/data/example_fsh/HIVARegistration.fsh | 18 +-- .../logical_models/logical_model_generator.py | 129 +++++++++++++----- who_l3_smart_tools/utils/__init__.py | 10 ++ who_l3_smart_tools/utils/counter.py | 12 ++ 6 files changed, 179 insertions(+), 41 deletions(-) create mode 100644 who_l3_smart_tools/utils/__init__.py create mode 100644 who_l3_smart_tools/utils/counter.py diff --git a/poetry.lock b/poetry.lock index 0bce512..fca449c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -521,6 +521,25 @@ files = [ {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, ] +[[package]] +name = "inflect" +version = "7.3.0" +description = "Correctly generate plurals, singular nouns, ordinals, indefinite articles" +optional = false +python-versions = ">=3.8" +files = [ + {file = "inflect-7.3.0-py3-none-any.whl", hash = "sha256:91f689dae31f9a0c28e6275f6161a0d39c7316524b71c5b6978920c7d362ce97"}, + {file = "inflect-7.3.0.tar.gz", hash = "sha256:27264bb8992cab2be3c50772ecf7a79492c6f95e16e15002e6023001d2d639c2"}, +] + +[package.dependencies] +more-itertools = "*" +typeguard = ">=4.0.1" + +[package.extras] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["pygments", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -629,6 +648,17 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "more-itertools" +version = "10.3.0" +description = "More routines for operating on iterables, beyond itertools" +optional = false +python-versions = ">=3.8" +files = [ + {file = "more-itertools-10.3.0.tar.gz", hash = "sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463"}, + {file = "more_itertools-10.3.0-py3-none-any.whl", hash = "sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320"}, +] + [[package]] name = "multidict" version = "6.0.5" @@ -1293,6 +1323,24 @@ files = [ {file = "stringcase-1.2.0.tar.gz", hash = "sha256:48a06980661908efe8d9d34eab2b6c13aefa2163b3ced26972902e3bdfd87008"}, ] +[[package]] +name = "typeguard" +version = "4.3.0" +description = "Run-time type checker for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typeguard-4.3.0-py3-none-any.whl", hash = "sha256:4d24c5b39a117f8a895b9da7a9b3114f04eb63bade45a4492de49b175b6f7dfa"}, + {file = "typeguard-4.3.0.tar.gz", hash = "sha256:92ee6a0aec9135181eae6067ebd617fd9de8d75d714fb548728a4933b1dea651"}, +] + +[package.dependencies] +typing-extensions = ">=4.10.0" + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.3.0)"] +test = ["coverage[toml] (>=7)", "mypy (>=1.2.0)", "pytest (>=7)"] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1454,4 +1502,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "90a3e5ca994cb8b4816330bf7a3e77a842d3661e71c96868c2d19fe982ab8ef3" +content-hash = "8297d0737dd260417b21cc0eff218c0c87dfc977cb0b78e11e8e4855b1142063" diff --git a/pyproject.toml b/pyproject.toml index 1e73e6e..da651c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ openpyxl = ">=3.0" fhirpy = ">=1.4" "fhir.resources" = ">=7.0" Faker = ">=25.0" +inflect = "^7.3.0" [tool.poetry.group.dev.dependencies] pytest = ">=6" diff --git a/tests/data/example_fsh/HIVARegistration.fsh b/tests/data/example_fsh/HIVARegistration.fsh index f5c7e97..03c683d 100644 --- a/tests/data/example_fsh/HIVARegistration.fsh +++ b/tests/data/example_fsh/HIVARegistration.fsh @@ -73,7 +73,7 @@ Description: "This tab describes the data that are collected during the registra * ^code[+] = HIVConcepts#HIV.A.DE4 * referredBy 0..1 Coding "Referred by" "How the client was referred" * ^code[+] = HIVConcepts#HIV.A.DE5 - * referredBy from HIV.A.DE5 +* referredBy from HIV.A.DE5 * uniqueIdentifier 1..1 Identifier "Unique identifier" "Unique identifier generated for new clients or a universal ID, if used in the country" * ^code[+] = HIVConcepts#HIV.A.DE8 * nationalId 0..1 Identifier "National ID" "National unique identifier assigned to the client, if used in the country" @@ -86,7 +86,7 @@ Description: "This tab describes the data that are collected during the registra * ^code[+] = HIVConcepts#HIV.A.DE12 * countryOfBirth 1..1 Coding "Country of birth" "Country where the client was born" * ^code[+] = HIVConcepts#HIV.A.DE13 - * countryOfBirth from HIV.A.DE13 +* countryOfBirth from HIV.A.DE13 * dateOfBirth 0..1 date "Date of birth" "The client's date of birth (DOB) if known" * ^code[+] = HIVConcepts#HIV.A.DE14 * dateOfBirthUnknown 0..1 boolean "Date of birth unknown" "Is the client's DOB is unknown?" @@ -97,29 +97,29 @@ Description: "This tab describes the data that are collected during the registra * ^code[+] = HIVConcepts#HIV.A.DE17 * gender 1..1 Coding "Gender" "Gender of the client" * ^code[+] = HIVConcepts#HIV.A.DE18 - * gender from HIV.A.DE18 -* other 0..1 string "Other (specify)" "Additional category (please specify)" +* gender from HIV.A.DE18 +* otherGender 0..1 string "Other (specify)" "Additional category (please specify)" * ^code[+] = HIVConcepts#HIV.A.DE24 * sex 1..1 Coding "Sex" "Sex of the client assigned at birth" * ^code[+] = HIVConcepts#HIV.A.DE25 - * sex from HIV.A.DE25 +* sex from HIV.A.DE25 * address 1..1 string "Address" "Client's home address or address which the client is consenting to disclose" * ^code[+] = HIVConcepts#HIV.A.DE29 * maritalStatus 0..1 Coding "Marital Status" "Client's current marital status " * ^code[+] = HIVConcepts#HIV.A.DE30 - * maritalStatus from HIV.A.DE30 +* maritalStatus from HIV.A.DE30 * telephoneNumber 1..1 integer "Telephone number" "Client's telephone number (a landline or a mobile phone number)" * ^code[+] = HIVConcepts#HIV.A.DE42 * administrativeArea 1..1 Coding "Administrative Area" "This should be a context-specific list of administrative areas, such as villages, districts, etc. The purpose of this data element is to allow for grouping and flagging of client data to a particular facility's catchment area. This can be input into the system by the end user OR it can be automated in the database based on the end user's attributes." * ^code[+] = HIVConcepts#HIV.A.DE43 - * administrativeArea from HIV.A.DE43 +* administrativeArea from HIV.A.DE43 * communicationConsent 0..1 boolean "Communication consent" "Indication that client gave consent to be contacted" * ^code[+] = HIVConcepts#HIV.A.DE44 * reminderMessages 0..1 boolean "Reminder messages" "Whether client wants to receive text or other messages as follow-up for HIV services" * ^code[+] = HIVConcepts#HIV.A.DE45 -* communicationPreference 0..* Coding "Communication preference(s)" "How the client would like to receive family planning communications" +* communicationPreferences 0..* Coding "Communication preference(s)" "How the client would like to receive family planning communications" * ^code[+] = HIVConcepts#HIV.A.DE46 - * communicationPreference from HIV.A.DE46 +* communicationPreferences from HIV.A.DE46 * clientEmail 0..1 string "Client's email" "Client's primary email account where the client can be contacted" * ^code[+] = HIVConcepts#HIV.A.DE49 * alternateContactName 0..1 string "Alternate contact's name" "Name of an alternate contact, which could be next of kin (e.g. partner, husband, mother, sibling, etc.). The alternate contact would be used in the case of an emergency situation." diff --git a/who_l3_smart_tools/core/logical_models/logical_model_generator.py b/who_l3_smart_tools/core/logical_models/logical_model_generator.py index bb38fd3..653441b 100644 --- a/who_l3_smart_tools/core/logical_models/logical_model_generator.py +++ b/who_l3_smart_tools/core/logical_models/logical_model_generator.py @@ -1,9 +1,13 @@ +from collections import defaultdict +import inflect import pandas as pd import stringcase import os import re import sys import datetime +from who_l3_smart_tools.utils import camel_case +from who_l3_smart_tools.utils.counter import Counter # TODO: differentiate between Coding, code, and CodableConcept # Boolean @@ -49,7 +53,7 @@ # * artStartDate 1..1 date "ART start date" "The date on which the client started or restarted antiretroviral therapy (ART)" # * ^code[+] = IMMZConcepts#D1.DE49 fsh_lm_element_template = """ -* {label_camel} {cardinality} {data_type} "{label}" "{description}" """.rstrip() +* {element_name} {cardinality} {data_type} "{label}" "{description}" """.rstrip() fsh_lm_validation_element_template = """ * obeys {validation_id}""" @@ -62,8 +66,7 @@ # * ^code[+] = IMMZConcepts#D1.DE10 # * hivStatus from IMMZ.D1.DE10 -fsh_lm_valueset_element_template = """ - * {label} from {valueset}""" +fsh_lm_valueset_element_template = """\n* {label} from {valueset}""" fsh_lm_coding_element_template = """ * ^code[+] = {code}""" @@ -88,14 +91,14 @@ * ^meta.profile[+] = "http://hl7.org/fhir/uv/crmi/StructureDefinition/crmi-computablevalueset" * ^status = #active * ^experimental = true +* ^name = "{name}" """ fsh_vs_code_template = """ * {code} "{label}" """.rstrip() -# regular expression to strip characters that cannot be properly handled from the string -# basically, we try to remove "'s" and anything between parentheses -label_strip_re = re.compile(r"(?:|'s|\s*\([^)]*\))(?:\b|$)", re.IGNORECASE) +inflect_engine = inflect.engine() + class LogicalModelAndTerminologyGenerator: def __init__(self, input_file, output_dir): @@ -104,6 +107,7 @@ def __init__(self, input_file, output_dir): self.models_dir = os.path.join(output_dir, "models") self.codesystem_dir = os.path.join(output_dir, "codesystems") self.valuesets_dir = os.path.join(output_dir, "valuesets") + self.invariants_dict = defaultdict(lambda: Counter()) def generate_fsh_from_excel(self): # create output structure @@ -129,8 +133,15 @@ def generate_fsh_from_excel(self): for sheet_name in dd_xls.keys(): if re.match(r"HIV\.\w+", sheet_name): df = dd_xls[sheet_name] - clean_name = stringcase.alphanumcase(sheet_name) - short_name = (sheet_name.split(" ")[0]).split(".") + + # hard-coded, but the page labelled E-F has no F codes + if sheet_name == "HIV.E-F PMTCT": + cleaned_sheet_name = "HIV.E PMTCT" + else: + cleaned_sheet_name = sheet_name + + clean_name = stringcase.alphanumcase(cleaned_sheet_name) + short_name = (cleaned_sheet_name.split(" ")[0]).split(".") # Initialize the FSH artifact fsh_artifact = "" @@ -139,13 +150,17 @@ def generate_fsh_from_excel(self): valuesets = [] active_valueset = None + # For handling "Other (specify)" + previous_element_label = None + # Process Invariants First # Get all unique validation conditions, and store their assigned rows validations = self.parse_validations(df) # Template for invariants based on validation conditions - for i, (validation, data_ids) in enumerate(validations.items()): - invariant_id = f"{short_name[0]}-{short_name[1]}-{i + 1}" + for (validation, data_ids) in validations.items(): + id = self.invariants_dict[short_name[1]].next + invariant_id = f"{short_name[0]}-{short_name[1]}-{id}" if type(validation) == str: description = validation.replace('"', "'") else: @@ -171,8 +186,8 @@ def generate_fsh_from_excel(self): fsh_artifact += fsh_header for i, row in df.iterrows(): - de_id = row["Data Element ID"] - if type(de_id) != str or not de_id: + data_element_id = row["Data Element ID"] + if type(data_element_id) != str or not data_element_id: continue # Process general fields @@ -181,19 +196,37 @@ def generate_fsh_from_excel(self): label = row["Data Element Label"] if type(label) == str and label: - # equalize spaces - label = label.strip().replace('*', '').replace('[', '').replace(']', '').replace('"', "'") - - # remove special characters that aren't handled by stringcase properly - # also lower-case everything - label_clean = label_strip_re.sub("", label).replace("-", "_").replace("/", "_").replace(" ", "_").lower() + # Other (specify) elements come after a list as a data element to + # contain a non-coded selection + if label.lower() == "other (specify)": + if previous_element_label: + label_clean = f"Other_{previous_element_label.lower()}" + else: + label_clean = "Other (specify)" + else: - label_camel = stringcase.camelcase(label_clean) + # equalize spaces + label = label.strip().replace('*', '').replace('[', '').replace(']', '').replace('"', "'") + + # remove many special characters + label_clean = (label + .replace("(", "") + .replace(")", "") + .replace("'s", "") + .replace("-", "_") + .replace("/", "_") + .replace(",", "") + .replace(" ", "_") + .replace(">=", "more than") + .replace("<=", "less than") + .replace(">", "more than") + .replace("<", "less than") + .lower()) else: label = "" - label_camel = "" + label_clean = "" - code_sys_ref = f"{code_system}#{de_id}" + code_sys_ref = f"{code_system}#{data_element_id}" description = row["Description and Definition"] if type(description) == str: @@ -207,16 +240,19 @@ def generate_fsh_from_excel(self): required_condition = row["Explain Conditionality"] codes.append({ - "code": de_id, + "code": data_element_id, "label": label, "description": description }) # handle ValueSets # First we identify a ValueSet - if data_type == "Coding" and multiple_choice_type in ["Select one", "Select all that apply"]: + # Originally, this looked at the Multiple Choice Type, but that doesn't seem to be + # guaranteed to be meaningful + if data_type == "Coding": active_valueset = { - "value_set": de_id, + "value_set": data_element_id, + "name": data_element_id.replace(".", ""), "title": f"{label} ValueSet", "description": f"Value set of {description[0].lower() + description[1:] if description[0].isupper() and not description.startswith("HIV") else description}", "codes": [] @@ -225,10 +261,10 @@ def generate_fsh_from_excel(self): # Then we identify the codes for the ValueSet elif data_type == "Codes" and multiple_choice_type == "Input Option": if active_valueset is None: - print(f"Attempted to create a member of a ValueSet without a ValueSet context for code {de_id}", sys.stderr) + print(f"Attempted to create a member of a ValueSet without a ValueSet context for code {data_element_id}", sys.stderr) else: active_valueset['codes'].append({ - "code": f"{code_system}#{de_id}", + "code": f"{code_system}#{data_element_id}", "label": f"{label}" }) @@ -237,9 +273,37 @@ def generate_fsh_from_excel(self): if data_type == "Codes": continue + label_camel = camel_case(label_clean) + + if len(label_camel) > 0 and not label_camel[0].isalpha(): + try: + prefix, rest = re.split(r'(?=[a-zA-Z])', label_camel, 1) + except: + prefix, rest = label_camel, "" + + if prefix.isnumeric(): + prefix = camel_case(inflect_engine.number_to_words(int(prefix)).replace("-", "_")) + else: + print("Did not know how to handle element prefix:", sheet_name, data_element_id, prefix, file=sys.stderr) + + label_camel = f"{prefix}{rest}" + + # data elements can only be 64 characters + if len(label_camel) > 64: + new_label_camel = '' + for label_part in re.split("(?=[A-Z1-9])", label_camel): + if len(new_label_camel) + len(label_part) > 64: + break + + new_label_camel += label_part + label_camel = new_label_camel + + + previous_element_label = label_clean + # Process as a normal entry fsh_artifact += fsh_lm_element_template.format( - label_camel=label_camel, + element_name=label_camel, cardinality=self.map_cardinality(required, multiple_choice_type), data_type=self.map_data_type(data_type), label=label, @@ -247,9 +311,9 @@ def generate_fsh_from_excel(self): ) # Add validation if needed - if de_id in validation_lookup.keys(): + if data_element_id in validation_lookup.keys(): fsh_artifact += fsh_lm_validation_element_template.format( - validation_id=validation_lookup[de_id] + validation_id=validation_lookup[data_element_id] ) # Add Terminology reference @@ -260,7 +324,7 @@ def generate_fsh_from_excel(self): # Process Coding/Codes/Input Options with ValueSets if data_type == "Coding": fsh_artifact += fsh_lm_valueset_element_template.format( - label=label_camel, valueset=de_id + label=label_camel, valueset=f"{data_element_id}" ) output_file = os.path.join( @@ -276,12 +340,15 @@ def generate_fsh_from_excel(self): for code in valueset["codes"]: vs_artifact += fsh_vs_code_template.format(**code) + if len(valueset["codes"]) > 0: + vs_artifact += "\n" + output_file = os.path.join( self.valuesets_dir, f"{valueset["value_set"]}.fsh" ) with open(output_file, "w") as f: - f.write(vs_artifact + "\n") + f.write(vs_artifact) if len(codes) > 0: code_system_artifact = fsh_cs_header_template.format( diff --git a/who_l3_smart_tools/utils/__init__.py b/who_l3_smart_tools/utils/__init__.py new file mode 100644 index 0000000..149e229 --- /dev/null +++ b/who_l3_smart_tools/utils/__init__.py @@ -0,0 +1,10 @@ +import re + +split_re = re.compile(r'[\W_]') + + +def camel_case(str: str) -> str: + if str == None: + return "" + + return ''.join([s.lower() if i == 0 else s.capitalize() for i, s in enumerate(split_re.split(str))]) diff --git a/who_l3_smart_tools/utils/counter.py b/who_l3_smart_tools/utils/counter.py new file mode 100644 index 0000000..e8416c9 --- /dev/null +++ b/who_l3_smart_tools/utils/counter.py @@ -0,0 +1,12 @@ +class Counter: + def __init__(self, start: int = 0) -> None: + self.__value = start + + @property + def current(self): + return self.__value + + @property + def next(self): + self.__value += 1 + return self.__value