Pipeline: Namespace management update. (#29)

* Version update for v4.0 * Pipeline update for namespace management, version files will need to be corrected with core and computation commit. update_commits.py is updated but not tested. * Pipeline update for namespace management, version files will need to be corrected with core, computation, controlledTerms last PRs. update_commits.py is updated but not tested. * Code simplification, variables renaming, version.json/dev file modifications. * Code simplification, lowercase of "acronym"-alike modules, versions.json update. * Typo in the pipeline name. * Signatures cleanup. * Addition of the attribute "_module". * Missing line break. --------- Co-authored-by: raphaelgazzotti <[email protected]>
openMetadataInitiative · Nov 7, 2024 · 65c8706 · 65c8706
1 parent 8c61e7e
commit 65c8706
Show file tree

Hide file tree

Showing 10 changed files with 399 additions and 161 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -1,5 +1,5 @@
 # MIT licensed
-name: openMINDS_build_pipline
+name: openMINDS_build_pipeline
 
 on:
   push:
@@ -55,4 +55,3 @@ jobs:
         else
             echo "Nothing to commit"
         fi
-        
diff --git a/build.py b/build.py
@@ -1,5 +1,4 @@
 import argparse
-
 import sys
 
 from openMINDS_pipeline.models import DirectoryStructure, Trigger
@@ -22,18 +21,17 @@
 clone_central(True)
 
 # Step 1 - find the versions to be (re-)built
-relevant_versions = evaluate_versions_to_be_built(args["config"], trigger)
+relevant_versions, namespaces = evaluate_versions_to_be_built(args["config"], trigger)
 
 for version, modules in relevant_versions.items():
-
     DirectoryStructure.clear_directory(directory_structure.expanded_directory)
     DirectoryStructure.clear_directory(directory_structure.source_directory)
 
     # Step 2 - Clone all required resources for the aggregation
     clone_sources(modules, version)
 
     # Step 3 - Find all involved schemas
-    all_schemas = find_schemas(directory_structure, modules)
+    all_schemas = find_schemas(directory_structure, modules, namespaces[version])
 
     # Step 4 - Resolve all "_extends" directives and save to target directory
     resolve_extends(all_schemas, directory_structure)
@@ -56,7 +54,6 @@
     # Step 10 - Copy results to the target directory
     copy_to_target_directory(directory_structure, version)
 
-
 if not trigger:
     # We've built everything - this is the only chance to do a proper cleanup at the end because we know all versions have been processed.
     Types(directory_structure).clean_types()

diff --git a/openMINDS_pipeline/constants.py b/openMINDS_pipeline/constants.py
@@ -1,3 +1,2 @@
-OPENMINDS_VOCAB="https://openminds.ebrains.eu/vocab/"
 SCHEMA_FILE_ENDING = ".schema.tpl.json"
-INSTANCE_FILE_ENDING = ".jsonld"
+INSTANCE_FILE_ENDING = ".jsonld"
diff --git a/openMINDS_pipeline/models.py b/openMINDS_pipeline/models.py
@@ -21,11 +21,12 @@ class Trigger:
 
 
 class SchemaStructure:
-    def __init__(self, type, schema_group, version, file):
+    def __init__(self, type, schema_group, version, file, namespaces):
         self.type = type
         self.schema_group = schema_group
         self.file = file
         self.version = version
+        self.namespaces = namespaces
         self.categories = None
         self.absolute_path = None
 
@@ -75,7 +76,7 @@ def find_resource_directories(self, file_ending) -> List[str]:
         resource_directories = set()
         for source in glob.glob(os.path.join(self.source_directory, f'**/*{file_ending}'), recursive=True):
             resource_dir = os.path.dirname(source)[len(self.source_directory) + 1:]
-            if ("target" not in resource_dir and "expanded" not in resource_dir):
+            if "target" not in resource_dir and "expanded" not in resource_dir:
                 path_split = resource_dir.split("/")
                 if len(path_split) == 1:
                     resource_directories.add(path_split[0])

diff --git a/openMINDS_pipeline/resolver.py b/openMINDS_pipeline/resolver.py
@@ -14,6 +14,7 @@
 TEMPLATE_PROPERTY_CATEGORIES = "_categories"
 TEMPLATE_PROPERTY_LINKED_CATEGORIES = "_linkedCategories"
 TEMPLATE_PROPERTY_EMBEDDED_CATEGORIES = "_embeddedCategories"
+TEMPLATE_PROPERTY_MODULE = "_module"
 
 
 def resolve_extends(schemas: List[SchemaStructure], directory_structure: DirectoryStructure):
@@ -41,7 +42,7 @@ def resolve_categories(version:str, directory_structure: DirectoryStructure, sch
     schemas_by_category = _schemas_by_category(schemas)
     for schema in schemas:
         print(f"resolving categories for {schema.type}")
-        _do_resolve_categories(schema, schemas_by_category)
+        _do_resolve_categories(version, schema, schemas_by_category)
     categories[version] = schemas_by_category
     _save_categories(directory_structure, categories)
 
@@ -71,12 +72,23 @@ def _schemas_by_category(schemas: List[SchemaStructure]) -> Dict[str, List[str]]
             for c in s.categories:
                 if c not in result:
                     result[c] = []
-                result[c].append(s.type)
+                # lowercase "acronym"-alike modules
+                schema_group_normalized = s.schema_group.lower() if s.schema_group.isupper() else s.schema_group
+                result[c].append(schema_group_normalized + ':' + s.type)
                 result[c].sort()
     return result
 
 
 def _do_resolve_extends(source_schema, schema, schema_group, directory_structure: DirectoryStructure):
+    # Autocomplete with the correct namespace, just rebuild it for older versions (replace part)
+    if TEMPLATE_PROPERTY_TYPE in schema:
+        schema_group_normalized = source_schema.schema_group.lower() if source_schema.schema_group.isupper() else source_schema.schema_group
+        schema[TEMPLATE_PROPERTY_TYPE] = source_schema.namespaces['types'].replace('{MODULE}',
+                                                                               schema_group_normalized) + \
+                                     schema[TEMPLATE_PROPERTY_TYPE].split(":")[-1].split("/")[-1]
+        # Add schema module
+        schema[TEMPLATE_PROPERTY_MODULE] = source_schema.schema_group
+
     if TEMPLATE_PROPERTY_EXTENDS in schema:
         if schema[TEMPLATE_PROPERTY_EXTENDS].startswith("/"):
             extends_split = schema[TEMPLATE_PROPERTY_EXTENDS].split("/")
@@ -129,7 +141,20 @@ def _apply_extension(source, extension):
             source["properties"][k] = extension["properties"][k]
 
 
-def _do_resolve_categories(schema: SchemaStructure, schemas_by_category):
+def _do_resolve_categories(version:str, schema: SchemaStructure, schemas_by_category):
+
+    def _namespace_completion_categories(schema_payload, schema, p, template_property):
+        def _build_namespace_type(_type):
+            # if _type is an URI rebuild it
+            # else _type consists of prefix:name_type
+            module = _type.split("/")[-2] if '/' in _type else _type.split(":")[0]
+            name_type = _type.split("/")[-1] if '/' in _type else _type.split(":")[-1]
+            return schema.namespaces['types'].replace('{MODULE}', module) + name_type
+
+        schema_payload["properties"][p][template_property] = [
+            _build_namespace_type(_type) for _type in schema_payload["properties"][p][template_property]]
+        return schema_payload
+
     with open(schema.absolute_path, "r") as schema_file:
         schema_payload = json.load(schema_file)
     if "properties" in schema_payload:
@@ -150,6 +175,13 @@ def _do_resolve_categories(schema: SchemaStructure, schemas_by_category):
                         embedded_types.extend(schemas_by_category[embedded_category])
                 schema_payload["properties"][p][TEMPLATE_PROPERTY_EMBEDDED_TYPES] = sorted(embedded_types)
                 del schema_payload["properties"][p][TEMPLATE_PROPERTY_EMBEDDED_CATEGORIES]
+
+            # Write namespace for '_linkedTypes' and '_embeddedTypes'
+            if TEMPLATE_PROPERTY_LINKED_TYPES in schema_payload["properties"][p]:
+                _namespace_completion_categories(schema_payload, schema, p, TEMPLATE_PROPERTY_LINKED_TYPES)
+
+            if TEMPLATE_PROPERTY_EMBEDDED_TYPES in schema_payload["properties"][p]:
+                _namespace_completion_categories(schema_payload, schema, p, TEMPLATE_PROPERTY_EMBEDDED_TYPES)
+
     with open(schema.absolute_path, "w") as target_file:
         target_file.write(json.dumps(schema_payload, indent=2))
-
diff --git a/openMINDS_pipeline/utils.py b/openMINDS_pipeline/utils.py
@@ -10,7 +10,7 @@
 from packaging.utils import canonicalize_version
 from packaging.version import Version
 
-from openMINDS_pipeline.constants import SCHEMA_FILE_ENDING, OPENMINDS_VOCAB
+from openMINDS_pipeline.constants import SCHEMA_FILE_ENDING
 from openMINDS_pipeline.models import Trigger, OpenMINDSModule, DirectoryStructure, SchemaStructure
 from openMINDS_pipeline.resolver import TEMPLATE_PROPERTY_TYPE
 
@@ -47,7 +47,7 @@ def get_basic_type(property_definition:dict) -> Optional[str]:
     return basic_type
 
 
-def evaluate_versions_to_be_built(version_config: str, trigger:Optional[Trigger]) -> Dict[str, Dict[str, OpenMINDSModule]]:
+def evaluate_versions_to_be_built(version_config: str, trigger:Optional[Trigger]) -> (Dict[str, Dict[str, OpenMINDSModule]], Dict[str, str]):
     """
     :return: the dictionary describing all versions supposed to be built either because of a change or because of a build of everything.
     """
@@ -65,22 +65,29 @@ def evaluate_versions_to_be_built(version_config: str, trigger:Optional[Trigger]
     if os.path.exists("pipeline"):
         shutil.rmtree("pipeline")
     relevant_versions = {}
-    for version, modules in versions.items():
+    namespaces = {}
+
+    for version, bundle in versions.items():
         triggering_module = None
         is_dynamic = False
         new_modules = {}
-        for module, module_spec in modules.items():
-            m = OpenMINDSModule(**module_spec)
-            if not m.commit:
-                is_dynamic = True
-                _evaluate_branch_and_commit_for_dynamic_instances(m)
-            if trigger and m.repository and m.repository.endswith(f"{trigger.repository}.git"):
-                triggering_module = m
-            new_modules[module] = m
+
+        for entry, entry_spec in bundle.items():
+            if entry == "namespaces":
+                namespaces[version] = bundle.get("namespaces", {})
+            if entry == "modules":
+                for module_name, module_spec in bundle[entry].items():
+                    m = OpenMINDSModule(**module_spec)
+                    if not m.commit:
+                        is_dynamic = True
+                        _evaluate_branch_and_commit_for_dynamic_instances(m)
+                    if trigger and m.repository and m.repository.endswith(f"{trigger.repository}.git"):
+                        triggering_module = m
+                    new_modules[module_name] = m
         # The version is only relevant if the process was not launched by a submodule change (so everything is built) or if the triggering module is specified with the given branch
         if not trigger or (is_dynamic and triggering_module and triggering_module.branch and triggering_module.branch == trigger.branch):
             relevant_versions[version] = new_modules
-    return relevant_versions
+    return relevant_versions, namespaces
 
 
 def _evaluate_branch_and_commit_for_dynamic_instances(module_spec:OpenMINDSModule):
@@ -102,7 +109,7 @@ def _evaluate_branch_and_commit_for_dynamic_instances(module_spec:OpenMINDSModul
     module_spec.commit = branch_to_commit[module_spec.branch]
 
 
-def find_schemas(directory_structure: DirectoryStructure, modules: Dict[str, OpenMINDSModule]) -> List[SchemaStructure]:
+def find_schemas(directory_structure: DirectoryStructure, modules: Dict[str, OpenMINDSModule], namespaces: Dict[str, str]) -> List[SchemaStructure]:
     schema_information = []
     for schema_group in directory_structure.find_resource_directories(file_ending=SCHEMA_FILE_ENDING):
         schema_group = schema_group.split("/")[0]
@@ -116,7 +123,9 @@ def find_schemas(directory_structure: DirectoryStructure, modules: Dict[str, Ope
                     with open(schema_path, "r") as schema_file:
                         schema = json.load(schema_file)
                     if TEMPLATE_PROPERTY_TYPE in schema:
-                        schema_information.append(SchemaStructure(schema[TEMPLATE_PROPERTY_TYPE], schema_group, version, relative_schema_path))
+                        # remove namespace, will be rebuilt in resolve_extends and resolve_categories
+                        schema[TEMPLATE_PROPERTY_TYPE] = schema[TEMPLATE_PROPERTY_TYPE].split(":")[-1].split("/")[-1]
+                        schema_information.append(SchemaStructure(schema[TEMPLATE_PROPERTY_TYPE], schema_group, version, relative_schema_path, namespaces))
                     else:
                         print(f"Skipping schema {relative_schema_path} because it doesn't contain a valid type")
                 except JSONDecodeError:
@@ -133,12 +142,11 @@ def qualify_property_names(schemas:List[SchemaStructure]):
         if "properties" in schema_payload:
             new_properties = {}
             for p, v in schema_payload["properties"].items():
-                new_properties[f"{OPENMINDS_VOCAB}{p}"] = v
+                new_properties[f"{schema.namespaces['props']}{p}"] = v
             schema_payload["properties"] = new_properties
         if "required" in schema_payload:
-            schema_payload["required"] = [f"{OPENMINDS_VOCAB}{p}" for p in schema_payload["required"]]
+            schema_payload["required"] = [f"{schema.namespaces['props']}{p}" for p in schema_payload["required"]]
             schema_payload["required"].sort()
-
         with open(schema.absolute_path, "w") as target_file:
             target_file.write(json.dumps(schema_payload, indent=2, sort_keys=True))