From 4c1ab8b09b45242a233451a60aab933dd3fb6f5d Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Thu, 14 Nov 2024 10:13:52 -0500 Subject: [PATCH 1/2] fix: correct syntax error in `gene_id_constraint` (#397) * Had extra `}` --- src/metakb/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metakb/database.py b/src/metakb/database.py index 350ba172..aa42dafa 100644 --- a/src/metakb/database.py +++ b/src/metakb/database.py @@ -74,7 +74,7 @@ def _get_credentials( _CONSTRAINTS = { "coding_constraint": "CREATE CONSTRAINT coding_constraint IF NOT EXISTS FOR (c:Coding) REQUIRE (c.code, c.label, c.system) IS UNIQUE;", - "gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene}) REQUIRE n.id IS UNIQUE;", + "gene_id_constraint": "CREATE CONSTRAINT gene_id_constraint IF NOT EXISTS FOR (n:Gene) REQUIRE n.id IS UNIQUE;", "disease_id_constraint": "CREATE CONSTRAINT disease_id_constraint IF NOT EXISTS FOR (n:Disease) REQUIRE n.id IS UNIQUE;", "therapeuticprocedure_id_constraint": "CREATE CONSTRAINT therapeuticprocedure_id_constraint IF NOT EXISTS FOR (n:TherapeuticProcedure) REQUIRE n.id IS UNIQUE;", "variation_id_constraint": "CREATE CONSTRAINT variation_id_constraint IF NOT EXISTS FOR (n:Variation) REQUIRE n.id IS UNIQUE;", From 261c06508c6c475542b14dc2161759e568424442 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 20 Nov 2024 06:59:21 -0500 Subject: [PATCH 2/2] feat!: use cat-vrs-python + va-spec-python (#404) close #388 , #395, #309 * Breaking changes * Update/add ga4gh packages * Add `ga4gh.cat_vrs` + `ga4gh.va_spec` packages * The Pydantic models in these packages replace the manually created models in `metakb/schemas/annotation.py`, `metakb/schemas/categorical_variation.py`, and `metakb/schemas/variation_statement.py`. (#388) * `ga4gh.vrs` version bumped * The models in all ga4gh packages caused breaking changes (mainly renaming) to the evidence models (such as #395) * Represent categorical variation members and constraints properly (#309) * Standardize representing normalizer data in extensions * The `extension.name` will always be `vicc_normalizer_data` and value will contain `id`, `label`, and optional `mondo_id` (for disease) * Simplify assertion checks in tests --- .../api/metakb.schemas.annotation.rst | 8 - .../metakb.schemas.categorical_variation.rst | 8 - .../metakb.schemas.variation_statement.rst | 8 - docs/source/reference/index.rst | 3 - pyproject.toml | 6 +- src/metakb/database.py | 4 +- src/metakb/load_data.py | 95 ++--- src/metakb/normalizers.py | 24 ++ src/metakb/query.py | 162 ++++---- src/metakb/schemas/annotation.py | 161 -------- src/metakb/schemas/api.py | 8 +- src/metakb/schemas/categorical_variation.py | 145 ------- src/metakb/schemas/variation_statement.py | 187 --------- src/metakb/transformers/base.py | 127 +++---- src/metakb/transformers/civic.py | 176 +++++---- src/metakb/transformers/moa.py | 115 +++--- tests/conftest.py | 355 +++++++++--------- .../therapeutic/civic_harvester.json | 14 +- tests/unit/database/test_database.py | 151 +++++--- tests/unit/search/test_search_studies.py | 18 +- .../unit/transformers/test_moa_transformer.py | 83 ++-- 21 files changed, 715 insertions(+), 1143 deletions(-) delete mode 100644 docs/source/reference/api/metakb.schemas.annotation.rst delete mode 100644 docs/source/reference/api/metakb.schemas.categorical_variation.rst delete mode 100644 docs/source/reference/api/metakb.schemas.variation_statement.rst delete mode 100644 src/metakb/schemas/annotation.py delete mode 100644 src/metakb/schemas/categorical_variation.py delete mode 100644 src/metakb/schemas/variation_statement.py diff --git a/docs/source/reference/api/metakb.schemas.annotation.rst b/docs/source/reference/api/metakb.schemas.annotation.rst deleted file mode 100644 index a3e2d7cf..00000000 --- a/docs/source/reference/api/metakb.schemas.annotation.rst +++ /dev/null @@ -1,8 +0,0 @@ -metakb.schemas.annotation -========================= - -.. automodule:: metakb.schemas.annotation - :members: - :undoc-members: - :special-members: __init__ - :exclude-members: model_fields, model_config, model_computed_fields \ No newline at end of file diff --git a/docs/source/reference/api/metakb.schemas.categorical_variation.rst b/docs/source/reference/api/metakb.schemas.categorical_variation.rst deleted file mode 100644 index e19603c8..00000000 --- a/docs/source/reference/api/metakb.schemas.categorical_variation.rst +++ /dev/null @@ -1,8 +0,0 @@ -metakb.schemas.categorical_variation -==================================== - -.. automodule:: metakb.schemas.categorical_variation - :members: - :undoc-members: - :special-members: __init__ - :exclude-members: model_fields, model_config, model_computed_fields \ No newline at end of file diff --git a/docs/source/reference/api/metakb.schemas.variation_statement.rst b/docs/source/reference/api/metakb.schemas.variation_statement.rst deleted file mode 100644 index 3f85e34e..00000000 --- a/docs/source/reference/api/metakb.schemas.variation_statement.rst +++ /dev/null @@ -1,8 +0,0 @@ -metakb.schemas.variation_statement -================================== - -.. automodule:: metakb.schemas.variation_statement - :members: - :undoc-members: - :special-members: __init__ - :exclude-members: model_fields, model_config, model_computed_fields \ No newline at end of file diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 0d5e9402..670182b0 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -34,11 +34,8 @@ Data Schemas :toctree: api/ :template: module_summary.rst - metakb.schemas.annotation metakb.schemas.api metakb.schemas.app - metakb.schemas.categorical_variation - metakb.schemas.variation_statement Harvesters ---------- diff --git a/pyproject.toml b/pyproject.toml index 99744c33..982b0abc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,9 @@ requires-python = ">=3.10" description = "A search interface for cancer variant interpretations assembled by aggregating and harmonizing across multiple cancer variant interpretation knowledgebases." license = {file = "LICENSE"} dependencies = [ - "ga4gh.vrs~=2.0.0a10", + "ga4gh.vrs~=2.0.0a12", + "ga4gh.cat_vrs~=0.1.0", + "ga4gh.va_spec~=0.1.0", "gene-normalizer[etl]~=0.4.1", "variation-normalizer~=0.10.0", "disease-normalizer[etl]~=0.5.0", @@ -42,7 +44,7 @@ dependencies = [ dynamic = ["version"] [project.optional-dependencies] -tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio"] +tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio", "deepdiff"] dev = ["pre-commit>=3.7.1", "ruff==0.5.0"] notebooks = ["ipykernel", "jupyterlab"] docs = [ diff --git a/src/metakb/database.py b/src/metakb/database.py index aa42dafa..bbbe4aee 100644 --- a/src/metakb/database.py +++ b/src/metakb/database.py @@ -78,11 +78,11 @@ def _get_credentials( "disease_id_constraint": "CREATE CONSTRAINT disease_id_constraint IF NOT EXISTS FOR (n:Disease) REQUIRE n.id IS UNIQUE;", "therapeuticprocedure_id_constraint": "CREATE CONSTRAINT therapeuticprocedure_id_constraint IF NOT EXISTS FOR (n:TherapeuticProcedure) REQUIRE n.id IS UNIQUE;", "variation_id_constraint": "CREATE CONSTRAINT variation_id_constraint IF NOT EXISTS FOR (n:Variation) REQUIRE n.id IS UNIQUE;", - "categoricalvariation_id_constraint": "CREATE CONSTRAINT categoricalvariation_id_constraint IF NOT EXISTS FOR (n:CategoricalVariation) REQUIRE n.id IS UNIQUE;", + "categoricalvariant_id_constraint": "CREATE CONSTRAINT categoricalvariant_id_constraint IF NOT EXISTS FOR (n:CategoricalVariant) REQUIRE n.id IS UNIQUE;", "variantgroup_id_constraint": "CREATE CONSTRAINT variantgroup_id_constraint IF NOT EXISTS FOR (n:VariantGroup) REQUIRE n.id IS UNIQUE;", "location_id_constraint": "CREATE CONSTRAINT location_id_constraint IF NOT EXISTS FOR (n:Location) REQUIRE n.id IS UNIQUE;", "document_id_constraint": "CREATE CONSTRAINT document_id_constraint IF NOT EXISTS FOR (n:Document) REQUIRE n.id IS UNIQUE;", - "study_id_constraint": "CREATE CONSTRAINT study_id_constraint IF NOT EXISTS FOR (n:Study) REQUIRE n.id IS UNIQUE;", + "statement_id_constraint": "CREATE CONSTRAINT statement_id_constraint IF NOT EXISTS FOR (n:Statement) REQUIRE n.id IS UNIQUE;", "method_id_constraint": "CREATE CONSTRAINT method_id_constraint IF NOT EXISTS FOR (n:Method) REQUIRE n.id IS UNIQUE;", } diff --git a/src/metakb/load_data.py b/src/metakb/load_data.py index 0c6028a3..60327b79 100644 --- a/src/metakb/load_data.py +++ b/src/metakb/load_data.py @@ -7,6 +7,7 @@ from neo4j import Driver, ManagedTransaction from metakb.database import get_driver +from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData _logger = logging.getLogger(__name__) @@ -16,7 +17,7 @@ def _create_parameterized_query( ) -> str: """Create parameterized query string for requested params if non-null in entity. - :param entity: entity to check against, eg a Variation or Study + :param entity: entity to check against, eg a Variation or Statement :param params: Parameter names to check :param entity_param_prefix: Prefix for parameter names in entity object :return: Parameterized query, such as (`name:$name`) @@ -41,10 +42,15 @@ def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None: extensions = obj.get("extensions", []) for ext in extensions: - if ext["name"].endswith("_normalizer_data"): - obj_type = ext["name"].split("_normalizer_data")[0] - name = f"{obj_type}_normalizer_id" - obj[name] = ext["value"]["normalized_id"] + if ext["name"] == VICC_NORMALIZER_DATA: + for normalized_field in ViccDiseaseNormalizerData.model_fields: + normalized_val = ext["value"].get(normalized_field) + if normalized_val is None: + continue + + name = f"normalizer_{normalized_field}" + obj[name] = normalized_val + obj_keys.append(f"{name}:${name}") else: name = "_".join(ext["name"].split()).lower() val = ext["value"] @@ -52,7 +58,7 @@ def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None: obj[name] = json.dumps(val) else: obj[name] = val - obj_keys.append(f"{name}:${name}") + obj_keys.append(f"{name}:${name}") def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str]) -> None: @@ -69,11 +75,13 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str]) MERGE (m:Method {id:$id, label:$label}) """ - is_reported_in = method.get("isReportedIn") + is_reported_in = method.get("reportedIn") if is_reported_in: # Method's documents are unique and do not currently have IDs - _add_document(tx, is_reported_in, ids_in_studies) - doc_doi = is_reported_in["doi"] + # They also only have one document + document = is_reported_in[0] + _add_document(tx, document, ids_in_studies) + doc_doi = document["doi"] query += f""" MERGE (d:Document {{ doi:'{doc_doi}' }}) MERGE (m) -[:IS_REPORTED_IN] -> (d) @@ -253,21 +261,21 @@ def _add_variation(tx: ManagedTransaction, variation_in: dict) -> None: tx.run(query, **v) -def _add_categorical_variation( +def _add_categorical_variant( tx: ManagedTransaction, - categorical_variation_in: dict, + categorical_variant_in: dict, ids_in_studies: set[str], ) -> None: - """Add categorical variation objects to DB. + """Add categorical variant objects to DB. :param tx: Transaction object provided to transaction functions - :param categorical_variation_in: Categorical variation CDM object + :param categorical_variant_in: Categorical variant CDM object :param ids_in_studies: IDs found in studies """ - if categorical_variation_in["id"] not in ids_in_studies: + if categorical_variant_in["id"] not in ids_in_studies: return - cv = categorical_variation_in.copy() + cv = categorical_variant_in.copy() mp_nonnull_keys = [ _create_parameterized_query( @@ -278,7 +286,7 @@ def _add_categorical_variation( _add_mappings_and_exts_to_obj(cv, mp_nonnull_keys) mp_keys = ", ".join(mp_nonnull_keys) - defining_context = cv["definingContext"] + defining_context = cv["constraints"][0]["definingContext"] _add_variation(tx, defining_context) dc_type = defining_context["type"] @@ -293,9 +301,9 @@ def _add_categorical_variation( query = f""" {members_match} - MERGE (dc:{dc_type}:Variation {{ id: '{defining_context['id']}' }}) + MERGE (dc:Variation:{dc_type} {{ id: '{defining_context['id']}' }}) MERGE (dc) -[:HAS_LOCATION] -> (loc) - MERGE (v:{cv['type']}:CategoricalVariation {{ {mp_keys} }}) + MERGE (v:Variation:{cv['type']} {{ {mp_keys} }}) MERGE (v) -[:HAS_DEFINING_CONTEXT] -> (dc) {members_relation} """ @@ -330,7 +338,7 @@ def _add_document( document = document_in.copy() formatted_keys = [ _create_parameterized_query( - document, ("id", "label", "title", "pmid", "url", "doi") + document, ("id", "label", "title", "pmid", "urls", "doi") ) ] @@ -365,11 +373,11 @@ def _add_obj_id_to_set(obj: dict, ids_set: set[str]) -> None: for study in studies: for obj in [ study.get("specifiedBy"), # method - study.get("isReportedIn"), - study.get("variant"), - study.get("therapeutic"), - study.get("tumorType"), - study.get("qualifiers", {}).get("geneContext"), + study.get("reportedIn"), + study.get("subjectVariant"), + study.get("objectTherapeutic"), + study.get("conditionQualifier"), + study.get("geneContextQualifier"), ]: if obj: if isinstance(obj, list): @@ -385,7 +393,7 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: """Add study node and its relationships :param tx: Transaction object provided to transaction functions - :param study_in: Study CDM object + :param study_in: Statement CDM object """ study = study_in.copy() study_type = study["type"] @@ -396,23 +404,22 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: match_line = "" rel_line = "" - is_reported_in_docs = study.get("isReportedIn", []) + is_reported_in_docs = study.get("reportedIn", []) for ri_doc in is_reported_in_docs: ri_doc_id = ri_doc["id"] name = f"doc_{ri_doc_id.split(':')[-1]}" match_line += f"MERGE ({name} {{ id: '{ri_doc_id}'}})\n" rel_line += f"MERGE (s) -[:IS_REPORTED_IN] -> ({name})\n" - qualifiers = study.get("qualifiers") - if qualifiers: - allele_origin = qualifiers.get("alleleOrigin") - study["alleleOrigin"] = allele_origin - match_line += "SET s.alleleOrigin=$alleleOrigin\n" + allele_origin = study.get("alleleOriginQualifier") + if allele_origin: + study["alleleOriginQualifier"] = allele_origin + match_line += "SET s.alleleOriginQualifier=$alleleOriginQualifier\n" - gene_context_id = qualifiers.get("geneContext", {}).get("id") - if gene_context_id: - match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n" - rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n" + gene_context_id = study.get("geneContextQualifier", {}).get("id") + if gene_context_id: + match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n" + rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n" method_id = study["specifiedBy"]["id"] match_line += f"MERGE (m {{ id: '{method_id}' }})\n" @@ -433,24 +440,20 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None: match_line += f"MERGE (c:Coding {{ {coding_keys} }})\n" rel_line += "MERGE (s) -[:HAS_STRENGTH] -> (c)\n" - variant_id = study["variant"]["id"] - if study["variant"]["type"] == "ProteinSequenceConsequence": - v_parent_type = "CategoricalVariation" - else: - v_parent_type = "Variation" - match_line += f"MERGE (v:{v_parent_type} {{ id: '{variant_id}' }})\n" + variant_id = study["subjectVariant"]["id"] + match_line += f"MERGE (v:Variation {{ id: '{variant_id}' }})\n" rel_line += "MERGE (s) -[:HAS_VARIANT] -> (v)\n" - therapeutic_id = study["therapeutic"]["id"] + therapeutic_id = study["objectTherapeutic"]["id"] match_line += f"MERGE (t:TherapeuticProcedure {{ id: '{therapeutic_id}' }})\n" rel_line += "MERGE (s) -[:HAS_THERAPEUTIC] -> (t)\n" - tumor_type_id = study["tumorType"]["id"] + tumor_type_id = study["conditionQualifier"]["id"] match_line += f"MERGE (tt:Condition {{ id: '{tumor_type_id}' }})\n" rel_line += "MERGE (s) -[:HAS_TUMOR_TYPE] -> (tt)\n" query = f""" - MERGE (s:{study_type}:Study {{ {study_keys} }}) + MERGE (s:{study_type}:StudyStatement:Statement {{ {study_keys} }}) {match_line} {rel_line} """ @@ -472,8 +475,8 @@ def add_transformed_data(driver: Driver, data: dict) -> None: with driver.session() as session: loaded_study_count = 0 - for cv in data.get("categorical_variations", []): - session.execute_write(_add_categorical_variation, cv, ids_in_studies) + for cv in data.get("categorical_variants", []): + session.execute_write(_add_categorical_variant, cv, ids_in_studies) for doc in data.get("documents", []): session.execute_write(_add_document, doc, ids_in_studies) diff --git a/src/metakb/normalizers.py b/src/metakb/normalizers.py index adb8c006..572c7568 100644 --- a/src/metakb/normalizers.py +++ b/src/metakb/normalizers.py @@ -4,6 +4,7 @@ import os from collections.abc import Iterable from enum import Enum +from typing import Literal from botocore.exceptions import TokenRetrievalError from disease.cli import update_db as update_disease_db @@ -22,6 +23,7 @@ from gene.database.database import AWS_ENV_VAR_NAME as GENE_AWS_ENV_VAR_NAME from gene.query import QueryHandler as GeneQueryHandler from gene.schemas import NormalizeService as NormalizedGene +from pydantic import BaseModel from therapy.cli import update_normalizer_db as update_therapy_db from therapy.database import create_db as create_therapy_db from therapy.database.database import AWS_ENV_VAR_NAME as THERAPY_AWS_ENV_VAR_NAME @@ -42,6 +44,28 @@ _logger = logging.getLogger(__name__) +class ViccNormalizerData(BaseModel, extra="forbid"): + """Define model for representing VICC normalizer data""" + + id: str + label: str + + +class ViccDiseaseNormalizerData(ViccNormalizerData, extra="forbid"): + """Define model for representing VICC disease normalizer data""" + + mondo_id: str | None = None + + +VICC_NORMALIZER_DATA = "vicc_normalizer_data" + + +class ViccNormalizerDataExtension(Extension): + """Define model for representing VICC normalizer data as an Extension""" + + name: Literal["vicc_normalizer_data"] = VICC_NORMALIZER_DATA + + class ViccNormalizers: """Manage VICC concept normalization services. diff --git a/src/metakb/query.py b/src/metakb/query.py index ffe9ef0b..cb66a2a6 100644 --- a/src/metakb/query.py +++ b/src/metakb/query.py @@ -5,21 +5,30 @@ from copy import copy from enum import Enum +from ga4gh.cat_vrs.core_models import CategoricalVariant, DefiningContextConstraint from ga4gh.core.domain_models import ( + CommonDomainType, Disease, Gene, TherapeuticAgent, TherapeuticProcedure, ) -from ga4gh.core.entity_models import Coding, Expression, Extension -from ga4gh.vrs.models import Variation +from ga4gh.core.entity_models import Coding, Document, Extension, Method +from ga4gh.va_spec.profiles.var_study_stmt import ( + VariantTherapeuticResponseStudyStatement, +) +from ga4gh.vrs.models import Expression, Variation from neo4j import Driver from neo4j.graph import Node from pydantic import ValidationError from metakb.database import get_driver -from metakb.normalizers import ViccNormalizers -from metakb.schemas.annotation import Document, Method +from metakb.normalizers import ( + ViccDiseaseNormalizerData, + ViccNormalizerData, + ViccNormalizerDataExtension, + ViccNormalizers, +) from metakb.schemas.api import ( BatchSearchStudiesQuery, BatchSearchStudiesService, @@ -28,11 +37,6 @@ ServiceMeta, ) from metakb.schemas.app import SourceName -from metakb.schemas.categorical_variation import CategoricalVariation -from metakb.schemas.variation_statement import ( - VariantTherapeuticResponseStudy, - _VariantOncogenicityStudyQualifier, -) logger = logging.getLogger(__name__) @@ -43,7 +47,7 @@ class PaginationParamError(Exception): class VariationRelation(str, Enum): """Constrain possible values for the relationship between variations and - categorical variations. + categorical variants. """ HAS_MEMBERS = "HAS_MEMBERS" @@ -154,7 +158,7 @@ async def search_studies( >>> result = qh.search_studies("BRAF V600E") >>> result.study_ids[:3] ['moa.assertion:944', 'moa.assertion:911', 'moa.assertion:865'] - >>> result.studies[0].isReportedIn[0].url + >>> result.studies[0].reportedIn[0].urls[0] 'https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/202429s019lbl.pdf' Variation, disease, therapy, and gene terms are resolved via their respective @@ -384,7 +388,7 @@ def _get_study_by_id(self, study_id: str) -> Node | None: :return: Study node if successful """ query = """ - MATCH (s:Study) + MATCH (s:Statement) WHERE toLower(s.id) = toLower($study_id) RETURN s """ @@ -414,32 +418,32 @@ def _get_studies( :param normalized_gene: normalized gene concept ID :return: List of Study nodes that match the intersection of the given parameters """ - query = "MATCH (s:Study)" + query = "MATCH (s:Statement)" params: dict[str, str | int] = {} if normalized_variation: query += """ - MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariation) + MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariant) MATCH (cv) -[:HAS_DEFINING_CONTEXT|HAS_MEMBERS] -> (v:Variation {id:$v_id}) """ params["v_id"] = normalized_variation if normalized_disease: query += """ - MATCH (s) -[:HAS_TUMOR_TYPE] -> (c:Condition {disease_normalizer_id:$c_id}) + MATCH (s) -[:HAS_TUMOR_TYPE] -> (c:Condition {normalizer_id:$c_id}) """ params["c_id"] = normalized_disease if normalized_gene: query += """ - MATCH (s) -[:HAS_GENE_CONTEXT] -> (g:Gene {gene_normalizer_id:$g_id}) + MATCH (s) -[:HAS_GENE_CONTEXT] -> (g:Gene {normalizer_id:$g_id}) """ params["g_id"] = normalized_gene if normalized_therapy: query += """ - OPTIONAL MATCH (s) -[:HAS_THERAPEUTIC] -> (tp:TherapeuticAgent {therapy_normalizer_id:$t_id}) - OPTIONAL MATCH (s) -[:HAS_THERAPEUTIC] -> () -[:HAS_SUBSTITUTES|HAS_COMPONENTS] -> (ta:TherapeuticAgent {therapy_normalizer_id:$t_id}) + OPTIONAL MATCH (s) -[:HAS_THERAPEUTIC] -> (tp:TherapeuticAgent {normalizer_id:$t_id}) + OPTIONAL MATCH (s) -[:HAS_THERAPEUTIC] -> () -[:HAS_SUBSTITUTES|HAS_COMPONENTS] -> (ta:TherapeuticAgent {normalizer_id:$t_id}) WITH s, tp, ta WHERE tp IS NOT NULL OR ta IS NOT NULL """ @@ -484,19 +488,19 @@ def _get_nested_studies(self, study_nodes: list[Node]) -> list[dict]: def _get_nested_study(self, study_node: Node) -> dict: """Get information related to a study - Only VariantTherapeuticResponseStudy are supported at the moment + Only VariantTherapeuticResponseStudyStatement are supported at the moment :param study_node: Neo4j graph node for study :return: Nested study """ - if study_node["type"] != "VariantTherapeuticResponseStudy": + if study_node["type"] != "VariantTherapeuticResponseStudyStatement": return {} params = { - "tumorType": None, - "variant": None, + "conditionQualifier": None, + "subjectVariant": None, "strength": None, - "isReportedIn": [], + "reportedIn": [], "specifiedBy": None, } params.update(study_node) @@ -504,7 +508,7 @@ def _get_nested_study(self, study_node: Node) -> dict: # Get relationship and nodes for a study query = """ - MATCH (s:Study { id: $study_id }) + MATCH (s:Statement { id: $study_id }) OPTIONAL MATCH (s)-[r]-(n) RETURN type(r) as r_type, n; """ @@ -516,51 +520,71 @@ def _get_nested_study(self, study_node: Node) -> dict: node = data["n"] if rel_type == "HAS_TUMOR_TYPE": - params["tumorType"] = self._get_disease(node) + params["conditionQualifier"] = self._get_disease(node) elif rel_type == "HAS_VARIANT": - params["variant"] = self._get_cat_var(node) + params["subjectVariant"] = self._get_cat_var(node) elif rel_type == "HAS_GENE_CONTEXT": - params["qualifiers"] = self._get_variant_onco_study_qualifier( - study_id, study_node.get("alleleOrigin") + params["geneContextQualifier"] = self._get_gene_context_qualifier( + study_id + ) + params["alleleOriginQualifier"] = study_node.get( + "alleleOriginQualifier" ) elif rel_type == "IS_SPECIFIED_BY": - node["isReportedIn"] = self._get_method_document(node["id"]) + node["reportedIn"] = [self._get_method_document(node["id"])] params["specifiedBy"] = Method(**node) elif rel_type == "IS_REPORTED_IN": - params["isReportedIn"].append(self._get_document(node)) + params["reportedIn"].append(self._get_document(node)) elif rel_type == "HAS_STRENGTH": params["strength"] = Coding(**node) elif rel_type == "HAS_THERAPEUTIC": - params["therapeutic"] = self._get_therapeutic_procedure(node) + params["objectTherapeutic"] = self._get_therapeutic_procedure(node) else: logger.warning("relation type not supported: %s", rel_type) - return VariantTherapeuticResponseStudy(**params).model_dump() + return VariantTherapeuticResponseStudyStatement(**params).model_dump() @staticmethod - def _get_disease(node: dict) -> Disease: + def _get_vicc_normalizer_extension(node: dict) -> ViccNormalizerDataExtension: + """Get VICC Normalizer extension data + + :param node: Therapy, disease, or gene node data + :return: VICC Normalizer extension data + """ + params = { + "id": node["normalizer_id"], + "label": node["normalizer_label"], + } + + if node["type"] == CommonDomainType.DISEASE: + params["mondo_id"] = node.get("normalizer_mondo_id") + ext_val = ViccDiseaseNormalizerData(**params) + else: + ext_val = ViccNormalizerData(**params) + + return ViccNormalizerDataExtension(value=ext_val.model_dump()) + + def _get_disease(self, node: dict) -> Disease: """Get disease data from a node with relationship ``HAS_TUMOR_TYPE`` :param node: Disease node data :return: Disease object """ node["mappings"] = _deserialize_field(node, "mappings") - node["extensions"] = [ - Extension(name="disease_normalizer_id", value=node["disease_normalizer_id"]) - ] + node["extensions"] = [self._get_vicc_normalizer_extension(node)] return Disease(**node) def _get_variations(self, cv_id: str, relation: VariationRelation) -> list[dict]: - """Get list of variations associated to categorical variation + """Get list of variations associated to categorical variant - :param cv_id: ID for categorical variation - :param relation: Relation type for categorical variation and variation - :return: List of variations with `relation` to categorical variation. If + :param cv_id: ID for categorical variant + :param relation: Relation type for categorical variant and variation + :return: List of variations with `relation` to categorical variant. If VariationRelation.HAS_MEMBERS, returns at least one variation. Otherwise, returns exactly one variation """ query = f""" - MATCH (v:Variation) <- [:{relation.value}] - (cv:CategoricalVariation + MATCH (v:Variation) <- [:{relation.value}] - (cv:CategoricalVariant {{ id: $cv_id }}) MATCH (loc:Location) <- [:HAS_LOCATION] - (v) RETURN v, loc @@ -590,11 +614,11 @@ def _get_variations(self, cv_id: str, relation: VariationRelation) -> list[dict] variations.append(Variation(**v_params).model_dump()) return variations - def _get_cat_var(self, node: dict) -> CategoricalVariation: - """Get categorical variation data from a node with relationship ``HAS_VARIANT`` + def _get_cat_var(self, node: dict) -> CategoricalVariant: + """Get categorical variant data from a node with relationship ``HAS_VARIANT`` :param node: Variant node data. This will be mutated. - :return: Categorical Variation data + :return: Categorical Variant data """ node["mappings"] = _deserialize_field(node, "mappings") @@ -624,25 +648,26 @@ def _get_cat_var(self, node: dict) -> CategoricalVariation: ) node["extensions"] = extensions or None - node["definingContext"] = self._get_variations( - node["id"], VariationRelation.HAS_DEFINING_CONTEXT - )[0] + node["constraints"] = [ + DefiningContextConstraint( + definingContext=self._get_variations( + node["id"], VariationRelation.HAS_DEFINING_CONTEXT + )[0] + ) + ] node["members"] = self._get_variations( node["id"], VariationRelation.HAS_MEMBERS ) - return CategoricalVariation(**node) + return CategoricalVariant(**node) - def _get_variant_onco_study_qualifier( - self, study_id: str, allele_origin: str | None - ) -> _VariantOncogenicityStudyQualifier: - """Get variant oncogenicity study qualifier data for a study + def _get_gene_context_qualifier(self, study_id: str) -> Gene | None: + """Get gene context qualifier data for a study :param study_id: ID of study node - :param allele_origin: Study's allele origin - :return Variant oncogenicity study qualifier data + :return Gene context qualifier data """ query = """ - MATCH (s:Study { id: $study_id }) -[:HAS_GENE_CONTEXT] -> (g:Gene) + MATCH (s:Statement { id: $study_id }) -[:HAS_GENE_CONTEXT] -> (g:Gene) RETURN g """ results = self.driver.execute_query(query, study_id=study_id) @@ -662,14 +687,8 @@ def _get_variant_onco_study_qualifier( gene_node = results.records[0].data()["g"] gene_node["mappings"] = _deserialize_field(gene_node, "mappings") - - gene_node["extensions"] = [ - Extension(name="gene_normalizer_id", value=gene_node["gene_normalizer_id"]) - ] - - return _VariantOncogenicityStudyQualifier( - alleleOrigin=allele_origin, geneContext=Gene(**gene_node) - ) + gene_node["extensions"] = [self._get_vicc_normalizer_extension(gene_node)] + return Gene(**gene_node) def _get_method_document(self, method_id: str) -> Document | None: """Get document for a given method @@ -775,8 +794,7 @@ def _get_therapeutic_agents( therapeutic_agents.append(ta) return therapeutic_agents - @staticmethod - def _get_therapeutic_agent(in_ta_params: dict) -> TherapeuticAgent: + def _get_therapeutic_agent(self, in_ta_params: dict) -> TherapeuticAgent: """Transform input parameters into TherapeuticAgent object :param in_ta_params: Therapeutic Agent node properties @@ -784,11 +802,7 @@ def _get_therapeutic_agent(in_ta_params: dict) -> TherapeuticAgent: """ ta_params = copy(in_ta_params) ta_params["mappings"] = _deserialize_field(ta_params, "mappings") - extensions = [ - Extension( - name="therapy_normalizer_id", value=ta_params["therapy_normalizer_id"] - ) - ] + extensions = [self._get_vicc_normalizer_extension(ta_params)] regulatory_approval = ta_params.get("regulatory_approval") if regulatory_approval: regulatory_approval = json.loads(regulatory_approval) @@ -863,7 +877,7 @@ async def batch_search_studies( if limit is not None or self._default_page_limit is not None: query = """ - MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariation) + MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariant) MATCH (cv) -[:HAS_DEFINING_CONTEXT|HAS_MEMBERS] -> (v:Variation) WHERE v.id IN $v_ids RETURN DISTINCT s @@ -874,7 +888,7 @@ async def batch_search_studies( limit = limit if limit is not None else self._default_page_limit else: query = """ - MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariation) + MATCH (s) -[:HAS_VARIANT] -> (cv:CategoricalVariant) MATCH (cv) -[:HAS_DEFINING_CONTEXT|HAS_MEMBERS] -> (v:Variation) WHERE v.id IN $v_ids RETURN DISTINCT s @@ -886,5 +900,7 @@ async def batch_search_studies( study_nodes = [r[0] for r in result] response.study_ids = [n["id"] for n in study_nodes] studies = self._get_nested_studies(study_nodes) - response.studies = [VariantTherapeuticResponseStudy(**s) for s in studies] + response.studies = [ + VariantTherapeuticResponseStudyStatement(**s) for s in studies + ] return response diff --git a/src/metakb/schemas/annotation.py b/src/metakb/schemas/annotation.py deleted file mode 100644 index 1dae3d26..00000000 --- a/src/metakb/schemas/annotation.py +++ /dev/null @@ -1,161 +0,0 @@ -"""Module containing GK pilot annotation definitions""" - -import datetime -from enum import Enum -from typing import Literal - -from ga4gh.core.entity_models import IRI, Coding, DomainEntity, Entity -from pydantic import Field, StrictInt, StrictStr, constr, field_validator - - -class AgentSubtype(str, Enum): - """Define constraints for agent subtype""" - - PERSON = "person" - ORGANIZATION = "organization" - COMPUTER = "computer" - - -class Direction(str, Enum): - """Define constraints for direction""" - - SUPPORTS = "supports" - REFUTES = "refutes" - NONE = "none" - - -class Document(DomainEntity): - """a representation of a physical or digital document""" - - type: Literal["Document"] = "Document" - title: StrictStr | None = Field(None, description="The title of the Document") - url: constr(pattern="^(https?|s?ftp)://") | None = Field( - None, description="A URL at which the document may be retrieved." - ) - doi: constr(pattern="^10.(\\d+)(\\.\\d+)*\\/[\\w\\-\\.]+") | None = Field( - None, - description="A `Digital Object Identifier _` for the document.", - ) - pmid: StrictInt | None = Field( - None, - description="A `PubMed unique identifier `_.", - ) - - -class Method(Entity): - """A set of instructions that specify how to achieve some objective (e.g. - experimental protocols, curation guidelines, rule sets, etc.) - """ - - type: Literal["Method"] = Field("Method", description="MUST be 'Method'.") - isReportedIn: Document | IRI | None = Field( - None, description="A document in which the information content is expressed." - ) - subtype: Coding | None = Field( - None, - description="A more specific type of entity the method represents (e.g. Variant Interpretation Guideline, Experimental Protocol)", - ) - - -class Agent(Entity): - """An autonomous actor (person, organization, or computational agent) that bears - some form of responsibility for an activity taking place, for the existence of an - entity, or for another agent's activity. - """ - - type: Literal["Agent"] = Field("Agent", description="MUST be 'Agent'.") - name: StrictStr | None = None - subtype: AgentSubtype | None = None - - -class Contribution(Entity): - """The sum of all actions taken by a single agent in contributing to the creation, - modification, assessment, or deprecation of a particular entity (e.g. a Statement, - EvidenceLine, DataItem, Publication, etc.) - """ - - type: Literal["Contribution"] = "Contribution" - contributor: Agent | None = None - date: StrictStr | None = None - activity: Coding | None = Field( - None, - description="SHOULD describe a concept descending from the Contributor Role Ontology.", - ) - - @field_validator("date") - @classmethod - def date_format(cls, v: str | None) -> str | None: - """Check that date is YYYY-MM-DD format""" - if v: - valid_format = "%Y-%m-%d" - - try: - datetime.datetime.strptime(v, valid_format).replace( - tzinfo=datetime.timezone.utc - ).strftime(valid_format) - except ValueError as e: - msg = "`date` must use YYYY-MM-DD format" - raise ValueError(msg) from e - return v - - -class _InformationEntity(Entity): - """InformationEntities are abstract (non-physical) entities that are about something - (i.e. they carry information about things in the real world). - """ - - id: StrictStr - type: StrictStr - specifiedBy: Method | IRI | None = Field( - None, - description="A `Method` that describes all or part of the process through which the information was generated.", - ) - contributions: list[Contribution] | None = None - isReportedIn: list[Document | IRI] | None = Field( - None, description="A document in which the information content is expressed." - ) - # recordMetadata (might be added in the future) - - -class DataItem(_InformationEntity): - """An InformationEntity representing an individual piece of data, generated/acquired - through methods which reliably produce truthful information about something. - """ - - type: Literal["DataItem"] = Field("DataItem", description="MUST be 'DataItem'.") - subtype: Coding | None = Field( - None, - description="A specific type of data the DataItem object represents (e.g. a specimen count, a patient weight, an allele frequency, a p-value, a confidence score)", - ) - value: StrictStr - unit: Coding | None = None - - -class _StatementBase(_InformationEntity): - """Base class for Statement model. Excludes fields that get extended with a - different name in child classes (subject, object, qualifiers) - """ - - predicate: StrictStr | None = Field( - None, description="The predicate of the Statement" - ) - direction: Direction = Field( - ..., description="direction of this Statement with respect to the predicate." - ) - strength: Coding | IRI | None = Field( - None, - description="The overall strength of support for the Statement based on all evidence assessed.", - ) - - -class _Statement(_StatementBase): - """A Statement (aka `Assertion`) represents a claim of purported truth as made by a - particular agent, on a particular occasion. - """ - - subject: StrictStr = Field(..., description="The subject of the Statement.") - object: StrictStr | None = Field(None, description="The object of the Statement") - qualifiers: dict | None = Field( - None, - description="Additional, optional properties that may qualify the Statement.", - ) diff --git a/src/metakb/schemas/api.py b/src/metakb/schemas/api.py index e5ce6e3b..383f21cc 100644 --- a/src/metakb/schemas/api.py +++ b/src/metakb/schemas/api.py @@ -2,10 +2,12 @@ from typing import Literal +from ga4gh.va_spec.profiles.var_study_stmt import ( + VariantTherapeuticResponseStudyStatement, +) from pydantic import BaseModel, ConfigDict, StrictStr from metakb import __version__ -from metakb.schemas.variation_statement import VariantTherapeuticResponseStudy class ServiceMeta(BaseModel): @@ -44,7 +46,7 @@ class SearchStudiesService(BaseModel): query: SearchStudiesQuery warnings: list[StrictStr] = [] study_ids: list[StrictStr] = [] - studies: list[VariantTherapeuticResponseStudy] = [] + studies: list[VariantTherapeuticResponseStudyStatement] = [] service_meta_: ServiceMeta @@ -67,5 +69,5 @@ class BatchSearchStudiesService(BaseModel): query: BatchSearchStudiesQuery warnings: list[StrictStr] = [] study_ids: list[StrictStr] = [] - studies: list[VariantTherapeuticResponseStudy] = [] + studies: list[VariantTherapeuticResponseStudyStatement] = [] service_meta_: ServiceMeta diff --git a/src/metakb/schemas/categorical_variation.py b/src/metakb/schemas/categorical_variation.py deleted file mode 100644 index ecb0dbcc..00000000 --- a/src/metakb/schemas/categorical_variation.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Define Pydantic models for GA4GH categorical variation objects. - -See the `CatVar page `_ on -the GA4GH website for more information. -""" - -from enum import Enum -from typing import Literal - -from ga4gh.core.entity_models import IRI, DomainEntity -from ga4gh.vrs import models -from pydantic import Field, RootModel, StrictStr - - -class LocationMatchCharacteristic(str, Enum): - """The characteristics of a valid match between a contextual CNV location (the - query) and the Categorical CNV location (the domain), when both query and domain are - represented on the same reference sequence. An ``exact`` match requires the location - of the query and domain to be identical. A ``subinterval`` match requires the query to - be a subinterval of the domain. A ``superinterval`` match requires the query to be a - superinterval of the domain. A ``partial`` match requires at least 1 residue of - between the query and domain. - """ - - EXACT = "exact" - PARTIAL = "partial" - SUBINTERVAL = "subinterval" - SUPERINTERVAL = "superinterval" - - -class _CategoricalVariationBase(DomainEntity): - """Base class for Categorical Variation""" - - members: list[models.Variation | IRI] | None = Field( - None, - description="A non-exhaustive list of VRS variation contexts that satisfy the constraints of this categorical variant.", - ) - - -class ProteinSequenceConsequence(_CategoricalVariationBase): - """A change that occurs in a protein sequence as a result of genomic changes. Due to - the degenerate nature of the genetic code, there are often several genomic changes - that can cause a protein sequence consequence. The protein sequence consequence, - like a ``CanonicalAllele``, is defined by an - `Allele `_ that is - representative of a collection of congruent Protein Alleles that share the same - altered codon(s). - """ - - type: Literal["ProteinSequenceConsequence"] = Field( - "ProteinSequenceConsequence", - description="MUST be 'ProteinSequenceConsequence'.", - ) - definingContext: models.Allele | IRI = Field( - ..., - description="The `VRS Allele `_ object that is congruent with (projects to the same codons) as alleles on other protein reference sequences.", - ) - - -class CanonicalAllele(_CategoricalVariationBase): - """A canonical allele is defined by an - `Allele `_ that is - representative of a collection of congruent Alleles, each of which depict the same - nucleic acid change on different underlying reference sequences. Congruent - representations of an Allele often exist across different genome assemblies and - associated cDNA transcript representations. - """ - - type: Literal["CanonicalAllele"] = Field( - "CanonicalAllele", description="MUST be 'CanonicalAllele'." - ) - definingContext: models.Allele | IRI = Field( - ..., - description="The `VRS Allele `_ object that is congruent with variants on alternate reference sequences.", - ) - - -class CategoricalCnv(_CategoricalVariationBase): - """A categorical variation domain is defined first by a sequence derived from a - canonical `Location `_ , - which is representative of a collection of congruent Locations. The change or count - of this sequence is also described, either by a numeric value (e.g. "3 or more - copies") or categorical representation (e.g. "high-level gain"). Categorical CNVs - may optionally be defined by rules specifying the location match characteristics for - member CNVs. - """ - - type: Literal["CategoricalCnv"] = Field( - "CategoricalCnv", description="MUST be 'CategoricalCnv'." - ) - location: models.Location = Field( - ..., - description="A `VRS Location `_ object that represents a sequence derived from that location, and is congruent with locations on alternate reference sequences.", - ) - locationMatchCharacteristic: LocationMatchCharacteristic | None = Field( - None, - description="The characteristics of a valid match between a contextual CNV location (the query) and the Categorical CNV location (the domain), when both query and domain are represented on the same reference sequence. An `exact` match requires the location of the query and domain to be identical. A `subinterval` match requires the query to be a subinterval of the domain. A `superinterval` match requires the query to be a superinterval of the domain. A `partial` match requires at least 1 residue of overlap between the query and domain.", - ) - copyChange: models.CopyChange | None = Field( - None, - description="A representation of the change in copies of a sequence in a system. MUST be one of 'efo:0030069' (complete genomic loss), 'efo:0020073' (high-level loss), 'efo:0030068' (low-level loss), 'efo:0030067' (loss), 'efo:0030064' (regional base ploidy), 'efo:0030070' (gain), 'efo:0030071' (low-level gain), 'efo:0030072' (high-level gain).", - ) - copies: int | models.Range | None = Field( - None, description="The integral number of copies of the subject in a system." - ) - - -class DescribedVariation(_CategoricalVariationBase): - """Some categorical variation concepts are supported by custom nomenclatures or - text-descriptive representations for which a categorical variation model does not - exist. DescribedVariation is a class that adds requirements and contextual semantics - to the ``label`` and ``description`` fields to indicate how a categorical variation - concept should be evaluated for matching variants. - """ - - type: Literal["DescribedVariation"] = Field( - "DescribedVariation", description="MUST be 'DescribedVariation'." - ) - label: StrictStr = Field( - ..., - description="A primary label for the categorical variation. This required property should provide a short and descriptive textual representation of the concept.", - ) - description: StrictStr | None = Field( - None, - description="A textual description of the domain of variation that should match the categorical variation entity.", - ) - - -class CategoricalVariation(RootModel): - """A representation of a categorically-defined domain for variation, in which - individual contextual variation instances may be members of the domain. - """ - - root: ( - CanonicalAllele - | CategoricalCnv - | DescribedVariation - | ProteinSequenceConsequence - ) = Field( - ..., - json_schema_extra={ - "description": "A representation of a categorically-defined domain for variation, in which individual contextual variation instances may be members of the domain.", - }, - discriminator="type", - ) diff --git a/src/metakb/schemas/variation_statement.py b/src/metakb/schemas/variation_statement.py deleted file mode 100644 index 088b020e..00000000 --- a/src/metakb/schemas/variation_statement.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Module containing variant statement definitions""" - -from enum import Enum -from typing import Literal - -from ga4gh.core.domain_models import Condition, Gene, TherapeuticProcedure -from ga4gh.core.entity_models import IRI, Coding -from ga4gh.vrs import models -from pydantic import BaseModel, Field - -from metakb.schemas.annotation import Document, _StatementBase -from metakb.schemas.categorical_variation import CategoricalVariation - - -class Penetrance(str, Enum): - """The extent to which the variant impact is expressed by individuals carrying it as - a measure of the proportion of carriers exhibiting the condition. - """ - - HIGH = "high" - LOW = "low" - RISK_ALLELE = "risk allele" - - -class ModeOfInheritance(str, Enum): - """The pattern of inheritance expected for the pathogenic effect of this variant.""" - - AUTOSOMAL_DOMINANT = "autosomal dominant" - AUTOSOMAL_RECESSIVE = "autosomal recessive" - X_LINKED_DOMINANT = "X-linked dominant" - X_LINKED_RECESSIVE = "X-linked recessive" - MITOCHONDRIAL = "mitochondrial" - - -class VariantOncogenicityStudyPredicate(str, Enum): - """Define constraints for Variant Oncogenicity Study predicate""" - - IS_ONCOGENIC_FOR = "isOncogenicFor" - IS_PROTECTIVE_FOR = "isProtectiveFor" - IS_PREDISPOSING_FOR = "isPredisposingFor" - - -class AlleleOrigin(str, Enum): - """Whether the statement should be interpreted in the context of an inherited - (germline) variant, an acquired (somatic) mutation, or both (combined). - """ - - GERMLINE = "germline" - SOMATIC = "somatic" - COMBINED = "combined" - - -class AllelePrevalence(str, Enum): - """Whether the statement should be interpreted in the context of the variant being - rare or common. - """ - - RARE = "rare" - COMMON = "common" - - -class VariantTherapeuticResponseStudyPredicate(str, Enum): - """Predicate for Variant Therapeutic Response Study""" - - PREDICTS_SENSITIVITY_TO = "predictsSensitivityTo" - PREDICTS_RESISTANCE_TO = "predictsResistanceTo" - - -class _VariantStatement(_StatementBase): - """A `Statement` describing the impact of a variant.""" - - # extends subject - variant: models.Variation | CategoricalVariation | IRI = Field( - ..., description="A variation object that is the subject of the Statement." - ) - - -class _VariantClassification(_VariantStatement): - """A `VariantStatement` classifying the impact of a variant.""" - - classification: Coding | IRI = Field( - ..., - description="A methodological, summary classification about the impact of a variant.", - ) - - -class VariantPathogenicityQualifier(BaseModel): - """VariantPathogenicity Qualifier""" - - penetrance: Penetrance | None = Field( - None, - description="The extent to which the variant impact is expressed by individuals carrying it as a measure of the proportion of carriers exhibiting the condition.", - ) - modeOfInheritance: ModeOfInheritance | None = Field( - None, - description="The pattern of inheritance expected for the pathogenic effect of this variant.", - ) - geneContext: Gene | None = Field( - None, description="A gene context that qualifies the Statement." - ) - - -class VariantPathogenicity(_VariantClassification): - """A `VariantClassification` describing the role of a variant in causing an - inherited disorder. - """ - - type: Literal["VariantPathogenicity"] = Field( - "VariantPathogenicity", description="MUST be 'VariantPathogenicity'." - ) - # extends predicate - predicate: Literal["isCausalFor"] | None = None - # extends object - condition: Condition | IRI = Field( - ..., description="The `Condition` for which the variant impact is stated." - ) - # extends qualifiers - qualifiers: VariantPathogenicityQualifier | None = None - - -class _VariantStudySummary(_VariantStatement): - """A `Statement` summarizing evidence about the impact of a variant from one or more - studies. - """ - - # extends isReportedIn - isReportedIn: list[Document | IRI] = Field( - ..., - description="A document in which the information content is expressed.", - min_length=1, - ) - - -class _VariantOncogenicityStudyQualifier(BaseModel): - """Qualifier for Variant Oncogenicity Study""" - - alleleOrigin: AlleleOrigin | None = Field( - None, - description="Whether the statement should be interpreted in the context of an inherited (germline) variant, an acquired (somatic) mutation, or both (combined).", - ) - allelePrevalence: AllelePrevalence | None = Field( - None, - description="Whether the statement should be interpreted in the context of the variant being rare or common.", - ) - geneContext: Gene | None = Field( - None, description="A gene context that qualifies the Statement." - ) - - -class VariantOncogenicityStudy(_VariantStudySummary): - """A study summarization supporting or refuting the effect of variation on - oncogenesis of a tumor type. - """ - - type: Literal["VariantOncogenicity"] = "VariantOncogenicity" - # extends predicate - predicate: VariantOncogenicityStudyPredicate - # extends object - tumorType: Condition | IRI = Field( - ..., description="The tumor type for which the variant impact is evaluated." - ) - # extends qualifiers - qualifiers: _VariantOncogenicityStudyQualifier | None = None - - -class VariantTherapeuticResponseStudy(_VariantStudySummary): - """A study summarization describing the role of a variation in modulating the - response of a neoplasm to drug administration or other therapeutic procedure. - """ - - type: Literal["VariantTherapeuticResponseStudy"] = Field( - "VariantTherapeuticResponseStudy", - description="MUST be 'VariantTherapeuticResponseStudy'.", - ) - # extends predicate - predicate: VariantTherapeuticResponseStudyPredicate - # extends object - therapeutic: TherapeuticProcedure | IRI = Field( - ..., - description="A drug administration or other therapeutic procedure that the neoplasm is intended to respond to.", - ) - tumorType: Condition | IRI = Field( - ..., - description="The tumor type context in which the variant impact is evaluated.", - ) - # extends qualifiers - qualifiers: _VariantOncogenicityStudyQualifier | None = None diff --git a/src/metakb/transformers/base.py b/src/metakb/transformers/base.py index 90c2ceb1..a583dc37 100644 --- a/src/metakb/transformers/base.py +++ b/src/metakb/transformers/base.py @@ -14,6 +14,7 @@ from disease.schemas import ( NormalizationService as NormalizedDisease, ) +from ga4gh.cat_vrs.core_models import CategoricalVariant from ga4gh.core import sha512t24u from ga4gh.core.domain_models import ( CombinationTherapy, @@ -22,25 +23,34 @@ TherapeuticAgent, TherapeuticSubstituteGroup, ) -from ga4gh.core.entity_models import Coding, Extension +from ga4gh.core.entity_models import Coding, Document, Extension, Method +from ga4gh.va_spec.profiles.var_study_stmt import ( + VariantTherapeuticResponseStudyStatement, +) from ga4gh.vrs.models import Allele +from gene.schemas import NormalizeService as NormalizedGene from pydantic import BaseModel, StrictStr, ValidationError from therapy.schemas import NormalizationService as NormalizedTherapy from metakb import APP_ROOT, DATE_FMT from metakb.harvesters.base import _HarvestedData -from metakb.normalizers import ViccNormalizers -from metakb.schemas.annotation import Document, Method -from metakb.schemas.app import SourceName -from metakb.schemas.categorical_variation import ( - ProteinSequenceConsequence, -) -from metakb.schemas.variation_statement import ( - VariantTherapeuticResponseStudy, +from metakb.normalizers import ( + ViccDiseaseNormalizerData, + ViccNormalizerData, + ViccNormalizerDataExtension, + ViccNormalizers, ) +from metakb.schemas.app import SourceName logger = logging.getLogger(__name__) +# Normalizer response type to attribute name +NORMALIZER_INSTANCE_TO_ATTR = { + NormalizedDisease: "disease", + NormalizedTherapy: "therapeutic_agent", + NormalizedGene: "gene", +} + class EcoLevel(str, Enum): """Define constraints for Evidence Ontology levels""" @@ -99,8 +109,8 @@ class ViccConceptVocab(BaseModel): class TransformedData(BaseModel): """Define model for transformed data""" - studies: list[VariantTherapeuticResponseStudy] = [] - categorical_variations: list[ProteinSequenceConsequence] = [] + studies: list[VariantTherapeuticResponseStudyStatement] = [] + categorical_variants: list[CategoricalVariant] = [] variations: list[Allele] = [] genes: list[Gene] = [] therapeutic_procedures: list[ @@ -118,22 +128,26 @@ class Transformer(ABC): Method( id=MethodId.CIVIC_EID_SOP, label="CIViC Curation SOP (2019)", - isReportedIn=Document( - label="Danos et al., 2019, Genome Med.", - title="Standard operating procedure for curation and clinical interpretation of variants in cancer", - doi="10.1186/s13073-019-0687-x", - pmid=31779674, - ), + reportedIn=[ + Document( + label="Danos et al., 2019, Genome Med.", + title="Standard operating procedure for curation and clinical interpretation of variants in cancer", + doi="10.1186/s13073-019-0687-x", + pmid=31779674, + ) + ], ), Method( id=MethodId.MOA_ASSERTION_BIORXIV, label="MOAlmanac (2021)", - isReportedIn=Document( - label="Reardon, B., Moore, N.D., Moore, N.S. et al.", - title="Integrating molecular profiles into clinical frameworks through the Molecular Oncology Almanac to prospectively guide precision oncology", - doi="10.1038/s43018-021-00243-3", - pmid=35121878, - ), + reportedIn=[ + Document( + label="Reardon, B., Moore, N.D., Moore, N.S. et al.", + title="Integrating molecular profiles into clinical frameworks through the Molecular Oncology Almanac to prospectively guide precision oncology", + doi="10.1038/s43018-021-00243-3", + pmid=35121878, + ) + ], ), ] methods_mapping: ClassVar[dict[MethodId, Method]] = {m.id: m for m in _methods} @@ -466,50 +480,31 @@ def _add_therapeutic_procedure( return tp @staticmethod - def _get_therapy_normalizer_ext_data( - normalized_therapeutic_id: str, therapy_norm_resp: NormalizedTherapy - ) -> Extension: - """Create extension containing relevant therapy-normalizer data - - :param normalized_therapeutic_id: Concept ID from therapy-normalizer - :param therapy_norm_resp: Matched response from therapy-normalizer - :return: Extension containing therapy-normalizer data. Additional information, - such as the label, is provided for VarCat. + def _get_vicc_normalizer_extension( + normalized_id: str, + normalizer_resp: NormalizedDisease | NormalizedTherapy | NormalizedGene, + ) -> ViccNormalizerDataExtension: + """Get VICC Normalizer extension data + + :param normalized_id: Normalized ID from VICC normalizer + :param normalizer_resp: Response from VICC normalizer + :return: VICC Normalizer extension data """ - return Extension( - name="therapy_normalizer_data", - value={ - "normalized_id": normalized_therapeutic_id, - "label": therapy_norm_resp.therapeutic_agent.label, - }, - ) - - @staticmethod - def _get_disease_normalizer_ext_data( - normalized_disease_id: str, disease_norm_resp: NormalizedDisease - ) -> Extension: - """Create extension containing relevant disease-normalizer data - - :param normalized_disease_id: Concept ID from disease-normalizer - :param disease_norm_resp: Matched response from disease-normalizer - :return: Extension containing disease-normalizer data. Additional information, - such as the label and mondo_id, is provided for VarCat. - """ - mappings = disease_norm_resp.disease.mappings or [] - mondo_id = None - for mapping in mappings: - if mapping.coding.system == DiseaseNamespacePrefix.MONDO.value: - mondo_id = mapping.coding.code - break - - return Extension( - name="disease_normalizer_data", - value={ - "normalized_id": normalized_disease_id, - "label": disease_norm_resp.disease.label, - "mondo_id": mondo_id, - }, - ) + attr_name = NORMALIZER_INSTANCE_TO_ATTR[type(normalizer_resp)] + normalizer_resp_obj = getattr(normalizer_resp, attr_name) + + params = {"id": normalized_id, "label": normalizer_resp_obj.label} + + if isinstance(normalizer_resp, NormalizedDisease): + mappings = normalizer_resp_obj.mappings or [] + for mapping in mappings: + if mapping.coding.system == DiseaseNamespacePrefix.MONDO.value: + params["mondo_id"] = mapping.coding.code.root + break + ext_val = ViccDiseaseNormalizerData(**params) + else: + ext_val = ViccNormalizerData(**params) + return ViccNormalizerDataExtension(value=ext_val.model_dump()) def create_json(self, cdm_filepath: Path | None = None) -> None: """Create a composite JSON for transformed data. diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py index 547f923d..d7d25360 100644 --- a/src/metakb/transformers/civic.py +++ b/src/metakb/transformers/civic.py @@ -5,26 +5,33 @@ from enum import Enum from pathlib import Path +from ga4gh.cat_vrs.core_models import CategoricalVariant, DefiningContextConstraint from ga4gh.core.domain_models import ( Disease, Gene, TherapeuticAgent, TherapeuticSubstituteGroup, ) -from ga4gh.core.entity_models import Coding, ConceptMapping, Extension, Relation, Syntax -from ga4gh.vrs.models import Expression, Variation +from ga4gh.core.entity_models import ( + Coding, + ConceptMapping, + Direction, + Document, + Extension, + Relation, +) +from ga4gh.va_spec.profiles.var_study_stmt import ( + AlleleOriginQualifier, + TherapeuticResponsePredicate, + VariantTherapeuticResponseStudyStatement, +) +from ga4gh.vrs.models import Expression, Syntax, Variation from pydantic import BaseModel, ValidationError from metakb import APP_ROOT from metakb.harvesters.civic import CivicHarvestedData -from metakb.normalizers import ViccNormalizers -from metakb.schemas.annotation import Direction, Document -from metakb.schemas.categorical_variation import ProteinSequenceConsequence -from metakb.schemas.variation_statement import ( - AlleleOrigin, - VariantTherapeuticResponseStudy, - VariantTherapeuticResponseStudyPredicate, - _VariantOncogenicityStudyQualifier, +from metakb.normalizers import ( + ViccNormalizers, ) from metakb.transformers.base import ( CivicEvidenceLevel, @@ -124,7 +131,7 @@ def __init__( ] self.able_to_normalize = { "variations": {}, # will store _VariationCache data - "categorical_variations": {}, + "categorical_variants": {}, "conditions": {}, "therapeutic_procedures": {}, "genes": {}, @@ -195,14 +202,14 @@ async def transform(self, harvested_data: CivicHarvestedData) -> None: self._add_genes(harvested_data.genes) # Only want to add MPs where variation-normalizer succeeded for the related - # variant. Will update `categorical_variations` + # variant. Will update `categorical_variants` able_to_normalize_vids = self.able_to_normalize["variations"].keys() mps = [ mp for mp in molecular_profiles if f"civic.vid:{mp['variant_ids'][0]}" in able_to_normalize_vids ] - self._add_protein_consequences(mps, mp_id_to_v_id_mapping) + self._add_categorical_variants(mps, mp_id_to_v_id_mapping) # Add variant therapeutic response study data. Will update `studies` self._add_variant_therapeutic_response_studies( @@ -225,7 +232,7 @@ def _add_variant_therapeutic_response_studies( for r in records: # Check cache for molecular profile, variation and gene data mp_id = f"civic.mpid:{r['molecular_profile_id']}" - mp = self.able_to_normalize["categorical_variations"].get(mp_id) + mp = self.able_to_normalize["categorical_variants"].get(mp_id) if not mp: _logger.debug("mp_id not supported: %s", mp_id) continue @@ -303,51 +310,31 @@ def _add_variant_therapeutic_response_studies( civic_gene = self.able_to_normalize["genes"].get( variation_gene_map.civic_gene_id ) - qualifiers = self._get_variant_onco_study_qualifier( - r["variant_origin"], civic_gene - ) - statement = VariantTherapeuticResponseStudy( + variant_origin = r["variant_origin"].upper() + if variant_origin == "SOMATIC": + allele_origin_qualifier = AlleleOriginQualifier.SOMATIC + elif variant_origin in {"RARE_GERMLINE", "COMMON_GERMLINE"}: + allele_origin_qualifier = AlleleOriginQualifier.GERMLINE + else: + allele_origin_qualifier = None + + statement = VariantTherapeuticResponseStudyStatement( id=r["name"].lower().replace("eid", "civic.eid:"), description=r["description"] if r["description"] else None, direction=direction, strength=strength, predicate=predicate, - variant=mp, - therapeutic=civic_therapeutic, - tumorType=civic_disease, - qualifiers=qualifiers, + subjectVariant=mp, + objectTherapeutic=civic_therapeutic, + conditionQualifier=civic_disease, + alleleOriginQualifier=allele_origin_qualifier, + geneContextQualifier=civic_gene, specifiedBy=self.processed_data.methods[0], - isReportedIn=[document], + reportedIn=[document], ) self.processed_data.studies.append(statement) - def _get_variant_onco_study_qualifier( - self, variant_origin: str, gene: Gene | None = None - ) -> _VariantOncogenicityStudyQualifier | None: - """Get Variant Oncogenicity Study Qualifier - - :param variant_origin: CIViC evidence item's variant origin - :param gene: CIViC gene data - :return: Variant Oncogenicity Study Qualifier for a Variant Therapeutic Response - Study, if allele origin or gene exists - """ - variant_origin = variant_origin.upper() - if variant_origin == "SOMATIC": - allele_origin = AlleleOrigin.SOMATIC - elif variant_origin in {"RARE_GERMLINE", "COMMON_GERMLINE"}: - allele_origin = AlleleOrigin.GERMLINE - else: - allele_origin = None - - if allele_origin or gene: - qualifier = _VariantOncogenicityStudyQualifier( - alleleOrigin=allele_origin, geneContext=gene - ) - else: - qualifier = None - return qualifier - def _get_evidence_direction(self, direction: str) -> Direction | None: """Get the normalized evidence direction @@ -358,12 +345,12 @@ def _get_evidence_direction(self, direction: str) -> Direction | None: if direction_upper == "SUPPORTS": return Direction.SUPPORTS if direction_upper == "DOES_NOT_SUPPORT": - return Direction.REFUTES - return Direction.NONE + return Direction.DISPUTES + return None def _get_predicate( self, record_type: str, clin_sig: str - ) -> VariantTherapeuticResponseStudyPredicate | None: + ) -> TherapeuticResponsePredicate | None: """Return predicate for an evidence item. :param record_type: The evidence type @@ -374,21 +361,17 @@ def _get_predicate( if record_type == "PREDICTIVE": if clin_sig == "SENSITIVITYRESPONSE": - predicate = ( - VariantTherapeuticResponseStudyPredicate.PREDICTS_SENSITIVITY_TO - ) + predicate = TherapeuticResponsePredicate.SENSITIVITY elif clin_sig == "RESISTANCE": - predicate = ( - VariantTherapeuticResponseStudyPredicate.PREDICTS_RESISTANCE_TO - ) + predicate = TherapeuticResponsePredicate.RESISTANCE return predicate - def _add_protein_consequences( + def _add_categorical_variants( self, molecular_profiles: list[dict], mp_id_to_v_id_mapping: dict ) -> None: - """Create Protein Sequence Consequence objects for all supported MP records. - Mutates instance variables ``able_to_normalize['categorical_variations']`` and - ``processed_data.categorical_variations``. + """Create Categorical Variant objects for all supported MP records. + Mutates instance variables ``able_to_normalize['categorical_variants']`` and + ``processed_data.categorical_variants``. :param molecular_profiles: List of supported Molecular Profiles in CIViC. The associated, single variant record for each MP was successfully @@ -431,18 +414,22 @@ def _add_protein_consequences( Extension(name=ext_key, value=civic_variation_data_value) ) - psc = ProteinSequenceConsequence( + cv = CategoricalVariant( id=mp_id, description=mp["description"], label=mp["name"], - definingContext=civic_variation_data.vrs_variation.root, + constraints=[ + DefiningContextConstraint( + definingContext=civic_variation_data.vrs_variation, + ) + ], alternativeLabels=list(set(aliases)) or None, mappings=civic_variation_data.mappings, extensions=extensions or None, members=civic_variation_data.members, ) - self.processed_data.categorical_variations.append(psc) - self.able_to_normalize["categorical_variations"][mp_id] = psc + self.processed_data.categorical_variants.append(cv) + self.able_to_normalize["categorical_variants"][mp_id] = cv @staticmethod def _get_variant_name(variant: dict) -> str: @@ -498,19 +485,28 @@ async def _get_variation_members(self, variant: dict) -> list[Variation] | None: :return: List containing one VRS variation record for associated genomic HGVS expression, if variation-normalizer was able to normalize """ - members = None - genomic_hgvs = ( - [expr for expr in variant["hgvs_expressions"] if "g." in expr] or [None] - )[0] - if genomic_hgvs: - vrs_genomic_variation = await self.vicc_normalizers.normalize_variation( - [genomic_hgvs] - ) + members = [] + for hgvs_expr in variant["hgvs_expressions"]: + if hgvs_expr == "N/A" or "p." in hgvs_expr: + continue - if vrs_genomic_variation: - genomic_params = vrs_genomic_variation.model_dump(exclude_none=True) - genomic_params["label"] = genomic_hgvs - members = [Variation(**genomic_params)] + if "c." in hgvs_expr: + syntax = Syntax.HGVS_C + elif "g." in hgvs_expr: + syntax = Syntax.HGVS_G + else: + _logger.debug("Syntax not recognized: %s", hgvs_expr) + continue + + vrs_variation = await self.vicc_normalizers.normalize_variation([hgvs_expr]) + + if vrs_variation: + variation_params = vrs_variation.model_dump(exclude_none=True) + variation_params["label"] = hgvs_expr + variation_params["expressions"] = [ + Expression(syntax=syntax, value=hgvs_expr) + ] + members.append(Variation(**variation_params)) return members async def _add_variations(self, variants: list[dict]) -> None: @@ -640,12 +636,10 @@ def _get_expressions(self, variant: dict) -> list[Expression]: """ expressions = [] for hgvs_expr in variant["hgvs_expressions"]: - if ":g." in hgvs_expr: - syntax = Syntax.HGVS_G - elif ":c." in hgvs_expr: - syntax = Syntax.HGVS_C - else: + if ":p." in hgvs_expr: syntax = Syntax.HGVS_P + else: + continue if hgvs_expr != "N/A": expressions.append(Expression(syntax=syntax, value=hgvs_expr)) @@ -664,7 +658,9 @@ def _add_genes(self, genes: list[dict]) -> None: ncbigene = f"ncbigene:{gene['entrez_id']}" queries = [ncbigene, gene["name"]] + gene["aliases"] - _, normalized_gene_id = self.vicc_normalizers.normalize_gene(queries) + gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene( + queries + ) if normalized_gene_id: civic_gene = Gene( @@ -682,7 +678,9 @@ def _add_genes(self, genes: list[dict]) -> None: ], alternativeLabels=gene["aliases"] if gene["aliases"] else None, extensions=[ - Extension(name="gene_normalizer_id", value=normalized_gene_id) + self._get_vicc_normalizer_extension( + normalized_gene_id, gene_norm_resp + ) ], ) self.able_to_normalize["genes"][gene_id] = civic_gene @@ -763,9 +761,9 @@ def _get_disease(self, disease: dict) -> Disease | None: label=display_name, mappings=mappings if mappings else None, extensions=[ - self._get_disease_normalizer_ext_data( + self._get_vicc_normalizer_extension( normalized_disease_id, disease_norm_resp - ), + ) ], ) @@ -861,9 +859,9 @@ def _get_therapeutic_agent(self, therapy: dict) -> TherapeuticAgent | None: ) extensions = [ - self._get_therapy_normalizer_ext_data( + self._get_vicc_normalizer_extension( normalized_therapeutic_id, therapy_norm_resp - ), + ) ] if regulatory_approval_extension: diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py index 6a195b0a..ee64b538 100644 --- a/src/metakb/transformers/moa.py +++ b/src/metakb/transformers/moa.py @@ -5,25 +5,31 @@ from pathlib import Path from urllib.parse import quote +from ga4gh.cat_vrs.core_models import CategoricalVariant, DefiningContextConstraint from ga4gh.core import sha512t24u from ga4gh.core.domain_models import ( Disease, Gene, TherapeuticAgent, ) -from ga4gh.core.entity_models import Coding, ConceptMapping, Extension, Relation +from ga4gh.core.entity_models import ( + Coding, + ConceptMapping, + Document, + Extension, + Relation, +) +from ga4gh.va_spec.profiles.var_study_stmt import ( + AlleleOriginQualifier, + TherapeuticResponsePredicate, + VariantTherapeuticResponseStudyStatement, +) from ga4gh.vrs.models import Variation from metakb import APP_ROOT from metakb.harvesters.moa import MoaHarvestedData -from metakb.normalizers import ViccNormalizers -from metakb.schemas.annotation import Direction, Document -from metakb.schemas.categorical_variation import ProteinSequenceConsequence -from metakb.schemas.variation_statement import ( - AlleleOrigin, - VariantTherapeuticResponseStudy, - VariantTherapeuticResponseStudyPredicate, - _VariantOncogenicityStudyQualifier, +from metakb.normalizers import ( + ViccNormalizers, ) from metakb.transformers.base import ( MethodId, @@ -75,7 +81,7 @@ async def transform(self, harvested_data: MoaHarvestedData) -> None: # Add gene, variant, and source data to ``processed_data`` instance variable # (``genes``, ``variations``, and ``documents``) self._add_genes(harvested_data.genes) - await self._add_protein_consequences(harvested_data.variants) + await self._add_categorical_variants(harvested_data.variants) self._add_documents(harvested_data.sources) # Add variant therapeutic response study data. Will update `studies` @@ -106,13 +112,9 @@ async def _add_variant_therapeutic_response_studies( # Get predicate. We only support therapeutic resistance/sensitivity if record["clinical_significance"] == "resistance": - predicate = ( - VariantTherapeuticResponseStudyPredicate.PREDICTS_RESISTANCE_TO - ) + predicate = TherapeuticResponsePredicate.RESISTANCE elif record["clinical_significance"] == "sensitivity": - predicate = ( - VariantTherapeuticResponseStudyPredicate.PREDICTS_SENSITIVITY_TO - ) + predicate = TherapeuticResponsePredicate.SENSITIVITY else: logger.debug( "clinical_significance not supported: %s", @@ -190,54 +192,31 @@ async def _add_variant_therapeutic_response_studies( # Add document document = self.able_to_normalize["documents"].get(record["source_ids"]) - # Get qualifier - gene = variation_gene_map["moa_gene"] - qualifiers = self._get_variant_onco_study_qualifier( - record["variant"]["feature_type"], gene - ) + feature_type = record["variant"]["feature_type"] + if feature_type == "somatic_variant": + allele_origin_qualifier = AlleleOriginQualifier.SOMATIC + elif feature_type == "germline_variant": + allele_origin_qualifier = AlleleOriginQualifier.GERMLINE + else: + allele_origin_qualifier = None - statement = VariantTherapeuticResponseStudy( - direction=Direction.NONE, + statement = VariantTherapeuticResponseStudyStatement( id=assertion_id, description=record["description"], strength=strength, predicate=predicate, - variant=variation_gene_map["psc"], - therapeutic=moa_therapeutic, - tumorType=moa_disease, - qualifiers=qualifiers, + subjectVariant=variation_gene_map["cv"], + objectTherapeutic=moa_therapeutic, + conditionQualifier=moa_disease, + alleleOriginQualifier=allele_origin_qualifier, + geneContextQualifier=variation_gene_map["moa_gene"], specifiedBy=self.processed_data.methods[0], - isReportedIn=[document], + reportedIn=[document], ) self.processed_data.studies.append(statement) - def _get_variant_onco_study_qualifier( - self, feature_type: str, gene: Gene | None = None - ) -> _VariantOncogenicityStudyQualifier | None: - """Get Variant Oncogenicity Study Qualifier - - :param feature_type: MOA feature type - :param gene: MOA gene data - :return: Variant Oncogenicity Study Qualifier for a Variant Therapeutic Response - Study, if allele origin or gene exists - """ - if feature_type == "somatic_variant": - allele_origin = AlleleOrigin.SOMATIC - elif feature_type == "germline_variant": - allele_origin = AlleleOrigin.GERMLINE - else: - allele_origin = None - - if allele_origin or gene: - qualifier = _VariantOncogenicityStudyQualifier( - alleleOrigin=allele_origin, geneContext=gene - ) - else: - qualifier = None - return qualifier - - async def _add_protein_consequences(self, variants: list[dict]) -> None: - """Create Protein Sequence Consequence objects for all MOA variant records. + async def _add_categorical_variants(self, variants: list[dict]) -> None: + """Create Categorical Variant objects for all MOA variant records. Mutates instance variables ``able_to_normalize['variations']`` and ``processed_data.variations``, if the variation-normalizer can successfully normalize the variant @@ -343,20 +322,20 @@ async def _add_protein_consequences(self, variants: list[dict]) -> None: ) ) - psc = ProteinSequenceConsequence( + cv = CategoricalVariant( id=moa_variant_id, label=feature, - definingContext=moa_variation.root, + constraints=[DefiningContextConstraint(definingContext=moa_variation)], mappings=mappings or None, extensions=extensions, members=members, ) self.able_to_normalize["variations"][variant_id] = { - "psc": psc, + "cv": cv, "moa_gene": moa_gene, } - self.processed_data.categorical_variations.append(psc) + self.processed_data.categorical_variants.append(cv) async def _get_variation_members( self, moa_rep_coord: dict @@ -407,13 +386,17 @@ def _add_genes(self, genes: list[str]) -> None: :param genes: All genes in MOAlmanac """ for gene in genes: - _, normalized_gene_id = self.vicc_normalizers.normalize_gene([gene]) + gene_norm_resp, normalized_gene_id = self.vicc_normalizers.normalize_gene( + [gene] + ) if normalized_gene_id: moa_gene = Gene( id=f"moa.normalize.gene:{quote(gene)}", label=gene, extensions=[ - Extension(name="gene_normalizer_id", value=normalized_gene_id) + self._get_vicc_normalizer_extension( + normalized_gene_id, gene_norm_resp + ) ], ) self.able_to_normalize["genes"][quote(gene)] = moa_gene @@ -447,7 +430,7 @@ def _add_documents(self, sources: list) -> None: document = Document( id=f"moa.source:{source_id}", title=source["citation"], - url=source["url"] if source["url"] else None, + urls=[source["url"]] if source["url"] else None, pmid=source["pmid"] if source["pmid"] else None, doi=source["doi"] if source["doi"] else None, mappings=mappings, @@ -488,9 +471,9 @@ def _get_therapeutic_agent(self, therapy: dict) -> TherapeuticAgent | None: return None extensions = [ - self._get_therapy_normalizer_ext_data( + self._get_vicc_normalizer_extension( normalized_therapeutic_id, therapy_norm_resp - ), + ) ] regulatory_approval_extension = ( @@ -587,8 +570,8 @@ def _get_disease(self, disease: dict) -> dict | None: label=disease_name, mappings=mappings if mappings else None, extensions=[ - self._get_disease_normalizer_ext_data( + self._get_vicc_normalizer_extension( normalized_disease_id, disease_norm_resp - ), + ) ], ) diff --git a/tests/conftest.py b/tests/conftest.py index f8991935..7a45c111 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,14 +1,14 @@ """Module for pytest fixtures.""" -import json import logging from copy import deepcopy from pathlib import Path import pytest +from deepdiff import DeepDiff from metakb.harvesters.base import Harvester -from metakb.normalizers import ViccNormalizers +from metakb.normalizers import VICC_NORMALIZER_DATA, ViccNormalizers from metakb.query import QueryHandler TEST_DATA_DIR = Path(__file__).resolve().parents[0] / "data" @@ -64,8 +64,8 @@ def cetuximab_extensions(): """Create test fixture for cetuximab extensions""" return [ { - "name": "therapy_normalizer_data", - "value": {"normalized_id": "rxcui:318341", "label": "cetuximab"}, + "name": VICC_NORMALIZER_DATA, + "value": {"id": "rxcui:318341", "label": "cetuximab"}, }, { "name": "regulatory_approval", @@ -127,8 +127,11 @@ def encorafenib_extensions(): """Create test fixture for encorafenib extensions""" return [ { - "name": "therapy_normalizer_data", - "value": {"normalized_id": "rxcui:2049106", "label": "encorafenib"}, + "name": VICC_NORMALIZER_DATA, + "value": { + "id": "rxcui:2049106", + "label": "encorafenib", + }, }, { "name": "regulatory_approval", @@ -168,11 +171,33 @@ def civic_mpid33(civic_vid33): """Create CIViC MPID 33""" return { "id": "civic.mpid:33", - "type": "ProteinSequenceConsequence", + "type": "CategoricalVariant", "description": "EGFR L858R has long been recognized as a functionally significant mutation in cancer, and is one of the most prevalent single mutations in lung cancer. Best described in non-small cell lung cancer (NSCLC), the mutation seems to confer sensitivity to first and second generation TKI's like gefitinib and neratinib. NSCLC patients with this mutation treated with TKI's show increased overall and progression-free survival, as compared to chemotherapy alone. Third generation TKI's are currently in clinical trials that specifically focus on mutant forms of EGFR, a few of which have shown efficacy in treating patients that failed to respond to earlier generation TKI therapies.", "label": "EGFR L858R", - "definingContext": civic_vid33, + "constraints": [ + {"definingContext": civic_vid33, "type": "DefiningContextConstraint"} + ], "members": [ + { + "id": "ga4gh:VA.gV7_dnvF8SQSeUdvgDFhU65zK_csc6VE", + "type": "Allele", + "label": "NM_005228.4:c.2573T>G", + "digest": "gV7_dnvF8SQSeUdvgDFhU65zK_csc6VE", + "location": { + "id": "ga4gh:SL.LREsUiEYvOrRhwXW1rG72kXFPegvkNzI", + "type": "SequenceLocation", + "digest": "LREsUiEYvOrRhwXW1rG72kXFPegvkNzI", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.d_QsP29RWJi6bac7GOC9cJ9AO7s_HUMN", + }, + "start": 2833, + "end": 2834, + "sequence": "T", + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "G"}, + "expressions": [{"syntax": "hgvs.c", "value": "NM_005228.4:c.2573T>G"}], + }, { "id": "ga4gh:VA.pM_eD8ha-bnAu6wJOoQTtHYIvEShSN51", "label": "NC_000007.13:g.55259515T>G", @@ -191,9 +216,12 @@ def civic_mpid33(civic_vid33): "sequence": "T", }, "state": {"type": "LiteralSequenceExpression", "sequence": "G"}, - } + "expressions": [ + {"syntax": "hgvs.g", "value": "NC_000007.13:g.55259515T>G"} + ], + }, ], - "alternativeLabels": ["LEU858ARG"], + "alternativeLabels": ["LEU813ARG", "LEU858ARG", "L813R"], "mappings": [ { "coding": { @@ -268,12 +296,6 @@ def civic_mpid33(civic_vid33): } -@pytest.fixture(scope="session") -def civic_eid2997_qualifier(civic_gid19): - """Create qualifier for civic eid 2997""" - return {"alleleOrigin": "somatic", "geneContext": civic_gid19} - - @pytest.fixture(scope="session") def civic_source592(): """Create fixture for civic source 592""" @@ -291,14 +313,14 @@ def civic_eid2997_study( civic_mpid33, civic_tid146, civic_did8, - civic_eid2997_qualifier, + civic_gid19, civic_method, civic_source592, ): """Create CIVIC EID2997 Statement test fixture. Uses TherapeuticAgent.""" return { "id": "civic.eid:2997", - "type": "VariantTherapeuticResponseStudy", + "type": "VariantTherapeuticResponseStudyStatement", "description": "Afatinib, an irreversible inhibitor of the ErbB family of tyrosine kinases has been approved in the US for the first-line treatment of patients with metastatic non-small-cell lung cancer (NSCLC) who have tumours with EGFR exon 19 deletions or exon 21 (L858R) substitution mutations as detected by a US FDA-approved test", "direction": "supports", "strength": { @@ -307,12 +329,13 @@ def civic_eid2997_study( "system": "https://go.osu.edu/evidence-codes", }, "predicate": "predictsSensitivityTo", - "variant": civic_mpid33, - "therapeutic": civic_tid146, - "tumorType": civic_did8, - "qualifiers": civic_eid2997_qualifier, + "subjectVariant": civic_mpid33, + "objectTherapeutic": civic_tid146, + "conditionQualifier": civic_did8, + "alleleOriginQualifier": "somatic", + "geneContextQualifier": civic_gid19, "specifiedBy": civic_method, - "isReportedIn": [civic_source592], + "reportedIn": [civic_source592], } @@ -342,7 +365,12 @@ def civic_gid5(): "NS7", "RAFB1", ], - "extensions": [{"name": "gene_normalizer_id", "value": "hgnc:1097"}], + "extensions": [ + { + "name": VICC_NORMALIZER_DATA, + "value": {"id": "hgnc:1097", "label": "BRAF"}, + } + ], } @@ -356,6 +384,7 @@ def civic_vid12(): "digest": "j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L", "location": { "id": "ga4gh:SL.t-3DrWALhgLdXHsupI-e-M00aL3HgK3y", + "digest": "t-3DrWALhgLdXHsupI-e-M00aL3HgK3y", "type": "SequenceLocation", "sequenceReference": { "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", @@ -368,15 +397,6 @@ def civic_vid12(): "state": {"sequence": "E", "type": "LiteralSequenceExpression"}, "expressions": [ {"syntax": "hgvs.p", "value": "NP_004324.2:p.Val600Glu"}, - {"syntax": "hgvs.c", "value": "NM_004333.4:c.1799T>A"}, - { - "syntax": "hgvs.c", - "value": "ENST00000288602.6:c.1799T>A", - }, - { - "syntax": "hgvs.g", - "value": "NC_000007.13:g.140453136A>T", - }, ], } @@ -409,14 +429,41 @@ def civic_mpid12(civic_vid12, braf_v600e_genomic): """Create test fixture for CIViC Molecular Profile ID 12""" genomic_rep = braf_v600e_genomic.copy() genomic_rep["label"] = "NC_000007.13:g.140453136A>T" + genomic_rep["expressions"] = [ + {"syntax": "hgvs.g", "value": "NC_000007.13:g.140453136A>T"} + ] return { "id": "civic.mpid:12", - "type": "ProteinSequenceConsequence", + "type": "CategoricalVariant", "description": "BRAF V600E has been shown to be recurrent in many cancer types. It is one of the most widely studied variants in cancer. This variant is correlated with poor prognosis in certain cancer types, including colorectal cancer and papillary thyroid cancer. The targeted therapeutic dabrafenib has been shown to be effective in clinical trials with an array of BRAF mutations and cancer types. Dabrafenib has also shown to be effective when combined with the MEK inhibitor trametinib in colorectal cancer and melanoma. However, in patients with TP53, CDKN2A and KRAS mutations, dabrafenib resistance has been reported. Ipilimumab, regorafenib, vemurafenib, and a number of combination therapies have been successful in treating V600E mutations. However, cetuximab and panitumumab have been largely shown to be ineffective without supplementary treatment.", "label": "BRAF V600E", - "definingContext": civic_vid12, - "members": [genomic_rep], + "constraints": [ + {"definingContext": civic_vid12, "type": "DefiningContextConstraint"} + ], + "members": [ + genomic_rep, + { + "id": "ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R", + "type": "Allele", + "label": "NM_004333.4:c.1799T>A", + "digest": "W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R", + "expressions": [{"syntax": "hgvs.c", "value": "NM_004333.4:c.1799T>A"}], + "location": { + "id": "ga4gh:SL.8HBKs9fzlT3tKWlM03REjkg_0Om6Y33U", + "type": "SequenceLocation", + "digest": "8HBKs9fzlT3tKWlM03REjkg_0Om6Y33U", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.aKMPEJgmlZXt_F6gRY5cUG3THH2n-GUa", + }, + "start": 2024, + "end": 2025, + "sequence": "T", + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "A"}, + }, + ], "alternativeLabels": ["VAL600GLU", "V640E", "VAL640GLU"], "mappings": [ { @@ -495,6 +542,7 @@ def civic_vid33(): "digest": "S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ", "location": { "id": "ga4gh:SL.v0_edynH98OIu-0QPVT5anCSOriAFSDQ", + "digest": "v0_edynH98OIu-0QPVT5anCSOriAFSDQ", "type": "SequenceLocation", "sequenceReference": { "refgetAccession": "SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", @@ -502,19 +550,11 @@ def civic_vid33(): }, "start": 857, "end": 858, + "sequence": "L", }, "state": {"sequence": "R", "type": "LiteralSequenceExpression"}, "expressions": [ {"syntax": "hgvs.p", "value": "NP_005219.2:p.Leu858Arg"}, - {"syntax": "hgvs.c", "value": "ENST00000275493.2:c.2573T>G"}, - { - "syntax": "hgvs.c", - "value": "NM_005228.4:c.2573T>G", - }, - { - "syntax": "hgvs.g", - "value": "NC_000007.13:g.55259515T>G", - }, ], } @@ -546,6 +586,12 @@ def civic_gid19(): "PIG61", "mENA", ], + "extensions": [ + { + "name": VICC_NORMALIZER_DATA, + "value": {"id": "hgnc:3236", "label": "EGFR"}, + } + ], } @@ -568,6 +614,7 @@ def civic_tid146(): "alternativeLabels": [ "BIBW2992", "BIBW 2992", + "BIBW-2992", "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", ], "extensions": [ @@ -596,8 +643,11 @@ def civic_tid146(): }, }, { - "name": "therapy_normalizer_data", - "value": {"normalized_id": "rxcui:1430438", "label": "afatinib"}, + "name": VICC_NORMALIZER_DATA, + "value": { + "id": "rxcui:1430438", + "label": "afatinib", + }, }, ], } @@ -621,9 +671,9 @@ def civic_did8(): ], "extensions": [ { - "name": "disease_normalizer_data", + "name": VICC_NORMALIZER_DATA, "value": { - "normalized_id": "ncit:C2926", + "id": "ncit:C2926", "label": "Lung Non-Small Cell Carcinoma", "mondo_id": "0005233", }, @@ -673,8 +723,11 @@ def civic_tid28(): ], "extensions": [ { - "name": "therapy_normalizer_data", - "value": {"normalized_id": "rxcui:263034", "label": "panitumumab"}, + "name": VICC_NORMALIZER_DATA, + "value": { + "id": "rxcui:263034", + "label": "panitumumab", + }, }, { "name": "regulatory_approval", @@ -811,9 +864,9 @@ def civic_did11(): ], "extensions": [ { - "name": "disease_normalizer_data", + "name": VICC_NORMALIZER_DATA, "value": { - "normalized_id": "ncit:C4978", + "id": "ncit:C4978", "label": "Malignant Colorectal Neoplasm", "mondo_id": "0005575", }, @@ -827,21 +880,22 @@ def civic_eid816_study(civic_mpid12, civic_tsg, civic_did11, civic_gid5, civic_m """Create CIVIC EID816 study test fixture. Uses TherapeuticSubstituteGroup.""" return { "id": "civic.eid:816", - "type": "VariantTherapeuticResponseStudy", + "type": "VariantTherapeuticResponseStudyStatement", "description": "This meta-analysis of 7 randomized control trials evaluating overall survival (OS) (8 for progression free survival) could not definitely state that survival benefit of anti-EGFR monoclonal antibodies is limited to patients with wild type BRAF. In other words, the authors believe that there is insufficient data to justify the exclusion of anti-EGFR monoclonal antibody therapy for patients with mutant BRAF. In these studies, mutant BRAF specifically meant the V600E mutation.", - "direction": "refutes", + "direction": "disputes", "strength": { "code": "e000005", "label": "clinical cohort evidence", "system": "https://go.osu.edu/evidence-codes", }, "predicate": "predictsResistanceTo", - "variant": civic_mpid12, - "therapeutic": civic_tsg, - "tumorType": civic_did11, - "qualifiers": {"alleleOrigin": "somatic", "geneContext": civic_gid5}, + "subjectVariant": civic_mpid12, + "objectTherapeutic": civic_tsg, + "conditionQualifier": civic_did11, + "alleleOriginQualifier": "somatic", + "geneContextQualifier": civic_gid5, "specifiedBy": civic_method, - "isReportedIn": [ + "reportedIn": [ { "id": "civic.source:548", "label": "Rowland et al., 2015", @@ -864,7 +918,7 @@ def civic_eid9851_study( """Create CIVIC EID9851 study test fixture. Uses CombinationTherapy.""" return { "id": "civic.eid:9851", - "type": "VariantTherapeuticResponseStudy", + "type": "VariantTherapeuticResponseStudyStatement", "description": "The open-label phase 3 BEACON CRC trial included 665 patients with BRAF V600E-mutated metastatic CRC. Patients were randomly assigned in a 1:1:1 ratio to receive encorafenib, binimetinib, and cetuximab (triplet-therapy group); encorafenib and cetuximab (doublet-therapy group); or the investigators\u2019 choice of either cetuximab and irinotecan or cetuximab and FOLFIRI. The median overall survival was 8.4 months (95% CI, 7.5 to 11.0) in the doublet-therapy group and 5.4 months (95% CI, 4.8 to 6.6) in the control group, with a significantly lower risk of death compared to the control group (hazard ratio for death doublet-group vs. control, 0.60; 95% CI, 0.45 to 0.79; P<0.001). The confirmed response rate was 26% (95% CI, 18 to 35) in the triplet-therapy group, 20% in the doublet-therapy group (95% CI 13 to 29) and 2% (95% CI, 0 to 7) in the control group (doublet group vs. control P<0.001). Median PFS was 4.2 months (95% CI, 3.7 to 5.4) in the doublet-therapy group, and 1.5 months (95% CI, 1.5 to 1.7) in the control group (hazard ratio for disease progression doublet-group vs control, 0.40; 95% CI, 0.31 to 0.52, P<0.001).", "direction": "supports", "strength": { @@ -873,12 +927,13 @@ def civic_eid9851_study( "system": "https://go.osu.edu/evidence-codes", }, "predicate": "predictsSensitivityTo", - "variant": civic_mpid12, - "therapeutic": civic_ct, - "tumorType": civic_did11, - "qualifiers": {"alleleOrigin": "somatic", "geneContext": civic_gid5}, + "subjectVariant": civic_mpid12, + "objectTherapeutic": civic_ct, + "conditionQualifier": civic_did11, + "alleleOriginQualifier": "somatic", + "geneContextQualifier": civic_gid5, "specifiedBy": civic_method, - "isReportedIn": [ + "reportedIn": [ { "id": "civic.source:3025", "label": "Kopetz et al., 2019", @@ -1683,20 +1738,20 @@ def moa_aid66_study( return { "id": "moa.assertion:66", "description": "T315I mutant ABL1 in p210 BCR-ABL cells resulted in retained high levels of phosphotyrosine at increasing concentrations of inhibitor STI-571, whereas wildtype appropriately received inhibition.", - "direction": "none", "strength": { "code": "e000009", "label": "preclinical evidence", "system": "https://go.osu.edu/evidence-codes", }, "predicate": "predictsResistanceTo", - "variant": moa_vid66, - "therapeutic": moa_imatinib, - "tumorType": moa_chronic_myelogenous_leukemia, - "qualifiers": {"alleleOrigin": "somatic", "geneContext": moa_abl1}, + "subjectVariant": moa_vid66, + "objectTherapeutic": moa_imatinib, + "conditionQualifier": moa_chronic_myelogenous_leukemia, + "alleleOriginQualifier": "somatic", + "geneContextQualifier": moa_abl1, "specifiedBy": moa_method, - "isReportedIn": [moa_source45], - "type": "VariantTherapeuticResponseStudy", + "reportedIn": [moa_source45], + "type": "VariantTherapeuticResponseStudyStatement", } @@ -1705,25 +1760,31 @@ def moa_vid66(): """Create a test fixture for MOA VID66.""" return { "id": "moa.variant:66", - "type": "ProteinSequenceConsequence", + "type": "CategoricalVariant", "label": "ABL1 p.T315I (Missense)", - "definingContext": { - "id": "ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ", - "digest": "D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ", - "type": "Allele", - "location": { - "id": "ga4gh:SL.jGElwyBPYNWI-BkFFHKfgLJynt9zuNPs", - "digest": "jGElwyBPYNWI-BkFFHKfgLJynt9zuNPs", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", + "constraints": [ + { + "definingContext": { + "id": "ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ", + "digest": "D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ", + "type": "Allele", + "location": { + "id": "ga4gh:SL.jGElwyBPYNWI-BkFFHKfgLJynt9zuNPs", + "digest": "jGElwyBPYNWI-BkFFHKfgLJynt9zuNPs", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", + }, + "start": 314, + "end": 315, + "sequence": "T", + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "I"}, }, - "start": 314, - "end": 315, - }, - "state": {"type": "LiteralSequenceExpression", "sequence": "I"}, - }, + "type": "DefiningContextConstraint", + } + ], "members": [ { "id": "ga4gh:VA.HUJOQCml0LngKmUf5IJIYQk9CfKmagbf", @@ -1786,7 +1847,12 @@ def moa_abl1(): "id": "moa.normalize.gene:ABL1", "type": "Gene", "label": "ABL1", - "extensions": [{"name": "gene_normalizer_id", "value": "hgnc:76"}], + "extensions": [ + { + "name": VICC_NORMALIZER_DATA, + "value": {"id": "hgnc:76", "label": "ABL1"}, + } + ], } @@ -1889,8 +1955,11 @@ def moa_imatinib(): }, }, { - "name": "therapy_normalizer_data", - "value": {"normalized_id": "rxcui:282388", "label": "imatinib"}, + "name": VICC_NORMALIZER_DATA, + "value": { + "id": "rxcui:282388", + "label": "imatinib", + }, }, ], } @@ -1905,9 +1974,9 @@ def moa_chronic_myelogenous_leukemia(): "label": "Chronic Myelogenous Leukemia", "extensions": [ { - "name": "disease_normalizer_data", + "name": VICC_NORMALIZER_DATA, "value": { - "normalized_id": "ncit:C3174", + "id": "ncit:C3174", "label": "Chronic Myelogenous Leukemia, BCR-ABL1 Positive", "mondo_id": "0011996", }, @@ -1932,12 +2001,15 @@ def civic_method(): return { "id": "civic.method:2019", "label": "CIViC Curation SOP (2019)", - "isReportedIn": { - "label": "Danos et al., 2019, Genome Med.", - "title": "Standard operating procedure for curation and clinical interpretation of variants in cancer", - "doi": "10.1186/s13073-019-0687-x", - "pmid": 31779674, - }, + "reportedIn": [ + { + "label": "Danos et al., 2019, Genome Med.", + "title": "Standard operating procedure for curation and clinical interpretation of variants in cancer", + "doi": "10.1186/s13073-019-0687-x", + "pmid": 31779674, + "type": "Document", + } + ], "type": "Method", } @@ -1948,12 +2020,15 @@ def moa_method(): return { "id": "moa.method:2021", "label": "MOAlmanac (2021)", - "isReportedIn": { - "label": "Reardon, B., Moore, N.D., Moore, N.S. et al.", - "title": "Integrating molecular profiles into clinical frameworks through the Molecular Oncology Almanac to prospectively guide precision oncology", - "doi": "10.1038/s43018-021-00243-3", - "pmid": 35121878, - }, + "reportedIn": [ + { + "label": "Reardon, B., Moore, N.D., Moore, N.S. et al.", + "title": "Integrating molecular profiles into clinical frameworks through the Molecular Oncology Almanac to prospectively guide precision oncology", + "doi": "10.1038/s43018-021-00243-3", + "pmid": 35121878, + "type": "Document", + } + ], "type": "Method", } @@ -1998,71 +2073,12 @@ def moa_source45(): "extensions": [{"name": "source_type", "value": "Journal"}], "type": "Document", "title": "Gorre, Mercedes E., et al. Clinical resistance to STI-571 cancer therapy caused by BCR-ABL gene mutation or amplification. Science 293.5531 (2001): 876-880.", - "url": "https://doi.org/10.1126/science.1062538", + "urls": ["https://doi.org/10.1126/science.1062538"], "doi": "10.1126/science.1062538", "pmid": 11423618, } -def _dict_check(expected_d: dict, actual_d: dict, is_cdm: bool = False) -> None: - """Make dictionary assertion checks. Check that actual matches expected data. - - :param expected_d: Expected dictionary - :param actual_d: Actual dictionary - :param is_cdm: Whether checks are for transformers (CDM) or query handler. - CDM have extra fields that are not exposed to the query handler - """ - for k, v in expected_d.items(): - if isinstance(v, dict): - _dict_check(v, actual_d[k], is_cdm=is_cdm) - elif isinstance(v, list): - actual_l = [json.dumps(v, sort_keys=True) for v in actual_d[k]] - if is_cdm: - expected_l = [json.dumps(v, sort_keys=True) for v in expected_d[k]] - else: - expected_l = [] - for v in expected_d[k]: - if isinstance(v, dict): - if v.get("name") in { - "therapy_normalizer_data", - "disease_normalizer_data", - }: - updated_ext = v.copy() - normalizer_data_type = v["name"].split("_normalizer_data")[ - 0 - ] - updated_ext["name"] = ( - f"{normalizer_data_type}_normalizer_id" - ) - updated_ext["value"] = v["value"]["normalized_id"] - expected_l.append(json.dumps(updated_ext, sort_keys=True)) - continue - new_extensions = [] - extensions = v.get("extensions") or [] - for ext in extensions: - if ext.get("name") in { - "therapy_normalizer_data", - "disease_normalizer_data", - }: - normalizer_data_type = ext["name"].split( - "_normalizer_data" - )[0] - new_extensions.append( - { - "name": f"{normalizer_data_type}_normalizer_id", - "value": ext["value"]["normalized_id"], - } - ) - else: - new_extensions.append(ext) - if extensions: - v["extensions"] = new_extensions - expected_l.append(json.dumps(v, sort_keys=True)) - assert set(actual_l) == set(expected_l), k - else: - assert actual_d[k] == expected_d[k], k - - @pytest.fixture(scope="session") def assertion_checks(): """Check that actual data matches expected data @@ -2080,9 +2096,10 @@ def _check(actual_data: list, test_data: list, is_cdm: bool = False) -> None: for actual in actual_data: if actual["id"] == expected["id"]: found_match = True - assert actual.keys() == expected.keys() + assert actual.keys() == expected.keys(), expected["id"] expected_copy = deepcopy(expected) - _dict_check(expected_copy, actual, is_cdm=is_cdm) + diff = DeepDiff(actual, expected_copy, ignore_order=True) + assert diff == {}, expected["id"] continue assert found_match, f"Did not find {expected['id']} in response" diff --git a/tests/data/transformers/therapeutic/civic_harvester.json b/tests/data/transformers/therapeutic/civic_harvester.json index a5a694cc..48b4a7f3 100644 --- a/tests/data/transformers/therapeutic/civic_harvester.json +++ b/tests/data/transformers/therapeutic/civic_harvester.json @@ -26,7 +26,8 @@ "aliases": [ "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", "BIBW 2992", - "BIBW2992" + "BIBW2992", + "BIBW-2992" ], "type": "therapie" } @@ -270,7 +271,8 @@ "aliases": [ "(2e)-N-(4-(3-Chloro-4-Fluoroanilino)-7-(((3s)-Oxolan-3-yl)Oxy)Quinoxazolin-6-yl)-4-(Dimethylamino)But-2-Enamide", "BIBW 2992", - "BIBW2992" + "BIBW2992", + "BIBW-2992" ], "type": "therapie" } @@ -488,7 +490,9 @@ ], "variant_aliases": [ "LEU858ARG", - "RS121434568" + "RS121434568", + "LEU813ARG", + "L813R" ] }, { @@ -556,7 +560,9 @@ "sources": [], "aliases": [ "LEU858ARG", - "RS121434568" + "RS121434568", + "LEU813ARG", + "L813R" ] }, { diff --git a/tests/unit/database/test_database.py b/tests/unit/database/test_database.py index dc1206c9..85a9c0db 100644 --- a/tests/unit/database/test_database.py +++ b/tests/unit/database/test_database.py @@ -7,6 +7,7 @@ from neo4j.graph import Node from metakb.database import get_driver +from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData from metakb.schemas.app import SourceName @@ -94,7 +95,7 @@ def check_study_relation(driver: Driver): def _check_function(value_label: str): query = f""" MATCH (d:{value_label}) - OPTIONAL MATCH (d)<-[:HAS_{value_label.upper()}]-(s:Study) + OPTIONAL MATCH (d)<-[:HAS_{value_label.upper()}]-(s:Statement) WITH d, COUNT(s) as s_count WHERE s_count < 1 RETURN COUNT(s_count) @@ -153,11 +154,15 @@ def _check_function( ): checked = set() for ext in fixture_extensions: - if ext["name"].endswith("_normalizer_data"): - obj_type = ext["name"].split("_normalizer_data")[0] - ext_name = f"{obj_type}_normalizer_id" - assert node[ext_name] == ext["value"]["normalized_id"] - checked.add(ext_name) + if ext["name"] == VICC_NORMALIZER_DATA: + for normalized_field in ViccDiseaseNormalizerData.model_fields: + normalized_val = ext["value"].get(normalized_field) + if normalized_val is None: + continue + + ext_name = f"normalizer_{normalized_field}" + assert node[ext_name] == ext["value"][normalized_field] + checked.add(ext_name) elif ext["name"] in ext_names: try: assert json.loads(node[ext["name"]]) == ext["value"] @@ -207,17 +212,23 @@ def test_gene_rules( """Verify property and relationship rules for Gene nodes.""" check_unique_property("Gene", "id") check_relation_count( - "Gene", "Study", "HAS_GENE_CONTEXT", direction="in", min_rels=1, max_rels=None + "Gene", + "Statement", + "HAS_GENE_CONTEXT", + direction="in", + min_rels=1, + max_rels=None, ) expected_labels = [{"Gene"}] check_node_labels("Gene", expected_labels, 1) gene = get_node_by_id(civic_gid5["id"]) - extension_names = {"gene_normalizer_id"} + extension_names = {"normalizer_label", "normalizer_id"} check_extension_props(gene, civic_gid5["extensions"], extension_names) expected_keys = { - "gene_normalizer_id", + "normalizer_id", + "normalizer_label", "label", "id", "description", @@ -241,7 +252,7 @@ def test_variation_rules( # members dont have defining context check_relation_count( "Variation", - "CategoricalVariation", + "CategoricalVariant", "HAS_DEFINING_CONTEXT", direction="in", min_rels=0, @@ -249,25 +260,23 @@ def test_variation_rules( ) check_relation_count( "Variation", - "CategoricalVariation", + "CategoricalVariant", "HAS_MEMBERS", min_rels=0, max_rels=None, direction="in", ) - expected_labels = [{"Variation", "Allele"}] - check_node_labels("Variation", expected_labels, 1) + expected_labels = [{"Variation", "Allele"}, {"Variation", "CategoricalVariant"}] + check_node_labels("Variation", expected_labels, 2) - # all Alleles are Variations and all Variations are Alleles + # all Variations are either Alleles or CategoricalVariants, and all Alleles and CategoricalVariants are Variation label_query = """ - MATCH (v:Variation) - WHERE NOT (v:Allele) - RETURN COUNT(v) - UNION - MATCH (v:Allele) - WHERE NOT (v:Variation) - RETURN COUNT(v) + MATCH (v) + RETURN + SUM(CASE WHEN (v:Variation AND NOT (v:Allele OR v:CategoricalVariant)) THEN 1 ELSE 0 END) + + SUM(CASE WHEN (v:Allele AND NOT v:Variation) THEN 1 ELSE 0 END) + + SUM(CASE WHEN (v:CategoricalVariant AND NOT v:Variation) THEN 1 ELSE 0 END) """ with driver.session() as s: record = s.run(label_query).single() @@ -280,8 +289,6 @@ def test_variation_rules( "digest", "state", "expression_hgvs_p", - "expression_hgvs_c", - "expression_hgvs_g", "type", } @@ -301,28 +308,26 @@ def test_variation_rules( expected_g.append(val) assert v["expression_hgvs_p"] == expected_p - assert set(v["expression_hgvs_c"]) == set(expected_c) - assert v["expression_hgvs_g"] == expected_g -def test_categorical_variation_rules( +def test_categorical_variant_rules( check_unique_property, check_relation_count, check_node_labels, get_node_by_id, civic_mpid12, ): - """Verify property and relationship rules for Categorical Variation nodes.""" - check_unique_property("CategoricalVariation", "id") + """Verify property and relationship rules for Categorical Variant nodes.""" + check_unique_property("CategoricalVariant", "id") check_relation_count( - "CategoricalVariation", "Variation", "HAS_DEFINING_CONTEXT", max_rels=1 + "CategoricalVariant", "Variation", "HAS_DEFINING_CONTEXT", max_rels=1 ) check_relation_count( - "CategoricalVariation", "Variation", "HAS_MEMBERS", min_rels=0, max_rels=None + "CategoricalVariant", "Variation", "HAS_MEMBERS", min_rels=0, max_rels=None ) - expected_node_labels = [{"CategoricalVariation", "ProteinSequenceConsequence"}] - check_node_labels("CategoricalVariation", expected_node_labels, 1) + expected_node_labels = [{"CategoricalVariant", "Variation"}] + check_node_labels("CategoricalVariant", expected_node_labels, 1) cv = get_node_by_id(civic_mpid12["id"]) assert set(cv.keys()) == { @@ -414,7 +419,7 @@ def test_therapeutic_procedure_rules( # through CombinationTherapy and TherapeuticSubstituteGroup check_relation_count( "TherapeuticProcedure", - "Study", + "Statement", "HAS_THERAPEUTIC", min_rels=0, max_rels=None, @@ -424,7 +429,11 @@ def test_therapeutic_procedure_rules( "CombinationTherapy", "TherapeuticAgent", "HAS_COMPONENTS", max_rels=None ) check_relation_count( - "CombinationTherapy", "Study", "HAS_THERAPEUTIC", max_rels=None, direction="in" + "CombinationTherapy", + "Statement", + "HAS_THERAPEUTIC", + max_rels=None, + direction="in", ) check_relation_count( "TherapeuticSubstituteGroup", @@ -434,7 +443,7 @@ def test_therapeutic_procedure_rules( ) check_relation_count( "TherapeuticSubstituteGroup", - "Study", + "Statement", "HAS_THERAPEUTIC", max_rels=None, direction="in", @@ -449,13 +458,18 @@ def test_therapeutic_procedure_rules( # Test TherapeuticAgent ta = get_node_by_id(civic_tid146["id"]) - extension_names = {"therapy_normalizer_id", "regulatory_approval"} + extension_names = { + "normalizer_id", + "normalizer_label", + "regulatory_approval", + } check_extension_props(ta, civic_tid146["extensions"], extension_names) expected_keys = { "id", "label", "alternativeLabels", - "therapy_normalizer_id", + "normalizer_id", + "normalizer_label", "regulatory_approval", "mappings", "type", @@ -489,16 +503,28 @@ def test_condition_rules( """Verify property and relationship rules for condition nodes.""" check_unique_property("Condition", "id") check_relation_count( - "Condition", "Study", "HAS_TUMOR_TYPE", max_rels=None, direction="in" + "Condition", "Statement", "HAS_TUMOR_TYPE", max_rels=None, direction="in" ) expected_node_labels = [{"Disease", "Condition"}] check_node_labels("Condition", expected_node_labels, 1) disease = get_node_by_id(civic_did8["id"]) - extension_names = {"disease_normalizer_id"} + extension_names = { + "normalizer_id", + "normalizer_label", + "normalizer_mondo_id", + } check_extension_props(disease, civic_did8["extensions"], extension_names) - expected_keys = {"id", "label", "mappings", "disease_normalizer_id", "type"} + expected_keys = { + "id", + "label", + "mappings", + "normalizer_id", + "normalizer_label", + "normalizer_mondo_id", + "type", + } check_node_props(disease, civic_did8, expected_keys, extension_names) @@ -511,21 +537,23 @@ def test_study_rules( civic_eid2997_study, check_node_props, ): - """Verify property and relationship rules for Study nodes.""" - check_unique_property("Study", "id") + """Verify property and relationship rules for Statement nodes.""" + check_unique_property("Statement", "id") - check_relation_count("Study", "CategoricalVariation", "HAS_VARIANT") - check_relation_count("Study", "Condition", "HAS_TUMOR_TYPE") - check_relation_count("Study", "TherapeuticProcedure", "HAS_THERAPEUTIC") - check_relation_count("Study", "Coding", "HAS_STRENGTH") - check_relation_count("Study", "Method", "IS_SPECIFIED_BY", max_rels=None) - check_relation_count("Study", "Gene", "HAS_GENE_CONTEXT", max_rels=None) + check_relation_count("Statement", "CategoricalVariant", "HAS_VARIANT") + check_relation_count("Statement", "Condition", "HAS_TUMOR_TYPE") + check_relation_count("Statement", "TherapeuticProcedure", "HAS_THERAPEUTIC") + check_relation_count("Statement", "Coding", "HAS_STRENGTH") + check_relation_count("Statement", "Method", "IS_SPECIFIED_BY", max_rels=None) + check_relation_count("Statement", "Gene", "HAS_GENE_CONTEXT", max_rels=None) - expected_node_labels = [{"Study", "VariantTherapeuticResponseStudy"}] - check_node_labels("Study", expected_node_labels, 1) + expected_node_labels = [ + {"Statement", "StudyStatement", "VariantTherapeuticResponseStudyStatement"} + ] + check_node_labels("Statement", expected_node_labels, 1) cite_query = """ - MATCH (s:Study) + MATCH (s:Statement) OPTIONAL MATCH (s)-[:IS_REPORTED_IN]->(d:Document) WITH s, COUNT(d) as d_count WHERE d_count < 1 @@ -541,12 +569,12 @@ def test_study_rules( "description", "direction", "predicate", - "alleleOrigin", + "alleleOriginQualifier", "type", } civic_eid2997_study_cp = civic_eid2997_study.copy() - civic_eid2997_study_cp["alleleOrigin"] = civic_eid2997_study_cp["qualifiers"][ - "alleleOrigin" + civic_eid2997_study_cp["alleleOriginQualifier"] = civic_eid2997_study_cp[ + "alleleOriginQualifier" ] check_node_props(study, civic_eid2997_study_cp, expected_keys) @@ -564,7 +592,12 @@ def test_document_rules( """Verify property and relationship rules for Document nodes.""" check_unique_property("Document", "id") check_relation_count( - "Document", "Study", "IS_REPORTED_IN", min_rels=0, max_rels=None, direction="in" + "Document", + "Statement", + "IS_REPORTED_IN", + min_rels=0, + max_rels=None, + direction="in", ) expected_labels = [{"Document"}] @@ -573,7 +606,7 @@ def test_document_rules( # PMIDs: 31779674 and 35121878 do not have this relationship is_reported_in_query = """ MATCH (s:Document) - OPTIONAL MATCH (s)<-[:IS_REPORTED_IN]-(d:Study) + OPTIONAL MATCH (s)<-[:IS_REPORTED_IN]-(d:Statement) WITH s, COUNT(d) as d_count WHERE (d_count < 1) AND (s.pmid <> 31779674) AND (s.pmid <> 35121878) RETURN COUNT(s) @@ -594,7 +627,7 @@ def test_document_rules( doc = get_node_by_id(moa_source45["id"]) extension_names = {"source_type"} check_extension_props(doc, moa_source45["extensions"], extension_names) - expected_keys = {"id", "title", "doi", "source_type", "url", "pmid"} + expected_keys = {"id", "title", "doi", "source_type", "urls", "pmid"} check_node_props(doc, moa_source45, expected_keys, extension_names) @@ -609,7 +642,7 @@ def test_method_rules( """Verify property and relationship rules for Method nodes.""" check_unique_property("Method", "id") check_relation_count( - "Method", "Study", "IS_SPECIFIED_BY", max_rels=None, direction="in" + "Method", "Statement", "IS_SPECIFIED_BY", max_rels=None, direction="in" ) expected_node_labels = [{"Method"}] @@ -626,7 +659,7 @@ def test_no_lost_nodes(driver: Driver): labels_query = """ MATCH (n) WHERE size(labels(n)) = 0 - AND NOT (n)<-[:IS_REPORTED_IN]-(:Study) + AND NOT (n)<-[:IS_REPORTED_IN]-(:Statement) RETURN COUNT(n) """ with driver.session() as s: diff --git a/tests/unit/search/test_search_studies.py b/tests/unit/search/test_search_studies.py index 78160e42..7e1694eb 100644 --- a/tests/unit/search/test_search_studies.py +++ b/tests/unit/search/test_search_studies.py @@ -3,6 +3,7 @@ import pytest from ga4gh.core.entity_models import Extension +from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.query import QueryHandler from .utils import assert_no_match, find_and_check_study @@ -16,8 +17,8 @@ def _get_normalizer_id(extensions: list[Extension]) -> str | None: """ normalizer_id = None for ext in extensions: - if ext.name.endswith("_normalizer_id"): - normalizer_id = ext.value + if ext.name == VICC_NORMALIZER_DATA: + normalizer_id = ext.value["id"] break return normalizer_id @@ -172,7 +173,7 @@ async def test_general_search_studies(query_handler): assert_general_search_studies(resp) expected_therapy_id = "rxcui:318341" for study in resp.studies: - tp = study.therapeutic.root + tp = study.objectTherapeutic.root if tp.type == "TherapeuticAgent": assert _get_normalizer_id(tp.extensions) == expected_therapy_id else: @@ -202,12 +203,17 @@ async def test_general_search_studies(query_handler): assert_general_search_studies(resp) for study in resp.studies: - assert study.variant.root.definingContext.id == expected_variation_id assert ( - _get_normalizer_id(study.therapeutic.root.extensions) == expected_therapy_id + study.subjectVariant.constraints[0].root.definingContext.root.id + == expected_variation_id ) assert ( - _get_normalizer_id(study.tumorType.root.extensions) == expected_disease_id + _get_normalizer_id(study.objectTherapeutic.root.extensions) + == expected_therapy_id + ) + assert ( + _get_normalizer_id(study.conditionQualifier.root.extensions) + == expected_disease_id ) diff --git a/tests/unit/transformers/test_moa_transformer.py b/tests/unit/transformers/test_moa_transformer.py index 5aaa617e..e86ba3c6 100644 --- a/tests/unit/transformers/test_moa_transformer.py +++ b/tests/unit/transformers/test_moa_transformer.py @@ -6,6 +6,7 @@ import pytest_asyncio from tests.conftest import TEST_TRANSFORMERS_DIR +from metakb.normalizers import VICC_NORMALIZER_DATA from metakb.transformers.moa import MoaTransformer FILENAME = "moa_cdm.json" @@ -35,24 +36,31 @@ def moa_vid145(braf_v600e_genomic): return { "id": "moa.variant:145", - "type": "ProteinSequenceConsequence", + "type": "CategoricalVariant", "label": "BRAF p.V600E (Missense)", - "definingContext": { - "id": "ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L", - "digest": "j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L", - "type": "Allele", - "location": { - "id": "ga4gh:SL.t-3DrWALhgLdXHsupI-e-M00aL3HgK3y", - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + "constraints": [ + { + "definingContext": { + "id": "ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L", + "digest": "j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L", + "type": "Allele", + "location": { + "id": "ga4gh:SL.t-3DrWALhgLdXHsupI-e-M00aL3HgK3y", + "digest": "t-3DrWALhgLdXHsupI-e-M00aL3HgK3y", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + }, + "start": 599, + "end": 600, + "sequence": "V", + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "E"}, }, - "start": 599, - "end": 600, - }, - "state": {"type": "LiteralSequenceExpression", "sequence": "E"}, - }, + "type": "DefiningContextConstraint", + } + ], "members": [genomic_rep], "extensions": [ { @@ -115,17 +123,16 @@ def moa_aid155_study(moa_vid145, moa_cetuximab, moa_encorafenib, moa_method): """Create MOA AID 155 study test fixture. Uses CombinationTherapy.""" return { "id": "moa.assertion:155", - "type": "VariantTherapeuticResponseStudy", + "type": "VariantTherapeuticResponseStudyStatement", "description": "The U.S. Food and Drug Administration (FDA) granted regular approval to encorafenib in combination with cetuximab for the treatment of adult patients with metastatic colorectal cancer (CRC) with BRAF V600E mutation, as detected by an FDA-approved test, after prior therapy.", - "direction": "none", "strength": { "code": "e000002", "label": "FDA recognized evidence", "system": "https://go.osu.edu/evidence-codes", }, "predicate": "predictsSensitivityTo", - "variant": moa_vid145, - "therapeutic": { + "subjectVariant": moa_vid145, + "objectTherapeutic": { "type": "CombinationTherapy", "id": "moa.ctid:ZGlEkRBR4st6Y_nijjuR1KUV7EFHIF_S", "components": [moa_cetuximab, moa_encorafenib], @@ -136,15 +143,15 @@ def moa_aid155_study(moa_vid145, moa_cetuximab, moa_encorafenib, moa_method): } ], }, - "tumorType": { + "conditionQualifier": { "id": "moa.normalize.disease.ncit:C5105", "type": "Disease", "label": "Colorectal Adenocarcinoma", "extensions": [ { - "name": "disease_normalizer_data", + "name": VICC_NORMALIZER_DATA, "value": { - "normalized_id": "ncit:C5105", + "id": "ncit:C5105", "label": "Colorectal Adenocarcinoma", "mondo_id": "0005008", }, @@ -161,28 +168,28 @@ def moa_aid155_study(moa_vid145, moa_cetuximab, moa_encorafenib, moa_method): } ], }, - "qualifiers": { - "alleleOrigin": "somatic", - "geneContext": { - "id": "moa.normalize.gene:BRAF", - "type": "Gene", - "label": "BRAF", - "extensions": [ - { - "name": "gene_normalizer_id", - "value": "hgnc:1097", - } - ], - }, + "alleleOriginQualifier": "somatic", + "geneContextQualifier": { + "id": "moa.normalize.gene:BRAF", + "type": "Gene", + "label": "BRAF", + "extensions": [ + { + "name": VICC_NORMALIZER_DATA, + "value": {"id": "hgnc:1097", "label": "BRAF"}, + } + ], }, "specifiedBy": moa_method, - "isReportedIn": [ + "reportedIn": [ { "id": "moa.source:63", "extensions": [{"name": "source_type", "value": "FDA"}], "type": "Document", "title": "Array BioPharma Inc. Braftovi (encorafenib) [package insert]. U.S. Food and Drug Administration website. www.accessdata.fda.gov/drugsatfda_docs/label/2020/210496s006lbl.pdf. Revised April 2020. Accessed October 15, 2020.", - "url": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/210496s006lbl.pdf", + "urls": [ + "https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/210496s006lbl.pdf" + ], } ], }