From 588927c85baf756c22c28346e37d72fbf3d0e62a Mon Sep 17 00:00:00 2001 From: Christopher Byrd Date: Tue, 20 Aug 2024 15:45:55 -0700 Subject: [PATCH 01/28] prevents stale user after logout #70 --- arches_lingo/src/arches_lingo/App.vue | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arches_lingo/src/arches_lingo/App.vue b/arches_lingo/src/arches_lingo/App.vue index a595a68f..262bc710 100644 --- a/arches_lingo/src/arches_lingo/App.vue +++ b/arches_lingo/src/arches_lingo/App.vue @@ -34,20 +34,13 @@ const { $gettext } = useGettext(); router.beforeEach(async (to, _from, next) => { try { - let userData = user.value; - - if (!userData || userData.username === ANONYMOUS) { - userData = await fetchUser(); - setUser(userData); - } + let userData = await fetchUser(); + setUser(userData); const requiresAuthentication = to.matched.some( (record) => record.meta.requiresAuthentication, ); - if ( - requiresAuthentication && - (!userData || userData.username === ANONYMOUS) - ) { + if (requiresAuthentication && userData.username === ANONYMOUS) { throw new Error(); } else { next(); From 7668b38d7b9b589882565f50b03938a33803ce59 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Tue, 27 Aug 2024 11:54:19 -0400 Subject: [PATCH 02/28] Set TEST_RUNNER Follow-up to faafbb4. --- tests/test_settings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_settings.py b/tests/test_settings.py index c9271fa7..a5c9a290 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -45,6 +45,7 @@ ELASTICSEARCH_PREFIX = "test" +TEST_RUNNER = "arches.test.runner.ArchesTestRunner" SILENCED_SYSTEM_CHECKS.append( "arches.W001", # Cache backend does not support rate-limiting ) From 539147e89b1d744212a6c1ba2214b66dbd36ee6b Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Tue, 20 Aug 2024 17:03:25 -0400 Subject: [PATCH 03/28] Make views a module --- arches_lingo/views/__init__.py | 2 ++ arches_lingo/views/root.py | 14 ++++++++++++++ arches_lingo/{views.py => views/trees.py} | 11 ----------- 3 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 arches_lingo/views/__init__.py create mode 100644 arches_lingo/views/root.py rename arches_lingo/{views.py => views/trees.py} (94%) diff --git a/arches_lingo/views/__init__.py b/arches_lingo/views/__init__.py new file mode 100644 index 00000000..ea7dc745 --- /dev/null +++ b/arches_lingo/views/__init__.py @@ -0,0 +1,2 @@ +from .root import * +from .trees import * diff --git a/arches_lingo/views/root.py b/arches_lingo/views/root.py new file mode 100644 index 00000000..a85b60f2 --- /dev/null +++ b/arches_lingo/views/root.py @@ -0,0 +1,14 @@ +from django.shortcuts import render +from django.utils.decorators import method_decorator +from django.utils.translation import gettext_lazy as _ +from django.views.decorators.csrf import ensure_csrf_cookie + +from arches.app.views.base import BaseManagerView + + +class LingoRootView(BaseManagerView): + @method_decorator(ensure_csrf_cookie) + def get(self, request, graphid=None, resourceid=None): + context = self.get_context_data(main_script="views/root") + context["page_title"] = _("Lingo") + return render(request, "arches_lingo/root.htm", context) diff --git a/arches_lingo/views.py b/arches_lingo/views/trees.py similarity index 94% rename from arches_lingo/views.py rename to arches_lingo/views/trees.py index a5d5f534..332a9d9d 100644 --- a/arches_lingo/views.py +++ b/arches_lingo/views/trees.py @@ -3,10 +3,8 @@ from django.contrib.postgres.expressions import ArraySubquery from django.db.models import CharField, F, OuterRef, Subquery, Value from django.db.models.expressions import CombinedExpression, Func -from django.shortcuts import render from django.utils.translation import gettext_lazy as _ from django.utils.decorators import method_decorator -from django.views.decorators.csrf import ensure_csrf_cookie from django.views.generic import View from arches.app.models.models import ( @@ -17,7 +15,6 @@ ) from arches.app.utils.decorators import group_required from arches.app.utils.response import JSONResponse -from arches.app.views.base import BaseManagerView from arches_lingo.const import ( SCHEMES_GRAPH_ID, @@ -204,11 +201,3 @@ def get(self, request): } # Todo: filter by nodegroup permissions return JSONResponse(data) - - -class LingoRootView(BaseManagerView): - @method_decorator(ensure_csrf_cookie) - def get(self, request, graphid=None, resourceid=None): - context = self.get_context_data(main_script="views/root") - context["page_title"] = _("Lingo") - return render(request, "arches_lingo/root.htm", context) From e54f706e65cfef4fe6a84fac88449d09382f434f Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Tue, 20 Aug 2024 17:05:25 -0400 Subject: [PATCH 04/28] Small view cleanups --- arches_lingo/views/root.py | 2 +- arches_lingo/views/trees.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arches_lingo/views/root.py b/arches_lingo/views/root.py index a85b60f2..4ec16321 100644 --- a/arches_lingo/views/root.py +++ b/arches_lingo/views/root.py @@ -8,7 +8,7 @@ class LingoRootView(BaseManagerView): @method_decorator(ensure_csrf_cookie) - def get(self, request, graphid=None, resourceid=None): + def get(self, request, *args, **kwargs): context = self.get_context_data(main_script="views/root") context["page_title"] = _("Lingo") return render(request, "arches_lingo/root.htm", context) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 332a9d9d..68be76bc 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -50,6 +50,7 @@ class JsonbArrayElements(Func): ) class ConceptTreeView(View): def __init__(self): + super().__init__() self.schemes = ResourceInstance.objects.none() # Maps built during a GET call From ed16aa0eff89c41e39b180072029b0e562c0de8d Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 21 Aug 2024 10:49:34 -0400 Subject: [PATCH 05/28] Remove .all() cruft --- arches_lingo/views/trees.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 68be76bc..959f4719 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -101,8 +101,7 @@ def labels_subquery(label_nodegroup): def language_concepts_map(self): languages = ( - Language.objects.all() - .annotate( + Language.objects.annotate( concept_value=Subquery( ConceptValue.objects.filter( valuetype="prefLabel", value=OuterRef("code") From 8d75b68e2803686020bce6d9bde0fe0f63a016f4 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 21 Aug 2024 11:48:59 -0400 Subject: [PATCH 06/28] Add cursor_tuple_fraction optimization for queryset iterators --- arches_lingo/settings.py | 4 +++- tests/test_settings.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arches_lingo/settings.py b/arches_lingo/settings.py index d21b5a2d..f06645eb 100644 --- a/arches_lingo/settings.py +++ b/arches_lingo/settings.py @@ -110,7 +110,9 @@ "ENGINE": "django.contrib.gis.db.backends.postgis", "HOST": "localhost", "NAME": "arches_lingo", - "OPTIONS": {}, + "OPTIONS": { + "options": "-c cursor_tuple_fraction=1", + }, "PASSWORD": "postgis", "PORT": "5432", "POSTGIS_TEMPLATE": "template_postgis", diff --git a/tests/test_settings.py b/tests/test_settings.py index a5c9a290..69d96579 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -21,7 +21,9 @@ "ENGINE": "django.contrib.gis.db.backends.postgis", "HOST": "localhost", "NAME": "arches_lingo", - "OPTIONS": {}, + "OPTIONS": { + "options": "-c cursor_tuple_fraction=1", + }, "PASSWORD": "postgis", "PORT": "5432", "POSTGIS_TEMPLATE": "template_postgis", From 55f6eda1f520d6c0f0e0210fc4c7f6026d8063c0 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 21 Aug 2024 18:57:13 -0400 Subject: [PATCH 07/28] Initial commit of search backend #67 --- CHANGELOG.md | 1 + arches_lingo/urls.py | 3 +- arches_lingo/views/trees.py | 101 +++++++++++++++++++++++++++++++----- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index badbed91..a8306ef0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add login interface [#13](https://github.com/archesproject/arches-lingo/issues/13) - Add front-end router [#11](https://github.com/archesproject/arches-lingo/issues/11) +- Add backend for search [#67](https://github.com/archesproject/arches-lingo/issues/67) ### Fixed diff --git a/arches_lingo/urls.py b/arches_lingo/urls.py index 7a211211..1e5301af 100644 --- a/arches_lingo/urls.py +++ b/arches_lingo/urls.py @@ -3,7 +3,7 @@ from django.conf.urls.i18n import i18n_patterns from django.urls import include, path -from arches_lingo.views import LingoRootView, ConceptTreeView +from arches_lingo.views import LingoRootView, ConceptTreeView, ValueSearchView urlpatterns = [ path("", LingoRootView.as_view(), name="root"), @@ -12,6 +12,7 @@ path("advanced-search", LingoRootView.as_view(), name="advanced-search"), path("schemes", LingoRootView.as_view(), name="schemes"), path("api/concept_trees", ConceptTreeView.as_view(), name="concept_trees"), + path("api/search", ValueSearchView.as_view(), name="api_search"), path("", include("arches_references.urls")), ] diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 959f4719..f7bb1dd0 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -1,4 +1,5 @@ from collections import defaultdict +from http import HTTPStatus from django.contrib.postgres.expressions import ArraySubquery from django.db.models import CharField, F, OuterRef, Subquery, Value @@ -63,6 +64,12 @@ def __init__(self): # key=resourceid (str) val=list of label dicts self.labels: dict[str : list[dict]] = defaultdict(set) + # Maps representing a reverse (leaf-first) tree + # key=resourceid (str) val=set of concept resourceids (str) + self.broader_concepts: dict[str : set[str]] = defaultdict(set) + # key=resourceid (str) val=set of scheme resourceids (str) + self.schemes_by_top_concept: dict[str : set[str]] = defaultdict(set) + @staticmethod def human_label_type(value_id): if value_id == PREF_LABEL_VALUE_ID: @@ -122,9 +129,11 @@ def top_concepts_map(self): .values("resourceinstance_id", "top_concept_of", "labels") ) for tile in top_concept_of_tiles: - resource_id: str = str(tile["resourceinstance_id"]) - self.top_concepts[tile["top_concept_of"]].add(resource_id) - self.labels[resource_id] = tile["labels"] + scheme_id = tile["top_concept_of"] + top_concept_id = str(tile["resourceinstance_id"]) + self.top_concepts[scheme_id].add(top_concept_id) + self.schemes_by_top_concept[top_concept_id].add(scheme_id) + self.labels[top_concept_id] = tile["labels"] def narrower_concepts_map(self): broader_concept_tiles = ( @@ -134,9 +143,16 @@ def narrower_concepts_map(self): .values("resourceinstance_id", "broader_concept", "labels") ) for tile in broader_concept_tiles.iterator(): - resource_id: str = str(tile["resourceinstance_id"]) - self.narrower_concepts[tile["broader_concept"]].add(resource_id) - self.labels[resource_id] = tile["labels"] + broader_concept_id = tile["broader_concept"] + narrower_concept_id: str = str(tile["resourceinstance_id"]) + self.narrower_concepts[broader_concept_id].add(narrower_concept_id) + self.broader_concepts[narrower_concept_id].add(broader_concept_id) + self.labels[narrower_concept_id] = tile["labels"] + + def populate_schemes(self): + self.schemes = ResourceInstance.objects.filter( + graph_id=SCHEMES_GRAPH_ID + ).annotate(labels=self.labels_subquery(SCHEME_NAME_NODEGROUP)) def serialize_scheme(self, scheme: ResourceInstance): scheme_id: str = str(scheme.pk) @@ -162,8 +178,8 @@ def serialize_scheme_label(self, label_tile: dict): "value": value, } - def serialize_concept(self, conceptid: str): - return { + def serialize_concept(self, conceptid: str, *, parentage=False): + data = { "id": conceptid, "labels": [ self.serialize_concept_label(label) for label in self.labels[conceptid] @@ -173,6 +189,31 @@ def serialize_concept(self, conceptid: str): for conceptid in self.narrower_concepts[conceptid] ], } + if parentage: + # Choose any reverse path back to the scheme (currently indeterminate). + path = self.add_broader_concept_recursive([], conceptid) + scheme_id, concept_ids = path[0], path[1:] + schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] + data["parentage"] = [self.serialize_scheme(schemes[0])] + [ + self.serialize_concept(concept_id) for concept_id in concept_ids + ] + + return data + + def add_broader_concept_recursive(self, working_parent_list, conceptid): + broader_concepts = self.broader_concepts[conceptid] + try: + arbitrary_broader_conceptid = next(iter(broader_concepts)) + except StopIteration: + schemes = self.schemes_by_top_concept[conceptid] + arbitrary_scheme = next(iter(schemes)) + working_parent_list.insert(0, arbitrary_scheme) + return working_parent_list + else: + working_parent_list.insert(0, arbitrary_broader_conceptid) + return self.add_broader_concept_recursive( + working_parent_list, arbitrary_broader_conceptid + ) def serialize_concept_label(self, label_tile: dict): lang_code = self.language_concepts[label_tile[CONCEPT_NAME_LANGUAGE_NODE][0]] @@ -191,13 +232,49 @@ def get(self, request): self.language_concepts_map() self.top_concepts_map() self.narrower_concepts_map() - - self.schemes = ResourceInstance.objects.filter( - graph_id=SCHEMES_GRAPH_ID - ).annotate(labels=self.labels_subquery(SCHEME_NAME_NODEGROUP)) + self.populate_schemes() data = { "schemes": [self.serialize_scheme(scheme) for scheme in self.schemes], } # Todo: filter by nodegroup permissions return JSONResponse(data) + + +@method_decorator( + group_required("RDM Administrator", raise_exception=True), name="dispatch" +) +class ValueSearchView(ConceptTreeView): + def get(self, request): + search_term = request.GET.get("search") + if not search_term: + # Treat this as a request to clear & warm the cache. + return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) + + # TODO: cache this + self.language_concepts_map() + self.top_concepts_map() + self.narrower_concepts_map() + self.populate_schemes() + + # TODO: fuzzy match, SEARCH_TERM_SENSITIVITY + concept_ids = ( + TileModel.objects.filter(nodegroup_id=CONCEPT_NAME_NODEGROUP) + .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) + # TODO: all languages + .filter( + **{ + f"data__{CONCEPT_NAME_CONTENT_NODE}__en__value__icontains": search_term + } + ) + .values_list("resourceinstance_id", flat=True) + ) + deduped = set(concept_ids) + + data = [ + self.serialize_concept(str(concept_uuid), parentage=True) + for concept_uuid in deduped + ] + + # Todo: filter by nodegroup permissions + return JSONResponse(data) From af32bc5491c666d76540b92a6c696f5864c1948e Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 21 Aug 2024 19:17:30 -0400 Subject: [PATCH 08/28] Add caching --- arches_lingo/settings.py | 3 ++ arches_lingo/views/trees.py | 58 ++++++++++++++++++++++++++++--------- tests/test_settings.py | 3 ++ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/arches_lingo/settings.py b/arches_lingo/settings.py index f06645eb..f9e7711e 100644 --- a/arches_lingo/settings.py +++ b/arches_lingo/settings.py @@ -261,6 +261,9 @@ "default": { "BACKEND": "django.core.cache.backends.dummy.DummyCache", }, + "lingo": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache", + }, "user_permission": { "BACKEND": "django.core.cache.backends.db.DatabaseCache", "LOCATION": "user_permission_cache", diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index f7bb1dd0..8a29b319 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -1,6 +1,7 @@ from collections import defaultdict from http import HTTPStatus +from django.core.cache import caches from django.contrib.postgres.expressions import ArraySubquery from django.db.models import CharField, F, OuterRef, Subquery, Value from django.db.models.expressions import CombinedExpression, Func @@ -38,6 +39,8 @@ TOP_CONCEPT_OF_LOOKUP = f"data__{TOP_CONCEPT_OF_NODE_AND_NODEGROUP}" BROADER_LOOKUP = f"data__{CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID}" +cache = caches["lingo"] + class JsonbArrayElements(Func): """https://forum.djangoproject.com/t/django-4-2-behavior-change-when-using-arrayagg-on-unnested-arrayfield-postgresql-specific/21547/5""" @@ -54,7 +57,6 @@ def __init__(self): super().__init__() self.schemes = ResourceInstance.objects.none() - # Maps built during a GET call # key=concept valueid (str) val=language code self.language_concepts: dict[str:str] = {} # key=scheme resourceid (str) val=set of concept resourceids (str) @@ -70,6 +72,46 @@ def __init__(self): # key=resourceid (str) val=set of scheme resourceids (str) self.schemes_by_top_concept: dict[str : set[str]] = defaultdict(set) + self.read_from_cache() + + def read_from_cache(self): + from_cache = cache.get_many( + [ + "language_concepts", + "top_concepts", + "narrower_concepts", + "schemes", + "labels", + "broader_concepts", + "schemes_by_top_concept", + ] + ) + try: + self.language_concepts = from_cache["language_concepts"] + self.top_concepts = from_cache["top_concepts"] + self.narrower_concepts = from_cache["narrower_concepts"] + self.schemes = from_cache["schemes"] + self.labels = from_cache["labels"] + self.broader_concepts = from_cache["broader_concepts"] + self.schemes_by_top_concept = from_cache["schemes_by_top_concept"] + except KeyError: + self.rebuild_cache() + + def rebuild_cache(self): + self.language_concepts_map() + self.top_concepts_map() + self.narrower_concepts_map() + self.populate_schemes() + + cache.set("language_concepts", self.language_concepts) + cache.set("top_concepts", self.top_concepts) + cache.set("narrower_concepts", self.narrower_concepts) + cache.set("schemes", self.schemes) + cache.set("labels", self.labels) + # Reverse trees. + cache.set("broader_concepts", self.broader_concepts) + cache.set("schemes_by_top_concept", self.schemes_by_top_concept) + @staticmethod def human_label_type(value_id): if value_id == PREF_LABEL_VALUE_ID: @@ -229,11 +271,6 @@ def serialize_concept_label(self, label_tile: dict): } def get(self, request): - self.language_concepts_map() - self.top_concepts_map() - self.narrower_concepts_map() - self.populate_schemes() - data = { "schemes": [self.serialize_scheme(scheme) for scheme in self.schemes], } @@ -248,15 +285,10 @@ class ValueSearchView(ConceptTreeView): def get(self, request): search_term = request.GET.get("search") if not search_term: - # Treat this as a request to clear & warm the cache. + # Useful for warming the cache before a search. + self.rebuild_cache() return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) - # TODO: cache this - self.language_concepts_map() - self.top_concepts_map() - self.narrower_concepts_map() - self.populate_schemes() - # TODO: fuzzy match, SEARCH_TERM_SENSITIVITY concept_ids = ( TileModel.objects.filter(nodegroup_id=CONCEPT_NAME_NODEGROUP) diff --git a/tests/test_settings.py b/tests/test_settings.py index 69d96579..47b91e97 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -37,6 +37,9 @@ "default": { "BACKEND": "django.core.cache.backends.dummy.DummyCache", }, + "lingo": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache", + }, "user_permission": { "BACKEND": "django.core.cache.backends.dummy.DummyCache", "LOCATION": "user_permission_cache", From 823b9d49d3a67202afbb38150575efca7ee84d5b Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 10:45:30 -0400 Subject: [PATCH 09/28] Handle all language values --- arches_lingo/migrations/0001_initial.py | 52 +++++++++++++++++++++++++ arches_lingo/models.py | 18 +++++++++ arches_lingo/views/trees.py | 17 +++----- 3 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 arches_lingo/migrations/0001_initial.py create mode 100644 arches_lingo/models.py diff --git a/arches_lingo/migrations/0001_initial.py b/arches_lingo/migrations/0001_initial.py new file mode 100644 index 00000000..293a2e70 --- /dev/null +++ b/arches_lingo/migrations/0001_initial.py @@ -0,0 +1,52 @@ +# Generated by Django 4.2.15 on 2024-08-22 09:01 +import textwrap + +from django.db import migrations, models + +from arches_lingo.const import CONCEPT_NAME_CONTENT_NODE + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [("models", "11042_update__arches_staging_to_tile")] + + forward = textwrap.dedent( + f""" + CREATE VIEW arches_lingo__vw_label_values AS ( + SELECT + t.resourceinstanceid AS conceptid, + ROW_TO_JSON(JSONB_EACH(t.tiledata -> '{CONCEPT_NAME_CONTENT_NODE}')) + -> 'value' ->> 'value' AS value + FROM + tiles t + ORDER BY + conceptid + );""" + ) + + reverse = "DROP VIEW arches_lingo__vw_label_values;" + + operations = [ + migrations.RunSQL(forward, reverse), + migrations.CreateModel( + name="VwLabelValue", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("value", models.CharField(db_column="value")), + ], + options={ + "db_table": "arches_lingo__vw_label_values", + "managed": False, + }, + ), + ] diff --git a/arches_lingo/models.py b/arches_lingo/models.py new file mode 100644 index 00000000..e239c1cd --- /dev/null +++ b/arches_lingo/models.py @@ -0,0 +1,18 @@ +from django.conf import settings +from django.db import models + +from arches.app.models.models import ResourceInstance + + +class VwLabelValue(models.Model): + concept = models.ForeignKey( + ResourceInstance, + related_name="label_values", + on_delete=models.DO_NOTHING, + db_column="conceptid", + ) + value = models.CharField(db_column="value") + + class Meta: + managed = False + db_table = f"{settings.APP_NAME}__vw_label_values" diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 8a29b319..e57f2851 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -35,6 +35,7 @@ PREF_LABEL_VALUE_ID, ALT_LABEL_VALUE_ID, ) +from arches_lingo.models import VwLabelValue TOP_CONCEPT_OF_LOOKUP = f"data__{TOP_CONCEPT_OF_NODE_AND_NODEGROUP}" BROADER_LOOKUP = f"data__{CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID}" @@ -45,6 +46,7 @@ class JsonbArrayElements(Func): """https://forum.djangoproject.com/t/django-4-2-behavior-change-when-using-arrayagg-on-unnested-arrayfield-postgresql-specific/21547/5""" + arity = 1 contains_subquery = True function = "JSONB_ARRAY_ELEMENTS" @@ -291,21 +293,14 @@ def get(self, request): # TODO: fuzzy match, SEARCH_TERM_SENSITIVITY concept_ids = ( - TileModel.objects.filter(nodegroup_id=CONCEPT_NAME_NODEGROUP) - .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) - # TODO: all languages - .filter( - **{ - f"data__{CONCEPT_NAME_CONTENT_NODE}__en__value__icontains": search_term - } - ) - .values_list("resourceinstance_id", flat=True) + VwLabelValue.objects.filter(value__icontains=search_term) + .values_list("concept_id", flat=True) + .distinct() ) - deduped = set(concept_ids) data = [ self.serialize_concept(str(concept_uuid), parentage=True) - for concept_uuid in deduped + for concept_uuid in concept_ids ] # Todo: filter by nodegroup permissions From 8cb74a5ce9f07b16f609a36234d80a799d555a73 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 11:40:19 -0400 Subject: [PATCH 10/28] Add fuzzy searching --- arches_lingo/migrations/0001_initial.py | 7 ++++ arches_lingo/models.py | 3 +- arches_lingo/views/trees.py | 43 +++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 5 deletions(-) diff --git a/arches_lingo/migrations/0001_initial.py b/arches_lingo/migrations/0001_initial.py index 293a2e70..99ebdceb 100644 --- a/arches_lingo/migrations/0001_initial.py +++ b/arches_lingo/migrations/0001_initial.py @@ -1,11 +1,17 @@ # Generated by Django 4.2.15 on 2024-08-22 09:01 import textwrap +from django.contrib.postgres.operations import CreateExtension from django.db import migrations, models from arches_lingo.const import CONCEPT_NAME_CONTENT_NODE +class FuzzyStrMatchExtension(CreateExtension): + def __init__(self): + self.name = "fuzzystrmatch" + + class Migration(migrations.Migration): initial = True @@ -49,4 +55,5 @@ class Migration(migrations.Migration): "managed": False, }, ), + FuzzyStrMatchExtension(), ] diff --git a/arches_lingo/models.py b/arches_lingo/models.py index e239c1cd..fa87287a 100644 --- a/arches_lingo/models.py +++ b/arches_lingo/models.py @@ -1,4 +1,3 @@ -from django.conf import settings from django.db import models from arches.app.models.models import ResourceInstance @@ -15,4 +14,4 @@ class VwLabelValue(models.Model): class Meta: managed = False - db_table = f"{settings.APP_NAME}__vw_label_values" + db_table = f"arches_lingo__vw_label_values" diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index e57f2851..9575af13 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -3,7 +3,14 @@ from django.core.cache import caches from django.contrib.postgres.expressions import ArraySubquery -from django.db.models import CharField, F, OuterRef, Subquery, Value +from django.db.models import ( + CharField, + FloatField, + F, + OuterRef, + Subquery, + Value, +) from django.db.models.expressions import CombinedExpression, Func from django.utils.translation import gettext_lazy as _ from django.utils.decorators import method_decorator @@ -15,6 +22,7 @@ TileModel, Value as ConceptValue, ) +from arches.app.models.system_settings import settings from arches.app.utils.decorators import group_required from arches.app.utils.response import JSONResponse @@ -51,6 +59,11 @@ class JsonbArrayElements(Func): function = "JSONB_ARRAY_ELEMENTS" +class LevenshteinLessEqual(Func): + arity = 3 + function = "LEVENSHTEIN_LESS_EQUAL" + + @method_decorator( group_required("RDM Administrator", raise_exception=True), name="dispatch" ) @@ -286,14 +299,24 @@ def get(self, request): class ValueSearchView(ConceptTreeView): def get(self, request): search_term = request.GET.get("search") + max_edit_distance = request.GET.get( + "maxEditDistance", self.default_sensitivity() + ) if not search_term: # Useful for warming the cache before a search. self.rebuild_cache() return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) - # TODO: fuzzy match, SEARCH_TERM_SENSITIVITY concept_ids = ( - VwLabelValue.objects.filter(value__icontains=search_term) + VwLabelValue.objects.annotate( + edit_distance=LevenshteinLessEqual( + F("value"), + Value(search_term), + Value(max_edit_distance), + output_field=FloatField(), + ) + ) + .filter(edit_distance__lte=max_edit_distance) .values_list("concept_id", flat=True) .distinct() ) @@ -305,3 +328,17 @@ def get(self, request): # Todo: filter by nodegroup permissions return JSONResponse(data) + + @staticmethod + def default_sensitivity(): + """Remains to be seen whether the existing elastic sensitivity setting + should be the fallback, but stub something out for now. + This sensitivity setting is actually inversely related to edit distance, + because it's prefix_length in elastic, not fuzziness, so invert it. + """ + elastic_prefix_length = settings.SEARCH_TERM_SENSITIVITY + if elastic_prefix_length <= 0: + return 5 + if elastic_prefix_length >= 5: + return 0 + return int(5 - elastic_prefix_length) From 8a3991c1b7b9dc44500439799dceefd71b614e4d Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 16:00:04 -0400 Subject: [PATCH 11/28] Make chosen parentage deterministic --- arches_lingo/views/trees.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 9575af13..94303a12 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -247,7 +247,6 @@ def serialize_concept(self, conceptid: str, *, parentage=False): ], } if parentage: - # Choose any reverse path back to the scheme (currently indeterminate). path = self.add_broader_concept_recursive([], conceptid) scheme_id, concept_ids = path[0], path[1:] schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] @@ -258,18 +257,19 @@ def serialize_concept(self, conceptid: str, *, parentage=False): return data def add_broader_concept_recursive(self, working_parent_list, conceptid): - broader_concepts = self.broader_concepts[conceptid] + # TODO: sort on sortorder at higher stacklevel once captured in original data. + broader_concepts = sorted(self.broader_concepts[conceptid]) try: - arbitrary_broader_conceptid = next(iter(broader_concepts)) - except StopIteration: - schemes = self.schemes_by_top_concept[conceptid] - arbitrary_scheme = next(iter(schemes)) - working_parent_list.insert(0, arbitrary_scheme) + first_broader_conceptid = broader_concepts[0] + except IndexError: + # TODO: sort here too. + schemes = sorted(self.schemes_by_top_concept[conceptid]) + working_parent_list.insert(0, schemes[0]) return working_parent_list else: - working_parent_list.insert(0, arbitrary_broader_conceptid) + working_parent_list.insert(0, first_broader_conceptid) return self.add_broader_concept_recursive( - working_parent_list, arbitrary_broader_conceptid + working_parent_list, first_broader_conceptid ) def serialize_concept_label(self, label_tile: dict): From 4dcd7f39a7b2c6fbc6090b45a01bd8a3b19d2898 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 16:07:03 -0400 Subject: [PATCH 12/28] Add polyhierarchical flag --- arches_lingo/views/trees.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 94303a12..5b1187eb 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -89,6 +89,9 @@ def __init__(self): self.read_from_cache() + # Not currently cached because written to during serialization. + self.polyhierarchical_concepts = set() + def read_from_cache(self): from_cache = cache.get_many( [ @@ -248,12 +251,19 @@ def serialize_concept(self, conceptid: str, *, parentage=False): } if parentage: path = self.add_broader_concept_recursive([], conceptid) - scheme_id, concept_ids = path[0], path[1:] + scheme_id, parent_concept_ids = path[0], path[1:] + if len(parent_concept_ids) > 1: + self.polyhierarchical_concepts.add(conceptid) schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] data["parentage"] = [self.serialize_scheme(schemes[0])] + [ - self.serialize_concept(concept_id) for concept_id in concept_ids + self.serialize_concept(parent_id) for parent_id in parent_concept_ids ] + self_and_parent_ids = set([conceptid] + parent_concept_ids) + data["polyhierarchical"] = bool( + self_and_parent_ids.intersection(self.polyhierarchical_concepts) + ) + return data def add_broader_concept_recursive(self, working_parent_list, conceptid): From bf868dbabb23f7c95f60d1d75a9050ee4a880fdb Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 16:23:34 -0400 Subject: [PATCH 13/28] Shorten the serialization of parent path (just show labels) --- arches_lingo/views/trees.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index 5b1187eb..f583747c 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -214,16 +214,18 @@ def populate_schemes(self): graph_id=SCHEMES_GRAPH_ID ).annotate(labels=self.labels_subquery(SCHEME_NAME_NODEGROUP)) - def serialize_scheme(self, scheme: ResourceInstance): + def serialize_scheme(self, scheme: ResourceInstance, *, children=True): scheme_id: str = str(scheme.pk) - return { + data = { "id": scheme_id, "labels": [self.serialize_scheme_label(label) for label in scheme.labels], - "top_concepts": [ + } + if children: + data["top_concepts"] = [ self.serialize_concept(concept_id) for concept_id in self.top_concepts[scheme_id] - ], - } + ] + return data def serialize_scheme_label(self, label_tile: dict): lang_code = self.language_concepts[label_tile[SCHEME_NAME_LANGUAGE_NODE][0]] @@ -238,25 +240,27 @@ def serialize_scheme_label(self, label_tile: dict): "value": value, } - def serialize_concept(self, conceptid: str, *, parentage=False): + def serialize_concept(self, conceptid: str, *, parents=False, children=True): data = { "id": conceptid, "labels": [ self.serialize_concept_label(label) for label in self.labels[conceptid] ], - "narrower": [ + } + if children: + data["narrower"] = [ self.serialize_concept(conceptid) for conceptid in self.narrower_concepts[conceptid] - ], - } - if parentage: + ] + if parents: path = self.add_broader_concept_recursive([], conceptid) scheme_id, parent_concept_ids = path[0], path[1:] if len(parent_concept_ids) > 1: self.polyhierarchical_concepts.add(conceptid) schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] - data["parentage"] = [self.serialize_scheme(schemes[0])] + [ - self.serialize_concept(parent_id) for parent_id in parent_concept_ids + data["parents"] = [self.serialize_scheme(schemes[0], children=False)] + [ + self.serialize_concept(parent_id, children=False) + for parent_id in parent_concept_ids ] self_and_parent_ids = set([conceptid] + parent_concept_ids) @@ -332,7 +336,7 @@ def get(self, request): ) data = [ - self.serialize_concept(str(concept_uuid), parentage=True) + self.serialize_concept(str(concept_uuid), parents=True) for concept_uuid in concept_ids ] From 744880a614511c772e83c076a41ce2864affe735 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 16:41:05 -0400 Subject: [PATCH 14/28] Add pagination --- arches_lingo/views/trees.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index f583747c..aaab65fd 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -1,8 +1,9 @@ from collections import defaultdict from http import HTTPStatus -from django.core.cache import caches from django.contrib.postgres.expressions import ArraySubquery +from django.core.cache import caches +from django.core.paginator import Paginator from django.db.models import ( CharField, FloatField, @@ -312,16 +313,18 @@ def get(self, request): ) class ValueSearchView(ConceptTreeView): def get(self, request): - search_term = request.GET.get("search") - max_edit_distance = request.GET.get( - "maxEditDistance", self.default_sensitivity() - ) - if not search_term: + if not (search_term := request.GET.get("search")): # Useful for warming the cache before a search. self.rebuild_cache() return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) - concept_ids = ( + max_edit_distance = request.GET.get( + "maxEditDistance", self.default_sensitivity() + ) + page_number = request.GET.get("page", 1) + items_per_page = request.GET.get("items", 25) + + concept_query = ( VwLabelValue.objects.annotate( edit_distance=LevenshteinLessEqual( F("value"), @@ -331,13 +334,17 @@ def get(self, request): ) ) .filter(edit_distance__lte=max_edit_distance) + .order_by("edit_distance") .values_list("concept_id", flat=True) .distinct() ) + paginator = Paginator(concept_query, items_per_page) + page = paginator.get_page(page_number) + data = [ self.serialize_concept(str(concept_uuid), parents=True) - for concept_uuid in concept_ids + for concept_uuid in page ] # Todo: filter by nodegroup permissions From ab3d427e0a7a3829ce8436df9321412957344e42 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 16:43:50 -0400 Subject: [PATCH 15/28] Rename search term param --- arches_lingo/views/trees.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index aaab65fd..ea50ce86 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -313,7 +313,7 @@ def get(self, request): ) class ValueSearchView(ConceptTreeView): def get(self, request): - if not (search_term := request.GET.get("search")): + if not (term := request.GET.get("term")): # Useful for warming the cache before a search. self.rebuild_cache() return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) @@ -328,7 +328,7 @@ def get(self, request): VwLabelValue.objects.annotate( edit_distance=LevenshteinLessEqual( F("value"), - Value(search_term), + Value(term), Value(max_edit_distance), output_field=FloatField(), ) From c166e832dd6c3f7a88074d0d816e1dc5db0c90ed Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Thu, 22 Aug 2024 17:00:16 -0400 Subject: [PATCH 16/28] Avoid empty narrower keys --- arches_lingo/views/trees.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index ea50ce86..f65f9cc7 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -343,7 +343,7 @@ def get(self, request): page = paginator.get_page(page_number) data = [ - self.serialize_concept(str(concept_uuid), parents=True) + self.serialize_concept(str(concept_uuid), parents=True, children=False) for concept_uuid in page ] From d160f0d19b1699755f085e0d58135e732fb8585e Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Mon, 26 Aug 2024 08:50:57 -0400 Subject: [PATCH 17/28] Return all results when term is empty --- arches_lingo/views/trees.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index f65f9cc7..a0eb7154 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -1,5 +1,4 @@ from collections import defaultdict -from http import HTTPStatus from django.contrib.postgres.expressions import ArraySubquery from django.core.cache import caches @@ -313,31 +312,30 @@ def get(self, request): ) class ValueSearchView(ConceptTreeView): def get(self, request): - if not (term := request.GET.get("term")): - # Useful for warming the cache before a search. - self.rebuild_cache() - return JSONResponse(status=HTTPStatus.IM_A_TEAPOT) - + term = request.GET.get("term") max_edit_distance = request.GET.get( "maxEditDistance", self.default_sensitivity() ) page_number = request.GET.get("page", 1) items_per_page = request.GET.get("items", 25) - concept_query = ( - VwLabelValue.objects.annotate( - edit_distance=LevenshteinLessEqual( - F("value"), - Value(term), - Value(max_edit_distance), - output_field=FloatField(), + concept_query = VwLabelValue.objects.all() + if term: + concept_query = ( + concept_query.annotate( + edit_distance=LevenshteinLessEqual( + F("value"), + Value(term), + Value(max_edit_distance), + output_field=FloatField(), + ) ) + .filter(edit_distance__lte=max_edit_distance) + .order_by("edit_distance") ) - .filter(edit_distance__lte=max_edit_distance) - .order_by("edit_distance") - .values_list("concept_id", flat=True) - .distinct() - ) + else: + concept_query = concept_query.order_by("concept_id") + concept_query = concept_query.values_list("concept_id", flat=True).distinct() paginator = Paginator(concept_query, items_per_page) page = paginator.get_page(page_number) From 1ddb5a41e18b69ed2b3cfdd543eebc8c4555d062 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Tue, 27 Aug 2024 14:53:34 -0400 Subject: [PATCH 18/28] Avoid looking up languages against the Languages model Trust the concept value instead. --- arches_lingo/const.py | 2 +- arches_lingo/views/trees.py | 26 +++++++++++++------------- tests/test_settings.py | 2 +- tests/tests.py | 6 ++++++ 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arches_lingo/const.py b/arches_lingo/const.py index 776a52d3..caa01568 100644 --- a/arches_lingo/const.py +++ b/arches_lingo/const.py @@ -53,5 +53,5 @@ HIDDEN_LABEL_VALUE_ID = "18c46580-8c3c-48b7-9a6c-a0643708cb8b" # Old RDM concepts, values -LANGUAGE_CONCEPT_ID = "845cc417-ef77-4582-9271-ffba5e4cabc9" +LANGUAGE_CONCEPT_ID = "a6b88323-7226-4428-8f41-3d5252e3a2a9" ENGLISH_VALUE_ID = "de978fd0-2819-4855-8858-8c089780f32c" diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/trees.py index a0eb7154..05a42366 100644 --- a/arches_lingo/views/trees.py +++ b/arches_lingo/views/trees.py @@ -5,10 +5,10 @@ from django.core.paginator import Paginator from django.db.models import ( CharField, + Exists, FloatField, F, OuterRef, - Subquery, Value, ) from django.db.models.expressions import CombinedExpression, Func @@ -17,7 +17,7 @@ from django.views.generic import View from arches.app.models.models import ( - Language, + Relation, ResourceInstance, TileModel, Value as ConceptValue, @@ -36,6 +36,7 @@ CONCEPT_NAME_LANGUAGE_NODE, CONCEPT_NAME_TYPE_NODE, HIDDEN_LABEL_VALUE_ID, + LANGUAGE_CONCEPT_ID, SCHEME_NAME_NODEGROUP, SCHEME_NAME_CONTENT_NODE, SCHEME_NAME_LANGUAGE_NODE, @@ -167,19 +168,18 @@ def labels_subquery(label_nodegroup): ) def language_concepts_map(self): - languages = ( - Language.objects.annotate( - concept_value=Subquery( - ConceptValue.objects.filter( - valuetype="prefLabel", value=OuterRef("code") - ).values("valueid") + language_preflabels = ConceptValue.objects.filter( + Exists( + Relation.objects.filter( + conceptfrom=LANGUAGE_CONCEPT_ID, + conceptto=OuterRef("concept_id"), + relationtype="narrower", ) - ) - .exclude(concept_value=None) - .distinct() + ), + valuetype="prefLabel", ) - for lang in languages: - self.language_concepts[str(lang.concept_value)] = lang.code + for language_label in language_preflabels: + self.language_concepts[str(language_label.pk)] = language_label.value def top_concepts_map(self): top_concept_of_tiles = ( diff --git a/tests/test_settings.py b/tests/test_settings.py index 47b91e97..d7cc1814 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -5,7 +5,7 @@ PACKAGE_NAME = "arches_lingo" PROJECT_TEST_ROOT = os.path.dirname(__file__) -MEDIA_ROOT = os.path.join(PROJECT_TEST_ROOT, "fixtures", "data") +MEDIA_ROOT = os.path.join(PROJECT_TEST_ROOT, "data") BUSINESS_DATA_FILES = ( # Put strings here, like "/home/html/django_templates" or "C:/www/django/templates". diff --git a/tests/tests.py b/tests/tests.py index 84c778fc..1df9b922 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -13,6 +13,7 @@ GraphModel, Node, NodeGroup, + Relation, ResourceInstance, TileModel, Value, @@ -103,6 +104,11 @@ def setUpModule(): valuetype_id="prefLabel", value="en", ) + Relation.objects.get_or_create( + conceptfrom_id=LANGUAGE_CONCEPT_ID, + conceptto_id=LANGUAGE_CONCEPT_ID, + relationtype_id="narrower", + ) def localized_string(text, language="en", direction="ltr"): From 7df77dc6f8bb8fb22b922aaa97ecd0d34c3f2270 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Tue, 27 Aug 2024 15:26:13 -0400 Subject: [PATCH 19/28] Test basic search API params --- tests/tests.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index 1df9b922..e067ed73 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -2,7 +2,6 @@ from django.contrib.auth.models import User from django.test import TestCase -from django.test.client import Client from django.urls import reverse # these tests can be run from the command line via @@ -115,15 +114,11 @@ def localized_string(text, language="en", direction="ltr"): return {language: {"value": text, "direction": direction}} -class ConceptTreeViewTests(TestCase): +class ViewTests(TestCase): @classmethod - def setUpClass(cls): - super().setUpClass() - cls.client = Client() + def setUpTestData(cls): cls.admin = User.objects.get(username="admin") - @classmethod - def setUpTestData(cls): # Create a scheme with five concepts, each one narrower than the last, # and each concept after the top concept also narrower than the top. cls.scheme = ResourceInstance.objects.create(graph_id=SCHEMES_GRAPH_ID) @@ -132,7 +127,7 @@ def setUpTestData(cls): nodegroup_id=SCHEME_NAME_NODEGROUP, data={ SCHEME_NAME_CONTENT_NODE: localized_string("Test Scheme"), - SCHEME_NAME_TYPE_NODE: [PREF_LABEL_VALUE_ID], + SCHEME_NAME_TYPE_NODE: PREF_LABEL_VALUE_ID, SCHEME_NAME_LANGUAGE_NODE: [ENGLISH_VALUE_ID], }, ) @@ -152,7 +147,7 @@ def setUpTestData(cls): nodegroup_id=CONCEPT_NAME_NODEGROUP, data={ CONCEPT_NAME_CONTENT_NODE: localized_string(f"Concept {i + 1}"), - CONCEPT_NAME_TYPE_NODE: [PREF_LABEL_VALUE_ID], + CONCEPT_NAME_TYPE_NODE: PREF_LABEL_VALUE_ID, CONCEPT_NAME_LANGUAGE_NODE: [ENGLISH_VALUE_ID], }, ) @@ -197,7 +192,7 @@ def test_get_concept_trees(self): with self.assertNumQueries(6): # 1: session # 2: auth - # 3: select languages, subquery for concept values + # 3: select relations (to find languages) # 4: select broader tiles, subquery for labels # 5: select top concept tiles, subquery for labels # 6: select schemes, subquery for labels @@ -223,3 +218,18 @@ def test_get_concept_trees(self): {n["labels"][0]["value"] for n in concept_2["narrower"]}, {"Concept 3"}, ) + + def test_search(self): + self.client.force_login(self.admin) + + cases = ( + ["term=Concept 1", 5], + ["term=Concept 1&maxEditDistance=0", 1], + ["term=Concept 1&items=1", 1], + ["term=Concept 1&items=2&page=3", 1], + ) + for query, expected_result_count in cases: + with self.subTest(query=query): + response = self.client.get(reverse("api_search"), QUERY_STRING=query) + result = json.loads(response.content) + self.assertEqual(len(result), expected_result_count, result) From 890e0295bbc5e9ac79d957a58b82acee97d52b38 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 10:48:34 -0400 Subject: [PATCH 20/28] Import views from submodules --- arches_lingo/urls.py | 3 ++- arches_lingo/views/__init__.py | 2 -- arches_lingo/views/{trees.py => concepts.py} | 0 3 files changed, 2 insertions(+), 3 deletions(-) rename arches_lingo/views/{trees.py => concepts.py} (100%) diff --git a/arches_lingo/urls.py b/arches_lingo/urls.py index 1e5301af..396fd3f2 100644 --- a/arches_lingo/urls.py +++ b/arches_lingo/urls.py @@ -3,7 +3,8 @@ from django.conf.urls.i18n import i18n_patterns from django.urls import include, path -from arches_lingo.views import LingoRootView, ConceptTreeView, ValueSearchView +from arches_lingo.views.root import LingoRootView +from arches_lingo.views.concepts import ConceptTreeView, ValueSearchView urlpatterns = [ path("", LingoRootView.as_view(), name="root"), diff --git a/arches_lingo/views/__init__.py b/arches_lingo/views/__init__.py index ea7dc745..e69de29b 100644 --- a/arches_lingo/views/__init__.py +++ b/arches_lingo/views/__init__.py @@ -1,2 +0,0 @@ -from .root import * -from .trees import * diff --git a/arches_lingo/views/trees.py b/arches_lingo/views/concepts.py similarity index 100% rename from arches_lingo/views/trees.py rename to arches_lingo/views/concepts.py From 9abcb330ddd502ced86827d7a6a4d6cba0e5f0eb Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 10:49:51 -0400 Subject: [PATCH 21/28] Factor out api views --- arches_lingo/urls.py | 4 ++-- arches_lingo/views/{ => api}/concepts.py | 0 tests/tests.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename arches_lingo/views/{ => api}/concepts.py (100%) diff --git a/arches_lingo/urls.py b/arches_lingo/urls.py index 396fd3f2..f16faa89 100644 --- a/arches_lingo/urls.py +++ b/arches_lingo/urls.py @@ -4,7 +4,7 @@ from django.urls import include, path from arches_lingo.views.root import LingoRootView -from arches_lingo.views.concepts import ConceptTreeView, ValueSearchView +from arches_lingo.views.api.concepts import ConceptTreeView, ValueSearchView urlpatterns = [ path("", LingoRootView.as_view(), name="root"), @@ -12,7 +12,7 @@ path("search", LingoRootView.as_view(), name="search"), path("advanced-search", LingoRootView.as_view(), name="advanced-search"), path("schemes", LingoRootView.as_view(), name="schemes"), - path("api/concept_trees", ConceptTreeView.as_view(), name="concept_trees"), + path("api/concepts", ConceptTreeView.as_view(), name="api_concepts"), path("api/search", ValueSearchView.as_view(), name="api_search"), path("", include("arches_references.urls")), ] diff --git a/arches_lingo/views/concepts.py b/arches_lingo/views/api/concepts.py similarity index 100% rename from arches_lingo/views/concepts.py rename to arches_lingo/views/api/concepts.py diff --git a/tests/tests.py b/tests/tests.py index e067ed73..4ef30244 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -196,7 +196,7 @@ def test_get_concept_trees(self): # 4: select broader tiles, subquery for labels # 5: select top concept tiles, subquery for labels # 6: select schemes, subquery for labels - response = self.client.get(reverse("concept_trees")) + response = self.client.get(reverse("api_concepts")) self.assertEqual(response.status_code, 200) result = json.loads(response.content) From 1aae8a254289d6cb8fb79ee47aa6d38c9ac43832 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:20:31 -0400 Subject: [PATCH 22/28] Factor out ConceptBuilder --- arches_lingo/concepts.py | 271 ++++++++++++++++++++++++ arches_lingo/models.py | 4 + arches_lingo/query_utils.py | 14 ++ arches_lingo/querysets.py | 23 +++ arches_lingo/views/api/concepts.py | 321 ++--------------------------- 5 files changed, 326 insertions(+), 307 deletions(-) create mode 100644 arches_lingo/concepts.py create mode 100644 arches_lingo/query_utils.py create mode 100644 arches_lingo/querysets.py diff --git a/arches_lingo/concepts.py b/arches_lingo/concepts.py new file mode 100644 index 00000000..564a62d1 --- /dev/null +++ b/arches_lingo/concepts.py @@ -0,0 +1,271 @@ +from collections import defaultdict + +from django.contrib.postgres.expressions import ArraySubquery +from django.core.cache import caches +from django.db.models import CharField, Exists, F, OuterRef, Value +from django.db.models.expressions import CombinedExpression +from django.utils.translation import gettext_lazy as _ + +from arches.app.models.models import ( + Relation, + ResourceInstance, + TileModel, + Value as ConceptValue, +) + +from arches_lingo.const import ( + SCHEMES_GRAPH_ID, + TOP_CONCEPT_OF_NODE_AND_NODEGROUP, + CLASSIFICATION_STATUS_NODEGROUP, + CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID, + CONCEPT_NAME_NODEGROUP, + CONCEPT_NAME_CONTENT_NODE, + CONCEPT_NAME_LANGUAGE_NODE, + CONCEPT_NAME_TYPE_NODE, + HIDDEN_LABEL_VALUE_ID, + LANGUAGE_CONCEPT_ID, + SCHEME_NAME_NODEGROUP, + SCHEME_NAME_CONTENT_NODE, + SCHEME_NAME_LANGUAGE_NODE, + SCHEME_NAME_TYPE_NODE, + PREF_LABEL_VALUE_ID, + ALT_LABEL_VALUE_ID, +) +from arches_lingo.query_utils import JsonbArrayElements + +TOP_CONCEPT_OF_LOOKUP = f"data__{TOP_CONCEPT_OF_NODE_AND_NODEGROUP}" +BROADER_LOOKUP = f"data__{CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID}" + +cache = caches["lingo"] + + +class ConceptBuilder: + def __init__(self): + super().__init__() + self.schemes = ResourceInstance.objects.none() + + # key=concept valueid (str) val=language code + self.language_concepts: dict[str:str] = {} + # key=scheme resourceid (str) val=set of concept resourceids (str) + self.top_concepts: dict[str : set[str]] = defaultdict(set) + # key=concept resourceid (str) val=set of concept resourceids (str) + self.narrower_concepts: dict[str : set[str]] = defaultdict(set) + # key=resourceid (str) val=list of label dicts + self.labels: dict[str : list[dict]] = defaultdict(set) + + # Maps representing a reverse (leaf-first) tree + # key=resourceid (str) val=set of concept resourceids (str) + self.broader_concepts: dict[str : set[str]] = defaultdict(set) + # key=resourceid (str) val=set of scheme resourceids (str) + self.schemes_by_top_concept: dict[str : set[str]] = defaultdict(set) + + self.read_from_cache() + + # Not currently cached because written to during serialization. + self.polyhierarchical_concepts = set() + + def read_from_cache(self): + from_cache = cache.get_many( + [ + "language_concepts", + "top_concepts", + "narrower_concepts", + "schemes", + "labels", + "broader_concepts", + "schemes_by_top_concept", + ] + ) + try: + self.language_concepts = from_cache["language_concepts"] + self.top_concepts = from_cache["top_concepts"] + self.narrower_concepts = from_cache["narrower_concepts"] + self.schemes = from_cache["schemes"] + self.labels = from_cache["labels"] + self.broader_concepts = from_cache["broader_concepts"] + self.schemes_by_top_concept = from_cache["schemes_by_top_concept"] + except KeyError: + self.rebuild_cache() + + def rebuild_cache(self): + self.language_concepts_map() + self.top_concepts_map() + self.narrower_concepts_map() + self.populate_schemes() + + cache.set("language_concepts", self.language_concepts) + cache.set("top_concepts", self.top_concepts) + cache.set("narrower_concepts", self.narrower_concepts) + cache.set("schemes", self.schemes) + cache.set("labels", self.labels) + # Reverse trees. + cache.set("broader_concepts", self.broader_concepts) + cache.set("schemes_by_top_concept", self.schemes_by_top_concept) + + @staticmethod + def human_label_type(value_id): + if value_id == PREF_LABEL_VALUE_ID: + return "prefLabel" + if value_id == ALT_LABEL_VALUE_ID: + return "altLabel" + if value_id == HIDDEN_LABEL_VALUE_ID: + return "hidden" + return "unknown" + + @staticmethod + def resources_from_tiles(lookup_expression: str): + return CombinedExpression( + JsonbArrayElements(F(lookup_expression)), + "->>", + Value("resourceId"), + output_field=CharField(), + ) + + @staticmethod + def labels_subquery(label_nodegroup): + if label_nodegroup == SCHEME_NAME_NODEGROUP: + # Annotating a ResourceInstance + outer = OuterRef("resourceinstanceid") + nodegroup_id = SCHEME_NAME_NODEGROUP + else: + # Annotating a Tile + outer = OuterRef("resourceinstance_id") + nodegroup_id = CONCEPT_NAME_NODEGROUP + + return ArraySubquery( + TileModel.objects.filter( + resourceinstance_id=outer, nodegroup_id=nodegroup_id + ).values("data") + ) + + def language_concepts_map(self): + language_preflabels = ConceptValue.objects.filter( + Exists( + Relation.objects.filter( + conceptfrom=LANGUAGE_CONCEPT_ID, + conceptto=OuterRef("concept_id"), + relationtype="narrower", + ) + ), + valuetype="prefLabel", + ) + for language_label in language_preflabels: + self.language_concepts[str(language_label.pk)] = language_label.value + + def top_concepts_map(self): + top_concept_of_tiles = ( + TileModel.objects.filter(nodegroup_id=TOP_CONCEPT_OF_NODE_AND_NODEGROUP) + .annotate(top_concept_of=self.resources_from_tiles(TOP_CONCEPT_OF_LOOKUP)) + .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) + .values("resourceinstance_id", "top_concept_of", "labels") + ) + for tile in top_concept_of_tiles: + scheme_id = tile["top_concept_of"] + top_concept_id = str(tile["resourceinstance_id"]) + self.top_concepts[scheme_id].add(top_concept_id) + self.schemes_by_top_concept[top_concept_id].add(scheme_id) + self.labels[top_concept_id] = tile["labels"] + + def narrower_concepts_map(self): + broader_concept_tiles = ( + TileModel.objects.filter(nodegroup_id=CLASSIFICATION_STATUS_NODEGROUP) + .annotate(broader_concept=self.resources_from_tiles(BROADER_LOOKUP)) + .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) + .values("resourceinstance_id", "broader_concept", "labels") + ) + for tile in broader_concept_tiles.iterator(): + broader_concept_id = tile["broader_concept"] + narrower_concept_id: str = str(tile["resourceinstance_id"]) + self.narrower_concepts[broader_concept_id].add(narrower_concept_id) + self.broader_concepts[narrower_concept_id].add(broader_concept_id) + self.labels[narrower_concept_id] = tile["labels"] + + def populate_schemes(self): + self.schemes = ResourceInstance.objects.filter( + graph_id=SCHEMES_GRAPH_ID + ).annotate(labels=self.labels_subquery(SCHEME_NAME_NODEGROUP)) + + def serialize_scheme(self, scheme: ResourceInstance, *, children=True): + scheme_id: str = str(scheme.pk) + data = { + "id": scheme_id, + "labels": [self.serialize_scheme_label(label) for label in scheme.labels], + } + if children: + data["top_concepts"] = [ + self.serialize_concept(concept_id) + for concept_id in self.top_concepts[scheme_id] + ] + return data + + def serialize_scheme_label(self, label_tile: dict): + lang_code = self.language_concepts[label_tile[SCHEME_NAME_LANGUAGE_NODE][0]] + localized_string_objs = label_tile[SCHEME_NAME_CONTENT_NODE].values() + try: + value = next(iter(localized_string_objs))["value"] + except (StopIteration, KeyError): + value = "Unknown" + return { + "valuetype": self.human_label_type(label_tile[SCHEME_NAME_TYPE_NODE]), + "language": lang_code, + "value": value, + } + + def serialize_concept(self, conceptid: str, *, parents=False, children=True): + data = { + "id": conceptid, + "labels": [ + self.serialize_concept_label(label) for label in self.labels[conceptid] + ], + } + if children: + data["narrower"] = [ + self.serialize_concept(conceptid) + for conceptid in self.narrower_concepts[conceptid] + ] + if parents: + path = self.add_broader_concept_recursive([], conceptid) + scheme_id, parent_concept_ids = path[0], path[1:] + if len(parent_concept_ids) > 1: + self.polyhierarchical_concepts.add(conceptid) + schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] + data["parents"] = [self.serialize_scheme(schemes[0], children=False)] + [ + self.serialize_concept(parent_id, children=False) + for parent_id in parent_concept_ids + ] + + self_and_parent_ids = set([conceptid] + parent_concept_ids) + data["polyhierarchical"] = bool( + self_and_parent_ids.intersection(self.polyhierarchical_concepts) + ) + + return data + + def add_broader_concept_recursive(self, working_parent_list, conceptid): + # TODO: sort on sortorder at higher stacklevel once captured in original data. + broader_concepts = sorted(self.broader_concepts[conceptid]) + try: + first_broader_conceptid = broader_concepts[0] + except IndexError: + # TODO: sort here too. + schemes = sorted(self.schemes_by_top_concept[conceptid]) + working_parent_list.insert(0, schemes[0]) + return working_parent_list + else: + working_parent_list.insert(0, first_broader_conceptid) + return self.add_broader_concept_recursive( + working_parent_list, first_broader_conceptid + ) + + def serialize_concept_label(self, label_tile: dict): + lang_code = self.language_concepts[label_tile[CONCEPT_NAME_LANGUAGE_NODE][0]] + localized_string_objs = label_tile[CONCEPT_NAME_CONTENT_NODE].values() + try: + value = next(iter(localized_string_objs))["value"] + except (StopIteration, KeyError): + value = "Unknown" + return { + "valuetype": self.human_label_type(label_tile[CONCEPT_NAME_TYPE_NODE]), + "language": lang_code, + "value": value, + } diff --git a/arches_lingo/models.py b/arches_lingo/models.py index fa87287a..6ce18cc9 100644 --- a/arches_lingo/models.py +++ b/arches_lingo/models.py @@ -2,6 +2,8 @@ from arches.app.models.models import ResourceInstance +from arches_lingo.querysets import LabelValueQuerySet + class VwLabelValue(models.Model): concept = models.ForeignKey( @@ -12,6 +14,8 @@ class VwLabelValue(models.Model): ) value = models.CharField(db_column="value") + objects = LabelValueQuerySet.as_manager() + class Meta: managed = False db_table = f"arches_lingo__vw_label_values" diff --git a/arches_lingo/query_utils.py b/arches_lingo/query_utils.py new file mode 100644 index 00000000..f8749a2e --- /dev/null +++ b/arches_lingo/query_utils.py @@ -0,0 +1,14 @@ +from django.db.models.expressions import Func + + +class JsonbArrayElements(Func): + """https://forum.djangoproject.com/t/django-4-2-behavior-change-when-using-arrayagg-on-unnested-arrayfield-postgresql-specific/21547/5""" + + arity = 1 + contains_subquery = True + function = "JSONB_ARRAY_ELEMENTS" + + +class LevenshteinLessEqual(Func): + arity = 3 + function = "LEVENSHTEIN_LESS_EQUAL" diff --git a/arches_lingo/querysets.py b/arches_lingo/querysets.py new file mode 100644 index 00000000..433085ae --- /dev/null +++ b/arches_lingo/querysets.py @@ -0,0 +1,23 @@ +from django.db import models + +from arches_lingo.query_utils import LevenshteinLessEqual + + +class LabelValueQuerySet(models.QuerySet): + + def fuzzy_search(self, term, max_edit_distance): + from arches_lingo.models import VwLabelValue + + return ( + VwLabelValue.objects.all() + .annotate( + edit_distance=LevenshteinLessEqual( + models.F("value"), + models.Value(term), + models.Value(max_edit_distance), + output_field=models.FloatField(), + ) + ) + .filter(edit_distance__lte=max_edit_distance) + .order_by("edit_distance") + ) diff --git a/arches_lingo/views/api/concepts.py b/arches_lingo/views/api/concepts.py index 05a42366..1f98723d 100644 --- a/arches_lingo/views/api/concepts.py +++ b/arches_lingo/views/api/concepts.py @@ -1,309 +1,24 @@ -from collections import defaultdict - -from django.contrib.postgres.expressions import ArraySubquery -from django.core.cache import caches from django.core.paginator import Paginator -from django.db.models import ( - CharField, - Exists, - FloatField, - F, - OuterRef, - Value, -) -from django.db.models.expressions import CombinedExpression, Func -from django.utils.translation import gettext_lazy as _ from django.utils.decorators import method_decorator from django.views.generic import View -from arches.app.models.models import ( - Relation, - ResourceInstance, - TileModel, - Value as ConceptValue, -) from arches.app.models.system_settings import settings from arches.app.utils.decorators import group_required from arches.app.utils.response import JSONResponse -from arches_lingo.const import ( - SCHEMES_GRAPH_ID, - TOP_CONCEPT_OF_NODE_AND_NODEGROUP, - CLASSIFICATION_STATUS_NODEGROUP, - CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID, - CONCEPT_NAME_NODEGROUP, - CONCEPT_NAME_CONTENT_NODE, - CONCEPT_NAME_LANGUAGE_NODE, - CONCEPT_NAME_TYPE_NODE, - HIDDEN_LABEL_VALUE_ID, - LANGUAGE_CONCEPT_ID, - SCHEME_NAME_NODEGROUP, - SCHEME_NAME_CONTENT_NODE, - SCHEME_NAME_LANGUAGE_NODE, - SCHEME_NAME_TYPE_NODE, - PREF_LABEL_VALUE_ID, - ALT_LABEL_VALUE_ID, -) from arches_lingo.models import VwLabelValue - -TOP_CONCEPT_OF_LOOKUP = f"data__{TOP_CONCEPT_OF_NODE_AND_NODEGROUP}" -BROADER_LOOKUP = f"data__{CLASSIFICATION_STATUS_ASCRIBED_CLASSIFICATION_NODEID}" - -cache = caches["lingo"] - - -class JsonbArrayElements(Func): - """https://forum.djangoproject.com/t/django-4-2-behavior-change-when-using-arrayagg-on-unnested-arrayfield-postgresql-specific/21547/5""" - - arity = 1 - contains_subquery = True - function = "JSONB_ARRAY_ELEMENTS" - - -class LevenshteinLessEqual(Func): - arity = 3 - function = "LEVENSHTEIN_LESS_EQUAL" +from arches_lingo.concepts import ConceptBuilder @method_decorator( group_required("RDM Administrator", raise_exception=True), name="dispatch" ) class ConceptTreeView(View): - def __init__(self): - super().__init__() - self.schemes = ResourceInstance.objects.none() - - # key=concept valueid (str) val=language code - self.language_concepts: dict[str:str] = {} - # key=scheme resourceid (str) val=set of concept resourceids (str) - self.top_concepts: dict[str : set[str]] = defaultdict(set) - # key=concept resourceid (str) val=set of concept resourceids (str) - self.narrower_concepts: dict[str : set[str]] = defaultdict(set) - # key=resourceid (str) val=list of label dicts - self.labels: dict[str : list[dict]] = defaultdict(set) - - # Maps representing a reverse (leaf-first) tree - # key=resourceid (str) val=set of concept resourceids (str) - self.broader_concepts: dict[str : set[str]] = defaultdict(set) - # key=resourceid (str) val=set of scheme resourceids (str) - self.schemes_by_top_concept: dict[str : set[str]] = defaultdict(set) - - self.read_from_cache() - - # Not currently cached because written to during serialization. - self.polyhierarchical_concepts = set() - - def read_from_cache(self): - from_cache = cache.get_many( - [ - "language_concepts", - "top_concepts", - "narrower_concepts", - "schemes", - "labels", - "broader_concepts", - "schemes_by_top_concept", - ] - ) - try: - self.language_concepts = from_cache["language_concepts"] - self.top_concepts = from_cache["top_concepts"] - self.narrower_concepts = from_cache["narrower_concepts"] - self.schemes = from_cache["schemes"] - self.labels = from_cache["labels"] - self.broader_concepts = from_cache["broader_concepts"] - self.schemes_by_top_concept = from_cache["schemes_by_top_concept"] - except KeyError: - self.rebuild_cache() - - def rebuild_cache(self): - self.language_concepts_map() - self.top_concepts_map() - self.narrower_concepts_map() - self.populate_schemes() - - cache.set("language_concepts", self.language_concepts) - cache.set("top_concepts", self.top_concepts) - cache.set("narrower_concepts", self.narrower_concepts) - cache.set("schemes", self.schemes) - cache.set("labels", self.labels) - # Reverse trees. - cache.set("broader_concepts", self.broader_concepts) - cache.set("schemes_by_top_concept", self.schemes_by_top_concept) - - @staticmethod - def human_label_type(value_id): - if value_id == PREF_LABEL_VALUE_ID: - return "prefLabel" - if value_id == ALT_LABEL_VALUE_ID: - return "altLabel" - if value_id == HIDDEN_LABEL_VALUE_ID: - return "hidden" - return "unknown" - - @staticmethod - def resources_from_tiles(lookup_expression: str): - return CombinedExpression( - JsonbArrayElements(F(lookup_expression)), - "->>", - Value("resourceId"), - output_field=CharField(), - ) - - @staticmethod - def labels_subquery(label_nodegroup): - if label_nodegroup == SCHEME_NAME_NODEGROUP: - # Annotating a ResourceInstance - outer = OuterRef("resourceinstanceid") - nodegroup_id = SCHEME_NAME_NODEGROUP - else: - # Annotating a Tile - outer = OuterRef("resourceinstance_id") - nodegroup_id = CONCEPT_NAME_NODEGROUP - - return ArraySubquery( - TileModel.objects.filter( - resourceinstance_id=outer, nodegroup_id=nodegroup_id - ).values("data") - ) - - def language_concepts_map(self): - language_preflabels = ConceptValue.objects.filter( - Exists( - Relation.objects.filter( - conceptfrom=LANGUAGE_CONCEPT_ID, - conceptto=OuterRef("concept_id"), - relationtype="narrower", - ) - ), - valuetype="prefLabel", - ) - for language_label in language_preflabels: - self.language_concepts[str(language_label.pk)] = language_label.value - - def top_concepts_map(self): - top_concept_of_tiles = ( - TileModel.objects.filter(nodegroup_id=TOP_CONCEPT_OF_NODE_AND_NODEGROUP) - .annotate(top_concept_of=self.resources_from_tiles(TOP_CONCEPT_OF_LOOKUP)) - .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) - .values("resourceinstance_id", "top_concept_of", "labels") - ) - for tile in top_concept_of_tiles: - scheme_id = tile["top_concept_of"] - top_concept_id = str(tile["resourceinstance_id"]) - self.top_concepts[scheme_id].add(top_concept_id) - self.schemes_by_top_concept[top_concept_id].add(scheme_id) - self.labels[top_concept_id] = tile["labels"] - - def narrower_concepts_map(self): - broader_concept_tiles = ( - TileModel.objects.filter(nodegroup_id=CLASSIFICATION_STATUS_NODEGROUP) - .annotate(broader_concept=self.resources_from_tiles(BROADER_LOOKUP)) - .annotate(labels=self.labels_subquery(CONCEPT_NAME_NODEGROUP)) - .values("resourceinstance_id", "broader_concept", "labels") - ) - for tile in broader_concept_tiles.iterator(): - broader_concept_id = tile["broader_concept"] - narrower_concept_id: str = str(tile["resourceinstance_id"]) - self.narrower_concepts[broader_concept_id].add(narrower_concept_id) - self.broader_concepts[narrower_concept_id].add(broader_concept_id) - self.labels[narrower_concept_id] = tile["labels"] - - def populate_schemes(self): - self.schemes = ResourceInstance.objects.filter( - graph_id=SCHEMES_GRAPH_ID - ).annotate(labels=self.labels_subquery(SCHEME_NAME_NODEGROUP)) - - def serialize_scheme(self, scheme: ResourceInstance, *, children=True): - scheme_id: str = str(scheme.pk) - data = { - "id": scheme_id, - "labels": [self.serialize_scheme_label(label) for label in scheme.labels], - } - if children: - data["top_concepts"] = [ - self.serialize_concept(concept_id) - for concept_id in self.top_concepts[scheme_id] - ] - return data - - def serialize_scheme_label(self, label_tile: dict): - lang_code = self.language_concepts[label_tile[SCHEME_NAME_LANGUAGE_NODE][0]] - localized_string_objs = label_tile[SCHEME_NAME_CONTENT_NODE].values() - try: - value = next(iter(localized_string_objs))["value"] - except (StopIteration, KeyError): - value = "Unknown" - return { - "valuetype": self.human_label_type(label_tile[SCHEME_NAME_TYPE_NODE]), - "language": lang_code, - "value": value, - } - - def serialize_concept(self, conceptid: str, *, parents=False, children=True): - data = { - "id": conceptid, - "labels": [ - self.serialize_concept_label(label) for label in self.labels[conceptid] - ], - } - if children: - data["narrower"] = [ - self.serialize_concept(conceptid) - for conceptid in self.narrower_concepts[conceptid] - ] - if parents: - path = self.add_broader_concept_recursive([], conceptid) - scheme_id, parent_concept_ids = path[0], path[1:] - if len(parent_concept_ids) > 1: - self.polyhierarchical_concepts.add(conceptid) - schemes = [scheme for scheme in self.schemes if str(scheme.pk) == scheme_id] - data["parents"] = [self.serialize_scheme(schemes[0], children=False)] + [ - self.serialize_concept(parent_id, children=False) - for parent_id in parent_concept_ids - ] - - self_and_parent_ids = set([conceptid] + parent_concept_ids) - data["polyhierarchical"] = bool( - self_and_parent_ids.intersection(self.polyhierarchical_concepts) - ) - - return data - - def add_broader_concept_recursive(self, working_parent_list, conceptid): - # TODO: sort on sortorder at higher stacklevel once captured in original data. - broader_concepts = sorted(self.broader_concepts[conceptid]) - try: - first_broader_conceptid = broader_concepts[0] - except IndexError: - # TODO: sort here too. - schemes = sorted(self.schemes_by_top_concept[conceptid]) - working_parent_list.insert(0, schemes[0]) - return working_parent_list - else: - working_parent_list.insert(0, first_broader_conceptid) - return self.add_broader_concept_recursive( - working_parent_list, first_broader_conceptid - ) - - def serialize_concept_label(self, label_tile: dict): - lang_code = self.language_concepts[label_tile[CONCEPT_NAME_LANGUAGE_NODE][0]] - localized_string_objs = label_tile[CONCEPT_NAME_CONTENT_NODE].values() - try: - value = next(iter(localized_string_objs))["value"] - except (StopIteration, KeyError): - value = "Unknown" - return { - "valuetype": self.human_label_type(label_tile[CONCEPT_NAME_TYPE_NODE]), - "language": lang_code, - "value": value, - } - def get(self, request): + builder = ConceptBuilder() data = { - "schemes": [self.serialize_scheme(scheme) for scheme in self.schemes], + "schemes": [builder.serialize_scheme(scheme) for scheme in builder.schemes] } - # Todo: filter by nodegroup permissions return JSONResponse(data) @@ -319,33 +34,25 @@ def get(self, request): page_number = request.GET.get("page", 1) items_per_page = request.GET.get("items", 25) - concept_query = VwLabelValue.objects.all() if term: - concept_query = ( - concept_query.annotate( - edit_distance=LevenshteinLessEqual( - F("value"), - Value(term), - Value(max_edit_distance), - output_field=FloatField(), - ) - ) - .filter(edit_distance__lte=max_edit_distance) - .order_by("edit_distance") - ) + concept_query = VwLabelValue.objects.fuzzy_search(term, max_edit_distance) else: - concept_query = concept_query.order_by("concept_id") + concept_query = VwLabelValue.objects.all().order_by("concept_id") concept_query = concept_query.values_list("concept_id", flat=True).distinct() paginator = Paginator(concept_query, items_per_page) page = paginator.get_page(page_number) - data = [ - self.serialize_concept(str(concept_uuid), parents=True, children=False) - for concept_uuid in page - ] + data = [] + if page: + builder = ConceptBuilder() + data = [ + builder.serialize_concept( + str(concept_uuid), parents=True, children=False + ) + for concept_uuid in page + ] - # Todo: filter by nodegroup permissions return JSONResponse(data) @staticmethod From 3b2a7f877fd474fca6c1cfa1eed571d953b1d4c9 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:21:47 -0400 Subject: [PATCH 23/28] fixup! Add fuzzy --- arches_lingo/migrations/0001_initial.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arches_lingo/migrations/0001_initial.py b/arches_lingo/migrations/0001_initial.py index 99ebdceb..f1cb3443 100644 --- a/arches_lingo/migrations/0001_initial.py +++ b/arches_lingo/migrations/0001_initial.py @@ -9,6 +9,7 @@ class FuzzyStrMatchExtension(CreateExtension): def __init__(self): + super().__init__() self.name = "fuzzystrmatch" From a3b60408ab382b7639e17aac9fda82d9ea09b8bf Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:22:35 -0400 Subject: [PATCH 24/28] fixup! Make chosen parentage deterministic --- arches_lingo/concepts.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arches_lingo/concepts.py b/arches_lingo/concepts.py index 564a62d1..c3fe7102 100644 --- a/arches_lingo/concepts.py +++ b/arches_lingo/concepts.py @@ -251,11 +251,11 @@ def add_broader_concept_recursive(self, working_parent_list, conceptid): schemes = sorted(self.schemes_by_top_concept[conceptid]) working_parent_list.insert(0, schemes[0]) return working_parent_list - else: - working_parent_list.insert(0, first_broader_conceptid) - return self.add_broader_concept_recursive( - working_parent_list, first_broader_conceptid - ) + + working_parent_list.insert(0, first_broader_conceptid) + return self.add_broader_concept_recursive( + working_parent_list, first_broader_conceptid + ) def serialize_concept_label(self, label_tile: dict): lang_code = self.language_concepts[label_tile[CONCEPT_NAME_LANGUAGE_NODE][0]] From 2db979eb39e5519135ba6d6201b61516711d7479 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:24:25 -0400 Subject: [PATCH 25/28] fixup! Factor out ConceptBuilder --- arches_lingo/concepts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/arches_lingo/concepts.py b/arches_lingo/concepts.py index c3fe7102..1c2c514a 100644 --- a/arches_lingo/concepts.py +++ b/arches_lingo/concepts.py @@ -41,7 +41,6 @@ class ConceptBuilder: def __init__(self): - super().__init__() self.schemes = ResourceInstance.objects.none() # key=concept valueid (str) val=language code From 2c7bee7a0df7f69ef8856a9dad316fd61ce77c2f Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:36:20 -0400 Subject: [PATCH 26/28] fixup! Add fuzzy --- arches_lingo/migrations/0001_initial.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arches_lingo/migrations/0001_initial.py b/arches_lingo/migrations/0001_initial.py index f1cb3443..52dc9f24 100644 --- a/arches_lingo/migrations/0001_initial.py +++ b/arches_lingo/migrations/0001_initial.py @@ -9,8 +9,7 @@ class FuzzyStrMatchExtension(CreateExtension): def __init__(self): - super().__init__() - self.name = "fuzzystrmatch" + super().__init__("fuzzystrmatch") class Migration(migrations.Migration): From f8f6ae7e8041871e6adb15f0582253567f9c82a6 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:50:36 -0400 Subject: [PATCH 27/28] fixup! Add fuzzy --- arches_lingo/views/api/concepts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arches_lingo/views/api/concepts.py b/arches_lingo/views/api/concepts.py index 1f98723d..7c356ae8 100644 --- a/arches_lingo/views/api/concepts.py +++ b/arches_lingo/views/api/concepts.py @@ -28,8 +28,8 @@ def get(self, request): class ValueSearchView(ConceptTreeView): def get(self, request): term = request.GET.get("term") - max_edit_distance = request.GET.get( - "maxEditDistance", self.default_sensitivity() + max_edit_distance = int( + request.GET.get("maxEditDistance", self.default_sensitivity()) ) page_number = request.GET.get("page", 1) items_per_page = request.GET.get("items", 25) @@ -67,4 +67,4 @@ def default_sensitivity(): return 5 if elastic_prefix_length >= 5: return 0 - return int(5 - elastic_prefix_length) + return 5 - elastic_prefix_length From cd1b7c9fed4f44dcba2cc980014c7cc41de86f39 Mon Sep 17 00:00:00 2001 From: Jacob Walls Date: Wed, 28 Aug 2024 11:53:15 -0400 Subject: [PATCH 28/28] Enable clickjacking protection middleware --- arches_lingo/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arches_lingo/settings.py b/arches_lingo/settings.py index f9e7711e..fcef5756 100644 --- a/arches_lingo/settings.py +++ b/arches_lingo/settings.py @@ -165,7 +165,7 @@ "oauth2_provider.middleware.OAuth2TokenMiddleware", "django.contrib.auth.middleware.AuthenticationMiddleware", "django.contrib.messages.middleware.MessageMiddleware", - # "django.middleware.clickjacking.XFrameOptionsMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", "arches.app.utils.middleware.SetAnonymousUser", # "silk.middleware.SilkyMiddleware", ]