Merge branch 'release/5.10.0'

CentreForDigitalHumanities · Aug 8, 2024 · 671ba3f · 671ba3f
2 parents 2cbe71e + a3b7e69
commit 671ba3f
Show file tree

Hide file tree

Showing 63 changed files with 1,295 additions and 1,295 deletions.
diff --git a/.github/workflows/backend-test.yml b/.github/workflows/backend-test.yml
@@ -1,4 +1,4 @@
-# This workflow will run backend tests on the Python version defined in the Dockerfiles
+# This workflow will run backend tests on the Python version defined in the backend/Dockerfile
 
 name: Backend unit tests
 
@@ -13,15 +13,45 @@ on:
       - 'hotfix/**'
       - 'release/**'
       - 'dependabot/**'
-    paths-ignore:
-      - 'frontend/**'
-      - '**.md'
+    paths:
+      - 'backend/**'
+      - '.github/workflows/backend*'
+      - 'docker-compose.yaml'
 
 jobs:
   backend-test:
     name: Test Backend
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Build and push Elasticsearch image
+      uses: docker/build-push-action@v6
+      with:
+        context: .
+        file: DockerfileElastic
+        push: true
+        tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest
+        cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-elastic:latest
+        cache-to: type=inline
+    - name: Build and push Backend
+      uses: docker/build-push-action@v6
+      with:
+        context: backend/.
+        push: true
+        tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest
+        cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-backend:latest
+        cache-to: type=inline
     - name: Run backend tests
-      run: sudo mkdir -p /ci-data && sudo docker-compose --env-file .env-ci run backend pytest
+      run: |
+        sudo mkdir -p /ci-data
+        docker compose pull elasticsearch
+        docker compose pull backend
+        docker compose --env-file .env-ci run --rm backend pytest
diff --git a/.github/workflows/frontend-test.yml b/.github/workflows/frontend-test.yml
@@ -13,15 +13,34 @@ on:
       - 'hotfix/**'
       - 'release/**'
       - 'dependabot/**'
-    paths-ignore:
-      - 'backend/**'
-      - '**.md'
+    paths:
+      - 'frontend/**'
+      - '.github/workflows/frontend*'
+      - 'docker-compose.yaml'
 
 jobs:
   frontend-test:
     name: Test Frontend
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - name: Run frontend tests
-      run: sudo docker-compose --env-file .env-ci run frontend yarn test
+    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Build frontend image, using cache from Github registry
+      uses: docker/build-push-action@v6
+      with:
+        context: frontend/.
+        push: true
+        tags: ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest
+        cache-from: type=registry,ref=ghcr.io/uudigitalhumanitieslab/ianalyzer-frontend:latest
+        cache-to: type=inline
+    - name: Run frontend unit tests
+      run: |
+        docker compose pull frontend
+        docker compose --env-file .env-ci run --rm frontend yarn test
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -61,6 +61,16 @@
             }
         },
         {
+            "name": "Python: Debug Tests",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "purpose": [
+                "debug-test"
+            ],
+            "console": "internalConsole",
+            "justMyCode": false
+        }, {
             "name": "celery",
             "type": "debugpy",
             "request": "launch",

diff --git a/CITATION.cff b/CITATION.cff
@@ -35,5 +35,5 @@ keywords:
   - elasticsearch
   - natural language processing
 license: MIT
-version: 5.9.0
-date-released: '2024-07-05'
+version: 5.11.0
+date-released: '2024-08-08'
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -7,7 +7,6 @@ RUN apt-get -y update && apt-get -y upgrade
 RUN apt-get install -y pkg-config libxml2-dev libxmlsec1-dev libxmlsec1-openssl default-libmysqlclient-dev
 
 RUN pip install --upgrade pip
-RUN pip install pip-tools
 # make a directory in the container
 WORKDIR /backend
 # copy requirements from the host system to the directory in the container

diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py
@@ -49,9 +49,18 @@ class VisualizationType(Enum):
     'scan',
     'tab-scan'
     'p',
+    'tags',
+    'context',
+    'tab',
 ]
 '''
-Field names that cannot be used because they are also query parameters in frontend routes.
+Field names that cannot be used because they interfere with other functionality.
 
-Using them would make routing ambiguous.
+This is usually because they are also query parameters in frontend routes, and using them
+would make routing ambiguous.
+
+`query` is also forbidden because it is a reserved column in CSV downloads. Likewise,
+`context` is forbidden because it's used in download requests.
+
+`scan` and `tab-scan` are added because they interfere with element IDs in the DOM.
 '''
diff --git a/backend/addcorpus/reader.py b/backend/addcorpus/reader.py
@@ -36,8 +36,6 @@ class NewReader(CSVReader):
                   for f in corpus.configuration.fields.all()]
 
         def sources(self, *args, **kwargs):
-            return (
-                (fn, {}) for fn in glob.glob(f'{self.data_directory}/**/*.csv', recursive=True)
-            )
+            return glob.glob(f'{self.data_directory}/**/*.csv', recursive=True)
 
     return NewReader()
diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py
@@ -2,6 +2,7 @@
 import os
 import re
 from tqdm import tqdm
+from ianalyzer_readers.xml_tag import Tag, CurrentTag, TransformTag
 
 from django.conf import settings
 from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition
@@ -25,8 +26,8 @@ class DBNL(XMLCorpusDefinition):
     languages = ['nl', 'dum', 'fr', 'la', 'fy', 'lat', 'en', 'nds', 'de', 'af']
     category = 'book'
 
-    tag_toplevel = 'TEI.2'
-    tag_entry = { 'name': 'div', 'attrs': {'type': 'chapter'} }
+    tag_toplevel = Tag('TEI.2')
+    tag_entry = Tag('div', type='chapter')
 
     document_context = {
         'context_fields': ['title_id'],
@@ -261,18 +262,18 @@ def _xml_files(self):
             Pass(
                 Backup(
                     XML( # get the language on chapter-level if available
+                        CurrentTag(),
                         attribute='lang',
                         transform=lambda value: [value] if value else None,
                     ),
                     XML( # look for section-level codes
-                        {'name': 'div', 'attrs': {'type': 'section'}},
+                        Tag('div', type='section'),
                         attribute='lang',
                         multiple=True,
                     ),
                     XML( # look in the top-level metadata
-                        'language',
+                        Tag('language'),
                         toplevel=True,
-                        recursive=True,
                         multiple=True,
                         attribute='id'
                     ),
@@ -298,17 +299,17 @@ def _xml_files(self):
         extractor=Pass(
             Backup(
                 XML( # get the language on chapter-level if available
+                    CurrentTag(),
                     attribute='lang',
                 ),
                 XML( # look for section-level code
-                    {'name': 'div', 'attrs': {'type': 'section'}},
+                    Tag('div', type='section'),
                     attribute='lang'
                 ),
                 XML( #otherwise, get the (first) language for the book
-                    'language',
+                    Tag('language'),
                     attribute='id',
                     toplevel=True,
-                    recursive=True,
                 ),
                 transform=utils.single_language_code,
             ),
@@ -322,13 +323,11 @@ def _xml_files(self):
         display_name='Chapter',
         extractor=Backup(
             XML(
-                tag='head',
-                recursive=True,
+                Tag('head'),
                 flatten=True,
             ),
             XML(
-                tag=utils.LINE_TAG,
-                recursive=True,
+                Tag(utils.LINE_TAG),
                 flatten=True,
             )
         ),
@@ -359,11 +358,11 @@ def _xml_files(self):
         search_field_core=True,
         csv_core=True,
         extractor=XML(
-            tag=utils.LINE_TAG,
-            recursive=True,
+            Tag(utils.LINE_TAG),
+            TransformTag(utils.pad_content),
             multiple=True,
             flatten=True,
-            transform_soup_func=utils.pad_content,
+            transform=lambda lines: '\n'.join(lines).strip() if lines else None,
         ),
         es_mapping=main_content_mapping(token_counts=True),
         visualizations=['wordcloud'],

diff --git a/backend/corpora/dbnl/tests/test_dbnl_extraction.py b/backend/corpora/dbnl/tests/test_dbnl_extraction.py
@@ -145,12 +145,12 @@ def test_append_to_tag(xml, tag, padding, original_output, new_output):
         'content': '\n'.join([
             'Register der Liedekens.',
             'A.',
-            'ACh gesalfde van den Heer. Pag. 30 ',
-            'Als Saul, en david den vyant in\'t velt. 41 ',
-            'Als ick de Son verhoogen sie. 184 ',
-            'Als hem de Son begeeft. 189 ',
-            'Als ick den Herfst aenschou. 194 ',
-            'Als in koelt, de nacht komt overkleeden 208 ',
+            'ACh gesalfde van den Heer. Pag. 30',
+            'Als Saul, en david den vyant in\'t velt. 41',
+            'Als ick de Son verhoogen sie. 184',
+            'Als hem de Son begeeft. 189',
+            'Als ick den Herfst aenschou. 194',
+            'Als in koelt, de nacht komt overkleeden 208',
             'Als van der meer op Eng\'le-vleug\'len vloog. 232',
         ])
     }, { # metadata-only book
@@ -194,6 +194,8 @@ def test_dbnl_extraction(dbnl_corpus):
     for actual, expected in zip(docs, expected_docs):
         # assert that actual is a superset of expected
         for key in expected:
+            if expected[key] != actual[key]:
+                print(key)
             assert expected[key] == actual[key]
         assert expected.items() <= actual.items()
 

diff --git a/backend/corpora/dbnl/utils.py b/backend/corpora/dbnl/utils.py
@@ -183,7 +183,8 @@ def append_to_tag(soup, tag, padding):
 def pad_content(node):
     pad_cells = lambda n: append_to_tag(n, 'cell', ' ')
     pad_linebreaks = lambda n: append_to_tag(n, 'lb', '\n')
-    return pad_cells(pad_linebreaks(node))
+    pad_cells(pad_linebreaks(node))
+    return [node]
 
 def standardize_language_code(code):
     if code:

diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py
@@ -4,6 +4,7 @@
 import os.path as op
 import logging
 from datetime import datetime
+from ianalyzer_readers.xml_tag import Tag
 
 from django.conf import settings
 
@@ -20,7 +21,6 @@
 class DutchAnnualReports(XMLCorpusDefinition):
     """ Alto XML corpus of Dutch annual reports. """
 
-    # Data overrides from .common.Corpus (fields at bottom of class)
     title = "Dutch Annual Reports"
     description = "Annual reports of Dutch financial and non-financial institutes"
     min_date = datetime(year=1957, month=1, day=1)
@@ -38,9 +38,8 @@ class DutchAnnualReports(XMLCorpusDefinition):
 
     mimetype = 'application/pdf'
 
-    # Data overrides from .common.XMLCorpus
-    tag_toplevel = 'alto'
-    tag_entry = 'Page'
+    tag_toplevel = Tag('alto')
+    tag_entry = Tag('Page')
 
     # New data members
     non_xml_msg = 'Skipping non-XML file {}'
@@ -187,9 +186,8 @@ def sources(self, start=min_date, end=max_date):
             description='Text content of the page.',
             results_overview=True,
             extractor=XML(
-                tag='String',
+                Tag('String'),
                 attribute='CONTENT',
-                recursive=True,
                 multiple=True,
                 transform=lambda x: ' '.join(x),
             ),