Merge pull request #1348 from UUDigitalHumanitieslab/feature/uu-hum-c…

…ourse-descriptions Feature/uu hum course descriptions
CentreForDigitalHumanities · Dec 5, 2023 · f896398 · f896398
2 parents 22862ff + a74eb7d
commit f896398
Show file tree

Hide file tree

Showing 9 changed files with 432 additions and 2 deletions.
diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py
@@ -9,6 +9,7 @@
     ('inscription', 'Funerary inscriptions'),
     ('oration', 'Orations'),
     ('book', 'Books'),
+    ('informative', 'Informative'),
 ]
 '''
 Types of data

diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
@@ -85,7 +85,7 @@ def es_index(self):
         '''
         ElasticSearch index name.
         '''
-        raise NotImplementedError('CorpusDefinition missing category')
+        raise NotImplementedError('CorpusDefinition es_index')
 
     '''
     Elasticsearch alias. Defaults to None.
@@ -271,7 +271,7 @@ def _reject_extractors(self, *inapplicable_extractors):
             if isinstance(field.extractor, inapplicable_extractors):
                 raise RuntimeError(
                     "Specified extractor method cannot be used with this type of data")
-    
+
 class ParentCorpusDefinition(CorpusDefinition):
     ''' A class from which other corpus definitions can inherit.
     This class is in charge of setting fields, usually without defining an extractor.

diff --git a/backend/addcorpus/migrations/0006_alter_corpusconfiguration_category.py b/backend/addcorpus/migrations/0006_alter_corpusconfiguration_category.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.1.13 on 2023-12-04 15:41
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('addcorpus', '0005_add_validators'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='corpusconfiguration',
+            name='category',
+            field=models.CharField(choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books'), ('informative', 'Informative')], help_text='category/medium of documents in this dataset', max_length=64),
+        ),
+    ]
diff --git a/backend/addcorpus/xlsx.py b/backend/addcorpus/xlsx.py
@@ -0,0 +1,95 @@
+import logging
+import openpyxl
+from openpyxl.worksheet.worksheet import Worksheet
+
+from addcorpus.corpus import CorpusDefinition
+from addcorpus import extract
+
+logger = logging.getLogger('indexing')
+
+class XLSXCorpusDefinition(CorpusDefinition):
+    '''
+    Parent class for corpora that extract data from excel spreadsheets
+    '''
+
+    '''
+    If applicable, the field that identifies entries. Subsequent rows with the same
+    value for this field are treated as a single document. If left blank, each row
+    is treated as a document.
+    '''
+    field_entry = None
+
+    '''
+    Specifies a required field, for example the main content. Rows with
+    an empty value for `required_field` will be skipped.
+    '''
+    required_field = None
+
+    '''
+    Number of lines to skip before reading the header
+    '''
+    skip_lines = 0
+
+    def source2dicts(self, source):
+        # make sure the field size is as big as the system permits
+        self._reject_extractors(extract.XML, extract.FilterAttribute)
+
+        if isinstance(source, str):
+            filename = source
+            metadata = {}
+        elif isinstance(source, bytes):
+            raise NotImplementedError()
+        else:
+            filename, metadata = source
+
+        wb = openpyxl.load_workbook(filename)
+        logger.info('Reading XLSX file {}...'.format(filename))
+
+        sheets = wb.sheetnames
+        sheet = wb[sheets[0]]
+        return self._sheet2dicts(sheet, metadata)
+
+    def _sheet2dicts(self, sheet: Worksheet, metadata):
+        data = (row for row in sheet.values)
+
+        for _ in range(self.skip_lines):
+            next(data)
+
+        header = list(next(data))
+
+        index = 0
+        document_id = None
+        rows = []
+
+        for row in data:
+            values = {
+                col: value
+                for col, value in zip(header, row)
+            }
+
+            if self.required_field and not values.get(self.required_field):  # skip row if required_field is empty
+                continue
+
+            identifier = values.get(self.field_entry, None)
+            is_new_document = identifier == None or identifier != document_id
+            document_id = identifier
+
+            if is_new_document and rows:
+                yield self.document_from_rows(rows, metadata, index)
+                rows = [values]
+                index += 1
+            else:
+                rows.append(values)
+
+        if rows:
+            yield self.document_from_rows(rows, metadata, index)
+
+    def document_from_rows(self, rows, metadata, row_index):
+        doc = {
+            field.name: field.extractor.apply(
+                rows=rows, metadata = metadata, index=row_index
+            )
+            for field in self.fields if field.indexed
+        }
+
+        return doc