Skip to content

Commit

Permalink
Merge pull request #1348 from UUDigitalHumanitieslab/feature/uu-hum-c…
Browse files Browse the repository at this point in the history
…ourse-descriptions

Feature/uu hum course descriptions
  • Loading branch information
lukavdplas authored Dec 5, 2023
2 parents 22862ff + a74eb7d commit f896398
Show file tree
Hide file tree
Showing 9 changed files with 432 additions and 2 deletions.
1 change: 1 addition & 0 deletions backend/addcorpus/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
('inscription', 'Funerary inscriptions'),
('oration', 'Orations'),
('book', 'Books'),
('informative', 'Informative'),
]
'''
Types of data
Expand Down
4 changes: 2 additions & 2 deletions backend/addcorpus/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def es_index(self):
'''
ElasticSearch index name.
'''
raise NotImplementedError('CorpusDefinition missing category')
raise NotImplementedError('CorpusDefinition es_index')

'''
Elasticsearch alias. Defaults to None.
Expand Down Expand Up @@ -271,7 +271,7 @@ def _reject_extractors(self, *inapplicable_extractors):
if isinstance(field.extractor, inapplicable_extractors):
raise RuntimeError(
"Specified extractor method cannot be used with this type of data")

class ParentCorpusDefinition(CorpusDefinition):
''' A class from which other corpus definitions can inherit.
This class is in charge of setting fields, usually without defining an extractor.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.1.13 on 2023-12-04 15:41

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0005_add_validators'),
]

operations = [
migrations.AlterField(
model_name='corpusconfiguration',
name='category',
field=models.CharField(choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books'), ('informative', 'Informative')], help_text='category/medium of documents in this dataset', max_length=64),
),
]
95 changes: 95 additions & 0 deletions backend/addcorpus/xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import logging
import openpyxl
from openpyxl.worksheet.worksheet import Worksheet

from addcorpus.corpus import CorpusDefinition
from addcorpus import extract

logger = logging.getLogger('indexing')

class XLSXCorpusDefinition(CorpusDefinition):
'''
Parent class for corpora that extract data from excel spreadsheets
'''

'''
If applicable, the field that identifies entries. Subsequent rows with the same
value for this field are treated as a single document. If left blank, each row
is treated as a document.
'''
field_entry = None

'''
Specifies a required field, for example the main content. Rows with
an empty value for `required_field` will be skipped.
'''
required_field = None

'''
Number of lines to skip before reading the header
'''
skip_lines = 0

def source2dicts(self, source):
# make sure the field size is as big as the system permits
self._reject_extractors(extract.XML, extract.FilterAttribute)

if isinstance(source, str):
filename = source
metadata = {}
elif isinstance(source, bytes):
raise NotImplementedError()
else:
filename, metadata = source

wb = openpyxl.load_workbook(filename)
logger.info('Reading XLSX file {}...'.format(filename))

sheets = wb.sheetnames
sheet = wb[sheets[0]]
return self._sheet2dicts(sheet, metadata)

def _sheet2dicts(self, sheet: Worksheet, metadata):
data = (row for row in sheet.values)

for _ in range(self.skip_lines):
next(data)

header = list(next(data))

index = 0
document_id = None
rows = []

for row in data:
values = {
col: value
for col, value in zip(header, row)
}

if self.required_field and not values.get(self.required_field): # skip row if required_field is empty
continue

identifier = values.get(self.field_entry, None)
is_new_document = identifier == None or identifier != document_id
document_id = identifier

if is_new_document and rows:
yield self.document_from_rows(rows, metadata, index)
rows = [values]
index += 1
else:
rows.append(values)

if rows:
yield self.document_from_rows(rows, metadata, index)

def document_from_rows(self, rows, metadata, row_index):
doc = {
field.name: field.extractor.apply(
rows=rows, metadata = metadata, index=row_index
)
for field in self.fields if field.indexed
}

return doc
Loading

0 comments on commit f896398

Please sign in to comment.