From 9a3c8885695afc23cf54ff1fc065c02718dc33e5 Mon Sep 17 00:00:00 2001 From: csae8092 Date: Fri, 27 Jan 2023 11:53:12 +0100 Subject: [PATCH] feature: hardend `extract_begin_end` #17 --- README.md | 10 ++++++- acdh_cidoc_pyutils/__init__.py | 50 +++++++++++++++++++++++--------- acdh_cidoc_pyutils/namespaces.py | 13 +++++++++ tests/test_cidoc_pyutils.py | 44 +++++++++++++++++++++------- 4 files changed, 92 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 5968780..9ec882d 100644 --- a/README.md +++ b/README.md @@ -299,7 +299,7 @@ print(normalize_string(string)) ### extract date attributes (begin, end) -expects typical TEI date attributes like `@when, @when-iso, @notBefore, @notAfter` and returns a tuple containg start- and enddate values. If only `@when or @when-iso` or only `@notBefore or @notAfter` are provided, the returned values are the same +expects typical TEI date attributes like `@when, @when-iso, @notBefore, @notAfter, @from, @to, ...` and returns a tuple containg start- and enddate values. If only `@when or @when-iso` or only `@notBefore or @notAfter` are provided, the returned values are the same, unless the default parameter `fill_missing` is set to false. ```python from lxml.etree import Element @@ -313,6 +313,14 @@ print(extract_begin_end(date_object)) # returns # ('1900-12-12', '1900-12-12') +date_string = "1900-12-12" +date_object = Element("{http://www.tei-c.org/ns/1.0}tei") +date_object.attrib["when-iso"] = date_string +print(extract_begin_end(date_object, fill_missing=False)) + +# returns +# ('1900-12-12', None) + date_object = Element("{http://www.tei-c.org/ns/1.0}tei") date_object.attrib["notAfter"] = "1900-12-12" date_object.attrib["notBefore"] = "1800" diff --git a/acdh_cidoc_pyutils/__init__.py b/acdh_cidoc_pyutils/__init__.py index 8173bdf..e882cde 100644 --- a/acdh_cidoc_pyutils/__init__.py +++ b/acdh_cidoc_pyutils/__init__.py @@ -5,7 +5,7 @@ from rdflib import Graph, Literal, URIRef, XSD, RDF, RDFS, OWL from slugify import slugify from acdh_tei_pyutils.utils import make_entity_label -from acdh_cidoc_pyutils.namespaces import CIDOC, NSMAP +from acdh_cidoc_pyutils.namespaces import CIDOC, NSMAP, DATE_ATTRIBUTE_DICT def normalize_string(string: str) -> str: @@ -45,19 +45,41 @@ def coordinates_to_p168( return g -def extract_begin_end(date_object: Union[Element, dict]) -> tuple[str, str]: - begin, end = "", "" - if date_object.get("when-iso", "") != "": - return (date_object.get("when-iso"), date_object.get("when-iso")) - elif date_object.get("when", "") != "": - return (date_object.get("when"), date_object.get("when")) - begin = date_object.get("notBefore", "") - end = date_object.get("notAfter", "") - if begin != "" and end == "": - end = begin - if end != "" and begin == "": - begin = end - return (begin, end) +def extract_begin_end( + date_object: Union[Element, dict], + fill_missing=True, + attribute_map=DATE_ATTRIBUTE_DICT, +) -> tuple[Union[str, bool], Union[str, bool]]: + final_start, final_end = None, None + start, end, when = None, None, None + for key, value in attribute_map.items(): + date_value = date_object.get(key) + if date_value and value == "start": + start = date_value + if date_value and value == "end": + end = date_value + if date_value and value == "when": + when = date_value + if fill_missing: + if start or end or when: + if start and end: + final_start, final_end = start, end + elif start and not end and not when: + final_start, final_end = start, start + elif end and not start and not when: + final_start, final_end = end, end + elif when and not start and not end: + final_start, final_end = when, when + else: + if start and end: + final_start, final_end = start, end + elif start and not end and not when: + final_start, final_end = start, None + elif end and not start and not when: + final_start, final_end = None, end + elif when and not start and not end: + final_start, final_end = when, when + return final_start, final_end def date_to_literal(date_str: str) -> Literal: diff --git a/acdh_cidoc_pyutils/namespaces.py b/acdh_cidoc_pyutils/namespaces.py index ad78216..b6de5a9 100644 --- a/acdh_cidoc_pyutils/namespaces.py +++ b/acdh_cidoc_pyutils/namespaces.py @@ -7,3 +7,16 @@ "tei": "http://www.tei-c.org/ns/1.0", "xml": "http://www.w3.org/XML/1998/namespace", } + +DATE_ATTRIBUTE_DICT = { + "notBefore": "start", + "notBefore-iso": "start", + "from": "start", + "from-iso": "start", + "notAfter": "end", + "notAfter-iso": "end", + "to": "end", + "to-iso": "end", + "when": "when", + "when-iso": "when" +} diff --git a/tests/test_cidoc_pyutils.py b/tests/test_cidoc_pyutils.py index ea4afb7..e8c030b 100644 --- a/tests/test_cidoc_pyutils.py +++ b/tests/test_cidoc_pyutils.py @@ -144,37 +144,61 @@ def test_006_begin_end(self): date_object = Element("hansi") date_object.attrib["when-iso"] = date_string begin, end = extract_begin_end(date_object) - self.assertTrue(begin, date_string) - self.assertTrue(end, date_string) + self.assertEqual(begin, date_string) + self.assertEqual(end, date_string) + begin, end = extract_begin_end(date_object, fill_missing=False) + self.assertEqual(begin, date_string) + self.assertEqual(end, date_string) + + date_string = "1900-12-12" + date_object = Element("hansi") + date_object.attrib["from-iso"] = date_string + begin, end = extract_begin_end(date_object, fill_missing=False) + self.assertEqual(begin, date_string) + self.assertEqual(end, None) date_string = "1900-12-12" date_object = Element("hansi") date_object.attrib["when"] = date_string begin, end = extract_begin_end(date_object) - self.assertTrue(begin, date_string) - self.assertTrue(end, date_string) + self.assertEqual(begin, date_string) + self.assertEqual(end, date_string) date_string = "1900-12-12" date_object = Element("hansi") date_object.attrib["notAfter"] = date_string begin, end = extract_begin_end(date_object) - self.assertTrue(begin, date_string) - self.assertTrue(end, date_string) + self.assertEqual(begin, date_string) + self.assertEqual(end, date_string) date_string = "1900-12-12" date_object = Element("hansi") date_object.attrib["notBefore"] = date_string begin, end = extract_begin_end(date_object) - self.assertTrue(begin, date_string) - self.assertTrue(end, date_string) + self.assertEqual(begin, date_string) + self.assertEqual(end, date_string) date_string = "1900-12-12" date_object = Element("hansi") date_object.attrib["notAfter"] = date_string date_object.attrib["notBefore"] = "1800" begin, end = extract_begin_end(date_object) - self.assertTrue(begin, "1800") - self.assertTrue(end, date_string) + self.assertEqual(begin, "1800") + self.assertEqual(end, date_string) + + date_string = "1900-12-12" + date_object = Element("hansi") + date_object.attrib["notAfter"] = date_string + date_object.attrib["notBefore"] = "1800" + begin, end = extract_begin_end(date_object, fill_missing=False) + self.assertEqual(begin, "1800") + self.assertEqual(end, date_string) + date_string = "1900-12-12" + date_object = Element("hansi") + date_object.attrib["to"] = date_string + begin, end = extract_begin_end(date_object, fill_missing=False) + self.assertEqual(begin, None) + self.assertEqual(end, date_string) def test_007_make_appelations(self): g = Graph()