Add content of comments field to IIS as well

CentreForDigitalHumanities · Jul 1, 2020 · 35b5e0d · 35b5e0d
1 parent a0209f7
commit 35b5e0d
Show file tree

Hide file tree

Showing 3 changed files with 169 additions and 46 deletions.
diff --git a/backend/corpora/peaceportal/epidat.py b/backend/corpora/peaceportal/epidat.py
@@ -4,7 +4,7 @@
 
 from addcorpus.extract import XML, Constant, HTML, Combined
 from addcorpus.corpus import Field
-from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries
 
 
 class Epidat(PeacePortal):
@@ -192,14 +192,6 @@ def __init__(self):
             multiple=True
         )
 
-def join_commentaries(commentaries):
-    results = []
-    for comm in commentaries:
-        if comm:
-            results.append(comm)
-    return "\n".join(results)
-
-
 
 def convert_sex(values):
     if not values:
@@ -309,9 +301,6 @@ def extract_support_comments(soup):
     cloned_soup.string = commentaries
     return cloned_soup
 
-def clean_commentary(commentary):
-    return ' '.join(commentary.split())
-
 
 def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
     elem = soup.find(elem_name)

diff --git a/backend/corpora/peaceportal/iis.py b/backend/corpora/peaceportal/iis.py
@@ -4,34 +4,36 @@
 
 from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
 from addcorpus.corpus import Field
-from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters
+from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries
 
 
 class IIS(PeacePortal):
     data_directory = current_app.config['PEACEPORTAL_IIS_DATA']
     external_file_folder = current_app.config['PEACEPORTAL_IIS_TXT_DATA']
     es_index = current_app.config['PEACEPORTAL_IIS_ES_INDEX']
 
-
     def __init__(self):
         self.source_database.extractor = Constant(
             value='Inscriptions of Israel/Palestine (Brown University)'
         )
 
         self._id.extractor = XML(
-            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'idno'],
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
             multiple=False,
             toplevel=False,
             flatten=True,
             transform=lambda x: ''.join(x.lower().split())
         )
 
         self.url.extractor = HTML(
-            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'idno'],
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'msIdentifier', 'idno'],
             multiple=False,
             toplevel=False,
             flatten=True,
-            transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(''.join(x.lower().split()))
+            transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(
+                ''.join(x.lower().split()))
         )
 
         # quick and dirty for now: extract value for 'notBefore'
@@ -73,6 +75,14 @@ def __init__(self):
         #     toplevel=False,
         # )
 
+        self.iconography.extractor = XML(
+            tag=['teiHeader', 'fileDesc', 'sourceDesc',
+                 'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
+            toplevel=False,
+            multiple=True,
+            flatten=True
+        )
+
         # is not present in IIS data
         self.sex.extractor = Constant(
             value='Unknown'
@@ -99,25 +109,24 @@ def __init__(self):
         self.location_details.extractor = Combined(
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
-                    'history', 'origin', 'placeName'],
+                     'history', 'origin', 'placeName'],
                 toplevel=False,
                 flatten=True
             ),
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
-                    'history', 'origin', 'p'],
+                     'history', 'origin', 'p'],
                 toplevel=False,
                 flatten=True
             ),
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
-                    'history', 'provenance'],
+                     'history', 'provenance'],
                 toplevel=False,
                 flatten=True
             )
         )
 
-
         self.material.extractor = XML(
             tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
                  'objectDesc', 'supportDesc'],
@@ -138,31 +147,75 @@ def __init__(self):
         self.language.extractor = Combined(
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
-                    'textLang'],
+                     'textLang'],
                 attribute='mainLang',
                 toplevel=False,
                 transform=lambda x: normalize_language(x)
             ),
             XML(
                 tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
-                    'textLang'],
+                     'textLang'],
                 attribute='otherLangs',
                 toplevel=False,
                 transform=lambda x: normalize_language(x)
             )
         )
 
-
-        self.comments.extractor = XML(
-            tag=['text'],
-            toplevel=False,
-            multiple=False,
-            flatten=True,
-            transform_soup_func=extract_comments
+        self.comments.extractor = Combined(
+            XML(
+                tag=['text'],
+                toplevel=False,
+                multiple=False,
+                flatten=True,
+                transform_soup_func=extract_comments,
+                transform=lambda x: clean_commentary(x) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'supportDesc', 'condition'],
+                toplevel=False,
+                transform_soup_func=extract_condition
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc', 'layoutDesc', 'layout', 'p'],
+                toplevel=False,
+                transform=lambda x: 'LAYOUT:\n{}\n\n'.format(clean_commentary(x)) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                    'objectDesc'],
+                toplevel=False,
+                attribute='ana',
+                transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format(x[1:]) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                     'objectDesc', 'supportDesc', 'support', 'dimensions'],
+                toplevel=False,
+                transform_soup_func=extract_dimensions,
+                transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format(
+                    x) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
+                     'objectDesc', 'supportDesc', 'support', 'p'],
+                toplevel=False,
+                flatten=True,
+                transform=lambda x: 'SUPPORT:\n{}\n\n'.format(
+                    clean_commentary(x)) if x else None
+            ),
+            XML(
+                tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', 'handDesc', 'handNote'],
+                toplevel=False,
+                transform_soup_func=extract_handnotes
+            ),
+            transform=lambda x: join_commentaries(x)
         )
 
         self.bibliography.extractor = XML(
-            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publications', 'publication'],
+            tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
+                 'msIdentifier', 'publications', 'publication'],
             toplevel=False,
             multiple=True
         )
@@ -177,32 +230,101 @@ def extract_transcript(filestream):
         text = text.replace('\t', '')
     return text
 
+
 def extract_paragraph(soup):
     '''
     Extract first <p> element from `soup`, ignore the rest.
     Ideal for ignoring <h2> headers in the HTML versions of the body.
     '''
-    if not soup: return
+    if not soup:
+        return
     return soup.find('p')
 
 
 def extract_comments(soup):
     '''
     Helper function to extract the commentary from either <body> or <back> (siblings under <text>)
     '''
-    if not soup: return
-    commentary_div = soup.find('div', { 'type': 'commentary' })
+    if not soup:
+        return
+    commentary_div = soup.find('div', {'type': 'commentary'})
     return extract_paragraph(commentary_div)
 
 
+def extract_attribute_and_child_p(soup, field_header):
+    '''
+    Extract value for 'ana' attribute from soup,
+    as well as the text from a <p> child. Will be returned
+    in a new soup, i.e. a single element with text content
+    in the following format `textcontent (attrivubtevalue)`
+    '''
+    result = ''
+    text = ''
+    ana = None
+    if 'ana' in soup.attrs:
+        ana = soup['ana']
+    p = extract_paragraph(soup)
+    if p:
+        text = p.get_text()
+        if text:
+            result = clean_commentary(text)
+    if ana:
+        result = '{} ({})'.format(result, ana)
+
+    if result:
+        cloned_soup = copy(soup)
+        cloned_soup.clear()
+        cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result)
+        return cloned_soup
+
+
+def extract_condition(soup):
+    return extract_attribute_and_child_p(soup, 'CONDITION')
+
+
+def extract_handnotes(soup):
+    if not soup: return
+    return extract_attribute_and_child_p(soup, 'HANDNOTES')
+
+
+def extract_dimensions(soup):
+    result = ''
+    height_elem = soup.find('height')
+    if height_elem:
+        height = height_elem.get_text()
+        if height:
+            result = "H: {} ".format(height)
+
+    width_elem = soup.find('width')
+    if width_elem:
+        width = width_elem.get_text()
+        if width:
+            result = "{}W: {}".format(result, width)
+
+    depth_elem = soup.find('depth')
+    if depth_elem:
+        depth = depth_elem.get_text()
+        if depth:
+            result = "{} D: {}".format(result, depth)
+
+    cloned_soup = copy(soup)
+    cloned_soup.clear()
+    cloned_soup.string = result
+    return cloned_soup
+
+
 def normalize_language(text):
-    if not text: return
+    if not text:
+        return
     ltext = text.lower().strip()
-    if ltext in ['grc']: return 'Greek'
-    if ltext in ['he', 'heb']: return 'Hebrew'
-    if ltext in ['arc']: return 'Aramaic'
-    if ltext in ['la', 'latin']: return 'Latin'
-
+    if ltext in ['grc']:
+        return 'Greek'
+    if ltext in ['he', 'heb']:
+        return 'Hebrew'
+    if ltext in ['arc']:
+        return 'Aramaic'
+    if ltext in ['la', 'latin']:
+        return 'Latin'
 
     # what to do with the dates from this corpus?
     # <date period="http://n2t.net/ark:/99152/p0m63njbxb9" notBefore="0001" notAfter="0100">First century CE</date>
@@ -212,12 +334,6 @@ def normalize_language(text):
     # TODO: add field
 
     # TODO: move to a comments field:
-    # condition
-    # layout description - notes
-    # dimensions - support / dimension notes
-    # origin notes
-    # object description (e.g. amphora, handles)
-    # handDesc (description of the letters)
 
     # excluded (for now):
     # revision history

diff --git a/backend/corpora/peaceportal/peaceportal.py b/backend/corpora/peaceportal/peaceportal.py
@@ -305,6 +305,24 @@ def clean_newline_characters(text):
     return '\n'.join(cleaned)
 
 
+def clean_commentary(commentary):
+    '''
+    Clean a commentary by removing all whitespaces characters between words,
+    except for one space.
+    '''
+    return ' '.join(commentary.split())
+
+def join_commentaries(commentaries):
+    '''
+    Helper function to join the result of a Combined extractor
+    into one string, separating items by a newline
+    '''
+    results = []
+    for comm in commentaries:
+        if comm:
+            results.append(comm)
+    return "\n".join(results)
+
 def categorize_material(text):
     '''
     Helper function to (significantly) reduce the material field to a set of categories.