Skip to content

Commit

Permalink
Add content of comments field to IIS as well
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Hebing committed Jul 1, 2020
1 parent a0209f7 commit 35b5e0d
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 46 deletions.
13 changes: 1 addition & 12 deletions backend/corpora/peaceportal/epidat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from addcorpus.extract import XML, Constant, HTML, Combined
from addcorpus.corpus import Field
from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters
from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries


class Epidat(PeacePortal):
Expand Down Expand Up @@ -192,14 +192,6 @@ def __init__(self):
multiple=True
)

def join_commentaries(commentaries):
results = []
for comm in commentaries:
if comm:
results.append(comm)
return "\n".join(results)



def convert_sex(values):
if not values:
Expand Down Expand Up @@ -309,9 +301,6 @@ def extract_support_comments(soup):
cloned_soup.string = commentaries
return cloned_soup

def clean_commentary(commentary):
return ' '.join(commentary.split())


def add_support_comment(soup, existing_commentaries, elem_name, commentary_name):
elem = soup.find(elem_name)
Expand Down
184 changes: 150 additions & 34 deletions backend/corpora/peaceportal/iis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,36 @@

from addcorpus.extract import XML, Constant, HTML, ExternalFile, Combined
from addcorpus.corpus import Field
from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters
from corpora.peaceportal.peaceportal import PeacePortal, categorize_material, clean_newline_characters, clean_commentary, join_commentaries


class IIS(PeacePortal):
data_directory = current_app.config['PEACEPORTAL_IIS_DATA']
external_file_folder = current_app.config['PEACEPORTAL_IIS_TXT_DATA']
es_index = current_app.config['PEACEPORTAL_IIS_ES_INDEX']


def __init__(self):
self.source_database.extractor = Constant(
value='Inscriptions of Israel/Palestine (Brown University)'
)

self._id.extractor = XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'idno'],
tag=['teiHeader', 'fileDesc', 'sourceDesc',
'msDesc', 'msIdentifier', 'idno'],
multiple=False,
toplevel=False,
flatten=True,
transform=lambda x: ''.join(x.lower().split())
)

self.url.extractor = HTML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'idno'],
tag=['teiHeader', 'fileDesc', 'sourceDesc',
'msDesc', 'msIdentifier', 'idno'],
multiple=False,
toplevel=False,
flatten=True,
transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(''.join(x.lower().split()))
transform=lambda x: 'https://library.brown.edu/iip/viewinscr/{}'.format(
''.join(x.lower().split()))
)

# quick and dirty for now: extract value for 'notBefore'
Expand Down Expand Up @@ -73,6 +75,14 @@ def __init__(self):
# toplevel=False,
# )

self.iconography.extractor = XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc',
'msDesc', 'physDesc', 'decoDesc', 'decoNote'],
toplevel=False,
multiple=True,
flatten=True
)

# is not present in IIS data
self.sex.extractor = Constant(
value='Unknown'
Expand All @@ -99,25 +109,24 @@ def __init__(self):
self.location_details.extractor = Combined(
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
'history', 'origin', 'placeName'],
'history', 'origin', 'placeName'],
toplevel=False,
flatten=True
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
'history', 'origin', 'p'],
'history', 'origin', 'p'],
toplevel=False,
flatten=True
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
'history', 'provenance'],
'history', 'provenance'],
toplevel=False,
flatten=True
)
)


self.material.extractor = XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc', 'supportDesc'],
Expand All @@ -138,31 +147,75 @@ def __init__(self):
self.language.extractor = Combined(
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
'textLang'],
'textLang'],
attribute='mainLang',
toplevel=False,
transform=lambda x: normalize_language(x)
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msContents',
'textLang'],
'textLang'],
attribute='otherLangs',
toplevel=False,
transform=lambda x: normalize_language(x)
)
)


self.comments.extractor = XML(
tag=['text'],
toplevel=False,
multiple=False,
flatten=True,
transform_soup_func=extract_comments
self.comments.extractor = Combined(
XML(
tag=['text'],
toplevel=False,
multiple=False,
flatten=True,
transform_soup_func=extract_comments,
transform=lambda x: clean_commentary(x) if x else None
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc', 'supportDesc', 'condition'],
toplevel=False,
transform_soup_func=extract_condition
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc', 'layoutDesc', 'layout', 'p'],
toplevel=False,
transform=lambda x: 'LAYOUT:\n{}\n\n'.format(clean_commentary(x)) if x else None
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc'],
toplevel=False,
attribute='ana',
transform=lambda x: 'OBJECTTYPE:\n{}\n\n'.format(x[1:]) if x else None
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc', 'supportDesc', 'support', 'dimensions'],
toplevel=False,
transform_soup_func=extract_dimensions,
transform=lambda x: 'DIMENSIONS:\n{}\n\n'.format(
x) if x else None
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc',
'objectDesc', 'supportDesc', 'support', 'p'],
toplevel=False,
flatten=True,
transform=lambda x: 'SUPPORT:\n{}\n\n'.format(
clean_commentary(x)) if x else None
),
XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'physDesc', 'handDesc', 'handNote'],
toplevel=False,
transform_soup_func=extract_handnotes
),
transform=lambda x: join_commentaries(x)
)

self.bibliography.extractor = XML(
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc', 'msIdentifier', 'publications', 'publication'],
tag=['teiHeader', 'fileDesc', 'sourceDesc', 'msDesc',
'msIdentifier', 'publications', 'publication'],
toplevel=False,
multiple=True
)
Expand All @@ -177,32 +230,101 @@ def extract_transcript(filestream):
text = text.replace('\t', '')
return text


def extract_paragraph(soup):
'''
Extract first <p> element from `soup`, ignore the rest.
Ideal for ignoring <h2> headers in the HTML versions of the body.
'''
if not soup: return
if not soup:
return
return soup.find('p')


def extract_comments(soup):
'''
Helper function to extract the commentary from either <body> or <back> (siblings under <text>)
'''
if not soup: return
commentary_div = soup.find('div', { 'type': 'commentary' })
if not soup:
return
commentary_div = soup.find('div', {'type': 'commentary'})
return extract_paragraph(commentary_div)


def extract_attribute_and_child_p(soup, field_header):
'''
Extract value for 'ana' attribute from soup,
as well as the text from a <p> child. Will be returned
in a new soup, i.e. a single element with text content
in the following format `textcontent (attrivubtevalue)`
'''
result = ''
text = ''
ana = None
if 'ana' in soup.attrs:
ana = soup['ana']
p = extract_paragraph(soup)
if p:
text = p.get_text()
if text:
result = clean_commentary(text)
if ana:
result = '{} ({})'.format(result, ana)

if result:
cloned_soup = copy(soup)
cloned_soup.clear()
cloned_soup.string = '{}:\n{}\n\n'.format(field_header, result)
return cloned_soup


def extract_condition(soup):
return extract_attribute_and_child_p(soup, 'CONDITION')


def extract_handnotes(soup):
if not soup: return
return extract_attribute_and_child_p(soup, 'HANDNOTES')


def extract_dimensions(soup):
result = ''
height_elem = soup.find('height')
if height_elem:
height = height_elem.get_text()
if height:
result = "H: {} ".format(height)

width_elem = soup.find('width')
if width_elem:
width = width_elem.get_text()
if width:
result = "{}W: {}".format(result, width)

depth_elem = soup.find('depth')
if depth_elem:
depth = depth_elem.get_text()
if depth:
result = "{} D: {}".format(result, depth)

cloned_soup = copy(soup)
cloned_soup.clear()
cloned_soup.string = result
return cloned_soup


def normalize_language(text):
if not text: return
if not text:
return
ltext = text.lower().strip()
if ltext in ['grc']: return 'Greek'
if ltext in ['he', 'heb']: return 'Hebrew'
if ltext in ['arc']: return 'Aramaic'
if ltext in ['la', 'latin']: return 'Latin'

if ltext in ['grc']:
return 'Greek'
if ltext in ['he', 'heb']:
return 'Hebrew'
if ltext in ['arc']:
return 'Aramaic'
if ltext in ['la', 'latin']:
return 'Latin'

# what to do with the dates from this corpus?
# <date period="http://n2t.net/ark:/99152/p0m63njbxb9" notBefore="0001" notAfter="0100">First century CE</date>
Expand All @@ -212,12 +334,6 @@ def normalize_language(text):
# TODO: add field

# TODO: move to a comments field:
# condition
# layout description - notes
# dimensions - support / dimension notes
# origin notes
# object description (e.g. amphora, handles)
# handDesc (description of the letters)

# excluded (for now):
# revision history
Expand Down
18 changes: 18 additions & 0 deletions backend/corpora/peaceportal/peaceportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,24 @@ def clean_newline_characters(text):
return '\n'.join(cleaned)


def clean_commentary(commentary):
'''
Clean a commentary by removing all whitespaces characters between words,
except for one space.
'''
return ' '.join(commentary.split())

def join_commentaries(commentaries):
'''
Helper function to join the result of a Combined extractor
into one string, separating items by a newline
'''
results = []
for comm in commentaries:
if comm:
results.append(comm)
return "\n".join(results)

def categorize_material(text):
'''
Helper function to (significantly) reduce the material field to a set of categories.
Expand Down

0 comments on commit 35b5e0d

Please sign in to comment.