-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathppn2ocr
executable file
·217 lines (161 loc) · 6.13 KB
/
ppn2ocr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env python3
"""Get OCR results as a OCR-D workspace for a given PPN"""
import os
import requests
import sys
import lxml.etree as ET
import re
import subprocess
import click
from copy import deepcopy
XMLNS = {
'mets': 'http://www.loc.gov/METS/',
'xlink': 'http://www.w3.org/1999/xlink'
}
API_URL = 'https://oai.sbb.berlin'
IDENTIFIER_TEMPLATE = 'oai:digital.staatsbibliothek-berlin.de:%s'
for prefix, uri in XMLNS.items():
ET.register_namespace(prefix, uri)
def oai_mets(ppn):
"""Retrieve METS metadata for a given PPN."""
params = {
'verb': 'GetRecord',
'metadataPrefix': 'mets',
'identifier': IDENTIFIER_TEMPLATE % ppn
}
s = requests.Session()
r = s.get(API_URL, params=params)
mets = ET.XML(r.content).find(f".//{{{XMLNS['mets']}}}mets")
mets = ET.ElementTree(mets)
return mets
def iiif_url_for_sbb_url(sbb_url, ppn, size, format):
"""
Construct an IIIF URL from a dms or an IIIF URL.
This function exists as long as dms URL exist (or as long as we may need to
rewrite IIIF URLs for a different size)
"""
if "/dms/" in sbb_url:
return iiif_url_for_dms_url(sbb_url, ppn, size, format)
else:
return iiif_url_for_iiif_url(sbb_url, ppn, size, format)
def iiif_url_for_dms_url(dms_url, ppn, size, format):
"""
Construct an IIIF URL from a dms URL.
This function exists to encapsulate the hack of rewriting the URL to get IIIF.
"""
if ppn not in dms_url:
raise ValueError(f"Unexpected URL {dms_url}")
m = re.search(r'/dms/.*/([0-9]+)\.jpg$', dms_url)
if m:
page_num = m.group(1)
else:
raise ValueError(f"Unexpected URL {dms_url}")
iiif_identifier = f'{ppn}-{page_num}'
iiif_quality = 'default'
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
return iiif_url
def iiif_url_for_iiif_url(iiif_url, ppn, size, format):
"""
Construct an IIIF URL from an already existing IIIF URL.
"""
if ppn not in iiif_url:
raise ValueError(f"Unexpected URL {iiif_url}")
m = re.search(rf'/dc/{ppn}-([0-9]+)/', iiif_url)
if m:
page_num = m.group(1)
else:
raise ValueError(f"Unexpected URL {iiif_url}")
iiif_identifier = f'{ppn}-{page_num}'
iiif_quality = 'default'
iiif_url = f'https://content.staatsbibliothek-berlin.de/dc/{iiif_identifier}/full/{size}/0/{iiif_quality}.{format}'
return iiif_url
def remove_file_grp(mets, use):
for bad_fileid in mets.xpath(f'//mets:fileGrp[@USE="{use}"]/mets:file/@ID', namespaces=XMLNS):
for bad in mets.xpath(f'//mets:fptr[@FILEID="{bad_fileid}"]', namespaces=XMLNS):
bad.getparent().remove(bad)
for bad in mets.xpath(f'//mets:fileGrp[@USE="{use}"]', namespaces=XMLNS):
bad.getparent().remove(bad)
def mime_type_for_format(format_):
if format_ == 'tif':
mime_type = 'image/tiff'
elif format_ == 'jpg':
mime_type = 'image/jpg'
else:
raise ValueError()
return mime_type
def prune_file_grps(mets):
"""
Prune unwanted file groups
We only want to keep the MAX file group (we created it ourselves) and
possibly ABBYY full texts in FULLTEXT.
For the PRESENTATION + LOCAL file groups we definitely want to delete
because they contain local file:/// or file:/ links, which are not handled
well by "ocrd workspace". They are not explicitly mentioned, as we
only keep a whitelist.
"""
wanted_file_grps = ["MAX", "FULLTEXT"]
for u in mets.xpath('//mets:fileGrp/@USE', namespaces=XMLNS):
if u not in wanted_file_grps:
remove_file_grp(mets, u)
def make_workspace(ppn, workspace):
# Make workspace directory
os.mkdir(workspace)
os.chdir(workspace)
mets = oai_mets(ppn)
# Delete MAX file group - we assume that, if it exists, it is not as
# we expect it, e.g. IIIF full URLs
remove_file_grp(mets, 'MAX')
# Duplicate DEFAULT file group into a new file group MAX
format_ = 'tif'
file_grp_default = mets.find('//mets:fileGrp[@USE="DEFAULT"]', namespaces=XMLNS)
if file_grp_default is None:
raise ValueError("This document has no DEFAULT file group, could be a multi-volume work")
file_grp_best = deepcopy(file_grp_default)
file_grp_best.attrib['USE'] = 'MAX'
for f in file_grp_best.findall('./mets:file', namespaces=XMLNS):
old_id = f.attrib['ID']
new_id = re.sub('DEFAULT', 'MAX', old_id)
f.attrib['ID'] = new_id
f.attrib['MIMETYPE'] = mime_type_for_format(format_)
for fptr in mets.findall(f'//mets:fptr[@FILEID="{old_id}"]', namespaces=XMLNS):
new_fptr = deepcopy(fptr)
new_fptr.attrib['FILEID'] = new_id
fptr.getparent().append(new_fptr)
# XXX Need to fumble around with the URL for now
flocat = f.find(f".//{{{XMLNS['mets']}}}FLocat")
old_url = flocat.attrib[f"{{{XMLNS['xlink']}}}href"]
url_iiif_full = iiif_url_for_sbb_url(old_url, ppn, 'full', format_)
flocat.attrib[f"{{{XMLNS['xlink']}}}href"] = url_iiif_full
mets.find('//mets:fileSec', namespaces=XMLNS).append(file_grp_best)
prune_file_grps(mets)
# Write mets.xml
mets.write('mets.xml', pretty_print=True)
# TODO
# Validate workspace
#ocrd workspace validate mets.xml | grep -v "<notice>Won't download remote image"
def validate_ppn(ctx, param, value):
"""Validate a PPN argument"""
if not value.startswith('PPN'):
raise click.BadParameter('PPN must be in format PPNxxxxxxxx')
else:
return value
@click.command()
@click.argument('ppn', callback=validate_ppn)
def ppn2ocr(ppn):
"""
Get METS with best images for a document PPN
For example, to get the document "PROPOSITIONES PHILOSOPHICAE: [...]" use this:
\b
ppn2ocr PPN699887615
ls PPN699887615
"""
self_dir = os.path.realpath(os.path.dirname(sys.argv[0]))
make_workspace(ppn, ppn)
# XXX
# subprocess.run([
# os.path.join(self_dir, 'run-docker-hub'),
# '-I', 'MAX',
# '--skip-validation'
# ])
if __name__ == '__main__':
ppn2ocr()