-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_line_detection.py
102 lines (81 loc) · 3.25 KB
/
run_line_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import json
import logging
import os
from typing import Any
from xml.etree import ElementTree
from tqdm import tqdm
from TkbsApiClient import TranskribusClient
from TkbsDocument import Document
from utilities import add_transkribus_args, find_existing, gather_document_folders, init_tkbs_connection, load_document, save_job_indication, setup_logging, setup_parser
def get_args():
parser = setup_parser()
add_transkribus_args(parser)
args = parser.parse_args()
return args
def lines_in_doc(doc: dict):
return doc['md']['nrOfLines'] > 0
def run_line_detection(tkbs: TranskribusClient, collection_id: int, tkbs_doc_id: int, tkbs_doc: dict) -> int:
# Run segmentation
page_dict = {
"docList":
{
"docs":
[{
"docId": tkbs_doc_id,
"pageList":
{
"pages":
[ page['pageId'] for page in tkbs_doc['pageList']['pages']]
}
}]
}
}
response = tkbs.analyzeLayout(collection_id, json.dumps(page_dict), False, True)
logging.debug(response)
tree = ElementTree.fromstring(response)
jobElement = tree.find('*jobId')
if jobElement is None:
raise ValueError("No job id")
try:
jobid = int(jobElement.text or 'xxx')
except:
raise ValueError(f"Can't parse job id '{jobElement.text}'")
return jobid
def main():
args = get_args()
setup_logging(args)
tkbs = init_tkbs_connection(args)
print(f'Running line detection on all documents from Trankribus collection {args.tkbs_collection_id}')
logging.info(f'Running line detection on all documents from Trankribus collection {args.tkbs_collection_id}')
logging.debug('Loading documents from Transkribus')
existing_docs = tkbs.listDocsByCollectionId(args.tkbs_collection_id)
jobs_issued = skipped = missing = 0
folders = list(gather_document_folders(args.base))
for folder in tqdm(folders):
doc = load_document(folder)
existing = find_existing(doc, existing_docs)
if not existing:
logging.warning(f"Can't locate document for {folder}, skipping")
missing += 1
continue
tkbs_doc_id = int(existing['docId'])
logging.debug(f'Loading document {tkbs_doc_id} from Transkribus')
tkbs_doc = tkbs.getDocById(args.tkbs_collection_id, tkbs_doc_id)
if lines_in_doc(tkbs_doc):
if not args.overwrite:
logging.info(f'Skipping {doc.title}, it has already been segmented')
skipped += 1
continue
output_folder = os.path.join(folder, 'transkribus_output')
if os.path.exists(output_folder):
if not args.overwrite:
logging.info(f'Skipping {doc.title}, it already has a transkribus output')
skipped += 1
continue
logging.info(f'Starting layout analysis on document {doc.title}')
job_id = run_line_detection(tkbs, args.tkbs_collection_id, tkbs_doc_id, tkbs_doc)
save_job_indication(folder, job_id)
jobs_issued += 1
print(f'Done, {jobs_issued} jobs issued, {missing} documents missing, {skipped} documents skipped')
if __name__ == '__main__':
main()