-
Notifications
You must be signed in to change notification settings - Fork 0
/
export_documents_alto_by_collection.py
112 lines (96 loc) · 3.66 KB
/
export_documents_alto_by_collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import json
import logging
import os
from typing import Any
from xml.etree import ElementTree
from tqdm import tqdm
from TkbsDocument import Document
from utilities import add_transkribus_args, find_existing, gather_document_folders, init_tkbs_connection, load_document, save_job_indication, setup_logging, setup_parser
from TranskribusPyClient import TranskribusClient
params="""
{ "commonPars" : {
"pages" : "1",
"doExportDocMetadata" : true,
"doWriteMets" : true,
"doWriteImages" : true,
"doExportPageXml" : true,
"doExportAltoXml" : true,
"doExportSingleTxtFiles" : false,
"doWritePdf" : true,
"doWriteTei" : false,
"doWriteDocx" : false,
"doWriteOneTxt" : false,
"doWriteTagsXlsx" : false,
"doWriteTagsIob" : false,
"doWriteTablesXlsx" : false,
"doWriteStructureInMets" : true,
"doCreateTitle" : false,
"useVersionStatus" : "Latest version",
"writeTextOnWordLevel" : false,
"doBlackening" : false,
"selectedTags" : [ "add", "date", "Address", "human_production", "supplied", "work", "unclear", "sic", "structure", "div", "highlight", "place1", "regionType", "speech", "person", "gap", "organization", "comment", "abbrev", "place", "add1", "Initial", "lat" ],
"font" : "FreeSerif",
"splitIntoWordsInAltoXml" : true,
"pageDirName" : "page",
"fileNamePattern" : "${filename}",
"useHttps" : true,
"remoteImgQuality" : "orig",
"doOverwrite" : true,
"useOcrMasterDir" : true,
"exportTranscriptMetadata" : true,
"updatePageXmlImageDimensions" : true
},
"altoPars" : {
"splitIntoWordsInAltoXml" : true
},
"pdfPars" : {
"doPdfImagesOnly" : false,
"doPdfImagesPlusText" : true,
"doPdfWithTextPages" : false,
"doPdfWithTags" : false,
"doPdfWithArticles" : true,
"doPdfA" : false,
"pdfImgQuality" : "view"
},
"docxPars" : {
"doDocxWithTags" : false,
"doDocxPreserveLineBreaks" : false,
"doDocxForcePageBreaks" : false,
"doDocxMarkUnclear" : false,
"doDocxKeepAbbrevs" : false,
"doDocxExpandAbbrevs" : false,
"doDocxSubstituteAbbrevs" : false,
"doDocxWriteFilenames" : false,
"doDocxIgnoreSuppliedTag" : false,
"doDocxShowSuppliedTagWithBrackets" : false
}
}
"""
def get_args():
parser = setup_parser()
add_transkribus_args(parser)
args = parser.parse_args()
return args
def lines_in_doc(doc: dict):
return doc['md']['nrOfLines'] > 0
def main():
print('Running export to alto started')
logging.debug('Running export to alto started')
args = get_args()
setup_logging(args)
tkbs = TranskribusClient(sServerUrl=args.tkbs_server)
tkbs.auth_login(args.tkbs_user, args.tkbs_password, True)
print(f'Running export to alto documents from Trankribus collection {args.tkbs_collection_id}')
logging.info(f'Running export to alto on all documents from Trankribus collection {args.tkbs_collection_id}')
existing_docs = tkbs.listDocsByCollectionId(args.tkbs_collection_id)
jobs_issued = skipped = missing = 0
for folder in os.listdir(args.base):
if os.path.isdir(os.path.join(args.base, folder)):
tkbs_doc_id = folder
logging.info(f'Starting export on document {tkbs_doc_id}')
job_id = tkbs.exportCollection(args.tkbs_collection_id, tkbs_doc_id, params)
save_job_indication(os.path.join(args.base, folder), job_id, "job-status-export-alto.json")
jobs_issued += 1
print(f'Done, {jobs_issued} jobs issued, {missing} documents missing, {skipped} documents skipped')
if __name__ == '__main__':
main()