-
Notifications
You must be signed in to change notification settings - Fork 0
/
tox.py
executable file
·223 lines (186 loc) · 7.73 KB
/
tox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/env python3
import re
from pprint import pprint
"""
This script takes a file describing a toc and generates pdfmark directives that
generate a correct table of contents
Commandline Parameters:
tox.py <tocfile> <outputfile> <page_offset> <splitfiles>
page_offset
offset of page numbers that should be added to all pagenumbers
tocfile
file containing toc specification
outputfile
file where the pdfmarks should be written to
splitfiles
maximum number of lines per pdfmark file
when splitting files, the index of the first line in each file
is appended to the original outputfile-name
The tocfile should look similar to:
1. Chapter one 2
1.1 subchapter one 3
2. blub 5
A. Appendix 6
A.1 subapp 3
Note that the chapter numbers need to start at the beginning of each line and
the page number needs to be followed immediately by the newline.
In order to achieve correct nesting, chapter numbers of subchapters need to
be prefixed by the number of their parent. Whitespace between the fields is
ignored. See the regexes in the function *tokenize* for reference.
Customizing
===========
Different toc-file format
1. implement tokenizer (yields tuple(chapter, title, page) records)
2. implement find_children function that, given a chapter number,
searches through a list of records and returns all children of the
given chapter.
Further Information
===================
pdftk
In order to get rid of old bookmarks, filter the file through pdftk
$ pdftk in.pdf cat output out.pdf
ghostscript
The generated pdfmark file can be given to ghostscript as an inputfile:
$ gs -sDEVICE=pdfwrite -sOutputFile=out.pdf in*.pdf pdfmark
pdfmark reference
http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
"""
def tokenize(textfile):
"""
Read from the given filename and process the toc file
yields records of the form (chapter_number, title, page)
"""
reg = re.compile(r"^((?:[A-Z]\.)?[0-9.]*)\s(.*)\s(\d+)$")
alt = re.compile(r"^(.*)\s(\d+)$")
with open(textfile) as f:
prev_num = ''
cnt = 1
for line in f:
try:
result = re.match(reg, line).groups()
num,title,page = result
prev_num = num
cnt = 1
except Exception as e:
try:
result = re.match(alt, line).groups()
title,page = result
num = "{}.{}".format(prev_num,cnt)
cnt+=1
except Exception as e:
print("Skipping toc line: %s"%line)
continue
yield (num, title, page)
def apply_page_offset(records, offset=0):
"""
Add the given offset to the page number of all records
"""
for num,title,page in records:
yield (num, title, int(page)+offset)
def get_matching_prfx(parent_rec, records):
"""
Find children, using prefix matching of chapter numbers
"""
num,title,page = parent_rec
return [(n,t,p) for n,t,p in records if n.startswith(num)]
def check_title_length(maxlen, records):
"""
Prints all titles that are longer than maxlen
pdfmark /title fields have a maximum length of 256 bytes or 126 unicode
chars. The recommended length is 32 characters. See the spec for
details.
"""
for num,title,page in records:
if len(num)+len(title)+1 > maxlen:
print(num,title,page)
def check_duplicate_targets(records):
"""
Calculate a histogram of pagenumbers and print all pages that have more
than 1 reference
"""
counts = {}
for n,t,page in records:
counts[page] = counts.get(page,0)+1
pprint({k:v for k,v in counts.items() if v!=1})
def tree_ize(records, find_children=get_matching_prfx):
"""
Builds a tree structure from a list of records.
Uses the given function *find_children* to group records into a tree
structure.
input format: tuple(chapter_num, title, page)
output format:
return [node0, node1, ...]
node = (node_data, children)
node_data = (chapter_num, title, page)
children = (child_node0, child_node1, ...)
"""
records = list(records)
while True:
try:
parent = records.pop(0)
children = find_children(parent, records)
for child in children:
records.remove(child)
yield (parent, tuple(tree_ize(children)))
except IndexError as e:
return
def deep_len(tree):
"""
Find the number of all descendants of the given node
"""
return sum(1+deep_len(children) for data,children in tree)
def pdfmark_toc(tree):
"""
Generate pdfmark instructions from the given toc tree representation
See tree_ize for the tree format
See pdfmark spec for the output language
yields a list of strings, each containing a line of toc
"""
for node in tree:
data, children = node
num,title,page = data
if len(children) == 0:
yield "[/Title ({num}: {title}) /Page {page} /OUT pdfmark\n".format(
num=num,
title=title,
page=page)
else:
yield "[/Count -{count} /Title ({num}: {title}) /Page {page} /OUT pdfmark\n".format(
count=len(children),
num=num,
title=title,
page=page)
yield from pdfmark_toc(children)
def splitfiles(l, n):
"""
Split the list into several lists, each containing no more than n items
"""
l = list(l)
for i in range(0, len(l), n):
yield (i, l[i:i+n])
def wite_pdfmark_file(tree, outfile=None, chunksize=None):
"""
Writes the given tree to the given outfile, optionally partitioning it
into files of maximum chunksize lines
If no outfile is given, nothing is done
"""
if outfile is None:
return
lines = list(pdfmark_toc(tree))
if chunksize is None:
with open(outfile, "w") as f:
f.writelines(lines)
else:
for index,linechunk in splitfiles(lines, int(chunksize)):
with open(outfile+str(index), "w") as f:
f.writelines(linechunk)
def main(self, textfile, outfile=None, page_offset=0, chunksize=None):
flat = list(apply_page_offset(tokenize(textfile), int(page_offset)))
tree = tuple(tree_ize(flat))
wite_pdfmark_file(tree, outfile, chunksize)
pprint(tree)
#check_title_length(64, flat)
#check_duplicate_targets(flat)
if __name__=='__main__':
import sys
main(*sys.argv)