-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpdfmunge.py
executable file
·211 lines (170 loc) · 7.53 KB
/
pdfmunge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/python3
"""
pdfmunge - Process PDFs to make them more legible on eBook readers.
Copyright Felix Crux (www.felixcrux.com)
Available under the MIT License (see Readme).
"""
USAGE_STRING = """
Usage: pdfmunge [options]... input_file output_file
Options:
-r --rotate Slice pages in half and rotate each half 90 degrees
counter-clockwise, creating a pseudo-landscape mode on
devices that don't do this automatically. Be warned that
this will double the size of your output file.
-m --margin If using rotation/slicing, have each page overlap with the
previous one by this amount (helps with lines getting cut
off in the middle).
-b --bounds Boundaries of visible area on each PDF page. Useful for
cropping off large margins. If this is given, cropping is
done automatically; otherwise it is not done. Boundaries
should be given as four comma-separated numbers, all
enclosed in quotation marks, like so: "10,20,100,120". Any
whitespace inside the quotation marks is ignored.
-o --oddbounds For PDFs that have different margins on even and odd pages,
use these boundary values for odd numbered pages, with
--bounds applying to even numbered ones. If this is given,
--bounds is required.
-e --exclude Numbers or ranges of pages to not include in the output PDF.
These should be given as a series of numbers or ranges
surrounded by quotation marks, and separated by commas. Any
whitespace is ignored. Ranges are given as two numbers
separated by a hyphen/minus sign (-), where the first number
must be smaller than the second. Example: "1,2,4-8,40".
This option takes precedence over --intact.
-i --intact Leave these pages completely unchanged, ignoring
cropping, rotating, or anything else. Requires a set of
numbers or ranges like --exclude. Excluded pages are
ignored even if listed here.
"""
import getopt
import PyPDF2
import sys
def main(argv):
""" Process PDFs to make them more legible on eBook readers. """
try:
options = handle_options(argv)
except getopt.GetoptError as err:
print(str(err))
print(USAGE_STRING)
return 2
# Get our inputs and outputs sorted out and opened.
try:
input_stream = open(options["infile"], "rb")
in_stream = PyPDF2.PdfFileReader(input_stream)
if options["rotate"] is True:
in_stream2 = PyPDF2.PdfFileReader(input_stream)
except IOError as err:
print("Unable to open input file: %s" % str(err))
return 1
try:
output_stream = open(options["outfile"], "wb")
output = PyPDF2.PdfFileWriter()
except IOError as err:
print("Unable to create output file: %s" % str(err))
return 1
# The meat of the program: go over every page performing the user's bidding.
page_nums = [x for x in range(in_stream.getNumPages())
if x not in options["exclude"]]
for page_num in page_nums:
page = in_stream.getPage(page_num)
page2 = None if not options["rotate"] else in_stream2.getPage(page_num)
if page_num not in options["intact"]:
if "bounds" in options:
crop(page, page_num, options)
crop(page2, page_num, options)
if options["rotate"]:
rotate(page, page2, options)
output.addPage(page)
if page2 is not None:
output.addPage(page2)
else:
output.addPage(page)
# All right, we're done. Write the output, close up, go home.
output.write(output_stream)
input_stream.close()
output_stream.close()
return 0
def crop(page, page_num, options):
""" Apply user-specified bounds to the page. """
# Note that (page_num % 2 == 0) is the correct test for odd numbered pages,
# since we are using 0-indexed ones, where the user expects 1-indexed.
if page is not None:
if "oddbounds" in options and (page_num % 2 == 0):
bounds = options["oddbounds"]
else:
bounds = options["bounds"]
page.mediaBox = PyPDF2.generic.RectangleObject(
[PyPDF2.generic.NumberObject(x) for x in bounds])
def rotate(page, page2, options):
""" Perform slicing and rotation on pages. """
bounds = list(page.mediaBox.lowerLeft) + list(page.mediaBox.upperRight)
bounds2 = list(page2.mediaBox.lowerLeft) + list(page2.mediaBox.upperRight)
bounds[1] = (bounds[3] - bounds[1]) / 2 + bounds[1] - options["margin"]
bounds2[3] = (bounds2[3] - bounds2[1]) / 2 + bounds2[1] + options["margin"]
page.mediaBox = PyPDF2.generic.RectangleObject(
[PyPDF2.generic.NumberObject(x) for x in bounds])
page2.mediaBox = PyPDF2.generic.RectangleObject(
[PyPDF2.generic.NumberObject(x) for x in bounds2])
page.rotateCounterClockwise(90)
page2.rotateCounterClockwise(90)
def handle_options(argv):
""" Parse the comamnd-line arguments and populate the options dictionary.
All options are optional (as the name tends to suggest), but two
arguments are required: an input filename and an output filename.
"""
options = {"rotate": False, "exclude": [], "intact": [], "margin": 0}
opts, args = getopt.getopt(argv,
"rb:o:e:i:m:",
["rotate", "bounds=", "oddbounds=",
"exclude=", "intact=", "margin="])
for opt, arg in opts:
if opt in ("-r", "--rotate"):
options["rotate"] = True
elif opt in ("-b", "--bounds"):
options["bounds"] = parse_bounds(arg)
elif opt in ("-o", "--oddbounds"):
options["oddbounds"] = parse_bounds(arg)
elif opt in ("-e", "--exclude"):
options["exclude"] = parse_range(arg)
elif opt in ("-i", "--intact"):
options["intact"] = parse_range(arg)
elif opt in ("-m", "--margin"):
options["margin"] = int(arg)
else:
assert False, "Unhandled Option"
try:
options["infile"], options["outfile"] = args[0], args[1]
except IndexError:
raise getopt.GetoptError("Missing input or output filename.")
if "oddbounds" in options and "bounds" not in options:
raise getopt.GetoptError("Boundaries for even pages required if odd "
"page boundaries given.")
return options
def parse_bounds(bounds_string):
""" Given a string representation of four boundary values, return a
four-item list representing those numbers.
Input values should be separated by commas, with whitespace being ignored.
"""
return [int(val) for val in bounds_string.split(",")]
def parse_range(range_string):
""" Return a list of numbers representing the input ranges.
Inputs can be individual numbers, or ranges, given by two numbers separated
by a hyphen/minus sign (-), with each input separated by a comma. All
whitespace is ignored. In range-type inputs, the second number must be
larger than the first. Ranges are inclusive of both numbers.
Because these numbers represent page numbers, which humans index from 1, but
PyPDF2 indexes from 0, the *inputs* are 1-indexed, but the *outputs* are
0-indexed.
"""
expanded_list = []
ranges = range_string.split(",")
for cur_range in ranges:
if cur_range.find("-") > -1:
start, end = cur_range.split("-")
start, end = int(start) - 1, int(end)
expanded_list.extend(list(range(start, end)))
else:
expanded_list.append(int(cur_range) - 1)
return expanded_list
if __name__ == "__main__":
exit(main(sys.argv[1:]))