-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_watermark.py
35 lines (34 loc) · 1.58 KB
/
remove_watermark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import sys
import fitz
# https://github.com/pymupdf/PyMuPDF/discussions/1855
# word = sys.argv[1] # search string
input_filname = sys.argv[1]
output_filename = sys.argv[2]
doc = fitz.open(input_filname)
assert isinstance(doc, fitz.Document)
for page in doc:
page.clean_contents()
xref = page.get_contents()[0] # get xref of resulting /Contents object
cont = bytearray(page.read_contents()
) # read the contents source as a (modifiable) bytearray
# print(xref, cont)
if cont.find(
b"/Subtype/Watermark"
) > 0: # this will confirm a marked-content watermark is present
print(f"marked-content watermark present at {page.number}")
while True:
i1 = cont.find(
b"/Artifact <</Subtype"
) # start of definition. Needs to incude <</Subtype because there are other /Artifacts that are not watermarks
if i1 < 0: break # none more left: done
i2 = cont.find(b"EMC", i1) # end of definition
whitespace_len = i2 + 3 - (i1 - 2 + 1)
# remove the full definition source "q ... EMC"
# in the issue, the author merely removed with a blank string.
# However, just removing the target string causes unexpected errors freezed operations
# By replaciong the target string with the same length of whitespace, the program
# avoids freezing and continued operations
cont[i1 - 2:i2 + 3] = b" " * whitespace_len
doc.update_stream(xref, cont) # replace the original source
doc.save(output_filename, clean=True, garbage=3) # save to new file
print("docsaved!")