-
Notifications
You must be signed in to change notification settings - Fork 6
/
imagecache.py
executable file
·513 lines (445 loc) · 20.1 KB
/
imagecache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
#!/usr/bin/env python3
"""An image cache for a single feedme site,
mapping original relative or absolute URLs to a local path.
Also handles image-specific manipulations like translating SRCSET
and various weirdo wordpress plugin javascript image swapping quirks.
"""
import utils
import re
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
from PIL import Image, UnidentifiedImageError
import sys, os
# A record of the images downloaded so far
ImageCache = {}
def clear():
"""Clear the image cache, when starting a new site"""
ImageCache = {}
def rewrite_images(html, baseurl, outdir, feedname, host=None):
"""Process all images referenced in an html file.
"""
if not host:
host = urllib.parse.urlparse(baseurl).hostname
soup = BeautifulSoup(html, "lxml")
for img in soup.find_all("img"):
process_img_tag(img, feedname, baseurl, outdir, host=host)
return str(soup)
def process_img_tag(tag, feedname, base_href, newdir, host=None):
"""Process an img tag (BeautifulSoup) and its attributes.
Try to detect stand-ins for src, like srcset and various
weirdo wordpress plugin attributes.
Download a local copy of the image (if not already downloaded),
and set the tag to point to it.
Arguments:
tag: the img tag, which will be modified in place
feedname: the name of the current feed
base_href: the href from which we'll get the host
host: if there's a need to override the host in base_href
newdir: the local directory in which this site is being written
"""
attrs = tag.attrs
keys = list(attrs.keys())
if not host:
host = urllib.parse.urlparse(base_href).hostname
# Handle both src and srcset.
# Los Alamos Daily Post has bogus src="data:..." URLs that don't
# display anything, and the real src is in data-src. Go figure.
if 'data-src' in keys:
src = attrs['data-src']
elif 'src' in keys:
src = attrs['src']
else:
src = None
if 'srcset' in keys or 'data-lazy-srcset' in keys:
# The intent here:
# If there's a srcset, pick the largest one that's still
# under max_srcset_size, and set src to that.
# That's what we'll try to download.
# Then remove the srcset attribute.
try:
maximgwidth = int(utils.g_config.get(feedname, 'max_srcset_size'))
except:
maximgwidth = 800
# ladailypost (which I think is wordpress) has a crazy setup
# where they set the src to something that isn't an image,
# then have data-lazy-src and/or data-lazy-srcset
# which presumably get loaded later with JavaScript.
if 'data-lazy-srcset' in attrs:
srcset = parse_srcset(attrs['data-lazy-srcset'])
elif 'srcset' in attrs:
srcset = parse_srcset(attrs['srcset'])
else:
srcset = None
if srcset:
try:
curimg = None
curwidth = 0
for pair in srcset:
w = pair[1].strip().lower()
if not w.endswith('w'):
# It probably ends in x and is a resolution-based
# descriptor. We don't handle those yet,
# but just in case we don't see any width-based
# ones, let's save the image.
if not curimg:
curimg = pair[0].strip()
continue
w = int(w[:-1])
if w > curwidth and w <= maximgwidth:
curwidth = w
curimg = pair[0].strip()
if curimg:
src = curimg
except:
# Wired sometimes has srcset with just a single url
# that's the same as the src=. In that case it
# wouldn't do us any good anyway.
# They also have srcSet="".
# And there are all sorts of nutty and random things
# sites do with srcset.
print("Error parsing srcset: '%s'" % str(srcset),
file=sys.stderr)
pass
if not src:
# Don't do anything to this image, it has no src or srcset.
return
if 'srcset' in keys:
del tag.attrs['srcset']
# Make relative URLs absolute
if src.startswith("data:"):
# With a data: url we already have all we need
return
src = make_absolute(src, base_href)
if not src:
print("make_absolute returned null", file=sys.stderr)
return
# urllib2 can't parse out the host part without first
# creating a Request object.
# Quote it to guard against URLs with nonascii characters,
# which will make urllib.request.urlopen bomb out with
# UnicodeEncodeError: 'ascii' codec can't encode character.
# If this proves problematical, try the more complicated
# solution at https://stackoverflow.com/a/40654295
# req = urllib.request.Request(urllib.parse.quote(src, safe=':/'))
# lareporter has a new Wordpress plugin that puts images on i0.wp.com
# with a bunch more characters that now need not to be quoted:
req = urllib.request.Request(urllib.parse.quote(src, safe=':/?=&%'))
user_agent = utils.g_config.get(feedname, 'user_agent')
req.add_header('User-Agent', user_agent)
# Should we only fetch images that come from the HTML's host?
try:
nonlocal_images = utils.g_config.getboolean(feedname, 'nonlocal_images')
except:
nonlocal_images = False
# Should we rewrite images that come from elsewhere,
# to avoid unwanted data use?
try:
block_nonlocal = utils.g_config.getboolean(feedname,
'block_nonlocal_images')
except:
block_nonlocal = False
# If we can't or won't download an image, what should
# we replace it with?
if block_nonlocal:
# print("Using bogus image source for nonlocal images",
# file=sys.stderr)
alt_src = 'file:///nonexistant'
# XXX Would be nice, in this case, to put a link around
# the image so the user could tap on it if they wanted
# to see it, at least if it isn't already inside a link.
# That would be easy in BeautifulSoup but it's hard
# with this start/end tag model.
else:
alt_src = src
alt_domains = utils.g_config.get_multiline(feedname, 'alt_domains')
if nonlocal_images or similar_host(req.host, host, alt_domains):
# base = os.path.basename(src)
# For now, don't take the basename; we want to know
# if images are unique, and the basename alone
# can't tell us that.
base = src.replace('/', '_')
# Clean up the filename, since it might have illegal chars.
# Only allow alphanumerics or others in a short whitelist.
# Don't allow % in the whitelist -- it causes problems
# with recursively copying the files over http later.
base = ''.join([x for x in base if x.isalpha() or x.isdigit()
or x in '-_.'])
if not base : base = '_unknown.img'
imgfilename = os.path.join(newdir, base)
# Some sites, like High Country News, use the same image
# name for everything (e.g. they'll have
# storyname-0418-jpg/image, storyname-0418-jpg/image etc.)
# so we can't assume that just because the basename is unique,
# the image must be.
# if os.path.exists(imgfilename) and \
# src not in ImageCache:
# howmany = 2
# while True:
# newimgfile = "%d-%s" % (howmany, imgfilename)
# if not os.path.exists(newimgfile):
# imgfilename = newimgfile
# break
# howmany += 1
# But we don't need this clause if we use the whole image path,
# not just the basename.
# Check again for a data: URL, in case it came in
# from one of the src substitutions.
# If so, output it and return.
if src.startswith("data:"):
return
try:
if not os.path.exists(imgfilename):
print("Fetching image", src, "to", imgfilename,
file=sys.stderr)
# urllib.request.urlopen is supposed to have
# a default timeout, but if so, it must be
# many minutes. Try this instead.
# Timeout is in seconds.
f = urllib.request.urlopen(req, timeout=8)
# Lots of things can go wrong with downloading
# the image, such as exceptions.IOError from
# [Errno 36] File name too long
try:
local_file = open(imgfilename, "wb")
except OSError as e:
if e.errno != 36:
raise e
print("Filename too long:", imgfilename, file=sys.stderr)
pathmax = os.pathconf('/', 'PC_PATH_MAX')
while imgfilename >= pathmax:
# chop stuff off the front
base = base[int(len(base)/3):]
imgfilename = os.path.join(newdir, base)
local_file = open(imgfilename, "wb")
print("Trying", imgfilename, "instead", file=sys.stderr)
# Write to our local file
local_file.write(f.read())
local_file.close()
#else:
# print("Not downloading, already have", imgfilename)
# handle download errors
except urllib.error.HTTPError as e:
print("HTTP Error on image:", e.code,
"on", src, ": setting img src to", alt_src,
file=sys.stderr)
# Since we couldn't download, point instead to the
# absolute URL, so it will at least work with a
# live net connection.
tag.attrs['src'] = alt_src
except urllib.error.URLError as e:
print("URL Error on image:", e.reason,
"on", src, file=sys.stderr)
tag.attrs['src'] = alt_src
except Exception as e:
print("Error downloading image:", str(e), \
"on", src, file=sys.stderr)
utils.ptraceback()
tag.attrs['src'] = alt_src
# If we got this far, then we have a local image.
# Are we resizing large images? Some sites have crazy-big
# images, like 5328 x 3996, which make no sense whatsoever
# to view on a phone.
try:
maxsize = int(utils.g_config.get(feedname, 'max_image_size'))
except Exception as e:
print("Exception", e, "checking maxsize")
utils.ptraceback()
maxsize = 0
if maxsize and imgfilename and os.path.exists(imgfilename):
imchanged = False
try:
im = Image.open(imgfilename)
oldwidth, oldheight = im.size
if max(oldwidth, oldheight) > maxsize:
if oldwidth >= oldheight: # landscape
newwidth = maxsize
newheight = oldheight * maxsize // oldwidth
else: # portrait
newheight = maxsize
newwidth = oldwidth * maxsize // oldheight
print("Resizing %dx%x image to %dx%d" % (oldwidth,
oldheight,
newwidth,
newheight),
file=sys.stderr)
im = im.resize((newwidth, newheight))
imchanged = True
# Change the tag's width and height attributes, if any
if 'width' in tag.attrs:
print("Rewriting tag width from %s (%s) to %s"
% (tag.attrs['width'],
type(tag.attrs['width']),
newwidth), file=sys.stderr)
tag.attrs['width'] = str(newwidth)
if 'height' in tag.attrs:
print("Rewriting tag height from %s (%s) to %s"
% (tag.attrs['height'],
type(tag.attrs['height']),
newheight), file=sys.stderr)
tag.attrs['height'] = str(newwidth)
# LA Daily Post has taken to using PNG for all
# their images, making them HUGE so translating to
# JPG would also be good.
# 3 ways to check for transparency:
# https://stackoverflow.com/a/58567453
def has_transparency(img):
if img.info.get("transparency", None) is not None:
return True
if img.mode == "P":
transparent = img.info.get("transparency", -1)
for _, index in img.getcolors():
if index == transparent:
return True
elif img.mode == "RGBA":
extrema = img.getextrema()
if extrema[3][0] < 255:
return True
return False
if (base.lower().endswith('.png')
and not has_transparency(im)):
print("rewriting base %s to %s" % (base, base[:-3] + "jpg"),
file=sys.stderr)
os.unlink(imgfilename)
imgfilename = os.path.join(newdir, base)
imchanged = True
if imchanged:
im.save(imgfilename)
except UnidentifiedImageError:
# Image might be SVG or some other non-PIL type
print("Can't resize", imgfilename, file=sys.stderr)
im = None
# Rewrite the url:
ImageCache[src] = base
tag.attrs['src'] = base
else:
# Looks like it's probably a nonlocal image, don't download
print(req.host, "and", host,
"are too different -- not fetching image", src,
file=sys.stderr)
# Add a link to the image, if a BS tag is provided
if tag:
# Find tag's root BeautifulSoup object (needed for new_tag)
soup = tag
tag.attrs['src'] = "nonlocal"
while type(soup) is not BeautifulSoup:
soup = soup.parent
awrap = soup.new_tag("a", href=src)
awrap.string = " [nonlocal image]"
# Originally, added alt text and wrapped the image in it,
# but Firefox, at least, doesn't show images with
# src = 'file:///nonexistant' and doesn't even show the
# alt text for them, so there's nothing visible to click on.
# tag.wrap(awrap)
# Instead, just add a tag around a text string.
tag.append(awrap)
else:
print("No soup, can't add image link to", src,
file=sys.stderr)
# We're left with a nonlocal image in the source.
# That could mean unwanted data use to fetch the image
# when viewing the file. So remove the image tag and
# replace it with a link.
tag.attrs['src'] = alt_src
# The srcset spec is here:
# http://w3c.github.io/html/semantics-embedded-content.html#element-attrdef-img-srcset
# https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
# A simple version is easy:
# srcset="http://site/img1.jpg 1024w,
# http://site/img1.jpg 150w, ..."
# but some sites, like Wired, embed commas inside their
# image URLs. Since the spaces aren't required, that
# makes the w and the comma the only way to parse it,
# and w and comma are both legal URL characters.
#
# The key is: "If an image candidate string contains no descriptors
# and no ASCII whitespace after the URL, the following image candidate
# string, if there is one, must begin with one or more ASCII whitespace."
# so basically, a comma that's separating image descriptors
# either has to have a space after it, or a w or x before it.
def parse_srcset(srcset_attr):
'''Parse a SRCSET attribute inside an IMG tag.
Return a list of pairs [(img_url, descriptor), ...]
where the descriptor is a resolution ending in w (pixel width)
or x (pixel density).
'''
srcset_attr = srcset_attr.strip()
if not srcset_attr:
return []
commaparts = srcset_attr.split(',')
parts = []
for part in commaparts:
# First, might we be continuing an image URL from the previous part?
# That's the case if the previous part lacked a descriptor
# and this part doesn't start with a space.
if not part.startswith(' ') and parts and not parts[-1][1]:
part = parts.pop(-1)[0] + ',' + part
# Does this part have a descriptor?
match = re.search(' *[0-9]+[wx]$', part)
if match:
matched = match.group()
parts.append((part[0:-len(matched)].strip(), matched.strip()))
else:
# Possibly shouldn't strip this, in case there's a
# continuation (this might only be the first part of
# an image URL, before a comma) and the URL might have
# a space in it. But since spaces in URLs are illegal,
# let's hope for now that no one does that.
parts.append((part.strip(), None))
return parts
def same_host(host1, host2):
"""Are two hosts close enough for the purpose of downloading images?"""
# host can be None:
if not host1 and not host2:
return True
if not host1 or not host2:
return False
# For now, a simplistic comparison:
# are the last two elements (foo.com) the same?
# Eventually we might want smarter special cases,
# exceptions for akamai, etc.
return host1.split('.')[-2:] == host2.split('.')[-2:]
def similar_host(host1, host2, alt_domains):
"""Are two hosts close enough for the purpose of downloading images?
Or is host1 close to anything in alt_domains?
"""
if same_host(host1, host2):
return True
for d in alt_domains:
if same_host(host1, d):
return True
return False
def make_absolute(url, base_href):
"""Make URLs, particularly img src, absolute according to
the current page location and any base href we've seen.
"""
# May want to switch to lxml.html.make_links_absolute(base_href,
# resolve_base_href=True)
if not url:
return url
if '://' in url:
return url # already absolute
if url.startswith('#'):
return url
# If we have a base href then it doesn't matter whether it's
# relative or absolute.
if base_href:
return urllib.parse.urljoin(base_href, url)
# Map paths without a schema or host to the full URL.
# Set it here from from the site RSS UR, though that isn't
# always right, e.g. https://rss.example.com.
# XXX We should always have a base_href here since it should
# have been set the first time through fetch_url,
# so this clause should never trigger. But just in case,
# leave this clause here for a while.
if url[0] == '/':
# if not self.base_href:
# print("******** Yikes, got to make_absolute with no base_url",
# file=sys.stderr)
# url = self.utils.g_config.get(self.feedname, 'url')
# urlparts = urllib.parse.urlparse(url)
# urlparts = urlparts._replace(path='/')
# self.base_href = urllib.parse.urlunparse(urlparts)
# print("Set base_href to", self.base_href, file=sys.stderr)
return urllib.parse.urljoin(base_href, url)
# It's relative, so append it to the current url minus cur filename:
return os.path.join(os.path.dirname(cururl), url)