diff --git a/.gitignore b/.gitignore
index fd1c91c..b4c062e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,6 @@ coverage.xml
# Sphinx documentation
docs/_build/
+
+# Pycharm
+.idea/
diff --git a/README.markdown b/README.markdown
index c794c76..5af29b8 100644
--- a/README.markdown
+++ b/README.markdown
@@ -49,6 +49,27 @@ Supports
* Sta.sh
* Completely arbitrary sites, with a bit more work (see below)
+Images support
+---
+
+Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
+format:
+- JPEG (JPG/JFIF)
+- PNG
+- GIF
+
+See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
+
+Leech can not save images in SVG because it is not supported by Pillow.
+
+Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
+image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
+
+By default, Leech will try and save all non-animated images as JPEG.
+The only animated images that Leech will save are GIFs.
+
+To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
+
Configuration
---
@@ -61,6 +82,10 @@ Example:
"logins": {
"QuestionableQuesting": ["username", "password"]
},
+ "images": true,
+ "image_format": "png",
+ "compress_images": true,
+ "max_image_size": 100000,
"cover": {
"fontname": "Comic Sans MS",
"fontsize": 30,
@@ -76,6 +101,30 @@ Example:
}
}
```
+> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
+> If it is `false`, Leech will not download any images.
+> Leech will also ignore the `image_format` key if `images` is `false`.
+
+> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
+> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.
+
+> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images.
+> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key,
+> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image
+> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes.
+> If `compress_images` is `false`, Leech will ignore the `max_image_size` key.
+
+> Warning: Compressing images might make Leech take a lot longer to download images.
+
+> Warning: Compressing images might make the image quality worse.
+
+> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might
+> not be able to compress the image to the exact size of the `max_image_size` key.
+
+> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to
+> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes.
+
+> Warning: Leech will not compress GIFs, that might damage the animation.
Arbitrary Sites
---
diff --git a/ebook/__init__.py b/ebook/__init__.py
index 7810c21..635dafd 100644
--- a/ebook/__init__.py
+++ b/ebook/__init__.py
@@ -1,6 +1,8 @@
from .epub import make_epub, EpubFile
-from .cover import make_cover
-from .cover import make_cover_from_url
+from .cover import make_cover, make_cover_from_url
+from .image import get_image_from_url
+from sites import Image
+from bs4 import BeautifulSoup
import html
import unicodedata
@@ -72,34 +74,91 @@ class CoverOptions:
height = attr.ib(default=None, converter=attr.converters.optional(int))
wrapat = attr.ib(default=None, converter=attr.converters.optional(int))
bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
- textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
+ textcolor = attr.ib(
+ default=None, converter=attr.converters.optional(tuple))
cover_url = attr.ib(default=None, converter=attr.converters.optional(str))
-def chapter_html(story, titleprefix=None, normalize=False):
+def chapter_html(
+ story,
+ image_bool=False,
+ image_format="JPEG",
+ compress_images=False,
+ max_image_size=1_000_000,
+ titleprefix=None,
+ normalize=False
+):
chapters = []
for i, chapter in enumerate(story):
title = chapter.title or f'#{i}'
if hasattr(chapter, '__iter__'):
# This is a Section
- chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
+ chapters.extend(chapter_html(
+ chapter, titleprefix=title, normalize=normalize))
else:
+ soup = BeautifulSoup(chapter.contents, 'html5lib')
+ if image_bool:
+ all_images = soup.find_all('img')
+ len_of_all_images = len(all_images)
+ print(f"Found {len_of_all_images} images in chapter {i}")
+
+ for count, img in enumerate(all_images):
+ if not img.has_attr('src'):
+ print(f"Image {count} has no src attribute, skipping...")
+ continue
+ print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
+ img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size)
+ chapter.images.append(Image(
+ path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
+ contents=img_contents[0],
+ content_type=img_contents[2]
+ ))
+ img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
+ if not img.has_attr('alt'):
+ img['alt'] = f"Image {count} from chapter {i}"
+ # Add all pictures on this chapter as well.
+ for image in chapter.images:
+ # For/else syntax, check if the image path already exists, if it doesn't add the image.
+ # Duplicates are not allowed in the format.
+ for other_file in chapters:
+ if other_file.path == image.path:
+ break
+ else:
+ chapters.append(EpubFile(
+ path=image.path, contents=image.contents, filetype=image.content_type))
+ else:
+ # Remove all images from the chapter so you don't get that annoying grey background.
+ for img in soup.find_all('img'):
+ if img.parent.name.lower() == "figure":
+ img.parent.decompose()
+ else:
+ img.decompose()
+
title = titleprefix and f'{titleprefix}: {title}' or title
- contents = chapter.contents
+ contents = str(soup)
if normalize:
title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents)
chapters.append(EpubFile(
title=title,
path=f'{story.id}/chapter{i + 1}.html',
- contents=html_template.format(title=html.escape(title), text=contents)
+ contents=html_template.format(
+ title=html.escape(title), text=contents)
))
if story.footnotes:
- chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
+ chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
+ title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
-def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
+def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False):
+ if image_options is None:
+ image_options = {
+ 'image_bool': False,
+ 'image_format': 'JPEG',
+ 'compress_images': False,
+ 'max_image_size': 1_000_000
+ }
dates = list(story.dates())
metadata = {
'title': story.title,
@@ -117,14 +176,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
extra_metadata['Tags'] = ', '.join(story.tags)
if extra_metadata:
- metadata['extra'] = '\n '.join(f'
{k}{v}' for k, v in extra_metadata.items())
+ metadata['extra'] = '\n '.join(
+ f'{k}{v}' for k, v in extra_metadata.items())
- valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
- cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
- cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
+ valid_cover_options = ('fontname', 'fontsize', 'width',
+ 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
+ cover_options = CoverOptions(
+ **{k: v for k, v in cover_options.items() if k in valid_cover_options})
+ cover_options = attr.asdict(
+ cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
if cover_options and "cover_url" in cover_options:
- image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
+ image = make_cover_from_url(
+ cover_options["cover_url"], story.title, story.author)
elif story.cover_url:
image = make_cover_from_url(story.cover_url, story.title, story.author)
else:
@@ -135,10 +199,24 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
[
# The cover is static, and the only change comes from the image which we generate
EpubFile(title='Cover', path='cover.html', contents=cover_template),
- EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
- *chapter_html(story, normalize=normalize),
- EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
- EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
+ EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
+ now=datetime.datetime.now(), **metadata)),
+ *chapter_html(
+ story,
+ image_bool=image_options.get('image_bool'),
+ image_format=image_options.get('image_format'),
+ compress_images=image_options.get('compress_images'),
+ max_image_size=image_options.get('max_image_size'),
+ normalize=normalize
+ ),
+ EpubFile(
+ path='Styles/base.css',
+ contents=requests.Session().get(
+ 'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
+ filetype='text/css'
+ ),
+ EpubFile(path='images/cover.png',
+ contents=image.read(), filetype='image/png'),
],
metadata,
output_dir=output_dir
diff --git a/ebook/image.py b/ebook/image.py
new file mode 100644
index 0000000..b89b59b
--- /dev/null
+++ b/ebook/image.py
@@ -0,0 +1,222 @@
+# Basically the same as cover.py with some minor differences
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from base64 import b64decode
+import math
+import textwrap
+import requests
+import logging
+
+from typing import Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def make_image(
+ message: str,
+ width=600,
+ height=300,
+ fontname="Helvetica",
+ font_size=40,
+ bg_color=(0, 0, 0),
+ textcolor=(255, 255, 255),
+ wrap_at=30
+):
+ """
+ This function should only be called if get_image_from_url() fails
+ """
+ img = Image.new("RGB", (width, height), bg_color)
+ draw = ImageDraw.Draw(img)
+
+ message = textwrap.fill(message, wrap_at)
+
+ font = _safe_font(fontname, size=font_size)
+ message_size = draw.textsize(message, font=font)
+ draw_text_outlined(
+ draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font)
+ # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)
+
+ output = BytesIO()
+ img.save(output, "JPEG")
+ output.name = 'cover.jpeg'
+ # writing left the cursor at the end of the file, so reset it
+ output.seek(0)
+ return output
+
+
+def get_size_format(b, factor=1000, suffix="B"):
+ """
+ Scale bytes to its proper byte format
+ e.g:
+ 1253656 => '1.20MB'
+ 1253656678 => '1.17GB'
+ """
+ for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+ if b < factor:
+ return f"{b:.2f}{unit}{suffix}"
+ b /= factor
+ return f"{b:.2f}Y{suffix}"
+
+
+def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image:
+ image_size = get_size_format(len(image.getvalue()))
+ logger.info(f"Image size: {image_size}")
+
+ big_photo = Image.open(image).convert("RGBA")
+
+ target_pixel_count = 2.8114 * target_size
+ if len(image.getvalue()) > target_size:
+ logger.info(f"Image is greater than {get_size_format(target_size)}, compressing")
+ scale_factor = target_pixel_count / math.prod(big_photo.size)
+ if scale_factor < 1:
+ x, y = tuple(int(scale_factor * dim) for dim in big_photo.size)
+ logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})")
+ sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS)
+ else:
+ sml_photo = big_photo
+ compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format)))
+ logger.info(f"Compressed image size: {compressed_image_size}")
+ return sml_photo
+ else:
+ logger.info(f"Image is less than {get_size_format(target_size)}, not compressing")
+ return big_photo
+
+
+def PIL_Image_to_bytes(
+ pil_image: PIL.Image.Image,
+ image_format: str
+) -> bytes:
+ out_io = BytesIO()
+ if image_format.lower().startswith("gif"):
+ frames = []
+ current = pil_image.convert('RGBA')
+ while True:
+ try:
+ frames.append(current)
+ pil_image.seek(pil_image.tell() + 1)
+ current = Image.alpha_composite(current, pil_image.convert('RGBA'))
+ except EOFError:
+ break
+ frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
+ return out_io.getvalue()
+
+ elif image_format.lower() in ["jpeg", "jpg"]:
+ # Create a new image with a white background
+ background_img = Image.new('RGBA', pil_image.size, "white")
+
+ # Paste the image on top of the background
+ background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA"))
+ pil_image = background_img.convert('RGB')
+
+ pil_image.save(out_io, format=image_format, optimize=True, quality=95)
+ return out_io.getvalue()
+
+
+def get_image_from_url(
+ url: str,
+ image_format: str = "JPEG",
+ compress_images: bool = False,
+ max_image_size: int = 1_000_000
+) -> Tuple[bytes, str, str]:
+ """
+ Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
+ an image tag and returns the image data, the image format and the image mime type
+
+ @param url: The url of the image
+ @param image_format: The format to convert the image to if it's not in the supported formats
+ @param compress_images: Whether to compress the image or not
+ @param max_image_size: The maximum size of the image in bytes
+ @return: A tuple of the image data, the image format and the image mime type
+ """
+ try:
+ if url.startswith("https://www.filepicker.io/api/"):
+ logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
+ url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
+ elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"):
+ logger.warning("Converting url to cdn6. This might fail.")
+ url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}"
+ elif url.startswith("data:image") and 'base64' in url:
+ logger.info("Base64 image detected")
+ head, base64data = url.split(',')
+ file_ext = str(head.split(';')[0].split('/')[1])
+ imgdata = b64decode(base64data)
+ if compress_images:
+ if file_ext.lower() == "gif":
+ logger.info("GIF images should not be compressed, skipping compression")
+ else:
+ compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext)
+ imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext)
+
+ if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
+ logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
+ return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
+ return imgdata, file_ext, f"image/{file_ext}"
+
+ print(url)
+ img = requests.Session().get(url)
+ image = BytesIO(img.content)
+ image.seek(0)
+
+ PIL_image = Image.open(image)
+ img_format = str(PIL_image.format)
+
+ if img_format.lower() == "gif":
+ PIL_image = Image.open(image)
+ if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
+ PIL_image.info['version'] = b"GIF89a"
+ return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
+
+ if compress_images:
+ PIL_image = compress_image(image, max_image_size, img_format)
+
+ return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"
+
+ except Exception as e:
+ logger.info("Encountered an error downloading image: " + str(e))
+ cover = make_image("There was a problem downloading this image.").read()
+ return cover, "jpeg", "image/jpeg"
+
+
+def _convert_to_new_format(image_bytestream, image_format: str):
+ new_image = BytesIO()
+ try:
+ Image.open(image_bytestream).save(new_image, format=image_format.upper())
+ new_image.name = f'cover.{image_format.lower()}'
+ new_image.seek(0)
+ except Exception as e:
+ logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
+ new_image = make_image("There was a problem converting this image.")
+ return new_image
+
+
+def _safe_font(preferred, *args, **kwargs):
+ for font in (preferred, "Helvetica", "FreeSans", "Arial"):
+ try:
+ return ImageFont.truetype(*args, font=font, **kwargs)
+ except IOError:
+ pass
+
+ # This is pretty terrible, but it'll work regardless of what fonts the
+ # system has. Worst issue: can't set the size.
+ return ImageFont.load_default()
+
+
+def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None):
+ x, y = xy
+
+ # Outline
+ draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+ draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+ draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+ draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+
+ # Fill
+ draw.text(xy, text=text, fill=fill, font=font, anchor=anchor)
+
+
+if __name__ == '__main__':
+ f = make_image(
+ 'Test of a Title which is quite long and will require multiple lines')
+ with open('output.png', 'wb') as out:
+ out.write(f.read())
diff --git a/examples/pact.json b/examples/pact.json
new file mode 100644
index 0000000..eaf0740
--- /dev/null
+++ b/examples/pact.json
@@ -0,0 +1,11 @@
+{
+ "url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/",
+ "title": "Pact",
+ "author": "Wildbow",
+ "content_selector": "#main",
+ "content_title_selector": "h1.entry-title",
+ "content_text_selector": ".entry-content",
+ "filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']",
+ "next_selector": "a[rel=\"next\"]",
+ "cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY"
+}
diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json
index db8a973..b548bf7 100644
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@@ -6,5 +6,6 @@
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
- "next_selector": "a[rel=\"next\"]"
+ "next_selector": "a[rel=\"next\"]",
+ "image_selector": ".entry-content img"
}
diff --git a/examples/pale.json b/examples/pale.json
index 6e053fe..111f786 100644
--- a/examples/pale.json
+++ b/examples/pale.json
@@ -1,8 +1,11 @@
{
- "url": "https://palewebserial.wordpress.com/table-of-contents/",
- "title": "Pale",
- "author": "Wildbow",
- "chapter_selector": "article .entry-content > p a",
- "content_selector": "article .entry-content",
- "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
+ "url": "https://palewebserial.wordpress.com/table-of-contents/",
+ "title": "Pale",
+ "author": "Wildbow",
+ "content_selector": "#main",
+ "content_title_selector": "h1.entry-title",
+ "content_text_selector": ".entry-content",
+ "chapter_selector": "article .entry-content > p a",
+ "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+ "image_selector": ".entry-content img"
}
diff --git a/examples/practical_all.json b/examples/practical_all.json
new file mode 100644
index 0000000..9339bda
--- /dev/null
+++ b/examples/practical_all.json
@@ -0,0 +1,11 @@
+{
+ "url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
+ "title": "A Practical Guide To Evil",
+ "author": "erraticerrata",
+ "content_selector": "#main .entry-wrapper",
+ "content_title_selector": "h1.entry-title",
+ "content_text_selector": ".entry-content",
+ "filter_selector": ".sharedaddy, .wpcnt, style",
+ "next_selector": "a[rel=\"next\"]",
+ "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
+}
\ No newline at end of file
diff --git a/examples/twig.json b/examples/twig.json
new file mode 100644
index 0000000..85490c5
--- /dev/null
+++ b/examples/twig.json
@@ -0,0 +1,11 @@
+{
+ "url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/",
+ "title": "Twig",
+ "author": "Wildbow",
+ "content_selector": "#main",
+ "content_title_selector": "h1.entry-title",
+ "content_text_selector": ".entry-content",
+ "filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']",
+ "next_selector": "a[rel=\"next\"]",
+ "cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png"
+}
diff --git a/examples/unsong.json b/examples/unsong.json
index e8192f5..8af8ddb 100644
--- a/examples/unsong.json
+++ b/examples/unsong.json
@@ -1,10 +1,10 @@
{
- "url": "https://unsongbook.com/prologue-2/",
- "title": "Unsong",
- "author": "Scott Alexander",
- "content_selector": "#pjgm-content",
- "content_title_selector": ".pjgm-posttitle",
- "content_text_selector": ".pjgm-postcontent",
- "filter_selector": ".sharedaddy",
- "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
+ "url": "https://unsongbook.com/prologue-2/",
+ "title": "Unsong",
+ "author": "Scott Alexander",
+ "content_selector": "#pjgm-content",
+ "content_title_selector": ".pjgm-posttitle",
+ "content_text_selector": ".pjgm-postcontent",
+ "filter_selector": ".sharedaddy",
+ "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
}
diff --git a/leech.py b/leech.py
index 638aa86..29cc7cc 100755
--- a/leech.py
+++ b/leech.py
@@ -58,18 +58,26 @@ def load_on_disk_options(site):
with open('leech.json') as store_file:
store = json.load(store_file)
login = store.get('logins', {}).get(site.site_key(), False)
+ image_bool: bool = store.get('images', False)
+ image_format: str = store.get('image_format', 'jpeg')
+ compress_images: bool = store.get('compress_images', False)
+ max_image_size: int = store.get('max_image_size', 1_000_000)
configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
cover_options = store.get('cover', {})
output_dir = store.get('output_dir', False)
except FileNotFoundError:
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
login = False
+ image_bool = False
+ image_format = 'jpeg'
+ compress_images = False
+ max_image_size = 1_000_000
configured_site_options = {}
cover_options = {}
output_dir = False
if output_dir and 'output_dir' not in configured_site_options:
configured_site_options['output_dir'] = output_dir
- return configured_site_options, login, cover_options
+ return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size
def create_options(site, site_options, unused_flags):
@@ -80,7 +88,7 @@ def create_options(site, site_options, unused_flags):
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
- configured_site_options, login, cover_options = load_on_disk_options(site)
+ configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site)
overridden_site_options = json.loads(site_options)
@@ -91,7 +99,8 @@ def create_options(site, site_options, unused_flags):
list(configured_site_options.items()) +
list(overridden_site_options.items()) +
list(flag_specified_site_options.items()) +
- list(cover_options.items())
+ list(cover_options.items()) +
+ list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items())
)
return options, login
@@ -158,7 +167,7 @@ def flush(verbose):
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options # Includes other click.options specific to sites
def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
- """Downloads a story and saves it on disk as a ebpub ebook."""
+ """Downloads a story and saves it on disk as an epub ebook."""
configure_logging(verbose)
session = create_session(cache)
@@ -169,6 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
if story:
filename = ebook.generate_epub(
story, options,
+ image_options={
+ 'image_bool': options['image_bool'] or False,
+ 'image_format': options['image_format'] or 'jpeg',
+ 'compress_images': options['compress_images'] or False,
+ 'max_image_size': options['max_image_size'] or 1_000_000
+ },
normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd())
)
diff --git a/sites/__init__.py b/sites/__init__.py
index c45bba1..7e93a50 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -21,11 +21,19 @@ def _default_uuid_string(self):
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
+@attr.s
+class Image:
+ path = attr.ib()
+ contents = attr.ib()
+ content_type = attr.ib()
+
+
@attr.s
class Chapter:
title = attr.ib()
contents = attr.ib()
date = attr.ib(default=False)
+ images = attr.ib(default=attr.Factory(list))
@attr.s
diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 5bb3cd2..21fae8b 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -6,7 +6,8 @@
import json
import re
import os.path
-from . import register, Site, Section, Chapter
+import urllib
+from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__)
@@ -42,6 +43,9 @@ class SiteDefinition:
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
+ # If present, use to also download the images and embed them into the epub.
+ image_selector = attr.ib(default=False)
+
@register
class Arbitrary(Site):
@@ -132,11 +136,42 @@ def _chapter(self, url, definition, title=False):
self._clean(content)
+ images = []
+ if definition.image_selector:
+ images = self.load_images(content, definition.image_selector)
+
chapters.append(Chapter(
title=title,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
+ images=images
))
return chapters
+
+ def load_images(self, content, selector):
+ images = []
+ for image in content.select(selector):
+ if not image.has_attr('src'):
+ continue
+
+ image_url = image['src']
+ url = urllib.parse.urlparse(image_url)
+ local_path = 'chapter_images/' + url.path.strip('/')
+
+ image_res = self.session.get(image_url)
+ content_type = image_res.headers['Content-Type']
+ image_data = image_res.content
+
+ images.append(Image(
+ path=local_path,
+ contents=image_data,
+ content_type=content_type
+ ))
+ # Replace 'src'.
+ image['src'] = '../' + local_path
+ if image.has_attr('srcset'):
+ del image['srcset']
+
+ return images
diff --git a/sites/xenforo.py b/sites/xenforo.py
index df1283e..42a4e5f 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -284,19 +284,36 @@ def _clean_chapter(self, post, chapterid):
def _clean_spoilers(self, post, chapterid):
# spoilers don't work well, so turn them into epub footnotes
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
- spoiler_title = spoiler.find(class_='SpoilerTitle')
- if self.options['skip_spoilers']:
- link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
- if spoiler_title:
- link.string = spoiler_title.get_text()
+ spoilerTarget = spoiler.find(class_='SpoilerTarget')
+
+ # This is a bit of a hack, but it works
+ # This downloads the spoiler image
+ img_exist = list(spoilerTarget.find_all('img'))
+ if len(img_exist) > 0:
+ for i in img_exist:
+ # For some weird reason, the images are duplicated, so this should skip some
+ if img_exist.index(i) % 2 == 0:
+ i.decompose()
+ else:
+ if not i.has_attr('src'):
+ i['src'] = i['data-url']
+ if i['src'].startswith('proxy.php'):
+ i['src'] = f"{self.domain}/{i['src']}"
+ spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
else:
- if spoiler_title:
- link = f'[SPOILER: {spoiler_title.get_text()}]'
+ spoiler_title = spoiler.find(class_='SpoilerTitle')
+ if self.options['skip_spoilers']:
+ link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
+ if spoiler_title:
+ link.string = spoiler_title.get_text()
else:
- link = '[SPOILER]'
- new_spoiler = self._new_tag('div', class_="leech-spoiler")
- new_spoiler.append(link)
- spoiler.replace_with(new_spoiler)
+ if spoiler_title:
+ link = f'[SPOILER: {spoiler_title.get_text()}]'
+ else:
+ link = '[SPOILER]'
+ new_spoiler = self._new_tag('div', class_="leech-spoiler")
+ new_spoiler.append(link)
+ spoiler.replace_with(new_spoiler)
def _post_date(self, post):
maybe_date = post.find(class_='DateTime')