diff --git a/.gitignore b/.gitignore index fd1c91c..b4c062e 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ coverage.xml # Sphinx documentation docs/_build/ + +# Pycharm +.idea/ diff --git a/README.markdown b/README.markdown index c794c76..5af29b8 100644 --- a/README.markdown +++ b/README.markdown @@ -49,6 +49,27 @@ Supports * Sta.sh * Completely arbitrary sites, with a bit more work (see below) +Images support +--- + +Leech creates EPUB 2.01 files, which means that Leech can only save images in the following +format: +- JPEG (JPG/JFIF) +- PNG +- GIF + +See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information. + +Leech can not save images in SVG because it is not supported by Pillow. + +Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different +image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information. + +By default, Leech will try and save all non-animated images as JPEG. +The only animated images that Leech will save are GIFs. + +To configure image support, you will need to create a file called `leech.json`. See the section below for more information. + Configuration --- @@ -61,6 +82,10 @@ Example: "logins": { "QuestionableQuesting": ["username", "password"] }, + "images": true, + "image_format": "png", + "compress_images": true, + "max_image_size": 100000, "cover": { "fontname": "Comic Sans MS", "fontsize": 30, @@ -76,6 +101,30 @@ Example: } } ``` +> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase. +> If it is `false`, Leech will not download any images. +> Leech will also ignore the `image_format` key if `images` is `false`. + +> Note: If the `image_format` key does not exist, Leech will default to `jpeg`. +> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive. + +> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images. +> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key, +> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image +> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes. +> If `compress_images` is `false`, Leech will ignore the `max_image_size` key. + +> Warning: Compressing images might make Leech take a lot longer to download images. + +> Warning: Compressing images might make the image quality worse. + +> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might +> not be able to compress the image to the exact size of the `max_image_size` key. + +> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to +> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes. + +> Warning: Leech will not compress GIFs, that might damage the animation. Arbitrary Sites --- diff --git a/ebook/__init__.py b/ebook/__init__.py index 7810c21..635dafd 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -1,6 +1,8 @@ from .epub import make_epub, EpubFile -from .cover import make_cover -from .cover import make_cover_from_url +from .cover import make_cover, make_cover_from_url +from .image import get_image_from_url +from sites import Image +from bs4 import BeautifulSoup import html import unicodedata @@ -72,34 +74,91 @@ class CoverOptions: height = attr.ib(default=None, converter=attr.converters.optional(int)) wrapat = attr.ib(default=None, converter=attr.converters.optional(int)) bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple)) - textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple)) + textcolor = attr.ib( + default=None, converter=attr.converters.optional(tuple)) cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) -def chapter_html(story, titleprefix=None, normalize=False): +def chapter_html( + story, + image_bool=False, + image_format="JPEG", + compress_images=False, + max_image_size=1_000_000, + titleprefix=None, + normalize=False +): chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' if hasattr(chapter, '__iter__'): # This is a Section - chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize)) + chapters.extend(chapter_html( + chapter, titleprefix=title, normalize=normalize)) else: + soup = BeautifulSoup(chapter.contents, 'html5lib') + if image_bool: + all_images = soup.find_all('img') + len_of_all_images = len(all_images) + print(f"Found {len_of_all_images} images in chapter {i}") + + for count, img in enumerate(all_images): + if not img.has_attr('src'): + print(f"Image {count} has no src attribute, skipping...") + continue + print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") + img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size) + chapter.images.append(Image( + path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", + contents=img_contents[0], + content_type=img_contents[2] + )) + img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}" + if not img.has_attr('alt'): + img['alt'] = f"Image {count} from chapter {i}" + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile( + path=image.path, contents=image.contents, filetype=image.content_type)) + else: + # Remove all images from the chapter so you don't get that annoying grey background. + for img in soup.find_all('img'): + if img.parent.name.lower() == "figure": + img.parent.decompose() + else: + img.decompose() + title = titleprefix and f'{titleprefix}: {title}' or title - contents = chapter.contents + contents = str(soup) if normalize: title = unicodedata.normalize('NFKC', title) contents = unicodedata.normalize('NFKC', contents) chapters.append(EpubFile( title=title, path=f'{story.id}/chapter{i + 1}.html', - contents=html_template.format(title=html.escape(title), text=contents) + contents=html_template.format( + title=html.escape(title), text=contents) )) if story.footnotes: - chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes)))) + chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( + title="Footnotes", text='\n\n'.join(story.footnotes)))) return chapters -def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False): +def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False): + if image_options is None: + image_options = { + 'image_bool': False, + 'image_format': 'JPEG', + 'compress_images': False, + 'max_image_size': 1_000_000 + } dates = list(story.dates()) metadata = { 'title': story.title, @@ -117,14 +176,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None extra_metadata['Tags'] = ', '.join(story.tags) if extra_metadata: - metadata['extra'] = '\n '.join(f'
{k}
{v}
' for k, v in extra_metadata.items()) + metadata['extra'] = '\n '.join( + f'
{k}
{v}
' for k, v in extra_metadata.items()) - valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url') - cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options}) - cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True) + valid_cover_options = ('fontname', 'fontsize', 'width', + 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url') + cover_options = CoverOptions( + **{k: v for k, v in cover_options.items() if k in valid_cover_options}) + cover_options = attr.asdict( + cover_options, filter=lambda k, v: v is not None, retain_collection_types=True) if cover_options and "cover_url" in cover_options: - image = make_cover_from_url(cover_options["cover_url"], story.title, story.author) + image = make_cover_from_url( + cover_options["cover_url"], story.title, story.author) elif story.cover_url: image = make_cover_from_url(story.cover_url, story.title, story.author) else: @@ -135,10 +199,24 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None [ # The cover is static, and the only change comes from the image which we generate EpubFile(title='Cover', path='cover.html', contents=cover_template), - EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)), - *chapter_html(story, normalize=normalize), - EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'), - EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'), + EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format( + now=datetime.datetime.now(), **metadata)), + *chapter_html( + story, + image_bool=image_options.get('image_bool'), + image_format=image_options.get('image_format'), + compress_images=image_options.get('compress_images'), + max_image_size=image_options.get('max_image_size'), + normalize=normalize + ), + EpubFile( + path='Styles/base.css', + contents=requests.Session().get( + 'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, + filetype='text/css' + ), + EpubFile(path='images/cover.png', + contents=image.read(), filetype='image/png'), ], metadata, output_dir=output_dir diff --git a/ebook/image.py b/ebook/image.py new file mode 100644 index 0000000..b89b59b --- /dev/null +++ b/ebook/image.py @@ -0,0 +1,222 @@ +# Basically the same as cover.py with some minor differences +import PIL +from PIL import Image, ImageDraw, ImageFont +from io import BytesIO +from base64 import b64decode +import math +import textwrap +import requests +import logging + +from typing import Tuple + +logger = logging.getLogger(__name__) + + +def make_image( + message: str, + width=600, + height=300, + fontname="Helvetica", + font_size=40, + bg_color=(0, 0, 0), + textcolor=(255, 255, 255), + wrap_at=30 +): + """ + This function should only be called if get_image_from_url() fails + """ + img = Image.new("RGB", (width, height), bg_color) + draw = ImageDraw.Draw(img) + + message = textwrap.fill(message, wrap_at) + + font = _safe_font(fontname, size=font_size) + message_size = draw.textsize(message, font=font) + draw_text_outlined( + draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font) + # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font) + + output = BytesIO() + img.save(output, "JPEG") + output.name = 'cover.jpeg' + # writing left the cursor at the end of the file, so reset it + output.seek(0) + return output + + +def get_size_format(b, factor=1000, suffix="B"): + """ + Scale bytes to its proper byte format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if b < factor: + return f"{b:.2f}{unit}{suffix}" + b /= factor + return f"{b:.2f}Y{suffix}" + + +def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image: + image_size = get_size_format(len(image.getvalue())) + logger.info(f"Image size: {image_size}") + + big_photo = Image.open(image).convert("RGBA") + + target_pixel_count = 2.8114 * target_size + if len(image.getvalue()) > target_size: + logger.info(f"Image is greater than {get_size_format(target_size)}, compressing") + scale_factor = target_pixel_count / math.prod(big_photo.size) + if scale_factor < 1: + x, y = tuple(int(scale_factor * dim) for dim in big_photo.size) + logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})") + sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS) + else: + sml_photo = big_photo + compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format))) + logger.info(f"Compressed image size: {compressed_image_size}") + return sml_photo + else: + logger.info(f"Image is less than {get_size_format(target_size)}, not compressing") + return big_photo + + +def PIL_Image_to_bytes( + pil_image: PIL.Image.Image, + image_format: str +) -> bytes: + out_io = BytesIO() + if image_format.lower().startswith("gif"): + frames = [] + current = pil_image.convert('RGBA') + while True: + try: + frames.append(current) + pil_image.seek(pil_image.tell() + 1) + current = Image.alpha_composite(current, pil_image.convert('RGBA')) + except EOFError: + break + frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0) + return out_io.getvalue() + + elif image_format.lower() in ["jpeg", "jpg"]: + # Create a new image with a white background + background_img = Image.new('RGBA', pil_image.size, "white") + + # Paste the image on top of the background + background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA")) + pil_image = background_img.convert('RGB') + + pil_image.save(out_io, format=image_format, optimize=True, quality=95) + return out_io.getvalue() + + +def get_image_from_url( + url: str, + image_format: str = "JPEG", + compress_images: bool = False, + max_image_size: int = 1_000_000 +) -> Tuple[bytes, str, str]: + """ + Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of + an image tag and returns the image data, the image format and the image mime type + + @param url: The url of the image + @param image_format: The format to convert the image to if it's not in the supported formats + @param compress_images: Whether to compress the image or not + @param max_image_size: The maximum size of the image in bytes + @return: A tuple of the image data, the image format and the image mime type + """ + try: + if url.startswith("https://www.filepicker.io/api/"): + logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") + url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95" + elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"): + logger.warning("Converting url to cdn6. This might fail.") + url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}" + elif url.startswith("data:image") and 'base64' in url: + logger.info("Base64 image detected") + head, base64data = url.split(',') + file_ext = str(head.split(';')[0].split('/')[1]) + imgdata = b64decode(base64data) + if compress_images: + if file_ext.lower() == "gif": + logger.info("GIF images should not be compressed, skipping compression") + else: + compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext) + imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext) + + if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]: + logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}") + return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" + return imgdata, file_ext, f"image/{file_ext}" + + print(url) + img = requests.Session().get(url) + image = BytesIO(img.content) + image.seek(0) + + PIL_image = Image.open(image) + img_format = str(PIL_image.format) + + if img_format.lower() == "gif": + PIL_image = Image.open(image) + if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]: + PIL_image.info['version'] = b"GIF89a" + return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif" + + if compress_images: + PIL_image = compress_image(image, max_image_size, img_format) + + return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}" + + except Exception as e: + logger.info("Encountered an error downloading image: " + str(e)) + cover = make_image("There was a problem downloading this image.").read() + return cover, "jpeg", "image/jpeg" + + +def _convert_to_new_format(image_bytestream, image_format: str): + new_image = BytesIO() + try: + Image.open(image_bytestream).save(new_image, format=image_format.upper()) + new_image.name = f'cover.{image_format.lower()}' + new_image.seek(0) + except Exception as e: + logger.info(f"Encountered an error converting image to {image_format}\nError: {e}") + new_image = make_image("There was a problem converting this image.") + return new_image + + +def _safe_font(preferred, *args, **kwargs): + for font in (preferred, "Helvetica", "FreeSans", "Arial"): + try: + return ImageFont.truetype(*args, font=font, **kwargs) + except IOError: + pass + + # This is pretty terrible, but it'll work regardless of what fonts the + # system has. Worst issue: can't set the size. + return ImageFont.load_default() + + +def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None): + x, y = xy + + # Outline + draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + + # Fill + draw.text(xy, text=text, fill=fill, font=font, anchor=anchor) + + +if __name__ == '__main__': + f = make_image( + 'Test of a Title which is quite long and will require multiple lines') + with open('output.png', 'wb') as out: + out.write(f.read()) diff --git a/examples/pact.json b/examples/pact.json new file mode 100644 index 0000000..eaf0740 --- /dev/null +++ b/examples/pact.json @@ -0,0 +1,11 @@ +{ + "url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/", + "title": "Pact", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY" +} diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json index db8a973..b548bf7 100644 --- a/examples/pale-withextras.json +++ b/examples/pale-withextras.json @@ -6,5 +6,6 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]" + "next_selector": "a[rel=\"next\"]", + "image_selector": ".entry-content img" } diff --git a/examples/pale.json b/examples/pale.json index 6e053fe..111f786 100644 --- a/examples/pale.json +++ b/examples/pale.json @@ -1,8 +1,11 @@ { - "url": "https://palewebserial.wordpress.com/table-of-contents/", - "title": "Pale", - "author": "Wildbow", - "chapter_selector": "article .entry-content > p a", - "content_selector": "article .entry-content", - "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']" + "url": "https://palewebserial.wordpress.com/table-of-contents/", + "title": "Pale", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "chapter_selector": "article .entry-content > p a", + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", + "image_selector": ".entry-content img" } diff --git a/examples/practical_all.json b/examples/practical_all.json new file mode 100644 index 0000000..9339bda --- /dev/null +++ b/examples/practical_all.json @@ -0,0 +1,11 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/", + "title": "A Practical Guide To Evil", + "author": "erraticerrata", + "content_selector": "#main .entry-wrapper", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png" +} \ No newline at end of file diff --git a/examples/twig.json b/examples/twig.json new file mode 100644 index 0000000..85490c5 --- /dev/null +++ b/examples/twig.json @@ -0,0 +1,11 @@ +{ + "url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/", + "title": "Twig", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png" +} diff --git a/examples/unsong.json b/examples/unsong.json index e8192f5..8af8ddb 100644 --- a/examples/unsong.json +++ b/examples/unsong.json @@ -1,10 +1,10 @@ { - "url": "https://unsongbook.com/prologue-2/", - "title": "Unsong", - "author": "Scott Alexander", - "content_selector": "#pjgm-content", - "content_title_selector": ".pjgm-posttitle", - "content_text_selector": ".pjgm-postcontent", - "filter_selector": ".sharedaddy", - "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])" + "url": "https://unsongbook.com/prologue-2/", + "title": "Unsong", + "author": "Scott Alexander", + "content_selector": "#pjgm-content", + "content_title_selector": ".pjgm-posttitle", + "content_text_selector": ".pjgm-postcontent", + "filter_selector": ".sharedaddy", + "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])" } diff --git a/leech.py b/leech.py index 638aa86..29cc7cc 100755 --- a/leech.py +++ b/leech.py @@ -58,18 +58,26 @@ def load_on_disk_options(site): with open('leech.json') as store_file: store = json.load(store_file) login = store.get('logins', {}).get(site.site_key(), False) + image_bool: bool = store.get('images', False) + image_format: str = store.get('image_format', 'jpeg') + compress_images: bool = store.get('compress_images', False) + max_image_size: int = store.get('max_image_size', 1_000_000) configured_site_options = store.get('site_options', {}).get(site.site_key(), {}) cover_options = store.get('cover', {}) output_dir = store.get('output_dir', False) except FileNotFoundError: logger.info("Unable to locate leech.json. Continuing assuming it does not exist.") login = False + image_bool = False + image_format = 'jpeg' + compress_images = False + max_image_size = 1_000_000 configured_site_options = {} cover_options = {} output_dir = False if output_dir and 'output_dir' not in configured_site_options: configured_site_options['output_dir'] = output_dir - return configured_site_options, login, cover_options + return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size def create_options(site, site_options, unused_flags): @@ -80,7 +88,7 @@ def create_options(site, site_options, unused_flags): flag_specified_site_options = site.interpret_site_specific_options(**unused_flags) - configured_site_options, login, cover_options = load_on_disk_options(site) + configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site) overridden_site_options = json.loads(site_options) @@ -91,7 +99,8 @@ def create_options(site, site_options, unused_flags): list(configured_site_options.items()) + list(overridden_site_options.items()) + list(flag_specified_site_options.items()) + - list(cover_options.items()) + list(cover_options.items()) + + list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items()) ) return options, login @@ -158,7 +167,7 @@ def flush(verbose): @click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output") @site_specific_options # Includes other click.options specific to sites def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags): - """Downloads a story and saves it on disk as a ebpub ebook.""" + """Downloads a story and saves it on disk as an epub ebook.""" configure_logging(verbose) session = create_session(cache) @@ -169,6 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ if story: filename = ebook.generate_epub( story, options, + image_options={ + 'image_bool': options['image_bool'] or False, + 'image_format': options['image_format'] or 'jpeg', + 'compress_images': options['compress_images'] or False, + 'max_image_size': options['max_image_size'] or 1_000_000 + }, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()) ) diff --git a/sites/__init__.py b/sites/__init__.py index c45bba1..7e93a50 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -21,11 +21,19 @@ def _default_uuid_string(self): return str(uuid.UUID(int=rd.getrandbits(8*16), version=4)) +@attr.s +class Image: + path = attr.ib() + contents = attr.ib() + content_type = attr.ib() + + @attr.s class Chapter: title = attr.ib() contents = attr.ib() date = attr.ib(default=False) + images = attr.ib(default=attr.Factory(list)) @attr.s diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 5bb3cd2..21fae8b 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -6,7 +6,8 @@ import json import re import os.path -from . import register, Site, Section, Chapter +import urllib +from . import register, Site, Section, Chapter, Image logger = logging.getLogger(__name__) @@ -42,6 +43,9 @@ class SiteDefinition: filter_selector = attr.ib(default=False) cover_url = attr.ib(default='') + # If present, use to also download the images and embed them into the epub. + image_selector = attr.ib(default=False) + @register class Arbitrary(Site): @@ -132,11 +136,42 @@ def _chapter(self, url, definition, title=False): self._clean(content) + images = [] + if definition.image_selector: + images = self.load_images(content, definition.image_selector) + chapters.append(Chapter( title=title, contents=content.prettify(), # TODO: better date detection date=datetime.datetime.now(), + images=images )) return chapters + + def load_images(self, content, selector): + images = [] + for image in content.select(selector): + if not image.has_attr('src'): + continue + + image_url = image['src'] + url = urllib.parse.urlparse(image_url) + local_path = 'chapter_images/' + url.path.strip('/') + + image_res = self.session.get(image_url) + content_type = image_res.headers['Content-Type'] + image_data = image_res.content + + images.append(Image( + path=local_path, + contents=image_data, + content_type=content_type + )) + # Replace 'src'. + image['src'] = '../' + local_path + if image.has_attr('srcset'): + del image['srcset'] + + return images diff --git a/sites/xenforo.py b/sites/xenforo.py index df1283e..42a4e5f 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -284,19 +284,36 @@ def _clean_chapter(self, post, chapterid): def _clean_spoilers(self, post, chapterid): # spoilers don't work well, so turn them into epub footnotes for spoiler in post.find_all(class_='ToggleTriggerAnchor'): - spoiler_title = spoiler.find(class_='SpoilerTitle') - if self.options['skip_spoilers']: - link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid) - if spoiler_title: - link.string = spoiler_title.get_text() + spoilerTarget = spoiler.find(class_='SpoilerTarget') + + # This is a bit of a hack, but it works + # This downloads the spoiler image + img_exist = list(spoilerTarget.find_all('img')) + if len(img_exist) > 0: + for i in img_exist: + # For some weird reason, the images are duplicated, so this should skip some + if img_exist.index(i) % 2 == 0: + i.decompose() + else: + if not i.has_attr('src'): + i['src'] = i['data-url'] + if i['src'].startswith('proxy.php'): + i['src'] = f"{self.domain}/{i['src']}" + spoiler.replace_with(spoiler.find(class_='SpoilerTarget')) else: - if spoiler_title: - link = f'[SPOILER: {spoiler_title.get_text()}]' + spoiler_title = spoiler.find(class_='SpoilerTitle') + if self.options['skip_spoilers']: + link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid) + if spoiler_title: + link.string = spoiler_title.get_text() else: - link = '[SPOILER]' - new_spoiler = self._new_tag('div', class_="leech-spoiler") - new_spoiler.append(link) - spoiler.replace_with(new_spoiler) + if spoiler_title: + link = f'[SPOILER: {spoiler_title.get_text()}]' + else: + link = '[SPOILER]' + new_spoiler = self._new_tag('div', class_="leech-spoiler") + new_spoiler.append(link) + spoiler.replace_with(new_spoiler) def _post_date(self, post): maybe_date = post.find(class_='DateTime')