diff --git a/compose.yml b/compose.yml index 570144207..b3d94da2b 100644 --- a/compose.yml +++ b/compose.yml @@ -1,23 +1,24 @@ -version: '3' +version: "3" services: chrome: image: selenium/standalone-chrome:latest + privileged: true shm_size: 6gb restart: unless-stopped ports: - "7900:7900" - "4444:4444" environment: - SE_VNC_VIEW_ONLY: '1' - SE_EVENT_BUS_PUBLISH_PORT: '4442' - SE_EVENT_BUS_SUBSCRIBE_PORT: '4443' - NODE_MAX_INSTANCE: '8' - NODE_MAX_SESSION: '8' - SE_NO_VNC_PORT: '7900' - SE_SCREEN_WIDTH: '1920' - SE_SCREEN_HEIGHT: '1080' - SE_NODE_GRID_URL: 'false' + SE_VNC_VIEW_ONLY: "1" + SE_EVENT_BUS_PUBLISH_PORT: "4442" + SE_EVENT_BUS_SUBSCRIBE_PORT: "4443" + NODE_MAX_INSTANCE: "8" + NODE_MAX_SESSION: "8" + SE_NO_VNC_PORT: "7900" + SE_SCREEN_WIDTH: "1920" + SE_SCREEN_HEIGHT: "1080" + SE_NODE_GRID_URL: "false" # telegram-bot: # image: lncrawl @@ -29,6 +30,14 @@ services: # environment: # CLOUD_DRIVE: "GOFILE" # TELEGRAM_TOKEN: "${TELEGRAM_TOKEN}" + # redis: + # image: redis:alpine + # restart: always + # ports: + # - "6379:6379" + # command: redis-server --save 20 1 --loglevel warning + # volumes: + # - redis_data:/data discord-bot: image: lncrawl @@ -36,7 +45,7 @@ services: context: . dockerfile: ./scripts/Dockerfile restart: unless-stopped - command: python -m lncrawl --suppress --bot discord --shard-id 0 --shard-count 1 --selenium-grid "http://chrome:4444" + command: python -m lncrawl --suppress --bot discord --selenium-grid "http://chrome:4444" depends_on: - chrome environment: @@ -44,3 +53,7 @@ services: DISCORD_TOKEN: "${DISCORD_TOKEN}" DISCORD_SIGNAL_CHAR: "${DISCORD_SIGNAL_CHAR}" DISCORD_DISABLE_SEARCH: "${DISCORD_DISABLE_SEARCH}" + +# volumes: +# redis_data: +# driver: local diff --git a/lncrawl/binders/calibre.py b/lncrawl/binders/calibre.py index d8435bd7a..3740b2fdb 100644 --- a/lncrawl/binders/calibre.py +++ b/lncrawl/binders/calibre.py @@ -7,6 +7,23 @@ EBOOK_CONVERT = "ebook-convert" CALIBRE_LINK = "https://calibre-ebook.com/download" +# ebook-convert + [ +# '/home/mira/Projects/misc/lightnovel-crawler/.discord_bot_output/novelfull-com/Birth Of The Demonic Sword/epub/Birth Of The Demonic Sword c1-5.epub', +# '/home/mira/Projects/misc/lightnovel-crawler/.discord_bot_output/novelfull-com/Birth Of The Demonic Sword/mobi/Birth Of The Demonic Sword c1-5.mobi', +# '--unsmarten-punctuation', +# '--no-chapters-in-toc', +# '--title', 'Birth Of The Demonic Sword c1-5', +# '--authors', 'Eveofchaos', +# '--comments', '', +# '--language', 'en', +# '--tags', [], +# '--series', 'Birth Of The Demonic Sword', +# '--publisher', 'https://novelfull.com/', +# '--book-producer', 'Lightnovel Crawler', +# '--enable-heuristics', +# '--disable-renumber-headings', +# '--cover', '/home/mira/Projects/misc/lightnovel-crawler/.discord_bot_output/novelfull-com/Birth Of The Demonic Sword/cover.jpg'] + def run_ebook_convert(*args): """ @@ -14,6 +31,7 @@ def run_ebook_convert(*args): Visit https://manual.calibre-ebook.com/generated/en/ebook-convert.html for argument list. """ try: + # print(f"{EBOOK_CONVERT} {' '.join(list(args))}") isdebug = os.getenv("debug_mode") with open(os.devnull, "w", encoding="utf8") as dumper: subprocess.call( diff --git a/lncrawl/binders/epub.py b/lncrawl/binders/epub.py index 8abc1f72f..bc1102384 100644 --- a/lncrawl/binders/epub.py +++ b/lncrawl/binders/epub.py @@ -197,7 +197,7 @@ def bind_epub_book( os.makedirs(epub_path, exist_ok=True) epub.write_epub(file_path, book, {}) - print("Created: %s.epub" % file_name) + logger.info("Created: %s.epub" % file_name) return file_path diff --git a/lncrawl/binders/text.py b/lncrawl/binders/text.py index cfc3af904..bda8e3c7b 100644 --- a/lncrawl/binders/text.py +++ b/lncrawl/binders/text.py @@ -25,5 +25,5 @@ def make_texts(app, data): file.write(text) text_files.append(file_name) - print("Created: %d text files" % len(text_files)) + logger.info("Created: %d text files" % len(text_files)) return text_files diff --git a/lncrawl/bots/discord/cogs/__init__.py b/lncrawl/bots/discord/cogs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lncrawl/bots/discord/cogs/novels.py b/lncrawl/bots/discord/cogs/novels.py new file mode 100644 index 000000000..35a3aa70e --- /dev/null +++ b/lncrawl/bots/discord/cogs/novels.py @@ -0,0 +1,133 @@ +import asyncio +import io +import math +import discord +import logging +from discord.ext import commands + +from lncrawl.core.app import App + +from ..components import NovelMenu +from ..utils import validate_formats +from ..config import available_formats +from ..novel_handlers import ( + archive_metadata, + configure_output_path, + destroy_app, + download_novel, + novel_by_title, + novel_by_url, + upload_file, + update_progress, +) + +logger = logging.getLogger(__name__) + + +class Novels(commands.Cog): + def __init__(self, bot): + self.bot: discord.Bot = bot + + @discord.slash_command(name="download", description="Download a novel by URL") + @discord.option("url", description="Novel URL") + @discord.option("start", description="Start chapter", default=0) + @discord.option("end", description="End chapter", default=math.inf) + @discord.option( + "formats", description="Comma separated target formats", default="epub" + ) + async def download( + self, + ctx: discord.ApplicationContext, + url: str, + start: float, + end: float, + formats: str, + ): + if not url.startswith("http"): + await ctx.respond("You specified an invalid URL") + return + formats_list = list(map(str.strip, formats.split(","))) + if not validate_formats(formats_list): + fs = ", ".join(available_formats) + await ctx.respond( + f"The format you specified is invalid, the available formats are: {fs}" + ) + # start thinking + await ctx.defer() + + app: App = await novel_by_url(url) + embed = discord.Embed( + title=app.crawler.novel_title, + url=app.crawler.novel_url, + description=app.crawler.novel_synopsis, + ) + embed.set_thumbnail(url=app.crawler.novel_cover) + embed.add_field(name="Author", value=app.crawler.novel_author, inline=False) + embed.add_field(name="Volumes", value=len(app.crawler.volumes)) + embed.add_field(name="Chapters", value=len(app.crawler.chapters)) + await ctx.respond(embed=embed) + + # set chapters + if math.isinf(end): + app.chapters = app.crawler.chapters[int(start) :] + else: + app.chapters = app.crawler.chapters[int(start) : int(end)] + + followUp = await ctx.respond( + f"I don't have this file, downloading {len(app.chapters)} chapters, this will take a while." + ) + + # set formats + app.output_formats = {x: (x in formats_list) for x in available_formats} + # set up directories + app.output_path = configure_output_path(app) + # update the user with dl progress + progress_report = update_progress(app, followUp.edit) + asyncio.create_task(progress_report) + + # start the download + archive_list = await download_novel(app) + + try: + for archive in archive_list: + archive_format, archive_name = archive_metadata(archive) + result = await upload_file(archive) + if isinstance(result, str): + await ctx.respond(f"Download URL: {result}") + elif isinstance(result, io.BufferedReader): + await ctx.respond( + file=discord.File(filename=archive_name, fp=result) + ) + # cache if needed + # attachment, *_ = fileResponse.attachments + else: + await ctx.respond(f"Failed to upload {archive_name}") + finally: + await destroy_app(app) + + @discord.slash_command(name="search", description="Search a novel by name") + @discord.option("name", description="Lightnovel name") + @discord.option("pattern", description="Regex pattern", default="") + async def search( + self, + ctx: discord.ApplicationContext, + name: str, + pattern: str, + ): + if len(name) < 4: + await ctx.respond("Query string is too short") + return + # start thinking + await ctx.defer() + app: App = await novel_by_title(name, pattern) + # app.search_results + selectNovelView = NovelMenu() + selectNovelView.add_items(novelList=app.search_results[:24]) + await ctx.respond( + "Select a novel, use the returned link in the `/download` command", + view=selectNovelView, + ) + + +def setup(bot): # this is called by Pycord to setup the cog + bot.add_cog(Novels(bot)) # add the cog to the bot diff --git a/lncrawl/bots/discord/components.py b/lncrawl/bots/discord/components.py new file mode 100644 index 000000000..5ae037379 --- /dev/null +++ b/lncrawl/bots/discord/components.py @@ -0,0 +1,61 @@ +import discord +import typing as t + +from lncrawl.models.search_result import CombinedSearchResult + + +class NovelSelectMenu(discord.ui.Select): + def __init__(self): + super().__init__( + placeholder="Select a novel...", + min_values=1, + max_values=1, + row=0, + ) + self.novelList = [] + + def fill_options(self, novelList: t.List[CombinedSearchResult]) -> None: + self.novelList = novelList + for i, item in enumerate(novelList): + nc = len(item.novels) + self.add_option( + label=item.title, + value=str(i), + description=f"{nc} source{'s'[:nc^1]}", + ) + + async def callback(self, interaction: discord.Interaction): + assert self.view is not None + value = self.values[0] + novel_list = [ + f"{i+1}. <{item.url}> {item.info or ''}".strip() + for i, item in enumerate(self.novelList[int(value)].novels) + ] + + message = "" + novel_count = len(novel_list) + responded = False + reply = ( + lambda msg: interaction.response.send_message(msg.strip()) + if not responded + else interaction.followup.send(msg.strip()) + ) + # split into separate messages w/ length up to 2000 chars + for i, line in enumerate(novel_list): + message_len = len(message) + if (message_len + len(line) + 1) >= 2000: + await reply(message) + responded = True + message = "" + message += line + "\n" + if i == novel_count - 1: + await reply(message) + + return + + +class NovelMenu(discord.ui.View): + def add_items(self, novelList: t.List[CombinedSearchResult]) -> None: + selectMenu = NovelSelectMenu() + selectMenu.fill_options(novelList) + self.add_item(selectMenu) diff --git a/lncrawl/bots/discord/config.py b/lncrawl/bots/discord/config.py index 7d6a411d1..c692b5c21 100644 --- a/lncrawl/bots/discord/config.py +++ b/lncrawl/bots/discord/config.py @@ -25,8 +25,8 @@ "text", "web", "mobi", - #'pdf', - #'fb2', + "pdf", + "fb2", ] os.makedirs("logs", exist_ok=True) diff --git a/lncrawl/bots/discord/discord_bot.py b/lncrawl/bots/discord/discord_bot.py index fa708bef6..7a74d1d03 100644 --- a/lncrawl/bots/discord/discord_bot.py +++ b/lncrawl/bots/discord/discord_bot.py @@ -1,130 +1,21 @@ -import os -import subprocess -from datetime import datetime -from typing import Dict - +import logging import discord from . import config as C -from .config import logger -from .message_handler import MessageHandler - - -def get_bot_version(): - try: - result = subprocess.check_output(["git", "rev-list", "--count", "HEAD"]) - return result.decode("utf-8") - except Exception: - from lncrawl.assets import version - - return version.get_version() +logger = logging.getLogger(__name__) -class DiscordBot(discord.Client): - bot_version = get_bot_version() - def __init__(self, *args, loop=None, **options): - options["shard_id"] = C.shard_id - options["shard_count"] = C.shard_count - options["heartbeat_timeout"] = 300 - options["guild_subscriptions"] = False - options["fetch_offline_members"] = False - self.handlers: Dict[str, MessageHandler] = {} - super().__init__(*args, loop=loop, **options) +class Bot(discord.Bot): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) - def start_bot(self): - self.bot_is_ready = False - os.environ["debug_mode"] = "yes" - self.run(C.discord_token) + self.load_extension("lncrawl.bots.discord.cogs.novels") async def on_ready(self): - # Reset handler cache - self.handlers = {} - - print("Discord bot in online!") - activity = discord.Activity( - name="for 🔥%s🔥 (%s)" % (C.signal, self.bot_version), - type=discord.ActivityType.watching, - ) - await self.change_presence(activity=activity, status=discord.Status.online) - - self.bot_is_ready = True - - async def on_message(self, message): - if not self.bot_is_ready: - return # Not ready yet - if message.author == self.user: - return # I am not crazy to talk with myself - if message.author.bot: - return # Other bots are not edible - try: - # Cleanup unused handlers - self.cleanup_handlers() - - text = message.content - if isinstance(message.channel, discord.abc.PrivateChannel): - await self.handle_message(message) - elif text.startswith(C.signal) and len(text.split(C.signal)) == 2: - uid = str(message.author.id) - async with message.channel.typing(): - await message.channel.send( - f"Sending you a private message <@{uid}>" - ) - if uid in self.handlers: - self.handlers[uid].destroy() - - await self.handle_message(message) - - except IndexError as ex: - logger.exception("Index error reported", ex) - except Exception: - logger.exception("Something went wrong processing message") - - async def handle_message(self, message): - if self.is_closed(): - return - - try: - uid = str(message.author.id) - discriminator = message.author.discriminator - logger.info( - "Processing message from %s#%s", message.author.name, discriminator - ) - if uid in self.handlers: - self.handlers[uid].process(message) - # elif len(self.handlers) > C.max_active_handles or discriminator not in C.vip_users_ids: - # async with message.author.typing(): - # await message.author.send( - # "Sorry! I am too busy processing requests of other users.\n" - # "Please knock again in a few hours." - # ) - else: - logger.info( - "New handler for %s#%s [%s]", - message.author.name, - discriminator, - uid, - ) - self.handlers[uid] = MessageHandler(uid, self) - async with message.author.typing(): - await message.author.send( - "-" * 25 + "\n" + f"Hello <@{uid}>\n" + "-" * 25 + "\n" - ) - self.handlers[uid].process(message) - - except Exception: - logger.exception("While handling this message: %s", message) - - def cleanup_handlers(self): - try: - cur_time = datetime.now() - for handler in self.handlers.values(): - if handler.is_busy(): - continue + # todo: activity and stuff + logger.debug(f"{self.user} is ready and online!") - last_time = getattr(handler, "last_activity", cur_time) - if (cur_time - last_time).seconds > C.session_retain_time_in_seconds: - handler.destroy() - except Exception: - logger.exception("Failed to cleanup handlers") +client = Bot() +client.run(C.discord_token) diff --git a/lncrawl/bots/discord/novel_handlers.py b/lncrawl/bots/discord/novel_handlers.py new file mode 100644 index 000000000..737cae602 --- /dev/null +++ b/lncrawl/bots/discord/novel_handlers.py @@ -0,0 +1,100 @@ +import asyncio +import io +import os +import re +import shutil +import typing as t +import logging + +from .utils import to_thread +from ...core.app import App +from ...core.sources import crawler_list, prepare_crawler +from ...core.crawler import Crawler +from ...utils.uploader import upload + +logger = logging.getLogger(__name__) + + +@to_thread +def download_novel(app: App) -> list: + try: + app.pack_by_volume = False + app.start_download() + app.bind_books() + app.compress_books() + assert isinstance(app.archived_outputs, list) + return app.archived_outputs + except Exception as ex: + logger.exception(ex) + + +@to_thread +def novel_by_url(url: str) -> App: + app = App() + app.user_input = url + app.crawler = prepare_crawler(app.user_input) + app.get_novel_info() + assert isinstance(app.crawler, Crawler) + return app + + +@to_thread +def novel_by_title(name: str, pattern: str) -> App: + app = App() + app.user_input = name.strip() + app.crawler_links = [ + str(link) + for link, crawler in crawler_list.items() + if crawler.search_novel != Crawler.search_novel + and (not pattern or re.search(pattern, link)) + ] + + app.search_novel() + return app + + +@to_thread +def upload_file(archive: str) -> str | io.BufferedIOBase | None: + # Check file size + file_size = os.stat(archive).st_size + if file_size >= 8388608: + try: + description = "Generated by: lncrawl Discord bot" + return upload(archive, description) + except Exception as e: + logger.error("Failed to upload file: %s", archive, e) + return None + + return open(archive, "rb") + + +@to_thread +def destroy_app(app: App): + app.destroy() + + +def archive_metadata(archive) -> t.Tuple[str, str]: + return os.path.basename(os.path.dirname(archive)), os.path.basename(archive) + + +async def update_progress(app: App, editFollowup: t.Callable[[str], None]): + chapterCount = len(app.chapters) + lastProgress = 0 + while app.crawler.future_progress < chapterCount: + # this is shit, but it ensures we won't be stuck if we miss the done window + if app.crawler.future_progress < lastProgress: + break + lastProgress = app.crawler.future_progress + await editFollowup(f"Download in progress: {lastProgress}/{chapterCount}") + await asyncio.sleep(1) + # not cool, but we're risking this property to be reset by further downloads + await editFollowup(f"Done: {chapterCount}/{chapterCount}. Uploading your file.") + + +def configure_output_path(app: App): + # set output path + root = os.path.abspath(".discord_bot_output") + output_path = os.path.join(root, app.good_source_name, app.good_file_name) + shutil.rmtree(output_path, ignore_errors=True) + os.makedirs(output_path, exist_ok=True) + return output_path diff --git a/lncrawl/bots/discord/utils.py b/lncrawl/bots/discord/utils.py new file mode 100644 index 000000000..dd49a9964 --- /dev/null +++ b/lncrawl/bots/discord/utils.py @@ -0,0 +1,19 @@ +import asyncio +import functools +import typing as t +from .config import available_formats + + +def validate_formats(xs: t.List[str]): + for x in xs: + if not x in available_formats: + return False + return True + + +def to_thread(func: t.Callable) -> t.Coroutine: + @functools.wraps(func) + async def wrapper(*args, **kwargs): + return await asyncio.to_thread(func, *args, **kwargs) + + return wrapper diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py index 9ef2fa321..4cf45c4a3 100644 --- a/lncrawl/core/app.py +++ b/lncrawl/core/app.py @@ -41,6 +41,7 @@ def __init__(self): self.book_cover: Optional[str] = None self.output_formats: Dict[OutputFormat, bool] = {} self.archived_outputs = None + self.good_source_name: str = "" self.good_file_name: str = "" self.no_suffix_after_filename = False atexit.register(self.destroy) @@ -60,7 +61,7 @@ def destroy(self): if self.crawler: self.crawler.__del__() self.chapters.clear() - logger.info("App destroyed") + logger.debug("App destroyed") # ----------------------------------------------------------------------- # @@ -147,9 +148,9 @@ def get_novel_info(self): word_boundary=True, ) - source_name = slugify(urlparse(self.crawler.home_url).netloc) + self.good_source_name = slugify(urlparse(self.crawler.home_url).netloc) self.output_path = os.path.join( - C.DEFAULT_OUTPUT_PATH, source_name, self.good_file_name + C.DEFAULT_OUTPUT_PATH, self.good_source_name, self.good_file_name ) # ----------------------------------------------------------------------- # @@ -177,7 +178,7 @@ def start_download(self): def bind_books(self): """Requires: crawler, chapters, output_path, pack_by_volume, book_cover, output_formats""" - logger.info("Processing data for binding") + logger.debug("Processing data for binding") assert self.crawler data = {} @@ -205,23 +206,26 @@ def bind_books(self): # ----------------------------------------------------------------------- # def compress_books(self, archive_singles=False): - logger.info("Compressing output...") - + logger.debug("Compressing output...") # Get which paths to be archived with their base names path_to_process = [] - for fmt in available_formats: + + for fmt in list({k: v for k, v in self.output_formats.items() if v == True}): root_dir = os.path.join(self.output_path, fmt) if os.path.isdir(root_dir): path_to_process.append( [root_dir, self.good_file_name + " (" + fmt + ")"] ) + logger.debug("path_to_process: %s", path_to_process) + # Archive files self.archived_outputs = [] for root_dir, output_name in path_to_process: file_list = os.listdir(root_dir) + logger.debug("file_list: %s", file_list) if len(file_list) == 0: - logger.info("It has no files: %s", root_dir) + logger.debug("It has no files: %s", root_dir) continue archived_file = None @@ -230,17 +234,20 @@ def compress_books(self, archive_singles=False): and not archive_singles and not os.path.isdir(os.path.join(root_dir, file_list[0])) ): - logger.info("Not archiving single file inside %s" % root_dir) + logger.debug("Not archiving single file inside %s" % root_dir) archived_file = os.path.join(root_dir, file_list[0]) else: - base_path = os.path.join(self.output_path, output_name) - logger.info("Compressing %s to %s" % (root_dir, base_path)) + base_path = os.path.join(root_dir, output_name) + logger.debug("Compressing %s to %s" % (root_dir, base_path)) archived_file = shutil.make_archive( base_path, format="zip", root_dir=root_dir, ) - logger.info("Compressed:", os.path.basename(archived_file)) + logger.debug(f"Compressed: {os.path.basename(archived_file)}") if archived_file: + logger.debug( + "appending archived file to archived_outputs: %s", archived_file + ) self.archived_outputs.append(archived_file) diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py index 7e953746f..af3919a3c 100644 --- a/lncrawl/core/crawler.py +++ b/lncrawl/core/crawler.py @@ -148,7 +148,7 @@ def download_chapters( unit="item", fail_fast=fail_fast, ) - for (index, future) in futures.items(): + for index, future in futures.items(): try: chapter = chapters[index] chapter.body = future.result() diff --git a/lncrawl/core/taskman.py b/lncrawl/core/taskman.py index e9b186d1b..5719cf02e 100644 --- a/lncrawl/core/taskman.py +++ b/lncrawl/core/taskman.py @@ -29,6 +29,7 @@ def __init__(self) -> None: - workers (int, optional): Number of concurrent workers to expect. Default: 10. - ratelimit (float, optional): Number of requests per second. """ + self.future_progress = 0 self.init_executor(MAX_WORKER_COUNT) def __del__(self) -> None: @@ -192,9 +193,11 @@ def resolve_futures( ) try: + self.future_progress = 0 for future in futures: if fail_fast: future.result(timeout) + self.future_progress += 1 bar.update() continue try: @@ -208,6 +211,7 @@ def resolve_futures( bar.clear() logger.warning(f"{type(e).__name__}: {e}") finally: + self.future_progress += 1 bar.update() finally: Thread(target=lambda: self.cancel_futures(futures)).start() diff --git a/requirements.txt b/requirements.txt index dd36c07bb..10e7f02f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,6 +37,6 @@ setuptools<=60.0.0 # win_unicode_console~=0.5 # bot requirements -discord.py>=1.0.0,<2.0.0 +py-cord>=2.4.0 python-telegram-bot<12 # pydrive>=1.3.1,<2.0.0