diff --git a/README.md b/README.md index 16d9e6f24..038508520 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ An app to download novels from online sources and generate e-books. ## Table of contents -- [Lightnovel Crawler ![pip package](https://pypi.org/project/lightnovel-crawler) [![download win](https://img.shields.io/badge/%E2%A7%AA-lncrawl.exe-red)](https://rebrand.ly/lncrawl) [![download linux]()](https://rebrand.ly/lncrawl-linux)](#lightnovel-crawler-img-srchttpsimgshieldsiobadgef09f93a6-pip-blue-altpip-package-img-srchttpsimgshieldsiobadgee2a7aa-lncrawlexe-red-altdownload-win-img-srchttpsimgshieldsiobadgee2a7ad-lncrawl20linux-brown-altdownload-linux) - - [Table of contents](#table-of-contents) - [(A) Installation](#a-installation) - [A1. Standalone Bundle (Windows, Linux)](#a1-standalone-bundle-windows-linux) - [A2. PIP (Windows, Mac, and Linux)](#a2-pip-windows-mac-and-linux) @@ -54,7 +52,7 @@ Without it, you will only get output in epub, text, and web formats. ### A1. Standalone Bundle (Windows, Linux) -⏬ **Windows**: [lightnovel-crawler v2.23.2 ~ 23MB](https://rebrand.ly/lncrawl) +⏬ **Windows**: [lightnovel-crawler v2.23.3 ~ 23MB](https://rebrand.ly/lncrawl) > In Windows 8, 10 or later versions, it might say that `lncrawl.exe` is not safe to dowload or execute. You should bypass/ignore this security check to execute this program. diff --git a/lncrawl/VERSION b/lncrawl/VERSION index b7fbcebc1..174f0e333 100644 --- a/lncrawl/VERSION +++ b/lncrawl/VERSION @@ -1 +1 @@ -2.23.2 \ No newline at end of file +2.23.3 \ No newline at end of file diff --git a/lncrawl/sources/bestlightnovel.py b/lncrawl/sources/bestlightnovel.py index 85d993d20..27648a18c 100644 --- a/lncrawl/sources/bestlightnovel.py +++ b/lncrawl/sources/bestlightnovel.py @@ -7,6 +7,7 @@ logger = logging.getLogger(__name__) search_url = 'https://bestlightnovel.com/getsearchstory' novel_page_url = 'https://bestlightnovel.com/novel_%s' +change_bad_words_off = 'https://bestlightnovel.com/change_bad_words_off' class BestLightNovel(Crawler): @@ -61,6 +62,8 @@ def read_novel_info(self): 'url': self.absolute_url(a['href']), }) # end for + + self.get_response(change_bad_words_off) # end def def download_chapter_body(self, chapter): diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py index b56ebaa5c..34ee584ca 100644 --- a/lncrawl/sources/novelfull.py +++ b/lncrawl/sources/novelfull.py @@ -3,6 +3,7 @@ import logging from concurrent import futures from ..utils.crawler import Crawler +from bs4 import Comment logger = logging.getLogger(__name__) search_url = 'https://novelfull.com/search?keyword=%s' @@ -122,14 +123,23 @@ def download_chapter_body(self, chapter): '''Download body of a single chapter and return as clean html format.''' logger.info('Downloading %s', chapter['url']) soup = self.get_soup(chapter['url']) + content = soup.select_one('div#chapter-content') - for ads in content.findAll('div', {"align": 'left'}): - ads.decompose() - for ads in content.findAll('div', {"align": 'center'}): - ads.decompose() - for ads in content.select('.adsbygoogle, script, ins, .ads, .ads-holder'): - ads.decompose - # self.clean_contents(content) + + # Removes junk text from chapters. + self.blacklist_patterns = [ + r'^\s*Translator:', + r'^\s*Editor:', + r'^\s*Atlas Studios', + r'Read more chapter on NovelFull', + r'full thich ung', + r'If you find any errors \( broken links.*let us know < report chapter >', + ] + + self.clean_contents(content) + for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): + ads.extract() + return str(content) # end def # end class diff --git a/lncrawl/utils/crawler.py b/lncrawl/utils/crawler.py index d0d51bae8..1237b245e 100644 --- a/lncrawl/utils/crawler.py +++ b/lncrawl/utils/crawler.py @@ -253,7 +253,7 @@ def clean_contents(self, div): return div # end if div.attrs = {} - for tag in div.findAll(True): + for tag in div.find_all(True): if isinstance(tag, Comment): tag.extract() # Remove comments elif tag.name == 'br': diff --git a/requirements.txt b/requirements.txt index 665a64c24..91c4014e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ lxml==4.5.1 # Bot requirements discord.py>=1.3.4 python-telegram-bot>=12.8 +pydrive==1.3.1