From b5668a384c7f86b707da72293678687d7219dfec Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Wed, 23 Sep 2020 02:19:17 +0600 Subject: [PATCH 1/8] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 665a64c24..91c4014e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ lxml==4.5.1 # Bot requirements discord.py>=1.3.4 python-telegram-bot>=12.8 +pydrive==1.3.1 From d2f3febf4709e6e6f4aa761423d712ae868451ee Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Sat, 26 Sep 2020 18:16:40 +0600 Subject: [PATCH 2/8] [bestlightnovel] visit change_bad_words_off --- lncrawl/sources/bestlightnovel.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lncrawl/sources/bestlightnovel.py b/lncrawl/sources/bestlightnovel.py index 85d993d20..27648a18c 100644 --- a/lncrawl/sources/bestlightnovel.py +++ b/lncrawl/sources/bestlightnovel.py @@ -7,6 +7,7 @@ logger = logging.getLogger(__name__) search_url = 'https://bestlightnovel.com/getsearchstory' novel_page_url = 'https://bestlightnovel.com/novel_%s' +change_bad_words_off = 'https://bestlightnovel.com/change_bad_words_off' class BestLightNovel(Crawler): @@ -61,6 +62,8 @@ def read_novel_info(self): 'url': self.absolute_url(a['href']), }) # end for + + self.get_response(change_bad_words_off) # end def def download_chapter_body(self, chapter): From 0d3d8b1793abf8846f6766510f88418c3c117833 Mon Sep 17 00:00:00 2001 From: SirGryphin Date: Thu, 1 Oct 2020 22:25:35 +0100 Subject: [PATCH 3/8] minor fixes --- lncrawl/sources/novelfull.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py index b56ebaa5c..0bcf07a3f 100644 --- a/lncrawl/sources/novelfull.py +++ b/lncrawl/sources/novelfull.py @@ -3,6 +3,7 @@ import logging from concurrent import futures from ..utils.crawler import Crawler +from bs4 import Comment logger = logging.getLogger(__name__) search_url = 'https://novelfull.com/search?keyword=%s' @@ -122,14 +123,33 @@ def download_chapter_body(self, chapter): '''Download body of a single chapter and return as clean html format.''' logger.info('Downloading %s', chapter['url']) soup = self.get_soup(chapter['url']) + content = soup.select_one('div#chapter-content') + + # Removes junk text from chapters. + self.blacklist_patterns = [ + r'^Translator:', + r'^Editor:', + r'^Atlas Studios', + r'Read more chapter on NovelFull' + r'^full thich ung', + ] + + # Some comments in source code of site gets converted into text/paragraphs. This removes it. + for comment in soup.findAll(text=lambda text:isinstance(text, Comment)): + comment.extract() + + self.clean_contents(content) + for ads in content.findAll('div', {"align": 'left'}): ads.decompose() for ads in content.findAll('div', {"align": 'center'}): ads.decompose() - for ads in content.select('.adsbygoogle, script, ins, .ads, .ads-holder'): - ads.decompose - # self.clean_contents(content) - return str(content) + for ads in content.select('h3, .adsbygoogle, script, ins, .ads, .ads-holder'): + ads.decompose() + # return str(content) + # Changed so excess div tags are removed and all chapters text is in p tag, so its better formatted. Also added h3 tag to above decompose to remove double chapter headings. + body = self.extract_contents(content) + return '

' + '

'.join(body) + '

' # end def # end class From d4b5c7c2f8c19ad5d0fc666f89f18dd8a0a29e7f Mon Sep 17 00:00:00 2001 From: SirGryphin Date: Sat, 3 Oct 2020 19:43:30 +0100 Subject: [PATCH 4/8] added suggested code --- lncrawl/sources/novelfull.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py index 0bcf07a3f..d9afea9a0 100644 --- a/lncrawl/sources/novelfull.py +++ b/lncrawl/sources/novelfull.py @@ -137,18 +137,19 @@ def download_chapter_body(self, chapter): # Some comments in source code of site gets converted into text/paragraphs. This removes it. for comment in soup.findAll(text=lambda text:isinstance(text, Comment)): + def is_ad(tag): + return isinstance(tag, Comment) or ( + tag.name == "div" and tag.get("class") in [["left"], ["center"]] + ) + for ads in content.find_all(is_ad): + ads.decompose() comment.extract() self.clean_contents(content) - for ads in content.findAll('div', {"align": 'left'}): + for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): ads.decompose() - for ads in content.findAll('div', {"align": 'center'}): - ads.decompose() - for ads in content.select('h3, .adsbygoogle, script, ins, .ads, .ads-holder'): - ads.decompose() - # return str(content) - # Changed so excess div tags are removed and all chapters text is in p tag, so its better formatted. Also added h3 tag to above decompose to remove double chapter headings. + body = self.extract_contents(content) return '

' + '

'.join(body) + '

' # end def From 412244a6e6cb0d49d0c9e7129a59c31e3fa38d7f Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Sun, 4 Oct 2020 02:56:58 +0600 Subject: [PATCH 5/8] Update novelfull.py --- lncrawl/sources/novelfull.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py index d9afea9a0..4201facda 100644 --- a/lncrawl/sources/novelfull.py +++ b/lncrawl/sources/novelfull.py @@ -134,21 +134,23 @@ def download_chapter_body(self, chapter): r'Read more chapter on NovelFull' r'^full thich ung', ] + + bad_class = ['h3', 'h2', '.adsbygoogle', 'script', 'ins', '.ads', '.ads-holder'] + def is_ad(tag): + return isinstance(tag, Comment) or ( + tag.has_attr('class') and \ + any(x in tag.get("class") for x in bad_class) + ) + for ads in content.find_all(is_ad): + ads.decompose() # Some comments in source code of site gets converted into text/paragraphs. This removes it. - for comment in soup.findAll(text=lambda text:isinstance(text, Comment)): - def is_ad(tag): - return isinstance(tag, Comment) or ( - tag.name == "div" and tag.get("class") in [["left"], ["center"]] - ) - for ads in content.find_all(is_ad): - ads.decompose() - comment.extract() - - self.clean_contents(content) - - for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): - ads.decompose() + #for comment in soup.find_all(text=lambda text:isinstance(text, Comment)): + # comment.extract() + + #self.clean_contents(content) + #for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): + # ads.decompose() body = self.extract_contents(content) return '

' + '

'.join(body) + '

' From b116fc9fe2bdbfd61b8edfc26c902576c85b7084 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Sun, 4 Oct 2020 03:56:40 +0600 Subject: [PATCH 6/8] [novelfull] use clean contents to remove all junks --- lncrawl/sources/novelfull.py | 33 ++++++++++----------------------- lncrawl/utils/crawler.py | 2 +- 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py index 4201facda..34ee584ca 100644 --- a/lncrawl/sources/novelfull.py +++ b/lncrawl/sources/novelfull.py @@ -128,31 +128,18 @@ def download_chapter_body(self, chapter): # Removes junk text from chapters. self.blacklist_patterns = [ - r'^Translator:', - r'^Editor:', - r'^Atlas Studios', - r'Read more chapter on NovelFull' - r'^full thich ung', + r'^\s*Translator:', + r'^\s*Editor:', + r'^\s*Atlas Studios', + r'Read more chapter on NovelFull', + r'full thich ung', + r'If you find any errors \( broken links.*let us know < report chapter >', ] - - bad_class = ['h3', 'h2', '.adsbygoogle', 'script', 'ins', '.ads', '.ads-holder'] - def is_ad(tag): - return isinstance(tag, Comment) or ( - tag.has_attr('class') and \ - any(x in tag.get("class") for x in bad_class) - ) - for ads in content.find_all(is_ad): - ads.decompose() - - # Some comments in source code of site gets converted into text/paragraphs. This removes it. - #for comment in soup.find_all(text=lambda text:isinstance(text, Comment)): - # comment.extract() - #self.clean_contents(content) - #for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): - # ads.decompose() + self.clean_contents(content) + for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'): + ads.extract() - body = self.extract_contents(content) - return '

' + '

'.join(body) + '

' + return str(content) # end def # end class diff --git a/lncrawl/utils/crawler.py b/lncrawl/utils/crawler.py index d0d51bae8..1237b245e 100644 --- a/lncrawl/utils/crawler.py +++ b/lncrawl/utils/crawler.py @@ -253,7 +253,7 @@ def clean_contents(self, div): return div # end if div.attrs = {} - for tag in div.findAll(True): + for tag in div.find_all(True): if isinstance(tag, Comment): tag.extract() # Remove comments elif tag.name == 'br': From 819af45f4b2a0242721a34af9e07adf276e967b5 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Fri, 16 Oct 2020 13:39:15 +0600 Subject: [PATCH 7/8] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 16d9e6f24..10dbf156e 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ An app to download novels from online sources and generate e-books. ## Table of contents -- [Lightnovel Crawler ![pip package](https://pypi.org/project/lightnovel-crawler) [![download win](https://img.shields.io/badge/%E2%A7%AA-lncrawl.exe-red)](https://rebrand.ly/lncrawl) [![download linux]()](https://rebrand.ly/lncrawl-linux)](#lightnovel-crawler-img-srchttpsimgshieldsiobadgef09f93a6-pip-blue-altpip-package-img-srchttpsimgshieldsiobadgee2a7aa-lncrawlexe-red-altdownload-win-img-srchttpsimgshieldsiobadgee2a7ad-lncrawl20linux-brown-altdownload-linux) - - [Table of contents](#table-of-contents) - [(A) Installation](#a-installation) - [A1. Standalone Bundle (Windows, Linux)](#a1-standalone-bundle-windows-linux) - [A2. PIP (Windows, Mac, and Linux)](#a2-pip-windows-mac-and-linux) From e9d04b255161d5e8dc75f1680838d572a9514404 Mon Sep 17 00:00:00 2001 From: Sudipto Chandra Date: Fri, 16 Oct 2020 13:42:20 +0600 Subject: [PATCH 8/8] Bump version 2.23.3 --- README.md | 2 +- lncrawl/VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 10dbf156e..038508520 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Without it, you will only get output in epub, text, and web formats. ### A1. Standalone Bundle (Windows, Linux) -⏬ **Windows**: [lightnovel-crawler v2.23.2 ~ 23MB](https://rebrand.ly/lncrawl) +⏬ **Windows**: [lightnovel-crawler v2.23.3 ~ 23MB](https://rebrand.ly/lncrawl) > In Windows 8, 10 or later versions, it might say that `lncrawl.exe` is not safe to dowload or execute. You should bypass/ignore this security check to execute this program. diff --git a/lncrawl/VERSION b/lncrawl/VERSION index b7fbcebc1..174f0e333 100644 --- a/lncrawl/VERSION +++ b/lncrawl/VERSION @@ -1 +1 @@ -2.23.2 \ No newline at end of file +2.23.3 \ No newline at end of file