From b5668a384c7f86b707da72293678687d7219dfec Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Wed, 23 Sep 2020 02:19:17 +0600
Subject: [PATCH 1/8] Update requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)
diff --git a/requirements.txt b/requirements.txt
index 665a64c24..91c4014e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ lxml==4.5.1
 # Bot requirements
 discord.py>=1.3.4
 python-telegram-bot>=12.8
+pydrive==1.3.1

From d2f3febf4709e6e6f4aa761423d712ae868451ee Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Sat, 26 Sep 2020 18:16:40 +0600
Subject: [PATCH 2/8] [bestlightnovel] visit change_bad_words_off

---
 lncrawl/sources/bestlightnovel.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lncrawl/sources/bestlightnovel.py b/lncrawl/sources/bestlightnovel.py
index 85d993d20..27648a18c 100644
--- a/lncrawl/sources/bestlightnovel.py
+++ b/lncrawl/sources/bestlightnovel.py
@@ -7,6 +7,7 @@
 logger = logging.getLogger(__name__)
 search_url = 'https://bestlightnovel.com/getsearchstory'
 novel_page_url = 'https://bestlightnovel.com/novel_%s'
+change_bad_words_off = 'https://bestlightnovel.com/change_bad_words_off'
 
 
 class BestLightNovel(Crawler):
@@ -61,6 +62,8 @@ def read_novel_info(self):
                 'url': self.absolute_url(a['href']),
             })
         # end for
+
+        self.get_response(change_bad_words_off)
     # end def
 
     def download_chapter_body(self, chapter):

From 0d3d8b1793abf8846f6766510f88418c3c117833 Mon Sep 17 00:00:00 2001
From: SirGryphin <bballuk@gmail.com>
Date: Thu, 1 Oct 2020 22:25:35 +0100
Subject: [PATCH 3/8] minor fixes

---
 lncrawl/sources/novelfull.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py
index b56ebaa5c..0bcf07a3f 100644
--- a/lncrawl/sources/novelfull.py
+++ b/lncrawl/sources/novelfull.py
@@ -3,6 +3,7 @@
 import logging
 from concurrent import futures
 from ..utils.crawler import Crawler
+from bs4 import Comment
 
 logger = logging.getLogger(__name__)
 search_url = 'https://novelfull.com/search?keyword=%s'
@@ -122,14 +123,33 @@ def download_chapter_body(self, chapter):
         '''Download body of a single chapter and return as clean html format.'''
         logger.info('Downloading %s', chapter['url'])
         soup = self.get_soup(chapter['url'])
+
         content = soup.select_one('div#chapter-content')
+
+        # Removes junk text from chapters.
+        self.blacklist_patterns = [
+            r'^Translator:',
+            r'^Editor:',
+            r'^Atlas Studios',
+            r'Read more chapter on NovelFull'
+            r'^full thich ung',
+        ]
+
+        # Some comments in source code of site gets converted into text/paragraphs. This removes it.
+        for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
+            comment.extract()
+
+        self.clean_contents(content)
+
         for ads in content.findAll('div', {"align": 'left'}):
             ads.decompose()
         for ads in content.findAll('div', {"align": 'center'}):
             ads.decompose()
-        for ads in content.select('.adsbygoogle, script, ins, .ads, .ads-holder'):
-            ads.decompose
-        # self.clean_contents(content)
-        return str(content)
+        for ads in content.select('h3, .adsbygoogle, script, ins, .ads, .ads-holder'):
+            ads.decompose()
+        # return str(content)
+        # Changed so excess div tags are removed and all chapters text is in p tag, so its better formatted. Also added h3 tag to above decompose to remove double chapter headings.
+        body = self.extract_contents(content)
+        return '<p>' + '</p><p>'.join(body) + '</p>'
     # end def
 # end class

From d4b5c7c2f8c19ad5d0fc666f89f18dd8a0a29e7f Mon Sep 17 00:00:00 2001
From: SirGryphin <bballuk@gmail.com>
Date: Sat, 3 Oct 2020 19:43:30 +0100
Subject: [PATCH 4/8] added suggested code

---
 lncrawl/sources/novelfull.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py
index 0bcf07a3f..d9afea9a0 100644
--- a/lncrawl/sources/novelfull.py
+++ b/lncrawl/sources/novelfull.py
@@ -137,18 +137,19 @@ def download_chapter_body(self, chapter):
 
         # Some comments in source code of site gets converted into text/paragraphs. This removes it.
         for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
+            def is_ad(tag):
+                return isinstance(tag, Comment) or (
+                    tag.name == "div" and tag.get("class") in [["left"], ["center"]]
+                )
+            for ads in content.find_all(is_ad):
+                ads.decompose()
             comment.extract()
 
         self.clean_contents(content)
 
-        for ads in content.findAll('div', {"align": 'left'}):
+        for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
             ads.decompose()
-        for ads in content.findAll('div', {"align": 'center'}):
-            ads.decompose()
-        for ads in content.select('h3, .adsbygoogle, script, ins, .ads, .ads-holder'):
-            ads.decompose()
-        # return str(content)
-        # Changed so excess div tags are removed and all chapters text is in p tag, so its better formatted. Also added h3 tag to above decompose to remove double chapter headings.
+
         body = self.extract_contents(content)
         return '<p>' + '</p><p>'.join(body) + '</p>'
     # end def

From 412244a6e6cb0d49d0c9e7129a59c31e3fa38d7f Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Sun, 4 Oct 2020 02:56:58 +0600
Subject: [PATCH 5/8] Update novelfull.py

---
 lncrawl/sources/novelfull.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py
index d9afea9a0..4201facda 100644
--- a/lncrawl/sources/novelfull.py
+++ b/lncrawl/sources/novelfull.py
@@ -134,21 +134,23 @@ def download_chapter_body(self, chapter):
             r'Read more chapter on NovelFull'
             r'^full thich ung',
         ]
+        
+        bad_class = ['h3', 'h2', '.adsbygoogle', 'script', 'ins', '.ads', '.ads-holder']
+        def is_ad(tag):
+            return isinstance(tag, Comment) or (
+                tag.has_attr('class') and \
+                any(x in tag.get("class") for x in bad_class)
+            )
+        for ads in content.find_all(is_ad):
+            ads.decompose()
 
         # Some comments in source code of site gets converted into text/paragraphs. This removes it.
-        for comment in soup.findAll(text=lambda text:isinstance(text, Comment)):
-            def is_ad(tag):
-                return isinstance(tag, Comment) or (
-                    tag.name == "div" and tag.get("class") in [["left"], ["center"]]
-                )
-            for ads in content.find_all(is_ad):
-                ads.decompose()
-            comment.extract()
-
-        self.clean_contents(content)
-
-        for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
-            ads.decompose()
+        #for comment in soup.find_all(text=lambda text:isinstance(text, Comment)):
+        #    comment.extract()
+
+        #self.clean_contents(content)
+        #for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
+        #    ads.decompose()
 
         body = self.extract_contents(content)
         return '<p>' + '</p><p>'.join(body) + '</p>'

From b116fc9fe2bdbfd61b8edfc26c902576c85b7084 Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Sun, 4 Oct 2020 03:56:40 +0600
Subject: [PATCH 6/8] [novelfull] use clean contents to remove all junks

---
 lncrawl/sources/novelfull.py | 33 ++++++++++-----------------------
 lncrawl/utils/crawler.py     |  2 +-
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py
index 4201facda..34ee584ca 100644
--- a/lncrawl/sources/novelfull.py
+++ b/lncrawl/sources/novelfull.py
@@ -128,31 +128,18 @@ def download_chapter_body(self, chapter):
 
         # Removes junk text from chapters.
         self.blacklist_patterns = [
-            r'^Translator:',
-            r'^Editor:',
-            r'^Atlas Studios',
-            r'Read more chapter on NovelFull'
-            r'^full thich ung',
+            r'^\s*Translator:',
+            r'^\s*Editor:',
+            r'^\s*Atlas Studios',
+            r'Read more chapter on NovelFull',
+            r'full thich ung',
+            r'If you find any errors \( broken links.*let us know < report chapter >',
         ]
-        
-        bad_class = ['h3', 'h2', '.adsbygoogle', 'script', 'ins', '.ads', '.ads-holder']
-        def is_ad(tag):
-            return isinstance(tag, Comment) or (
-                tag.has_attr('class') and \
-                any(x in tag.get("class") for x in bad_class)
-            )
-        for ads in content.find_all(is_ad):
-            ads.decompose()
-
-        # Some comments in source code of site gets converted into text/paragraphs. This removes it.
-        #for comment in soup.find_all(text=lambda text:isinstance(text, Comment)):
-        #    comment.extract()
 
-        #self.clean_contents(content)
-        #for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
-        #    ads.decompose()
+        self.clean_contents(content)
+        for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
+            ads.extract()
 
-        body = self.extract_contents(content)
-        return '<p>' + '</p><p>'.join(body) + '</p>'
+        return str(content)
     # end def
 # end class
diff --git a/lncrawl/utils/crawler.py b/lncrawl/utils/crawler.py
index d0d51bae8..1237b245e 100644
--- a/lncrawl/utils/crawler.py
+++ b/lncrawl/utils/crawler.py
@@ -253,7 +253,7 @@ def clean_contents(self, div):
             return div
         # end if
         div.attrs = {}
-        for tag in div.findAll(True):
+        for tag in div.find_all(True):
             if isinstance(tag, Comment):
                 tag.extract()   # Remove comments
             elif tag.name == 'br':

From 819af45f4b2a0242721a34af9e07adf276e967b5 Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Fri, 16 Oct 2020 13:39:15 +0600
Subject: [PATCH 7/8] Update README.md

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 16d9e6f24..10dbf156e 100644
--- a/README.md
+++ b/README.md
@@ -19,8 +19,6 @@ An app to download novels from online sources and generate e-books.
 
 ## Table of contents
 
-- [Lightnovel Crawler ![pip package](https://pypi.org/project/lightnovel-crawler) [![download win](https://img.shields.io/badge/%E2%A7%AA-lncrawl.exe-red)](https://rebrand.ly/lncrawl) [![download linux](<https://img.shields.io/badge/%E2%A7%AD-lncrawl%20(linux)-brown>)](https://rebrand.ly/lncrawl-linux)](#lightnovel-crawler-img-srchttpsimgshieldsiobadgef09f93a6-pip-blue-altpip-package-img-srchttpsimgshieldsiobadgee2a7aa-lncrawlexe-red-altdownload-win-img-srchttpsimgshieldsiobadgee2a7ad-lncrawl20linux-brown-altdownload-linux)
-  - [Table of contents](#table-of-contents)
   - [(A) Installation](#a-installation)
     - [A1. Standalone Bundle (Windows, Linux)](#a1-standalone-bundle-windows-linux)
     - [A2. PIP (Windows, Mac, and Linux)](#a2-pip-windows-mac-and-linux)

From e9d04b255161d5e8dc75f1680838d572a9514404 Mon Sep 17 00:00:00 2001
From: Sudipto Chandra <dipu.sudipta@gmail.com>
Date: Fri, 16 Oct 2020 13:42:20 +0600
Subject: [PATCH 8/8] Bump version 2.23.3

---
 README.md       | 2 +-
 lncrawl/VERSION | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 10dbf156e..038508520 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ Without it, you will only get output in epub, text, and web formats.
 
 ### A1. Standalone Bundle (Windows, Linux)
 
-⏬ **Windows**: [lightnovel-crawler v2.23.2 ~ 23MB](https://rebrand.ly/lncrawl)
+⏬ **Windows**: [lightnovel-crawler v2.23.3 ~ 23MB](https://rebrand.ly/lncrawl)
 
 > In Windows 8, 10 or later versions, it might say that `lncrawl.exe` is not safe to dowload or execute. You should bypass/ignore this security check to execute this program.
 
diff --git a/lncrawl/VERSION b/lncrawl/VERSION
index b7fbcebc1..174f0e333 100644
--- a/lncrawl/VERSION
+++ b/lncrawl/VERSION
@@ -1 +1 @@
-2.23.2
\ No newline at end of file
+2.23.3
\ No newline at end of file