Merge pull request #622 from dipu-bd/dev

Version 2.23.3
dipu-bd · Oct 16, 2020 · ebdb811 · ebdb811
2 parents 5ce52ee + e9d04b2
commit ebdb811
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -19,8 +19,6 @@ An app to download novels from online sources and generate e-books.
 
 ## Table of contents
 
-- [Lightnovel Crawler ![pip package](https://pypi.org/project/lightnovel-crawler) [![download win](https://img.shields.io/badge/%E2%A7%AA-lncrawl.exe-red)](https://rebrand.ly/lncrawl) [![download linux](<https://img.shields.io/badge/%E2%A7%AD-lncrawl%20(linux)-brown>)](https://rebrand.ly/lncrawl-linux)](#lightnovel-crawler-img-srchttpsimgshieldsiobadgef09f93a6-pip-blue-altpip-package-img-srchttpsimgshieldsiobadgee2a7aa-lncrawlexe-red-altdownload-win-img-srchttpsimgshieldsiobadgee2a7ad-lncrawl20linux-brown-altdownload-linux)
-  - [Table of contents](#table-of-contents)
   - [(A) Installation](#a-installation)
     - [A1. Standalone Bundle (Windows, Linux)](#a1-standalone-bundle-windows-linux)
     - [A2. PIP (Windows, Mac, and Linux)](#a2-pip-windows-mac-and-linux)
@@ -54,7 +52,7 @@ Without it, you will only get output in epub, text, and web formats.
 
 ### A1. Standalone Bundle (Windows, Linux)
 
-⏬ **Windows**: [lightnovel-crawler v2.23.2 ~ 23MB](https://rebrand.ly/lncrawl)
+⏬ **Windows**: [lightnovel-crawler v2.23.3 ~ 23MB](https://rebrand.ly/lncrawl)
 
 > In Windows 8, 10 or later versions, it might say that `lncrawl.exe` is not safe to dowload or execute. You should bypass/ignore this security check to execute this program.
 

diff --git a/lncrawl/VERSION b/lncrawl/VERSION
@@ -1 +1 @@
-2.23.2
+2.23.3
diff --git a/lncrawl/sources/bestlightnovel.py b/lncrawl/sources/bestlightnovel.py
@@ -7,6 +7,7 @@
 logger = logging.getLogger(__name__)
 search_url = 'https://bestlightnovel.com/getsearchstory'
 novel_page_url = 'https://bestlightnovel.com/novel_%s'
+change_bad_words_off = 'https://bestlightnovel.com/change_bad_words_off'
 
 
 class BestLightNovel(Crawler):
@@ -61,6 +62,8 @@ def read_novel_info(self):
                 'url': self.absolute_url(a['href']),
             })
         # end for
+
+        self.get_response(change_bad_words_off)
     # end def
 
     def download_chapter_body(self, chapter):

diff --git a/lncrawl/sources/novelfull.py b/lncrawl/sources/novelfull.py
@@ -3,6 +3,7 @@
 import logging
 from concurrent import futures
 from ..utils.crawler import Crawler
+from bs4 import Comment
 
 logger = logging.getLogger(__name__)
 search_url = 'https://novelfull.com/search?keyword=%s'
@@ -122,14 +123,23 @@ def download_chapter_body(self, chapter):
         '''Download body of a single chapter and return as clean html format.'''
         logger.info('Downloading %s', chapter['url'])
         soup = self.get_soup(chapter['url'])
+
         content = soup.select_one('div#chapter-content')
-        for ads in content.findAll('div', {"align": 'left'}):
-            ads.decompose()
-        for ads in content.findAll('div', {"align": 'center'}):
-            ads.decompose()
-        for ads in content.select('.adsbygoogle, script, ins, .ads, .ads-holder'):
-            ads.decompose
-        # self.clean_contents(content)
+
+        # Removes junk text from chapters.
+        self.blacklist_patterns = [
+            r'^\s*Translator:',
+            r'^\s*Editor:',
+            r'^\s*Atlas Studios',
+            r'Read more chapter on NovelFull',
+            r'full thich ung',
+            r'If you find any errors \( broken links.*let us know < report chapter >',
+        ]
+
+        self.clean_contents(content)
+        for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
+            ads.extract()
+
         return str(content)
     # end def
 # end class
diff --git a/lncrawl/utils/crawler.py b/lncrawl/utils/crawler.py
@@ -253,7 +253,7 @@ def clean_contents(self, div):
             return div
         # end if
         div.attrs = {}
-        for tag in div.findAll(True):
+        for tag in div.find_all(True):
             if isinstance(tag, Comment):
                 tag.extract()   # Remove comments
             elif tag.name == 'br':

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ lxml==4.5.1
 # Bot requirements
 discord.py>=1.3.4
 python-telegram-bot>=12.8
+pydrive==1.3.1