Skip to content

Commit

Permalink
Merge pull request #622 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Version 2.23.3
  • Loading branch information
dipu-bd authored Oct 16, 2020
2 parents 5ce52ee + e9d04b2 commit ebdb811
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 12 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ An app to download novels from online sources and generate e-books.
## Table of contents

- [Lightnovel Crawler ![pip package](https://pypi.org/project/lightnovel-crawler) [![download win](https://img.shields.io/badge/%E2%A7%AA-lncrawl.exe-red)](https://rebrand.ly/lncrawl) [![download linux](<https://img.shields.io/badge/%E2%A7%AD-lncrawl%20(linux)-brown>)](https://rebrand.ly/lncrawl-linux)](#lightnovel-crawler-img-srchttpsimgshieldsiobadgef09f93a6-pip-blue-altpip-package-img-srchttpsimgshieldsiobadgee2a7aa-lncrawlexe-red-altdownload-win-img-srchttpsimgshieldsiobadgee2a7ad-lncrawl20linux-brown-altdownload-linux)
- [Table of contents](#table-of-contents)
- [(A) Installation](#a-installation)
- [A1. Standalone Bundle (Windows, Linux)](#a1-standalone-bundle-windows-linux)
- [A2. PIP (Windows, Mac, and Linux)](#a2-pip-windows-mac-and-linux)
Expand Down Expand Up @@ -54,7 +52,7 @@ Without it, you will only get output in epub, text, and web formats.

### A1. Standalone Bundle (Windows, Linux)

**Windows**: [lightnovel-crawler v2.23.2 ~ 23MB](https://rebrand.ly/lncrawl)
**Windows**: [lightnovel-crawler v2.23.3 ~ 23MB](https://rebrand.ly/lncrawl)

> In Windows 8, 10 or later versions, it might say that `lncrawl.exe` is not safe to dowload or execute. You should bypass/ignore this security check to execute this program.
Expand Down
2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.23.2
2.23.3
3 changes: 3 additions & 0 deletions lncrawl/sources/bestlightnovel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
logger = logging.getLogger(__name__)
search_url = 'https://bestlightnovel.com/getsearchstory'
novel_page_url = 'https://bestlightnovel.com/novel_%s'
change_bad_words_off = 'https://bestlightnovel.com/change_bad_words_off'


class BestLightNovel(Crawler):
Expand Down Expand Up @@ -61,6 +62,8 @@ def read_novel_info(self):
'url': self.absolute_url(a['href']),
})
# end for

self.get_response(change_bad_words_off)
# end def

def download_chapter_body(self, chapter):
Expand Down
24 changes: 17 additions & 7 deletions lncrawl/sources/novelfull.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from concurrent import futures
from ..utils.crawler import Crawler
from bs4 import Comment

logger = logging.getLogger(__name__)
search_url = 'https://novelfull.com/search?keyword=%s'
Expand Down Expand Up @@ -122,14 +123,23 @@ def download_chapter_body(self, chapter):
'''Download body of a single chapter and return as clean html format.'''
logger.info('Downloading %s', chapter['url'])
soup = self.get_soup(chapter['url'])

content = soup.select_one('div#chapter-content')
for ads in content.findAll('div', {"align": 'left'}):
ads.decompose()
for ads in content.findAll('div', {"align": 'center'}):
ads.decompose()
for ads in content.select('.adsbygoogle, script, ins, .ads, .ads-holder'):
ads.decompose
# self.clean_contents(content)

# Removes junk text from chapters.
self.blacklist_patterns = [
r'^\s*Translator:',
r'^\s*Editor:',
r'^\s*Atlas Studios',
r'Read more chapter on NovelFull',
r'full thich ung',
r'If you find any errors \( broken links.*let us know < report chapter >',
]

self.clean_contents(content)
for ads in content.select('h3, h2, .adsbygoogle, script, ins, .ads, .ads-holder'):
ads.extract()

return str(content)
# end def
# end class
2 changes: 1 addition & 1 deletion lncrawl/utils/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def clean_contents(self, div):
return div
# end if
div.attrs = {}
for tag in div.findAll(True):
for tag in div.find_all(True):
if isinstance(tag, Comment):
tag.extract() # Remove comments
elif tag.name == 'br':
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ lxml==4.5.1
# Bot requirements
discord.py>=1.3.4
python-telegram-bot>=12.8
pydrive==1.3.1

0 comments on commit ebdb811

Please sign in to comment.