diff --git a/README.md b/README.md index f2faf82d6..14b5290e8 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ [![download linux](https://img.shields.io/badge/download-lncrawl_(linux)-brown?logo=linux)](https://rebrand.ly/lncrawl-linux) [![Discord](https://img.shields.io/discord/578550900231110656?logo=discord&label=discord)](https://discord.gg/wMECG2Q)
-[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/dipu-bd/lightnovel-crawler/Build%20and%20test%20package?label=linux&logo=linux)](https://github.com/dipu-bd/lightnovel-crawler/actions/workflows/package.yml) -[![AppVeyor](https://img.shields.io/appveyor/build/dipu-bd/lightnovel-crawler?label=windows&logo=appveyor)](https://ci.appveyor.com/project/dipu-bd/lightnovel-crawler) +[![GitHub branch checks state](https://img.shields.io/github/checks-status/dipu-bd/lightnovel-crawler/master?logo=git)](https://github.com/dipu-bd/lightnovel-crawler/actions/workflows/package.yml) +[![AppVeyor](https://img.shields.io/appveyor/build/dipu-bd/lightnovel-crawler?logo=appveyor)](https://ci.appveyor.com/project/dipu-bd/lightnovel-crawler) [![Python version](https://img.shields.io/pypi/pyversions/lightnovel-crawler.svg)](https://pypi.org/project/lightnovel-crawler) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/dipu-bd/lightnovel-crawler/blob/master/LICENSE) [![GitHub stars](https://img.shields.io/github/stars/dipu-bd/lightnovel-crawler?logo=github)](https://github.com/dipu-bd/lightnovel-crawler) @@ -303,16 +303,17 @@ You are very welcome to contribute in this project. You can: > Request new one by [creating a new issue](https://github.com/dipu-bd/lightnovel-crawler/issues/new/choose). -
- Click to expand! - | Available Sources | Can Search | Can Login | Maintainer | | -------------------------------------------- | :--------: | :-------: | :-------------------: | +| http://boxnovel.cloud | ✔ | | @SirGryphin | | http://boxnovel.org | ✔ | | @dipu-bd | +| http://hs2ppe.co.uk | ✔ | | @SirGryphin | | http://liberspark.com | | | | | http://novelfull.com | ✔ | | @dipu-bd | +| http://novels.cloud | ✔ | | @SirGryphin | | http://tiknovel.com | | | | -| http://wspadancewichita.com | ✔ | | | +| http://wspadancewichita.com | ✔ | | @SirGryphin | +| http://wuxiaworld.cloud | ✔ | | @SirGryphin | | http://www.fujitranslation.com | | | | | http://www.hanyunovels.site | ✔ | | @SirGryphin | | http://www.machinenoveltranslation.com | | | | @@ -324,6 +325,7 @@ You are very welcome to contribute in this project. You can: | https://88tangeatdrinkread.wordpress.com | | | @SirGryphin | | https://9kqw.com | ✔ | | | | https://allnovel.org | ✔ | | @SirGryphin | +| https://amnesiactl.com | | | @SirGryphin | | https://anonanemone.wordpress.com | | | @SirGryphin | | https://arangscans.com | | | @SirGryphin | | https://asadatranslations.com | ✔ | | @SirGryphin | @@ -331,7 +333,9 @@ You are very welcome to contribute in this project. You can: | https://babelnovel.com | ✔ | ✔ | @dipu-bd | | https://bestlightnovel.com | ✔ | | | | https://book.qidian.com | | | | +| https://booknet.com | | | @dipu-bd, @SirGryphin | | https://boxnovel.com | ✔ | | @dipu-bd | +| https://boxnovel.online | ✔ | | @SirGryphin | | https://clicknovel.net | | | @SirGryphin | | https://creativenovels.com | | | | | https://crescentmoon.blog | | | | @@ -342,9 +346,9 @@ You are very welcome to contribute in this project. You can: | https://docln.net | ✔ | | @dipu-bd | | https://dsrealmtranslations.com | | | @SirGryphin | | https://es.mtlnovel.com | ✔ | | | -| https://exiledrebelsscanlations.com | ✔ | | @SirGryphin | +| https://exiledrebelsscanlations.com | | | @SirGryphin | | https://fanstranslations.com | | | @SirGryphin | -| https://fastnovel.net | | | @SirGryphin | +| https://fastnovel.net | ✔ | | @SirGryphin | | https://foxaholic.com | ✔ | | | | https://fr.mtlnovel.com | ✔ | | | | https://fujitranslation.com | | | @SirGryphin | @@ -361,13 +365,15 @@ You are very welcome to contribute in this project. You can: | https://kisslightnovels.info | ✔ | | | | https://lemontreetranslations.wordpress.com | | | @SirGryphin | | https://light-novel.online | ✔ | | | +| https://lightnovel.tv | ✔ | | @SirGryphin | | https://lightnovel.world | | | @SirGryphin | | https://lightnovelbastion.com | | | @dipu-bd | | https://lightnovelheaven.com | | | @SirGryphin | +| https://lightnovelkiss.com | ✔ | | @SirGryphin | | https://lightnovelsonl.com | ✔ | | @SirGryphin | | https://lightnovelstranslations.com | | | @SirGryphin | | https://listnovel.com | ✔ | | | -| https://litnet.com | ✔ | | | +| https://litnet.com | | | | | https://ln.hako.re | ✔ | | @dipu-bd | | https://lnmtl.com | | ✔ | @dipu-bd | | https://m.chinesefantasynovels.com | | | | @@ -382,19 +388,26 @@ You are very welcome to contribute in this project. You can: | https://morenovel.net | ✔ | | @SirGryphin | | https://myoniyonitranslations.com | | | | | https://mysticalmerries.com | ✔ | | @SirGryphin | +| https://newsite.kolnovel.com | ✔ | | @SirGryphin | | https://novel27.com | ✔ | | @SirGryphin | +| https://novelcake.com | ✔ | | @SirGryphin | | https://novelcrush.com | ✔ | | @SirGryphin | +| https://novelextra.com | ✔ | | @SirGryphin | | https://novelfull.com | ✔ | | @dipu-bd | +| https://novelfullplus.com | ✔ | | @dipu-bd, @SirGryphin | +| https://novelgate.net | ✔ | | @SirGryphin | | https://novelgo.id/ | | | | | https://novelonlinefree.com | ✔ | | @SirGryphin | | https://novelonlinefull.com | ✔ | | | | https://novelraw.blogspot.com | | | | | https://novels.pl | | | @dipu-bd | | https://novelsite.net | ✔ | | @SirGryphin | +| https://novelsonline.net | | | @SirGryphin | | https://novelsrock.com | | | | | https://noveltranslate.com | ✔ | | @SirGryphin | | https://noveltrench.com | ✔ | | @SirGryphin | | https://omgnovels.com | ✔ | | @SirGryphin | +| https://overabook.com | ✔ | | @SirGryphin | | https://ranobelib.me | | | | | https://readlightnovels.net | ✔ | | @PreownedFIN | | https://readwebnovels.net | ✔ | | @SirGryphin | @@ -407,6 +420,8 @@ You are very welcome to contribute in this project. You can: | https://supernovel.net | ✔ | | @SirGryphin | | https://toc.qidianunderground.org | ✔ | | @dipu-bd | | https://tomotranslations.com | | | | +| https://totallytranslations.com | | | @SirGryphin, @dipu-bd | +| https://tunovelaligera.com | ✔ | | @SirGryphin | | https://viewnovel.net | ✔ | | @SirGryphin | | https://vipnovel.com | ✔ | | @SirGryphin | | https://vistranslations.wordpress.com | | | @SirGryphin | @@ -415,11 +430,13 @@ You are very welcome to contribute in this project. You can: | https://webnovel.online | | | | | https://webnovelindonesia.com | | | | | https://webnovelonline.com | | | | +| https://wondernovels.com | ✔ | | @SirGryphin | | https://woopread.com | ✔ | | | | https://wordexcerpt.com | ✔ | | @dipu-bd, @SirGryphin | | https://wordexcerpt.org | | | @dipu-bd, @SirGryphin | | https://wuxiaworld.io | ✔ | | @SirGryphin | | https://wuxiaworld.live | ✔ | | @SirGryphin | +| https://wuxiaworld.name | ✔ | | @SirGryphin | | https://wuxiaworld.online | ✔ | | @dipu-bd | | https://wuxiaworld.site | | | @dipu-bd | | https://wuxiaworldsite.co | | | @dipu-bd | @@ -430,10 +447,12 @@ You are very welcome to contribute in this project. You can: | https://www.f-w-o.com | ✔ | | @SirGryphin | | https://www.flying-lines.com | | | @dipu-bd | | https://www.foxteller.com | ✔ | | @dipu-bd | +| https://www.freelightnovel.com | | | @SirGryphin | | https://www.fuyuneko.org | | | @SirGryphin | | https://www.idqidian.us | | | | | https://www.koreanmtl.online | | | @dipu-bd | | https://www.lightnovelpub.com | ✔ | | | +| https://www.lunarletters.com | | | @SirGryphin | | https://www.machine-translation.org | ✔ | | | | https://www.miraslation.net | | | @SirGryphin | | https://www.mtlnovel.com | ✔ | | | @@ -472,38 +491,33 @@ You are very welcome to contribute in this project. You can: | https://yukinovel.id | | | | | https://zinnovel.com | ✔ | | @SirGryphin | -
### Rejected sources -
- Click to expand! - | Rejected Sources | Reason | | ------------------------------- | -------------------------------------------------------------------------------------------------- | | http://fullnovel.live | `403 - Forbidden: Access is denied` | | http://gravitytales.com | `Redirects to webnovel.com` | | http://moonbunnycafe.com | `Does not follow uniform format` | +| https://4scanlation.xyz | `Site moved` | | https://anythingnovel.com | `Site broken` | +| https://bestoflightnovels.com | `Site moved` | | https://chrysanthemumgarden.com | `Removed on request of the owner` [#649](https://github.com/dipu-bd/lightnovel-crawler/issues/649) | +| https://fsapk.com | `Site is not working` | | https://indomtl.com | `Does not like to be crawled` | | https://lnindo.org | `Does not like to be crawled` | | https://mtled-novels.com | `Domain is expired` | +| https://novelcrush.com | `Site is down` | | https://novelplanet.com | `Site is closed` | +| https://pery.info | `Site is down` | +| https://writerupdates.com | `Site is down` | +| https://www.centinni.com | `Site is down` | +| https://www.hotmtlnovel.xyz | `Cloudflare version 2 challenge` | | https://www.jieruihao.cn | `Unavailable` | | https://www.noveluniverse.com | `Site is down` | | https://www.novelupdates.com | `Does not host any novels` | | https://www.novelv.com | `Site is down` | | https://www.rebirth.online | `Site moved` | -| https://4scanlation.xyz | `Site moved` | -| https://pery.info | `Site is down` | -| https://writerupdates.com | `Site is down` | -| https://www.centinni.com | `Site is down` | -| https://fsapk.com | `Site is not working` | -| https://bestoflightnovels.com | `Site moved` | -| https://novelcrush.com | `Site is down` | - -
### Supported output formats diff --git a/lncrawl/VERSION b/lncrawl/VERSION index ed1d60050..3953e8ad5 100644 --- a/lncrawl/VERSION +++ b/lncrawl/VERSION @@ -1 +1 @@ -2.26.2 +2.26.3 diff --git a/lncrawl/core/novel_info.py b/lncrawl/core/novel_info.py index d14b2db3e..60829174d 100644 --- a/lncrawl/core/novel_info.py +++ b/lncrawl/core/novel_info.py @@ -4,14 +4,19 @@ """ import json import os +import re from .. import constants as C from ..utils.crawler import Crawler +def __format_title(text): + return re.sub(r'\s+', ' ', text).strip() +# end def + def format_novel(crawler: Crawler): - crawler.novel_title = crawler.novel_title.strip() - crawler.novel_author = crawler.novel_author.strip() + crawler.novel_title = __format_title(crawler.novel_title) + crawler.novel_author = __format_title(crawler.novel_author) # crawler.novel_title = crawler.cleanup_text(crawler.novel_title) # crawler.novel_author = crawler.cleanup_text(crawler.novel_author) format_volumes(crawler) @@ -29,6 +34,7 @@ def format_volumes(crawler: Crawler): if not ('title' in vol and vol['title']): vol['title'] = title # end if + vol['title'] = __format_title(vol['title']) # end for # end def @@ -39,6 +45,7 @@ def format_chapters(crawler: Crawler): if not ('title' in item and item['title']): item['title'] = title # end if + item['title'] = __format_title(item['title']) volume = [x for x in crawler.volumes if x['id'] == item['volume']] if len(volume) == 0: diff --git a/lncrawl/sources/1stkissnovel.py b/lncrawl/sources/1stkissnovel.py index b1579dac3..66ceee1e8 100644 --- a/lncrawl/sources/1stkissnovel.py +++ b/lncrawl/sources/1stkissnovel.py @@ -1,85 +1,106 @@ # -*- coding: utf-8 -*- import json import logging -from urllib.parse import quote_plus +import re +from urllib.parse import urlparse from ..utils.crawler import Crawler logger = logging.getLogger(__name__) -search_url = 'https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release=' +search_url = ( + "https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release=" +) +wp_admin_ajax_url = 'https://1stkissnovel.love/wp-admin/admin-ajax.php' class OneKissNovelCrawler(Crawler): base_url = 'https://1stkissnovel.love/' - # TODO: Error 503 Backend fetch failed - # def search_novel(self, query): - # query = quote_plus(query.lower()) - # soup = self.get_soup(search_url % query) - # - # results = [] - # for tab in soup.select('.c-tabs-item__content')[:20]: - # a = tab.select_one('.post-title h3 a') - # latest = tab.select_one('.latest-chap .chapter a').text - # votes = tab.select_one('.rating .total_votes').text - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': '%s | Rating: %s' % (latest, votes), - # }) - # # end for - # - # return results - # # end def + def search_novel(self, query): + query = query.lower().replace(" ", "+") + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select(".c-tabs-item__content"): + a = tab.select_one(".post-title h3 a") + latest = tab.select_one(".latest-chap .chapter a").text + votes = tab.select_one(".rating .total_votes").text + results.append( + { + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + "info": "%s | Rating: %s" % (latest, votes), + } + ) + # end for + + return results + # end def def read_novel_info(self): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', self.novel_url) + logger.debug("Visiting %s", self.novel_url) soup = self.get_soup(self.novel_url) - possible_title = soup.select_one('.post-title h1') - for span in possible_title.select('span'): + possible_title = soup.select_one(".post-title h1") + for span in possible_title.select("span"): span.extract() # end for self.novel_title = possible_title.text.strip() - logger.info('Novel title: %s', self.novel_title) + logger.info("Novel title: %s", self.novel_title) - self.novel_cover = soup.select_one( - 'meta[property="og:image"]')['content'] - logger.info('Novel cover: %s', self.novel_cover) + self.novel_cover = self.absolute_url( + soup.select_one(".summary_image a img")["src"] + ) + logger.info("Novel cover: %s", self.novel_cover) - self.novel_author = ' '.join([ - a.text.strip() - for a in soup.select('.author-content a[href*="manga-author"]') - ]) - logger.info('%s', self.novel_author) + self.novel_author = " ".join( + [ + a.text.strip() + for a in soup.select('.author-content a[href*="manga-author"]') + ] + ) + logger.info("%s", self.novel_author) - volumes = set() - chapters = soup.select('ul.main li.wp-manga-chapter a') - for a in reversed(chapters): + self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"] + logger.info("Novel id: %s", self.novel_id) + + # For getting cookies + # self.submit_form(wp_admin_ajax_url, data={ + # 'action': 'manga_views', + # 'manga': self.novel_id, + # }) + # print(self.cookies) + response = self.submit_form(wp_admin_ajax_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): chap_id = len(self.chapters) + 1 - vol_id = (chap_id - 1) // 100 + 1 - volumes.add(vol_id) - self.chapters.append({ - 'id': chap_id, - 'volume': vol_id, - 'url': self.absolute_url(a['href']), - 'title': a.text.strip() or ('Chapter %d' % chap_id), - }) + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) # end for - - self.volumes = [{'id': x} for x in volumes] # end def def download_chapter_body(self, chapter): - '''Download body of a single chapter and return as clean html format.''' - logger.info('Downloading %s', chapter['url']) - soup = self.get_soup(chapter['url']) + logger.info("Visiting %s", chapter["url"]) + soup = self.get_soup(chapter["url"]) contents = soup.select_one('div.text-left') for bad in contents.select('h3, .code-block, script, .adsbygoogle'): bad.decompose() + # end for body = self.extract_contents(contents) return '

' + '

'.join(body) + '

' # end def -# end class \ No newline at end of file +# end class diff --git a/lncrawl/sources/amnesiactl.py b/lncrawl/sources/amnesiactl.py new file mode 100644 index 000000000..ca5764019 --- /dev/null +++ b/lncrawl/sources/amnesiactl.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +#search_url = 'https://amnesiactl.com/?s=%s&post_type=wp-manga' +chapter_list_url = 'https://amnesiactl.com/wp-admin/admin-ajax.php' + + +class Amnesiactl(Crawler): + base_url = 'https://amnesiactl.com/' + + # NOTE: Site doesn't have proper search layout. + # def search_novel(self, query): + # query = query.lower().replace(' ', '+') + # soup = self.get_soup(search_url % query) + + # results = [] + # for tab in soup.select('.c-tabs-item__content'): + # a = tab.select_one('.post-title h3 a') + # latest = tab.select_one('.latest-chap .chapter a').text + # votes = tab.select_one('.rating .total_votes').text + # results.append({ + # 'title': a.text.strip(), + # 'url': self.absolute_url(a['href']), + # 'info': '%s | Rating: %s' % (latest, votes), + # }) + # # end for + + # return results + # # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="novel-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) + # end for + + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/bestlightnovel.py b/lncrawl/sources/bestlightnovel.py index 27648a18c..bbd761d55 100644 --- a/lncrawl/sources/bestlightnovel.py +++ b/lncrawl/sources/bestlightnovel.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- import logging -import re -from bs4 import BeautifulSoup from ..utils.crawler import Crawler logger = logging.getLogger(__name__) @@ -20,7 +18,7 @@ def search_novel(self, query): results = [] for novel in data: - titleSoup = BeautifulSoup(novel['name'], 'lxml') + titleSoup = self.make_soup(novel['name']) results.append({ 'title': titleSoup.body.text.title(), 'url': novel_page_url % novel['id_encode'], diff --git a/lncrawl/sources/booknet.py b/lncrawl/sources/booknet.py new file mode 100644 index 000000000..a184b7cbe --- /dev/null +++ b/lncrawl/sources/booknet.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +import logging +from concurrent.futures.thread import ThreadPoolExecutor +from urllib.parse import quote_plus + +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://booknet.com/en/search?q=%s' +get_chapter_url = 'https://booknet.com/reader/get-page' + + +class LitnetCrawler(Crawler): + base_url = [ + 'https://litnet.com/', + 'https://booknet.com/', + ] + + def initialize(self): + self.home_url = 'https://booknet.com/' + self.executor = ThreadPoolExecutor(1) + # end def + + def search_novel(self, query): + query = quote_plus(query.lower()) + soup = self.get_soup(search_url % query) + + results = [] + for div in soup.select('.book-item'): + a = div.select_one('.book-title a') + author = div.select_one('.author-wr a.author').text.strip() + views = div.select_one('span.count-views').text.strip() + favourites = div.select_one('span.count-favourites').text.strip() + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': 'Author: %s | %s views | %s favorites' % (author, views, favourites) + }) + # end for + + return results + # end def + + def read_novel_info(self): + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.csrf_token = soup.select_one('meta[name="csrf-token"]')['content'] + self.csrf_param = soup.select_one('meta[name="csrf-param"]')['content'] + logger.info('%s: %s', self.csrf_param, self.csrf_token) + + self.novel_title = soup.select_one('h1.roboto').text.strip() + logger.info('Novel title: %s', self.novel_title) + + img_src = soup.select_one('.book-view-cover img') + if not img_src: + img_src = soup.select_one('.book-cover img') + # end if + if img_src: + self.novel_cover = self.absolute_url(img_src['src']) + # end if + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.select_one('.book-view-info a.author') + if not author: + author = soup.select_one('.book-head-content a.book-autor') + # end if + if author: + self.novel_author = author.text.strip() + # end if + logger.info('Novel author: %s', self.novel_author) + + chapters = soup.find('select', {'name': 'chapter'}) + if chapters is None: + chapters = soup.select('.collapsible-body a.collection-item') + else: + chapters = chapters.find_all('option') + chapters = [a for a in chapters if a.attrs['value']] + # end if + + volumes = set([]) + for a in chapters: + chap_id = len(self.chapters) + 1 + vol_id = len(self.chapters) // 100 + 1 + volumes.add(vol_id) + + abs_url = self.last_visited_url.replace('/en/book/', '/en/reader/') + chap_url = abs_url + ('?c=%s' % a.attrs['value']) + self.chapters.append({ + 'id': chap_id, + 'volume': 1, + 'url': chap_url, + 'chapter_id': a.attrs['value'], + }) + # end for + + self.volumes = [{'id': x} for x in volumes] + # end def + + def download_chapter_body(self, chapter): + data = self._get_chapter_page(chapter) + chapter['title'] = data['chapterTitle'] + content = data['data'] + + for page in range(2, data['totalPages'] + 1): + data = self._get_chapter_page(chapter, page) + content += data['data'] + # end for + + return content + # end def + + def _get_chapter_page(self, chapter, page=1): + return self.post_json(get_chapter_url, data={ + 'chapterId': int(chapter['chapter_id']), + 'page': page, + self.csrf_param: self.csrf_token + }, headers={ + 'X-CSRF-Token': self.csrf_token, + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + }) + # end def + +# end class diff --git a/lncrawl/sources/boxnovelcloud.py b/lncrawl/sources/boxnovelcloud.py new file mode 100644 index 000000000..f686d7196 --- /dev/null +++ b/lncrawl/sources/boxnovelcloud.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +import logging +import re +from concurrent import futures +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'http://boxnovel.cloud/search?keyword=%s' + + +class BoxNovelCloud(Crawler): + base_url = 'http://boxnovel.cloud/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.col-novel-main .list-novel .row'): + search_title = tab.select_one('.novel-title a') + latest = tab.select_one('.text-info a').text.strip() + results.append({ + 'title': search_title.text.strip(), + 'url': self.absolute_url( + tab.select_one('.novel-title a')['href']), + 'info': 'Latest chapter: %s' % (latest) + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = ' '.join([ + str(x) + for x in soup.select_one('.title').contents + if not x.name + ]).strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.find_all(href=re.compile('author')) + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + # This is copied from the Novelfull pagination 'hanlder' with minor tweaks + pagination_links = soup.select('.pagination li a') + pagination_page_numbers = [] + for pagination_link in pagination_links: + # Boxnovel.org pagination numbering boxes contain non-digit characters + if pagination_link.text.isdigit(): + pagination_page_numbers.append(int(pagination_link.text)) + + page_count = max( + pagination_page_numbers) if pagination_page_numbers else 0 + logger.info('Chapter list pages: %d' % page_count) + + logger.info('Getting chapters...') + futures_to_check = { + self.executor.submit( + self.download_chapter_list, + i + 1, + ): str(i) + for i in range(page_count + 1) + } + [x.result() for x in futures.as_completed(futures_to_check)] + + # Didn't test without this, but with pagination the chapters could be in different orders + logger.info('Sorting chapters...') + self.chapters.sort(key=lambda x: x['volume'] * 1000 + x['id']) + + # Copied straight from Novelfull + logger.info('Adding volumes...') + mini = self.chapters[0]['volume'] + maxi = self.chapters[-1]['volume'] + for i in range(mini, maxi + 1): + self.volumes.append({'id': i}) + # end for + # end def + + def download_chapter_list(self, page): + '''Download list of chapters and volumes.''' + url = self.novel_url.split('?')[0].strip('/') + url += '?page=%d&per-page=50' % page + soup = self.get_soup(url) + + for a in soup.select('ul.list-chapter li a'): + title = a['title'].strip() + + chapter_id = len(self.chapters) + 1 + # match = re.findall(r'ch(apter)? (\d+)', title, re.IGNORECASE) + # if len(match) == 1: + # chapter_id = int(match[0][1]) + # # end if + + volume_id = 1 + (chapter_id - 1) // 100 + match = re.findall(r'(book|vol|volume) (\d+)', + title, re.IGNORECASE) + if len(match) == 1: + volume_id = int(match[0][1]) + # end if + + data = { + 'title': title, + 'id': chapter_id, + 'volume': volume_id, + 'url': self.absolute_url(a['href']), + } + self.chapters.append(data) + # end for + # end def + + def download_chapter_body(self, chapter): # + # NOTE: Set `chapter['body_lock'] = True` to disable post-formatting. + # It can be useful in non-english sources, e.g. aixdzs, qidiancom, tiknovel + # + # Return an empty body if anything goes wrong. But you should not return `None`. + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('div.chr-c, #chr-content') + for br in contents.select('br'): + br.decompose() + # end for + + return str(contents) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/boxnovelcom.py b/lncrawl/sources/boxnovelcom.py index 4bb36d3a2..15daa11b9 100644 --- a/lncrawl/sources/boxnovelcom.py +++ b/lncrawl/sources/boxnovelcom.py @@ -13,24 +13,25 @@ class BoxNovelComCrawler(Crawler): base_url = 'https://www.box-novel.com/' - def search_novel(self, query): - query = query.lower().replace(' ', '+') - soup = self.get_soup(search_url % query) + # NOTE: Disabled because it is taking too long + # def search_novel(self, query): + # query = query.lower().replace(' ', '+') + # soup = self.get_soup(search_url % query) - results = [] - for tab in soup.select('.c-tabs-item__content'): - a = tab.select_one('.post-title h3 a') - latest = tab.select_one('.latest-chap .chapter a').text - votes = tab.select_one('.rating .total_votes').text - results.append({ - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - 'info': '%s | Rating: %s' % (latest, votes), - }) - # end for + # results = [] + # for tab in soup.select('.c-tabs-item__content'): + # a = tab.select_one('.post-title h3 a') + # latest = tab.select_one('.latest-chap .chapter a').text + # votes = tab.select_one('.rating .total_votes').text + # results.append({ + # 'title': a.text.strip(), + # 'url': self.absolute_url(a['href']), + # 'info': '%s | Rating: %s' % (latest, votes), + # }) + # # end for - return results - # end def + # return results + # # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' diff --git a/lncrawl/sources/boxnovelonline.py b/lncrawl/sources/boxnovelonline.py new file mode 100644 index 000000000..05d7c2d2d --- /dev/null +++ b/lncrawl/sources/boxnovelonline.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://boxnovel.online/?s=%s&post_type=wp-manga&author=&artist=&release=' + + +class BoxNovelOnline(Crawler): + base_url = 'https://boxnovel.online/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s' % (latest), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = ' '.join([ + str(x) + for x in soup.select_one('.post-title h1').contents + if not x.name + ]).strip() + logger.info('Novel title: %s', self.novel_title) + + probable_img = soup.select_one('.summary_image img') + if probable_img: + self.novel_cover = self.absolute_url(probable_img['data-src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.select('.author-content a') + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + volumes = set() + chapters = soup.select('ul.main li.wp-manga-chapter a') + for a in reversed(chapters): + chap_id = len(self.chapters) + 1 + vol_id = (chap_id - 1) // 100 + 1 + volumes.add(vol_id) + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(a['href']), + 'title': a.text.strip() or ('Chapter %d' % chap_id), + }) + # end for + + self.volumes = [{'id': x} for x in volumes] + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('div.text-left') + for bad in contents.select('h3, .code-block, script, .adsbygoogle'): + bad.decompose() + + return str(contents) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/exiledrebels.py b/lncrawl/sources/exiledrebels.py index df9f3e95d..7f4502500 100644 --- a/lncrawl/sources/exiledrebels.py +++ b/lncrawl/sources/exiledrebels.py @@ -6,40 +6,20 @@ from ..utils.crawler import Crawler logger = logging.getLogger(__name__) -search_url = 'https://exiledrebelsscanlations.com/?s=%s' class ExiledRebelsScanlations(Crawler): base_url = 'https://exiledrebelsscanlations.com/' - def search_novel(self, query): - query = query.lower().replace(' ', '+') - soup = self.get_soup(search_url % query) - - results = [] - for tab in soup.select('article.type-page'): - a = tab.select_one('h2.entry-title a') - latest = "N/A" - votes = "0" - results.append({ - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - 'info': '%s | Rating: %s' % (latest, votes), - }) - # end for - - return results - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) soup = self.get_soup(self.novel_url) - self.novel_title = soup.find("h1", {"class": "page-title"}).text.strip() + self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() logger.info('Novel title: %s', self.novel_title) self.novel_cover = self.absolute_url( - soup.select_one('div.entry-content img')['data-orig-file']) + soup.select_one('.post-thumbnail img')['src']) logger.info('Novel cover: %s', self.novel_cover) self.novel_author = "Translated by ExR" diff --git a/lncrawl/sources/fastnovel.py b/lncrawl/sources/fastnovel.py index 8e81fe47f..e9f4baad7 100644 --- a/lncrawl/sources/fastnovel.py +++ b/lncrawl/sources/fastnovel.py @@ -5,11 +5,29 @@ from ..utils.crawler import Crawler logger = logging.getLogger(__name__) - +search_url = 'http://fastnovel.net/search/%s' class FastNovel(Crawler): base_url = 'http://fastnovel.net/' + def search_novel(self, query): + query = query.lower().replace(' ', '%20') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.film-item'): + a = tab.select_one('a') + latest = tab.select_one('label.current-status span.process').text + results.append({ + 'title': a['title'], + 'url': self.absolute_url(a['href']), + 'info': '%s' % (latest), + }) + # end for + + return results + # end def + def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/foxteller.py b/lncrawl/sources/foxteller.py index be1d25360..4c11e8764 100644 --- a/lncrawl/sources/foxteller.py +++ b/lncrawl/sources/foxteller.py @@ -15,23 +15,24 @@ class FoxtellerCrawler(Crawler): base_url = 'https://www.foxteller.com/' - def search_novel(self, query): - self.get_response(self.home_url) # for cookies - - query = query.lower().replace(' ', '+') - soup = self.post_soup(search_url, data=dict(query=query)) - - results = [] - for a in soup.select('a[href*="/novel/"]'): - results.append({ - 'title': a.select_one('span .ellipsis-1').text.strip(), - 'url': self.absolute_url(a['href']), - 'info': a.select_one('span .text-brand').text.strip(), - }) - # end for - - return results - # end def + # NOTE: Disabled because it takes too long + # def search_novel(self, query): + # self.get_response(self.home_url) # for cookies + + # query = query.lower().replace(' ', '+') + # soup = self.post_soup(search_url, data=dict(query=query)) + + # results = [] + # for a in soup.select('a[href*="/novel/"]'): + # results.append({ + # 'title': a.select_one('span .ellipsis-1').text.strip(), + # 'url': self.absolute_url(a['href']), + # 'info': a.select_one('span .text-brand').text.strip(), + # }) + # # end for + + # return results + # # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' diff --git a/lncrawl/sources/freelightnovel.py b/lncrawl/sources/freelightnovel.py new file mode 100644 index 000000000..d2358aa4d --- /dev/null +++ b/lncrawl/sources/freelightnovel.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +import logging +import re + +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) + + +class FreeLightNovel(Crawler): + base_url = 'https://www.freelightnovel.com/' + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = soup.select_one('h1.page-header').text + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.content img.img-responsive')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.volumes.append({'id': 1}) + for a in soup.select('.book-toc .dropdown-menu li.leaf a'): + title = a.text.strip() + + chap_id = len(self.chapters) + 1 + match = re.findall(r'ch(apter)? (\d+)', title, re.IGNORECASE) + if len(match) == 1: + chap_id = int(match[0][1]) + # end if + + self.chapters.append({ + 'volume': 1, + 'id': chap_id, + 'title': title, + 'url': self.absolute_url(a['href']), + }) + # end for + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.debug('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + content = soup.select_one('.content') + self.clean_contents(content) + + return ''.join([ + str(p) for p in content.select('p') + if len(p.text.strip()) > 1 + ]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/fsapk.py b/lncrawl/sources/fsapk.py index f6f48a90e..5df4faa7a 100644 --- a/lncrawl/sources/fsapk.py +++ b/lncrawl/sources/fsapk.py @@ -14,27 +14,9 @@ class BestofLightNovels(Crawler): 'https://bestoflightnovels.com/', ] - # def search_novel(self, query): - # query = quote_plus(query.lower()) - # soup = self.get_soup(search_url % query) - - # results = [] - # for tab in soup.select('.c-tabs-item__content'): - # a = tab.select_one('.post-title h3 a') - # latest = tab.select_one('.latest-chap .chapter a').text - # votes = tab.select_one('.rating .total_votes').text - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': '%s | Rating: %s' % (latest, votes), - # }) - # # end for - - # return results - # # end def - def initialize(self): self.home_url = 'https://fsapk.com/' + # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' diff --git a/lncrawl/sources/hs2ppe.py b/lncrawl/sources/hs2ppe.py new file mode 100644 index 000000000..01f163ef3 --- /dev/null +++ b/lncrawl/sources/hs2ppe.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler +from ..utils.cleaner import cleanup_text + +logger = logging.getLogger(__name__) +search_url = 'http://hs2ppe.co.uk/search?keyword=%s' +full_chapter_url = 'http://hs2ppe.co.uk/ajax/chapter-archive?novelId=%s' + + +class wspadancewichita(Crawler): + base_url = 'http://hs2ppe.co.uk/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for result in soup.select('div.col-novel-main div.list.list-novel div.row')[:5]: + url = self.absolute_url( + result.select_one('h3.novel-title a')['href']) + title = result.select_one('h3.novel-title a')['title'] + last_chapter = result.select_one('span.chr-text').text.strip() + results.append({ + 'url': url, + 'title': title, + 'info': 'Latest: %s' % last_chapter, + }) + # end for + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url + '?waring=1') + + self.novel_title = soup.select_one('h3.title').text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('div.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = [] + for a in soup.select('ul.info.info-meta li')[1].select('a'): + author.append(a.text.strip()) + # end for + + self.novel_author = ", ".join(author) + + logger.info('Novel author: %s', self.novel_author) + + novel_id = soup.select_one('div#rating')['data-novel-id'] + + chapter_url = full_chapter_url % novel_id + logger.debug('Visiting %s', chapter_url) + + chapter_soup = self.get_soup(chapter_url) + chapters = chapter_soup.select('li a') + for a in chapters: + for span in a.findAll('span'): + span.decompose() + # end for + # end for + + for x in chapters: + chap_id = len(self.chapters) + 1 + if len(self.chapters) % 100 == 0: + vol_id = chap_id//100 + 1 + vol_title = 'Volume ' + str(vol_id) + self.volumes.append({ + 'id': vol_id, + 'title': vol_title, + }) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(x['href']), + 'title': x['title'] or ('Chapter %d' % chap_id), + }) + # end for + # end def + + @cleanup_text + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + content = soup.select('#chr-content p') + if not content: + return '' + # end if + + return "".join(map(str, content)) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/idmtlnovel.py b/lncrawl/sources/idmtlnovel.py index af1f178b5..b1bd15db1 100644 --- a/lncrawl/sources/idmtlnovel.py +++ b/lncrawl/sources/idmtlnovel.py @@ -2,6 +2,7 @@ import json import logging import re +from urllib.parse import quote from ..utils.crawler import Crawler logger = logging.getLogger(__name__) @@ -12,9 +13,7 @@ class IdMtlnovelCrawler(Crawler): base_url = 'https://id.mtlnovel.com/' def search_novel(self, query): - query = query.lower().replace(' ', '%20') - #soup = self.get_soup(search_url % query) - + query = quote(query.lower()) list_url = search_url % query data = self.get_json(list_url)['items'][0]['results'] @@ -24,28 +23,12 @@ def search_novel(self, query): results.append({ 'url': url, 'title': re.sub(r'', '', item['title']), - 'info': self.search_novel_info(url), }) # end for return results # end def - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - chapters = soup.select( - 'div.info-wrap div')[1].text.replace('Chapters', '') - info = '%s chapters' % chapters - # if len(chapters) > 0: - # info += ' | Latest: %s' % chapters[-1].text.strip() - # end if - - return info - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/instadoses.py b/lncrawl/sources/instadoses.py index ae5c6dd27..2d495fb7f 100644 --- a/lncrawl/sources/instadoses.py +++ b/lncrawl/sources/instadoses.py @@ -19,26 +19,6 @@ def initialize(self): self.executor = ThreadPoolExecutor(max_workers=7) # end def - # NOTE: Site search doesn't work. So this won't work. - # def search_novel(self, query): - # query = query.lower().replace(' ', '+') - # soup = self.get_soup(search_url % query) - - # results = [] - # for tab in soup.select('.c-tabs-item__content'): - # a = tab.select_one('.post-title h3 a') - # latest = tab.select_one('.latest-chap .chapter a').text - # votes = tab.select_one('.rating .total_votes').text - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': '%s | Rating: %s' % (latest, votes), - # }) - # # end for - - # return results - # # end def - def read_novel_info(self): logger.debug('Visiting %s', self.novel_url) soup = self.get_soup(self.novel_url) diff --git a/lncrawl/sources/kolnovelnewsite.py b/lncrawl/sources/kolnovelnewsite.py new file mode 100644 index 000000000..cb0ca5ede --- /dev/null +++ b/lncrawl/sources/kolnovelnewsite.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://newsite.kolnovel.com/?s=%s&post_type=wp-manga' +chapter_list_url = 'https://newsite.kolnovel.com/wp-admin/admin-ajax.php' + + +class kolnovelnewsite(Crawler): + base_url = 'https://newsite.kolnovel.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="manga-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) + # end for + + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/lightnovelkiss.py b/lncrawl/sources/lightnovelkiss.py new file mode 100644 index 000000000..ad5804dcb --- /dev/null +++ b/lncrawl/sources/lightnovelkiss.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://lightnovelkiss.com/?s=%s&post_type=wp-manga&author=&artist=&release=' +chapter_list_url = 'https://lightnovelkiss.com/wp-admin/admin-ajax.php' + + +class LightNovelKiss(Crawler): + base_url = 'https://lightnovelkiss.com/' + + # NOTE: Disabled because it takes unusually long time + # def search_novel(self, query): + # query = query.lower().replace(' ', '+') + # soup = self.get_soup(search_url % query) + + # results = [] + # for tab in soup.select('.c-tabs-item__content'): + # a = tab.select_one('.post-title h3 a') + # latest = tab.select_one('.latest-chap .chapter a').text + # votes = tab.select_one('.rating .total_votes').text + # results.append({ + # 'title': a.text.strip(), + # 'url': self.absolute_url(a['href']), + # 'info': '%s | Rating: %s' % (latest, votes), + # }) + # # end for + + # return results + # # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + # NOTE: Title has "Novel" at the end of book title, can't seem to remove it. + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form( + chapter_list_url, data='action=manga_get_chapters&manga=' + self.novel_id) + soup = self.make_soup(response) + for a in reversed(soup.select('.wp-manga-chapter a')): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({'id': vol_id}) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + }) + # end for + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/lightnoveltv.py b/lncrawl/sources/lightnoveltv.py new file mode 100644 index 000000000..84e64eb32 --- /dev/null +++ b/lncrawl/sources/lightnoveltv.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://lightnovel.tv/?s=%s&post_type=wp-manga&author=&artist=&release=' +chapter_list_url = 'https://lightnovel.tv/wp-admin/admin-ajax.php' + + +class LightNovelTV(Crawler): + base_url = 'https://lightnovel.tv/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="novel-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form( + chapter_list_url, data='action=manga_get_chapters&manga=' + self.novel_id) + soup = self.make_soup(response) + for a in reversed(soup.select('.wp-manga-chapter a')): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({'id': vol_id}) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + }) + # end for + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/litnet.py b/lncrawl/sources/litnet.py deleted file mode 100644 index 1ac0aa2ea..000000000 --- a/lncrawl/sources/litnet.py +++ /dev/null @@ -1,95 +0,0 @@ -# -*- coding: utf-8 -*- -import logging -from ..utils.crawler import Crawler - -logger = logging.getLogger(__name__) -search_url = 'https://litnet.com/en/search?q=%s' - - -class LitnetCrawler(Crawler): - base_url = 'https://litnet.com/' - - def search_novel(self, query): - query = query.lower().replace(' ', '+') - soup = self.get_soup(search_url % query) - - results = [] - for a in soup.select('div.l-container ul a'): - results.append({ - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - }) - # end for - - return results - # end def - - def read_novel_info(self): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', self.novel_url) - soup = self.get_soup(self.novel_url) - - self.novel_title = soup.select_one('h1').text.strip() - logger.info('Novel title: %s', self.novel_title) - - img_src = soup.select_one('div.book-view-cover img') - if not img_src: - img_src = soup.select_one('div.book-cover img') - # end if - if img_src: - self.novel_cover = self.absolute_url(img_src['src']) - # end if - logger.info('Novel cover: %s', self.novel_cover) - - author = soup.select_one('div.book-view-info a.author') - if not author: - author = soup.select_one('div.book-head-content a.book-autor') - # end if - if author: - self.novel_author = author.text.strip() - # end if - logger.info('Novel author: %s', self.novel_author) - - chapters = soup.find('select', {'name': 'chapter'}) - if chapters is None: - chapters = soup.select('div.collapsible-body a.collection-item') - else: - chapters = chapters.find_all('option') - chapters = [c for c in chapters if c.attrs['value']] - # end if - - for a in chapters: - chap_id = len(self.chapters) + 1 - if len(self.chapters) % 100 == 0: - vol_id = chap_id//100 + 1 - vol_title = 'Volume ' + str(vol_id) - self.volumes.append({ - 'id': vol_id, - 'title': vol_title, - }) - # end if - - abs_url = self.last_visited_url.replace('book', 'reader') - chap_url = abs_url + \ - ('?c=%s' % a.attrs['value']) if a.has_attr( - 'value') else self.home_url + a['href'] - self.chapters.append({ - 'id': chap_id, - 'volume': 1, - 'url': chap_url, - 'title': a.text.strip() or ('Chapter %d' % chap_id), - }) - # end for - # end def - - def download_chapter_body(self, chapter): - '''Download body of a single chapter and return as clean html format.''' - logger.info('Downloading %s', chapter['url']) - soup = self.get_soup(chapter['url']) - - contents = soup.select_one('div.reader-text') - if contents is None: - contents = soup.select_one('div.demo-txt') - return str(contents) - # end def -# end class diff --git a/lncrawl/sources/lnmtl.py b/lncrawl/sources/lnmtl.py index b99f83e9b..382fc4769 100644 --- a/lncrawl/sources/lnmtl.py +++ b/lncrawl/sources/lnmtl.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import json import logging import re from concurrent import futures @@ -44,7 +43,6 @@ def login(self, email, password): logger.debug('-' * 80) logger.error('Failed to login') # end if - # end def def logout(self): @@ -56,7 +54,6 @@ def logout(self): else: print('Logged out') # end if - # end def def read_novel_info(self): @@ -82,7 +79,6 @@ def read_novel_info(self): logger.info('Getting chapters...') self.download_chapter_list() - # end def def parse_volume_list(self, soup): @@ -108,7 +104,6 @@ def parse_volume_list(self, soup): if len(self.volumes) == 0: raise Exception('Failed parsing volume list') # end if - # end def def download_chapter_list(self): @@ -131,7 +126,6 @@ def download_chapter_list(self): self.chapters.append(chap) # end for # end for - # end def def download_chapters_per_volume(self, volume, page=1): @@ -160,7 +154,6 @@ def download_chapters_per_volume(self, volume, page=1): chapters += self.download_chapters_per_volume(volume, page) # end for return volume['id'], chapters - # end def def download_chapter_body(self, chapter): @@ -170,7 +163,6 @@ def download_chapter_body(self, chapter): body = [self.format_text(x.text) for x in body if x] body = '\n'.join(['

%s

' % (x) for x in body if len(x)]) return body.strip() - # end def def format_text(self, text): @@ -180,8 +172,6 @@ def format_text(self, text): text = re.sub(r'\u201d[, ]*', '”', text) text = re.sub(r'[ ]*,[ ]+', ', ', text) return text.strip() - # end def - # end class diff --git a/lncrawl/sources/lunarletters.py b/lncrawl/sources/lunarletters.py new file mode 100644 index 000000000..ce11a475c --- /dev/null +++ b/lncrawl/sources/lunarletters.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +import logging +from requests import Session +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +chapter_list_url = 'https://www.lunarletters.com/wp-admin/admin-ajax.php' + + +class LunarLetters(Crawler): + base_url = 'https://www.lunarletters.com/' + + def initialize(self): + self.scraper = Session() + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="series-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) + # end for + + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class diff --git a/lncrawl/sources/mangatoon.py b/lncrawl/sources/mangatoon.py index da30c96f1..69a8061c1 100644 --- a/lncrawl/sources/mangatoon.py +++ b/lncrawl/sources/mangatoon.py @@ -19,63 +19,57 @@ def initialize(self): # end def def read_novel_info(self): - '''Get novel title, autor, cover etc''' - self.novel_id = self.novel_url.split('/')[5] - logger.info('Novel Id: %s', self.novel_id) + novel_id = self.novel_url.split('/')[5] + logger.info('Novel Id: %s', novel_id) novel_region = self.novel_url.split('/')[3] + logger.info('Novel Region: %s', novel_region) - self.novel_url = book_url % (novel_region, self.novel_id) + self.novel_url = book_url % (novel_region, novel_id) logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) - self.novel_title = soup.select_one('h1.comics-title').text + self.novel_title = soup.select_one('h1.comics-title, .detail-title').text logger.info('Novel title: %s', self.novel_title) - try: - self.novel_cover = self.absolute_url( - soup.select_one('.detail-top-right img')['src']) - logger.info('Novel cover: %s', self.novel_cover) - except Exception: - logger.debug('Failed to get cover: %s', self.novel_url) - # end try - - self.novel_author = soup.select_one('.created-by').text - logger.info('Novel author: %s', self.novel_author) + possible_image = soup.select_one('.detail-top-right img, .detail-img .big-img') + if possible_image: + self.novel_cover = self.absolute_url(possible_image['src']) + # end if + logger.info('Novel cover: %s', self.novel_cover) - for a in soup.select('a.episode-item'): + volumes = set([]) + for a in soup.select('a.episode-item, a.episode-item-new'): chap_id = len(self.chapters) + 1 - if len(self.chapters) % 100 == 0: - vol_id = chap_id//100 + 1 - vol_title = 'Volume ' + str(vol_id) - self.volumes.append({ - 'id': vol_id, - 'title': vol_title, - }) - # end if + vol_id = len(self.chapters) // 100 + 1 + volumes.add(vol_id) self.chapters.append({ 'id': chap_id, 'volume': vol_id, 'url': self.absolute_url(a['href']), - 'title': a.select_one('.episode-title').text.strip() or ('Chapter %d' % chap_id), + 'title': a.select_one('.episode-title, .episode-title-new').text, }) # end for + + self.volumes = [{'id': x} for x in volumes] # end def def download_chapter_body(self, chapter): - '''Download body of a single chapter and return as clean html format''' logger.info('Downloading %s', chapter['url']) soup = self.get_soup(chapter['url']) - script = soup.find("script", text=re.compile("initialValue\s+=")) - initialValue = re.search( - 'var initialValue = (?P.*);', script.string) + pictures = soup.select_one('.pictures') + if pictures: + return str(pictures) + # end if + + script = soup.find("script", text=re.compile(r"initialValue\s+=")) + initialValue = re.search('var initialValue = (?P.*);', script.string) content = initialValue.group('value') - chapter_content = ast.literal_eval(content) - chapter_content = [p.replace('\-', '-') for p in chapter_content] - text = '

' + '

'.join(chapter_content) + '

' - # end if - return text.strip() + chapter_content = ast.literal_eval(content) + chapter_content = [p.replace(r'\-', '-') for p in chapter_content] + return '

' + '

'.join(chapter_content) + '

' # end def # end class diff --git a/lncrawl/sources/meionovel.py b/lncrawl/sources/meionovel.py index b1a639fc1..4f94ca58d 100644 --- a/lncrawl/sources/meionovel.py +++ b/lncrawl/sources/meionovel.py @@ -11,24 +11,25 @@ class MeionovelCrawler(Crawler): base_url = 'https://meionovel.id/' - def search_novel(self, query): - query = query.lower().replace(' ', '+') - soup = self.get_soup(search_url % query) - - results = [] - for tab in soup.select('.c-tabs-item__content'): - a = tab.select_one('.post-title h4 a') - latest = tab.select_one('.latest-chap .chapter a').text - votes = tab.select_one('.rating .total_votes').text - results.append({ - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - 'info': '%s | Rating: %s' % (latest, votes), - }) - # end for - - return results - # end def + # NOTE: Disabled because it takes too long + # def search_novel(self, query): + # query = query.lower().replace(' ', '+') + # soup = self.get_soup(search_url % query) + + # results = [] + # for tab in soup.select('.c-tabs-item__content'): + # a = tab.select_one('.post-title h4 a') + # latest = tab.select_one('.latest-chap .chapter a').text + # votes = tab.select_one('.rating .total_votes').text + # results.append({ + # 'title': a.text.strip(), + # 'url': self.absolute_url(a['href']), + # 'info': '%s | Rating: %s' % (latest, votes), + # }) + # # end for + + # return results + # # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' diff --git a/lncrawl/sources/moonstonetrans.py b/lncrawl/sources/moonstonetrans.py index 270d26cec..cfcdce538 100644 --- a/lncrawl/sources/moonstonetrans.py +++ b/lncrawl/sources/moonstonetrans.py @@ -18,26 +18,6 @@ def initialize(self): self.executor = ThreadPoolExecutor(max_workers=7) # end def - # NOTE: Site search doesn't work. So this won't work. - # def search_novel(self, query): - # query = query.lower().replace(' ', '+') - # soup = self.get_soup(search_url % query) - - # results = [] - # for tab in soup.select('.c-tabs-item__content'): - # a = tab.select_one('.post-title h3 a') - # latest = tab.select_one('.latest-chap .chapter a').text - # votes = tab.select_one('.rating .total_votes').text - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': '%s | Rating: %s' % (latest, votes), - # }) - # # end for - - # return results - # # end def - def read_novel_info(self): logger.debug('Visiting %s', self.novel_url) soup = self.get_soup(self.novel_url) diff --git a/lncrawl/sources/morenovel.py b/lncrawl/sources/morenovel.py index eafdf7e61..819ea4ea3 100644 --- a/lncrawl/sources/morenovel.py +++ b/lncrawl/sources/morenovel.py @@ -33,7 +33,6 @@ def search_novel(self, query): # end for return results - # end def def read_novel_info(self): diff --git a/lncrawl/sources/mtlednovels.py b/lncrawl/sources/mtlednovels.py index 0647f29ff..3bb378436 100644 --- a/lncrawl/sources/mtlednovels.py +++ b/lncrawl/sources/mtlednovels.py @@ -51,7 +51,7 @@ def logout(self): print('Logged out') # end def - # TODO: disabled search for cloudflare issue + # NOTE: Disabled search for cloudflare issue # def search_novel(self, query): # query = query.lower().replace(' ', '+') # soup = self.get_soup(search_url % query) @@ -62,27 +62,12 @@ def logout(self): # results.append({ # 'url': url, # 'title': a.img['alt'], - # 'info': self.search_novel_info(url), # }) # # end for # return results # # end def - # def search_novel_info(self, url): - # '''Get novel title, autor, cover etc''' - # logger.debug('Visiting %s', url) - # soup = self.get_soup(url) - - # chapters = soup.select('#tab-profile-2 a.chapters') - # info = '%d chapters' % len(chapters) - # if len(chapters) > 0: - # info += ' | Latest: %s' % chapters[-1].text.strip() - # # end if - - # return info - # # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/mtlnovel.py b/lncrawl/sources/mtlnovel.py index 33d9191a5..cd1fc7771 100644 --- a/lncrawl/sources/mtlnovel.py +++ b/lncrawl/sources/mtlnovel.py @@ -24,38 +24,22 @@ def search_novel(self, query): results.append({ 'url': url, 'title': re.sub(r'', '', item['title']), - 'info': self.search_novel_info(url), }) # end for return results # end def - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - chapters = soup.select( - 'div.info-wrap div')[1].text.replace('Chapters', '') - info = '%s chapters' % chapters - # if len(chapters) > 0: - # info += ' | Latest: %s' % chapters[-1].text.strip() - # end if - - return info - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) soup = self.get_soup(self.novel_url) - self.novel_title = soup.select_one('h1.entry-title').text.strip() + self.novel_title = soup.select_one('h1').text.strip() logger.info('Novel title: %s', self.novel_title) self.novel_cover = self.absolute_url( - soup.select('div.nov-head amp-img')[1]['src']) + soup.select_one('.post-content amp-img')['src']) logger.info('Novel cover: %s', self.novel_cover) try: diff --git a/lncrawl/sources/novelall.py b/lncrawl/sources/novelall.py index f240bfe4e..292cee153 100644 --- a/lncrawl/sources/novelall.py +++ b/lncrawl/sources/novelall.py @@ -16,35 +16,16 @@ def search_novel(self, query): soup = self.get_soup(search_url % query) results = [] - for a in soup.select('.cover-info p.title a')[:5]: + for a in soup.select('.cover-info p.title a')[:20]: url = self.absolute_url(a['href']) results.append({ 'url': url, 'title': a.text.strip(), - 'info': self.search_novel_info(url), }) # end for return results # end def - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - chapters = soup.select_one( - 'div.manga-detailchapter').findAll('a', title=True) - info = '%d chapters' % len(chapters) - - latest = soup.select_one( - 'div.manga-detailchapter').findAll('a', title=True) - if latest: - info += ' | Latest: ' + latest[0]['title'] - # end if - - return info - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/novelcake.py b/lncrawl/sources/novelcake.py new file mode 100644 index 000000000..ef1779cad --- /dev/null +++ b/lncrawl/sources/novelcake.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://novelcake.com/?s=%s&post_type=wp-manga' +chapter_list_url = 'https://novelcake.com/wp-admin/admin-ajax.php' + + +class NovelCake(Crawler): + base_url = 'https://novelcake.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['data-src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="series-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) + # end for + + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/novelextra.py b/lncrawl/sources/novelextra.py new file mode 100644 index 000000000..497347b79 --- /dev/null +++ b/lncrawl/sources/novelextra.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler +from ..utils.cleaner import cleanup_text + +logger = logging.getLogger(__name__) +search_url = 'https://novelextra.com/search?keyword=%s' +full_chapter_url = 'https://novelextra.com/ajax/chapter-archive?novelId=%s' + + +class NovelExtra(Crawler): + base_url = 'https://novelextra.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for result in soup.select('div.col-novel-main div.list.list-novel div.row')[:20]: + url = self.absolute_url( + result.select_one('h3.novel-title a')['href']) + title = result.select_one('h3.novel-title a')['title'] + last_chapter = result.select_one('span.chr-text').text.strip() + results.append({ + 'url': url, + 'title': title, + 'info': 'last chapter : %s' % last_chapter, + }) + # end for + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url + '?waring=1') + + self.novel_title = soup.select_one('h3.title').text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('div.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = [] + for a in soup.select('ul.info.info-meta li')[1].select('a'): + author.append(a.text.strip()) + # end for + + self.novel_author = ", ".join(author) + + logger.info('Novel author: %s', self.novel_author) + + novel_id = soup.select_one('div#rating')['data-novel-id'] + + chapter_url = full_chapter_url % novel_id + logger.debug('Visiting %s', chapter_url) + + chapter_soup = self.get_soup(chapter_url) + chapters = chapter_soup.select('li a') + for a in chapters: + for span in a.findAll('span'): + span.decompose() + # end for + # end for + + for x in chapters: + chap_id = len(self.chapters) + 1 + if len(self.chapters) % 100 == 0: + vol_id = chap_id//100 + 1 + vol_title = 'Volume ' + str(vol_id) + self.volumes.append({ + 'id': vol_id, + 'title': vol_title, + }) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(x['href']), + 'title': x['title'] or ('Chapter %d' % chap_id), + }) + # end for + # end def + + @cleanup_text + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + content = soup.select('#chr-content p') + if not content: + return '' + # end if + + return "".join(map(str, content)) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/novelfullplus.py b/lncrawl/sources/novelfullplus.py new file mode 100644 index 000000000..221fee4cf --- /dev/null +++ b/lncrawl/sources/novelfullplus.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +import logging +import re +from concurrent import futures +from urllib.parse import quote_plus + +from ..utils.cleaner import cleanup_text +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = ' https://novelfullplus.com/ajax/search?q=%s' + +RE_VOLUME = r'(?:book|vol|volume) (\d+)' + + +class NovelFullPlus(Crawler): + base_url = 'https://novelfullplus.com/' + + def search_novel(self, query): + '''Gets a list of {title, url} matching the given query''' + query = quote_plus(query.lower()) + soup = self.get_soup(search_url % query) + + results = [] + for a in soup.select('ul li a'): + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + image = soup.select_one('.detail-info .col-image img') + + self.novel_title = image['alt'] + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url(image['src']) + logger.info('Novel cover: %s', self.novel_cover) + + any_chapter_url = soup.select_one('.chapter a[href*="/novel/"]')['href'] + soup = self.get_soup(any_chapter_url) + + volumes = set([]) + for option in soup.select('select.select-chapter option'): + chap_id = len(self.chapters) + 1 + vol_id = len(self.chapters) // 100 + 1 + volumes.add(vol_id) + option['value'] + option.text.strip() + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'title': option.text.strip(), + 'url': self.absolute_url(option['value']), + }) + # end for + + self.volumes = [{'id': x} for x in volumes] + # end def + + @cleanup_text + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('.reading-detail .container') + for br in contents.select('br, h3, h1, h2, h4'): + br.decompose() + # end for + + return str(contents) + # end def +# end class diff --git a/lncrawl/sources/novelgate.py b/lncrawl/sources/novelgate.py new file mode 100644 index 000000000..633f5cfa5 --- /dev/null +++ b/lncrawl/sources/novelgate.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://novelgate.net/search/%s' + +class NovelGate(Crawler): + base_url = 'https://novelgate.net/' + + def search_novel(self, query): + query = query.lower().replace(' ', '%20') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.film-item'): + a = tab.select_one('a') + latest = tab.select_one('label.current-status span.process').text + results.append({ + 'title': a['title'], + 'url': self.absolute_url(a['href']), + 'info': '%s' % (latest), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = soup.select_one('.name').text + logger.info('Novel title: %s', self.novel_title) + + author = soup.find_all(href=re.compile('author')) + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + self.novel_cover = self.absolute_url( + soup.select_one('.book-cover')['data-original']) + logger.info('Novel cover: %s', self.novel_cover) + + for div in soup.select('.block-film #list-chapters .book'): + vol_title = div.select_one('.title a').text + vol_id = [int(x) for x in re.findall(r'\d+', vol_title)] + vol_id = vol_id[0] if len(vol_id) else len(self.volumes) + 1 + self.volumes.append({ + 'id': vol_id, + 'title': vol_title, + }) + + for a in div.select('ul.list-chapters li.col-sm-5 a'): + ch_title = a.text + ch_id = [int(x) for x in re.findall(r'\d+', ch_title)] + ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 + self.chapters.append({ + 'id': ch_id, + 'volume': vol_id, + 'title': ch_title, + 'url': self.absolute_url(a['href']), + }) + # end for + # end for + + logger.debug('%d chapters and %d volumes found', + len(self.chapters), len(self.volumes)) + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('#chapter-body') + # end for + + return str(contents) + # end def + + def format_text(self, text): + '''formats the text and remove bad characters''' + text = re.sub(r'\u00ad', '', text, flags=re.UNICODE) + text = re.sub(r'\u201e[, ]*', '“', text, flags=re.UNICODE) + text = re.sub(r'\u201d[, ]*', '”', text, flags=re.UNICODE) + text = re.sub(r'[ ]*,[ ]+', ', ', text, flags=re.UNICODE) + return text.strip() + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/novelpassion.py b/lncrawl/sources/novelpassion.py index a3930757c..bce00a8e7 100644 --- a/lncrawl/sources/novelpassion.py +++ b/lncrawl/sources/novelpassion.py @@ -7,11 +7,10 @@ search_url = 'https://www.novelpassion.com/search?keyword=%s' + class NovelPassion(Crawler): base_url = 'https://www.novelpassion.com/' - # lncrawl -q "Forced to Date a Big Shot" --sources - def search_novel(self, query): query = query.lower().replace(' ', '%20') soup = self.get_soup(search_url % query) @@ -19,9 +18,6 @@ def search_novel(self, query): results = [] for tab in soup.select('div.lh1d5'): a = tab.select_one('a') - # NOTE: Could not get latest chapter to show. - # latest = tab.select_one('.dab a') - latest = " " votes = tab.select_one('span[class="g_star"]')['title'] results.append({ 'title': a.text.strip(), @@ -33,28 +29,6 @@ def search_novel(self, query): return results # end def - # def search_novel(self, query): - # '''Gets a list of (title, url) matching the given query''' - # query = query.strip().lower().replace(' ', '%20') - # soup = self.get_soup(search_url % query) - - # results = [] - # for div in soup.select('.d-80 .j_bookList ul li > lh1d5'): - # a = div.select_one('a.c_000') - # info = div.select_one('.dab') - # results.append( - # { - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': info.text.strip() if info else '', - # } - # ) - # # end for - - # return results - - # # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' url = self.novel_url diff --git a/lncrawl/sources/novelplanet.py b/lncrawl/sources/novelplanet.py index fc63d8e96..063acc709 100644 --- a/lncrawl/sources/novelplanet.py +++ b/lncrawl/sources/novelplanet.py @@ -14,24 +14,6 @@ class NovelPlanetCrawler(Crawler): base_url = 'https://novelplanet.com/' - # def search_novel(self, query): - # url = search_url % slugify(query) - # logger.info('Visiting %s', url) - # soup = self.get_soup(url) - - # results = [] - # for novel in soup.select('.post-content')[:8]: - # a = novel.select_one('a.title') - # info = novel.select_one("div:nth-of-type(3) a").text.strip() - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': 'Latest: %s' % info, - # }) - # # end for - # return results - # # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/novelscloud.py b/lncrawl/sources/novelscloud.py new file mode 100644 index 000000000..8d0e13bfd --- /dev/null +++ b/lncrawl/sources/novelscloud.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +import logging +import re +from concurrent import futures +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'http://novels.cloud/search?keyword=%s' + + +class NovelsCloud(Crawler): + base_url = 'http://novels.cloud/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.col-novel-main .list-novel .row'): + search_title = tab.select_one('.novel-title a') + latest = tab.select_one('.text-info a').text.strip() + results.append({ + 'title': search_title.text.strip(), + 'url': self.absolute_url( + tab.select_one('.novel-title a')['href']), + 'info': 'Latest chapter: %s' % (latest) + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = ' '.join([ + str(x) + for x in soup.select_one('.title').contents + if not x.name + ]).strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.find_all(href=re.compile('author')) + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + # This is copied from the Novelfull pagination 'hanlder' with minor tweaks + pagination_links = soup.select('.pagination li a') + pagination_page_numbers = [] + for pagination_link in pagination_links: + # Boxnovel.org pagination numbering boxes contain non-digit characters + if pagination_link.text.isdigit(): + pagination_page_numbers.append(int(pagination_link.text)) + + page_count = max( + pagination_page_numbers) if pagination_page_numbers else 0 + logger.info('Chapter list pages: %d' % page_count) + + logger.info('Getting chapters...') + futures_to_check = { + self.executor.submit( + self.download_chapter_list, + i + 1, + ): str(i) + for i in range(page_count + 1) + } + [x.result() for x in futures.as_completed(futures_to_check)] + + # Didn't test without this, but with pagination the chapters could be in different orders + logger.info('Sorting chapters...') + self.chapters.sort(key=lambda x: x['volume'] * 1000 + x['id']) + + # Copied straight from Novelfull + logger.info('Adding volumes...') + mini = self.chapters[0]['volume'] + maxi = self.chapters[-1]['volume'] + for i in range(mini, maxi + 1): + self.volumes.append({'id': i}) + # end for + # end def + + def download_chapter_list(self, page): + '''Download list of chapters and volumes.''' + url = self.novel_url.split('?')[0].strip('/') + url += '?page=%d&per-page=50' % page + soup = self.get_soup(url) + + for a in soup.select('ul.list-chapter li a'): + title = a['title'].strip() + + chapter_id = len(self.chapters) + 1 + # match = re.findall(r'ch(apter)? (\d+)', title, re.IGNORECASE) + # if len(match) == 1: + # chapter_id = int(match[0][1]) + # # end if + + volume_id = 1 + (chapter_id - 1) // 100 + match = re.findall(r'(book|vol|volume) (\d+)', + title, re.IGNORECASE) + if len(match) == 1: + volume_id = int(match[0][1]) + # end if + + data = { + 'title': title, + 'id': chapter_id, + 'volume': volume_id, + 'url': self.absolute_url(a['href']), + } + self.chapters.append(data) + # end for + # end def + + def download_chapter_body(self, chapter): # + # NOTE: Set `chapter['body_lock'] = True` to disable post-formatting. + # It can be useful in non-english sources, e.g. aixdzs, qidiancom, tiknovel + # + # Return an empty body if anything goes wrong. But you should not return `None`. + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('div.chr-c, #chr-content') + for br in contents.select('br'): + br.decompose() + # end for + + return str(contents) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/novelsonline.py b/lncrawl/sources/novelsonline.py new file mode 100644 index 000000000..abe04fa3a --- /dev/null +++ b/lncrawl/sources/novelsonline.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://novelsonline.net/search/autocomplete' + + +class NovelsOnline(Crawler): + base_url = 'https://novelsonline.net/' + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = soup.select_one('.block-title h1').text + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.find('img', {'alt': self.novel_title})['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author_link = soup.select_one("a[href*=author]") + if author_link: + self.novel_author = author_link.text.strip().title() + # end if + logger.info('Novel author: %s', self.novel_author) + + volume_ids = set() + for a in soup.select('.chapters .chapter-chs li a'): + chap_id = len(self.chapters) + 1 + vol_id = (chap_id - 1) // 100 + 1 + volume_ids.add(vol_id) + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(a['href']), + 'title': a.text.strip() or ('Chapter %d' % chap_id), + }) + # end for + + self.volumes = [{'id': i} for i in volume_ids] + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + div = soup.select_one('.chapter-content3') + + bad_selectors = [ + '.trinity-player-iframe-wrapper' + '.hidden', + '.ads-title', + 'script', + 'center', + 'interaction', + 'a[href*=remove-ads]', + 'a[target=_blank]', + 'hr', + 'br', + '#growfoodsmart', + '.col-md-6' + ] + for hidden in div.select(', '.join(bad_selectors)): + hidden.decompose() + # end for + + body = self.extract_contents(div) + if re.search(r'c?hapter .?\d+', body[0], re.IGNORECASE): + title = body[0].replace('', '').replace( + '', '').strip() + title = ('C' if title.startswith('hapter') else '') + title + chapter['title'] = title.strip() + body = body[1:] + # end if + + return '

' + '

'.join(body) + '

' + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/novelsrock.py b/lncrawl/sources/novelsrock.py index ef978c846..531bcbd0a 100644 --- a/lncrawl/sources/novelsrock.py +++ b/lncrawl/sources/novelsrock.py @@ -54,7 +54,7 @@ def read_novel_info(self): author = soup.select('.author-content a') if len(author) == 2: self.novel_author = author[0].text + ' (' + author[1].text + ')' - else: + elif len(author) == 1: self.novel_author = author[0].text logger.info('Novel author: %s', self.novel_author) diff --git a/lncrawl/sources/noveltrench.py b/lncrawl/sources/noveltrench.py index 33f1525a2..143493f52 100644 --- a/lncrawl/sources/noveltrench.py +++ b/lncrawl/sources/noveltrench.py @@ -47,12 +47,11 @@ def read_novel_info(self): self.novel_cover = self.absolute_url(probable_img['data-src']) logger.info('Novel cover: %s', self.novel_cover) - author = soup.select('.author-content a') - if len(author) == 2: - self.novel_author = author[0].text + ' (' + author[1].text + ')' - else: - self.novel_author = author[0].text - logger.info('Novel author: %s', self.novel_author) + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="manga-author"]') + ]) + logger.info('%s', self.novel_author) volumes = set() chapters = soup.select('ul.main li.wp-manga-chapter a') diff --git a/lncrawl/sources/novelupdatescc.py b/lncrawl/sources/novelupdatescc.py index 7ffed7861..4c1edad10 100644 --- a/lncrawl/sources/novelupdatescc.py +++ b/lncrawl/sources/novelupdatescc.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- -import json import logging -import re +from urllib.parse import quote from ..utils.crawler import Crawler @@ -14,25 +13,27 @@ class NovelUpdatesCC(Crawler): 'https://www.novelupdates.cc/', ] - # FIXME: Can't seem to get search to work. - # def search_novel(self, query): - # query = query.lower().replace(' ', '+') - # soup = self.get_soup(search_url % query) - - # results = [] - # for tab in soup.select('li.list-item'): - # a = tab.select_one('a.book-name') - # latest = "N/A" - # votes = tab.select_one('.star-suite span.score').text - # results.append({ - # 'title': a.text.strip(), - # 'url': self.absolute_url(a['href']), - # 'info': '%s | Rating: %s' % (latest, votes), - # }) - # # end for - - # return results - # # end def + def search_novel(self, query): + query = quote(query.lower()) + soup = self.get_soup(search_url % query) + + results = [] + for li in soup.select('.result-list .list-item'): + a = li.select_one('a.book-name') + for bad in a.select('font'): + bad.decompose() + # end for + catalog = li.select_one('.book-catalog').text.strip() + votes = li.select_one('.star-suite .score').text.strip() + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (catalog, votes), + }) + # end for + + return results + # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' @@ -92,4 +93,4 @@ def download_chapter_body(self, chapter): body = self.extract_contents(body_parts) return '

' + '

'.join(body) + '

' # end def -# end class \ No newline at end of file +# end class diff --git a/lncrawl/sources/overabook.py b/lncrawl/sources/overabook.py new file mode 100644 index 000000000..50a0478b6 --- /dev/null +++ b/lncrawl/sources/overabook.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://overabook.com/?s=%s&post_type=wp-manga&author=&artist=&release=' +chapter_list_url = 'https://overabook.com/wp-admin/admin-ajax.php' + + +class OverABook(Crawler): + base_url = 'https://overabook.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="novel-author"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form( + chapter_list_url, data='action=manga_get_chapters&manga=' + self.novel_id) + soup = self.make_soup(response) + for a in reversed(soup.select('.wp-manga-chapter a')): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({'id': vol_id}) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + }) + # end for + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/readnovelfull.py b/lncrawl/sources/readnovelfull.py index 6b170915b..bb2c5654b 100644 --- a/lncrawl/sources/readnovelfull.py +++ b/lncrawl/sources/readnovelfull.py @@ -11,10 +11,7 @@ class ReadNovelFullCrawler(Crawler): - base_url = [ - 'https://readnovelfull.com/', - 'http://wspadancewichita.com/', - ] + base_url = 'https://readnovelfull.com/' def search_novel(self, query): query = query.lower().replace(' ', '+') diff --git a/lncrawl/sources/readonlinenovels.py b/lncrawl/sources/readonlinenovels.py index ccb20cd4b..9bc1e0d60 100644 --- a/lncrawl/sources/readonlinenovels.py +++ b/lncrawl/sources/readonlinenovels.py @@ -15,24 +15,24 @@ def initialize(self): self.home_url = 'http://readonlinenovels.com/' # end def - def search_novel(self, query): - '''Gets a list of {title, url} matching the given query''' - soup = self.get_soup(search_url % query) - - results = [] - for div in soup.select('div.book-context'): - a = div.select_one('a') - title = a.select_one('h4 b').text.strip() - info = div.select_one('div.update-info').text.strip() - results.append({ - 'title': title, - 'url': self.absolute_url(a['href']), - 'info': info, - }) - # end for - - return results - # end def + # NOTE: Disabled because it takes too long + # def search_novel(self, query): + # soup = self.get_soup(search_url % query) + + # results = [] + # for div in soup.select('div.book-context'): + # a = div.select_one('a') + # title = a.select_one('h4 b').text.strip() + # info = div.select_one('div.update-info').text.strip() + # results.append({ + # 'title': title, + # 'url': self.absolute_url(a['href']), + # 'info': info, + # }) + # # end for + + # return results + # # end def def read_novel_info(self): '''Get novel title, autor, cover etc''' diff --git a/lncrawl/sources/royalroad.py b/lncrawl/sources/royalroad.py index 01cd823fb..abbf9199d 100644 --- a/lncrawl/sources/royalroad.py +++ b/lncrawl/sources/royalroad.py @@ -24,27 +24,12 @@ def search_novel(self, query): results.append({ 'url': url, 'title': a.text.strip(), - 'info': self.search_novel_info(url), }) # end for return results # end def - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - score = soup.select_one('span.star')['data-content'] - chapters = len(soup.find('tbody').findAll('a', href=True)) - latest = soup.find('tbody').findAll('a', href=True)[-1].text.strip() - info = 'Score: %s, Chapter count %s, Latest: %s' % ( - score, chapters, latest) - - return info - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) diff --git a/lncrawl/sources/sleepytrans.py b/lncrawl/sources/sleepytrans.py index 6cd3c40f9..9a4f00803 100644 --- a/lncrawl/sources/sleepytrans.py +++ b/lncrawl/sources/sleepytrans.py @@ -1,37 +1,15 @@ # -*- coding: utf-8 -*- -import json import logging -import re -from urllib.parse import urlparse from ..utils.crawler import Crawler logger = logging.getLogger(__name__) -search_url = 'https://sleepytranslations.com/?s=%s&post_type=wp-manga' +#search_url = 'https://sleepytranslations.com/?s=%s&post_type=wp-manga' chapter_list_url = 'https://sleepytranslations.com/wp-admin/admin-ajax.php' class SleepyTranslations(Crawler): base_url = 'https://sleepytranslations.com/' - def search_novel(self, query): - query = query.lower().replace(' ', '+') - soup = self.get_soup(search_url % query) - - results = [] - for tab in soup.select('.c-tabs-item__content'): - a = tab.select_one('.post-title h4 a') - latest = tab.select_one('.latest-chap .chapter a').text - votes = tab.select_one('.rating .total_votes').text - results.append({ - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - 'info': '%s | Rating: %s' % (latest, votes), - }) - # end for - - return results - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' logger.debug('Visiting %s', self.novel_url) @@ -57,21 +35,25 @@ def read_novel_info(self): self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] logger.info('Novel id: %s', self.novel_id) - response = self.submit_form( - chapter_list_url, data='action=manga_get_chapters&manga=' + self.novel_id) + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) soup = self.make_soup(response) - for a in reversed(soup.select('.wp-manga-chapter a')): + for a in reversed(soup.select(".wp-manga-chapter a")): chap_id = len(self.chapters) + 1 vol_id = 1 + len(self.chapters) // 100 if chap_id % 100 == 1: - self.volumes.append({'id': vol_id}) + self.volumes.append({"id": vol_id}) # end if - self.chapters.append({ - 'id': chap_id, - 'volume': vol_id, - 'title': a.text.strip(), - 'url': self.absolute_url(a['href']), - }) + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) # end for # end def diff --git a/lncrawl/sources/totallytranslations.py b/lncrawl/sources/totallytranslations.py new file mode 100644 index 000000000..b858d3ef1 --- /dev/null +++ b/lncrawl/sources/totallytranslations.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re + +from requests.sessions import Session + +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) + +class TotallyTranslations(Crawler): + base_url = 'https://totallytranslations.com/' + + def initialize(self): + self.scraper = Session() + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = soup.select_one('.entry-title').text + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.novel-image img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + for p in soup.select('.chapters-list .chapters-title'): + vol_title = p.text.strip() + vol_id = len(self.volumes) + 1 + self.volumes.append({ + 'id': vol_id, + 'title': vol_title, + }) + + ul = p.find_next('ul') + for a in ul.select('a'): + chap_id = len(self.chapters) + 1 + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + }) + # end for + # end for + # end def + + def download_chapter_body(self, chapter): + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + paras = soup.select('.post-content p') + return '\n'.join([str(p) for p in paras if p.text.strip()]) + # end def +# end class diff --git a/lncrawl/sources/tunovelaligera.py b/lncrawl/sources/tunovelaligera.py new file mode 100644 index 000000000..601548260 --- /dev/null +++ b/lncrawl/sources/tunovelaligera.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://tunovelaligera.com/?s=%s&post_type=wp-manga' + + +class Tunovelaligera(Crawler): + base_url = 'https://tunovelaligera.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = ' '.join([ + str(x) + for x in soup.select_one('.post-title h1').contents + if not x.name + ]).strip() + logger.info('Novel title: %s', self.novel_title) + + probable_img = soup.select_one('.summary_image img') + if probable_img: + self.novel_cover = self.absolute_url(probable_img['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.select('.author-content a') + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + volumes = set() + chapters = soup.select('ul.main li.wp-manga-chapter a') + for a in reversed(chapters): + chap_id = len(self.chapters) + 1 + vol_id = (chap_id - 1) // 100 + 1 + volumes.add(vol_id) + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(a['href']), + 'title': a.text.strip() or ('Chapter %d' % chap_id), + }) + # end for + + self.volumes = [{'id': x} for x in volumes] + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('div.text-left') + for bad in contents.select('h3, .code-block, script, .adsbygoogle'): + bad.decompose() + + return str(contents) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/volarenovels.py b/lncrawl/sources/volarenovels.py index 367055aee..617f92faf 100644 --- a/lncrawl/sources/volarenovels.py +++ b/lncrawl/sources/volarenovels.py @@ -46,34 +46,15 @@ def search_novel(self, query): url = search_url % query logger.info('Visiting %s ...', url) data = self.get_json(url)['items'][:5] - logger.debug(data) - - return [ - { - 'title': x['name'], - 'url': book_url % x['slug'], - 'info': self.search_novel_info(book_url % x['slug']), - } - for x in data - ] - - # end def - - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - volumes, chapters = self.__parse_toc(soup) - - info = 'Volume : %s, Chapter : %s, Latest: %s' % ( - len(volumes), - len(chapters), - chapters[-1]['title'], - ) - - return info - + # logger.debug(data) + results = [] + for item in data: + results.append({ + 'title': item['name'], + 'url': book_url % item['slug'], + }) + # end for + return results # end def def read_novel_info(self): diff --git a/lncrawl/sources/wondernovels.py b/lncrawl/sources/wondernovels.py new file mode 100644 index 000000000..3b6a93632 --- /dev/null +++ b/lncrawl/sources/wondernovels.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from urllib.parse import urlparse +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'https://wondernovels.com/?s=%s&post_type=wp-manga' +chapter_list_url = 'https://wondernovels.com/wp-admin/admin-ajax.php' + + +class WonderNovels(Crawler): + base_url = 'https://wondernovels.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.c-tabs-item__content'): + a = tab.select_one('.post-title h3 a') + latest = tab.select_one('.latest-chap .chapter a').text + votes = tab.select_one('.rating .total_votes').text + results.append({ + 'title': a.text.strip(), + 'url': self.absolute_url(a['href']), + 'info': '%s | Rating: %s' % (latest, votes), + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + possible_title = soup.select_one('.post-title h1') + for span in possible_title.select('span'): + span.extract() + # end for + self.novel_title = possible_title.text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.summary_image a img')['data-src']) + logger.info('Novel cover: %s', self.novel_cover) + + self.novel_author = ' '.join([ + a.text.strip() + for a in soup.select('.author-content a[href*="translator"]') + ]) + logger.info('%s', self.novel_author) + + self.novel_id = soup.select_one('#manga-chapters-holder')['data-id'] + logger.info('Novel id: %s', self.novel_id) + + response = self.submit_form(chapter_list_url, data={ + 'action': 'manga_get_chapters', + 'manga': self.novel_id, + }) + soup = self.make_soup(response) + for a in reversed(soup.select(".wp-manga-chapter a")): + chap_id = len(self.chapters) + 1 + vol_id = 1 + len(self.chapters) // 100 + if chap_id % 100 == 1: + self.volumes.append({"id": vol_id}) + # end if + self.chapters.append( + { + "id": chap_id, + "volume": vol_id, + "title": a.text.strip(), + "url": self.absolute_url(a["href"]), + } + ) + # end for + + # end def + + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Visiting %s', chapter['url']) + soup = self.get_soup(chapter['url']) + contents = soup.select('.reading-content p') + return ''.join([str(p) for p in contents]) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/wspadancewichita.py b/lncrawl/sources/wspadancewichita.py new file mode 100644 index 000000000..9e4d0a166 --- /dev/null +++ b/lncrawl/sources/wspadancewichita.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +import json +import logging +import re +from ..utils.crawler import Crawler +from ..utils.cleaner import cleanup_text + +logger = logging.getLogger(__name__) +search_url = 'http://wspadancewichita.com/search?keyword=%s' +full_chapter_url = 'http://wspadancewichita.com/ajax/chapter-archive?novelId=%s' + + +class wspadancewichita(Crawler): + base_url = 'http://wspadancewichita.com/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for result in soup.select('div.col-novel-main div.list.list-novel div.row')[:5]: + url = self.absolute_url( + result.select_one('h3.novel-title a')['href']) + title = result.select_one('h3.novel-title a')['title'] + last_chapter = result.select_one('span.chr-text').text.strip() + results.append({ + 'url': url, + 'title': title, + 'info': 'last chapter : %s' % last_chapter, + }) + # end for + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url + '?waring=1') + + self.novel_title = soup.select_one('h3.title').text.strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('div.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = [] + for a in soup.select('ul.info.info-meta li')[1].select('a'): + author.append(a.text.strip()) + # end for + + self.novel_author = ", ".join(author) + + logger.info('Novel author: %s', self.novel_author) + + novel_id = soup.select_one('div#rating')['data-novel-id'] + + chapter_url = full_chapter_url % novel_id + logger.debug('Visiting %s', chapter_url) + + chapter_soup = self.get_soup(chapter_url) + chapters = chapter_soup.select('li a') + for a in chapters: + for span in a.findAll('span'): + span.decompose() + # end for + # end for + + for x in chapters: + chap_id = len(self.chapters) + 1 + if len(self.chapters) % 100 == 0: + vol_id = chap_id//100 + 1 + vol_title = 'Volume ' + str(vol_id) + self.volumes.append({ + 'id': vol_id, + 'title': vol_title, + }) + # end if + self.chapters.append({ + 'id': chap_id, + 'volume': vol_id, + 'url': self.absolute_url(x['href']), + 'title': x['title'] or ('Chapter %d' % chap_id), + }) + # end for + # end def + + @cleanup_text + def download_chapter_body(self, chapter): + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + content = soup.select('#chr-content p') + if not content: + return '' + # end if + + return "".join(map(str, content)) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/wuxiacom.py b/lncrawl/sources/wuxiacom.py index 750128ef3..87ae02d71 100644 --- a/lncrawl/sources/wuxiacom.py +++ b/lncrawl/sources/wuxiacom.py @@ -29,43 +29,11 @@ def search_novel(self, query): results.append({ 'title': item['name'], 'url': book_url % item['slug'], - 'info': self.search_novel_info(book_url % item['slug']), }) # end for return results # end def - def search_novel_info(self, url): - '''Get novel title, autor, cover etc''' - logger.debug('Visiting %s', url) - soup = self.get_soup(url) - - volumes = [] - chapters = [] - for panel in soup.select('#accordion .panel-default'): - vol_id = int(panel.select_one('h4.panel-title .book').text) - vol_title = panel.select_one('h4.panel-title .title a').text - volumes.append({ - 'id': vol_id, - 'title': vol_title, - }) - for a in panel.select('ul.list-chapters li.chapter-item a'): - chap_id = len(self.chapters) + 1 - chapters.append({ - 'id': chap_id, - 'volume': vol_id, - 'url': self.absolute_url(a['href']), - 'title': a.text.strip() or ('Chapter %d' % chap_id), - }) - # end for - # end for - - info = 'Volume : %s, Chapter : %s, Latest: %s' % ( - len(volumes), len(chapters), chapters[-1]['title']) - - return info - # end def - def read_novel_info(self): '''Get novel title, autor, cover etc''' self.novel_id = self.novel_url.split( diff --git a/lncrawl/sources/wuxiaonline.py b/lncrawl/sources/wuxiaonline.py index 9fbe28686..c53665358 100644 --- a/lncrawl/sources/wuxiaonline.py +++ b/lncrawl/sources/wuxiaonline.py @@ -11,7 +11,7 @@ class WuxiaOnlineCrawler(Crawler): base_url = 'https://wuxiaworld.online/' - # DISABLING DUE TO CLOUDEFLARE CAPTCHA CHALLENGE + # NOTE: DISABLING DUE TO CLOUDEFLARE CAPTCHA CHALLENGE # def search_novel(self, query): # '''Gets a list of {title, url} matching the given query''' # soup = self.get_soup(search_url % query) diff --git a/lncrawl/sources/wuxiasite.py b/lncrawl/sources/wuxiasite.py index 27cd0cef8..8bd65a851 100644 --- a/lncrawl/sources/wuxiasite.py +++ b/lncrawl/sources/wuxiasite.py @@ -11,7 +11,7 @@ class WuxiaSiteCrawler(Crawler): base_url = 'https://wuxiaworld.site/' - # TODO: Disabled due to Cloudflare issue. + # NOTE: Disabled due to Cloudflare issue. # def search_novel(self, query): # query = query.lower().replace(' ', '+') # soup = self.get_soup(search_url % query) diff --git a/lncrawl/sources/wuxiaworldcloud.py b/lncrawl/sources/wuxiaworldcloud.py new file mode 100644 index 000000000..5bbe5dd34 --- /dev/null +++ b/lncrawl/sources/wuxiaworldcloud.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +import logging +import re +from concurrent import futures +from ..utils.crawler import Crawler + +logger = logging.getLogger(__name__) +search_url = 'http://wuxiaworld.cloud/search?keyword=%s' + + +class WuxiaWorldCloud(Crawler): + base_url = 'http://wuxiaworld.cloud/' + + def search_novel(self, query): + query = query.lower().replace(' ', '+') + soup = self.get_soup(search_url % query) + + results = [] + for tab in soup.select('.col-novel-main .list-novel .row'): + search_title = tab.select_one('.novel-title a') + latest = tab.select_one('.text-info a').text.strip() + results.append({ + 'title': search_title.text.strip(), + 'url': self.absolute_url( + tab.select_one('.novel-title a')['href']), + 'info': 'Latest chapter: %s' % (latest) + }) + # end for + + return results + # end def + + def read_novel_info(self): + '''Get novel title, autor, cover etc''' + logger.debug('Visiting %s', self.novel_url) + soup = self.get_soup(self.novel_url) + + self.novel_title = ' '.join([ + str(x) + for x in soup.select_one('.title').contents + if not x.name + ]).strip() + logger.info('Novel title: %s', self.novel_title) + + self.novel_cover = self.absolute_url( + soup.select_one('.book img')['src']) + logger.info('Novel cover: %s', self.novel_cover) + + author = soup.find_all(href=re.compile('author')) + if len(author) == 2: + self.novel_author = author[0].text + ' (' + author[1].text + ')' + else: + self.novel_author = author[0].text + logger.info('Novel author: %s', self.novel_author) + + # This is copied from the Novelfull pagination 'hanlder' with minor tweaks + pagination_links = soup.select('.pagination li a') + pagination_page_numbers = [] + for pagination_link in pagination_links: + # Boxnovel.org pagination numbering boxes contain non-digit characters + if pagination_link.text.isdigit(): + pagination_page_numbers.append(int(pagination_link.text)) + + page_count = max( + pagination_page_numbers) if pagination_page_numbers else 0 + logger.info('Chapter list pages: %d' % page_count) + + logger.info('Getting chapters...') + futures_to_check = { + self.executor.submit( + self.download_chapter_list, + i + 1, + ): str(i) + for i in range(page_count + 1) + } + [x.result() for x in futures.as_completed(futures_to_check)] + + # Didn't test without this, but with pagination the chapters could be in different orders + logger.info('Sorting chapters...') + self.chapters.sort(key=lambda x: x['volume'] * 1000 + x['id']) + + # Copied straight from Novelfull + logger.info('Adding volumes...') + mini = self.chapters[0]['volume'] + maxi = self.chapters[-1]['volume'] + for i in range(mini, maxi + 1): + self.volumes.append({'id': i}) + # end for + # end def + + def download_chapter_list(self, page): + '''Download list of chapters and volumes.''' + url = self.novel_url.split('?')[0].strip('/') + url += '?page=%d&per-page=50' % page + soup = self.get_soup(url) + + for a in soup.select('ul.list-chapter li a'): + title = a['title'].strip() + + chapter_id = len(self.chapters) + 1 + # match = re.findall(r'ch(apter)? (\d+)', title, re.IGNORECASE) + # if len(match) == 1: + # chapter_id = int(match[0][1]) + # # end if + + volume_id = 1 + (chapter_id - 1) // 100 + match = re.findall(r'(book|vol|volume) (\d+)', + title, re.IGNORECASE) + if len(match) == 1: + volume_id = int(match[0][1]) + # end if + + data = { + 'title': title, + 'id': chapter_id, + 'volume': volume_id, + 'url': self.absolute_url(a['href']), + } + self.chapters.append(data) + # end for + # end def + + def download_chapter_body(self, chapter): # + # NOTE: Set `chapter['body_lock'] = True` to disable post-formatting. + # It can be useful in non-english sources, e.g. aixdzs, qidiancom, tiknovel + # + # Return an empty body if anything goes wrong. But you should not return `None`. + '''Download body of a single chapter and return as clean html format.''' + logger.info('Downloading %s', chapter['url']) + soup = self.get_soup(chapter['url']) + + contents = soup.select_one('div.chr-c, #chr-content') + for br in contents.select('br'): + br.decompose() + # end for + + return str(contents) + # end def +# end class \ No newline at end of file diff --git a/lncrawl/sources/wuxiaworldio.py b/lncrawl/sources/wuxiaworldio.py index 969c25b65..982378891 100644 --- a/lncrawl/sources/wuxiaworldio.py +++ b/lncrawl/sources/wuxiaworldio.py @@ -8,7 +8,10 @@ search_url = 'https://wuxiaworld.io/search.ajax?type=&query=%s' class WuxiaWorldIo(Crawler): - base_url = 'https://wuxiaworld.io/' + base_url = [ + 'https://wuxiaworld.io/', + 'https://wuxiaworld.name/', + ] def search_novel(self, query): '''Gets a list of {title, url} matching the given query'''