Skip to content

Commit

Permalink
Merge pull request #878 from dipu-bd/dev
Browse files Browse the repository at this point in the history
Version 2.26.3
  • Loading branch information
dipu-bd authored May 19, 2021
2 parents 158cdca + 630517e commit 188ac8a
Show file tree
Hide file tree
Showing 57 changed files with 2,387 additions and 632 deletions.
58 changes: 36 additions & 22 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lncrawl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.26.2
2.26.3
11 changes: 9 additions & 2 deletions lncrawl/core/novel_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@
"""
import json
import os
import re

from .. import constants as C
from ..utils.crawler import Crawler


def __format_title(text):
return re.sub(r'\s+', ' ', text).strip()
# end def

def format_novel(crawler: Crawler):
crawler.novel_title = crawler.novel_title.strip()
crawler.novel_author = crawler.novel_author.strip()
crawler.novel_title = __format_title(crawler.novel_title)
crawler.novel_author = __format_title(crawler.novel_author)
# crawler.novel_title = crawler.cleanup_text(crawler.novel_title)
# crawler.novel_author = crawler.cleanup_text(crawler.novel_author)
format_volumes(crawler)
Expand All @@ -29,6 +34,7 @@ def format_volumes(crawler: Crawler):
if not ('title' in vol and vol['title']):
vol['title'] = title
# end if
vol['title'] = __format_title(vol['title'])
# end for
# end def

Expand All @@ -39,6 +45,7 @@ def format_chapters(crawler: Crawler):
if not ('title' in item and item['title']):
item['title'] = title
# end if
item['title'] = __format_title(item['title'])

volume = [x for x in crawler.volumes if x['id'] == item['volume']]
if len(volume) == 0:
Expand Down
123 changes: 72 additions & 51 deletions lncrawl/sources/1stkissnovel.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,106 @@
# -*- coding: utf-8 -*-
import json
import logging
from urllib.parse import quote_plus
import re
from urllib.parse import urlparse
from ..utils.crawler import Crawler

logger = logging.getLogger(__name__)
search_url = 'https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release='
search_url = (
"https://1stkissnovel.love/?s=%s&post_type=wp-manga&author=&artist=&release="
)
wp_admin_ajax_url = 'https://1stkissnovel.love/wp-admin/admin-ajax.php'


class OneKissNovelCrawler(Crawler):
base_url = 'https://1stkissnovel.love/'

# TODO: Error 503 Backend fetch failed
# def search_novel(self, query):
# query = quote_plus(query.lower())
# soup = self.get_soup(search_url % query)
#
# results = []
# for tab in soup.select('.c-tabs-item__content')[:20]:
# a = tab.select_one('.post-title h3 a')
# latest = tab.select_one('.latest-chap .chapter a').text
# votes = tab.select_one('.rating .total_votes').text
# results.append({
# 'title': a.text.strip(),
# 'url': self.absolute_url(a['href']),
# 'info': '%s | Rating: %s' % (latest, votes),
# })
# # end for
#
# return results
# # end def
def search_novel(self, query):
query = query.lower().replace(" ", "+")
soup = self.get_soup(search_url % query)

results = []
for tab in soup.select(".c-tabs-item__content"):
a = tab.select_one(".post-title h3 a")
latest = tab.select_one(".latest-chap .chapter a").text
votes = tab.select_one(".rating .total_votes").text
results.append(
{
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
"info": "%s | Rating: %s" % (latest, votes),
}
)
# end for

return results
# end def

def read_novel_info(self):
'''Get novel title, autor, cover etc'''
logger.debug('Visiting %s', self.novel_url)
logger.debug("Visiting %s", self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one('.post-title h1')
for span in possible_title.select('span'):
possible_title = soup.select_one(".post-title h1")
for span in possible_title.select("span"):
span.extract()
# end for
self.novel_title = possible_title.text.strip()
logger.info('Novel title: %s', self.novel_title)
logger.info("Novel title: %s", self.novel_title)

self.novel_cover = soup.select_one(
'meta[property="og:image"]')['content']
logger.info('Novel cover: %s', self.novel_cover)
self.novel_cover = self.absolute_url(
soup.select_one(".summary_image a img")["src"]
)
logger.info("Novel cover: %s", self.novel_cover)

self.novel_author = ' '.join([
a.text.strip()
for a in soup.select('.author-content a[href*="manga-author"]')
])
logger.info('%s', self.novel_author)
self.novel_author = " ".join(
[
a.text.strip()
for a in soup.select('.author-content a[href*="manga-author"]')
]
)
logger.info("%s", self.novel_author)

volumes = set()
chapters = soup.select('ul.main li.wp-manga-chapter a')
for a in reversed(chapters):
self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"]
logger.info("Novel id: %s", self.novel_id)

# For getting cookies
# self.submit_form(wp_admin_ajax_url, data={
# 'action': 'manga_views',
# 'manga': self.novel_id,
# })
# print(self.cookies)
response = self.submit_form(wp_admin_ajax_url, data={
'action': 'manga_get_chapters',
'manga': self.novel_id,
})
soup = self.make_soup(response)
for a in reversed(soup.select(".wp-manga-chapter a")):
chap_id = len(self.chapters) + 1
vol_id = (chap_id - 1) // 100 + 1
volumes.add(vol_id)
self.chapters.append({
'id': chap_id,
'volume': vol_id,
'url': self.absolute_url(a['href']),
'title': a.text.strip() or ('Chapter %d' % chap_id),
})
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
# end if
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
# end for

self.volumes = [{'id': x} for x in volumes]
# end def

def download_chapter_body(self, chapter):
'''Download body of a single chapter and return as clean html format.'''
logger.info('Downloading %s', chapter['url'])
soup = self.get_soup(chapter['url'])
logger.info("Visiting %s", chapter["url"])
soup = self.get_soup(chapter["url"])

contents = soup.select_one('div.text-left')
for bad in contents.select('h3, .code-block, script, .adsbygoogle'):
bad.decompose()
# end for

body = self.extract_contents(contents)
return '<p>' + '</p><p>'.join(body) + '</p>'
# end def
# end class
# end class
91 changes: 91 additions & 0 deletions lncrawl/sources/amnesiactl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
import json
import logging
import re
from urllib.parse import urlparse
from ..utils.crawler import Crawler

logger = logging.getLogger(__name__)
#search_url = 'https://amnesiactl.com/?s=%s&post_type=wp-manga'
chapter_list_url = 'https://amnesiactl.com/wp-admin/admin-ajax.php'


class Amnesiactl(Crawler):
base_url = 'https://amnesiactl.com/'

# NOTE: Site doesn't have proper search layout.
# def search_novel(self, query):
# query = query.lower().replace(' ', '+')
# soup = self.get_soup(search_url % query)

# results = []
# for tab in soup.select('.c-tabs-item__content'):
# a = tab.select_one('.post-title h3 a')
# latest = tab.select_one('.latest-chap .chapter a').text
# votes = tab.select_one('.rating .total_votes').text
# results.append({
# 'title': a.text.strip(),
# 'url': self.absolute_url(a['href']),
# 'info': '%s | Rating: %s' % (latest, votes),
# })
# # end for

# return results
# # end def

def read_novel_info(self):
'''Get novel title, autor, cover etc'''
logger.debug('Visiting %s', self.novel_url)
soup = self.get_soup(self.novel_url)

possible_title = soup.select_one('.post-title h1')
for span in possible_title.select('span'):
span.extract()
# end for
self.novel_title = possible_title.text.strip()
logger.info('Novel title: %s', self.novel_title)

self.novel_cover = self.absolute_url(
soup.select_one('.summary_image a img')['src'])
logger.info('Novel cover: %s', self.novel_cover)

self.novel_author = ' '.join([
a.text.strip()
for a in soup.select('.author-content a[href*="novel-author"]')
])
logger.info('%s', self.novel_author)

self.novel_id = soup.select_one('#manga-chapters-holder')['data-id']
logger.info('Novel id: %s', self.novel_id)

response = self.submit_form(chapter_list_url, data={
'action': 'manga_get_chapters',
'manga': self.novel_id,
})
soup = self.make_soup(response)
for a in reversed(soup.select(".wp-manga-chapter a")):
chap_id = len(self.chapters) + 1
vol_id = 1 + len(self.chapters) // 100
if chap_id % 100 == 1:
self.volumes.append({"id": vol_id})
# end if
self.chapters.append(
{
"id": chap_id,
"volume": vol_id,
"title": a.text.strip(),
"url": self.absolute_url(a["href"]),
}
)
# end for

# end def

def download_chapter_body(self, chapter):
'''Download body of a single chapter and return as clean html format.'''
logger.info('Visiting %s', chapter['url'])
soup = self.get_soup(chapter['url'])
contents = soup.select('.reading-content p')
return ''.join([str(p) for p in contents])
# end def
# end class
4 changes: 1 addition & 3 deletions lncrawl/sources/bestlightnovel.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
import logging
import re
from bs4 import BeautifulSoup
from ..utils.crawler import Crawler

logger = logging.getLogger(__name__)
Expand All @@ -20,7 +18,7 @@ def search_novel(self, query):

results = []
for novel in data:
titleSoup = BeautifulSoup(novel['name'], 'lxml')
titleSoup = self.make_soup(novel['name'])
results.append({
'title': titleSoup.body.text.title(),
'url': novel_page_url % novel['id_encode'],
Expand Down
Loading

0 comments on commit 188ac8a

Please sign in to comment.