From ae0484ae99d271deead1397040ed9ab9d9d1bc58 Mon Sep 17 00:00:00 2001 From: shaohuzhang1 Date: Tue, 2 Apr 2024 18:31:43 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=88=9B=E5=BB=BA=20MaxKB=20=E5=9C=A8?= =?UTF-8?q?=E7=BA=BF=E6=96=87=E6=A1=A3=E7=9A=84=E7=9F=A5=E8=AF=86=E5=BA=93?= =?UTF-8?q?=EF=BC=8C=E5=8F=AA=E8=83=BD=E8=8E=B7=E5=8F=96=E6=A0=B9=E5=9C=B0?= =?UTF-8?q?=E5=9D=80=E6=95=B0=E6=8D=AE=EF=BC=8C=E5=AD=90=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/common/handle/impl/text_split_handle.py | 6 ++-- apps/common/util/fork.py | 30 +++++++++++++++---- .../serializers/document_serializers.py | 3 +- pyproject.toml | 1 - 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py index 67f56c37d3..176c1d6090 100644 --- a/apps/common/handle/impl/text_split_handle.py +++ b/apps/common/handle/impl/text_split_handle.py @@ -9,7 +9,7 @@ import re from typing import List -import chardet +from charset_normalizer import detect from common.handle.base_split_handle import BaseSplitHandle from common.util.split_model import SplitModel @@ -26,7 +26,7 @@ def support(self, file, get_buffer): file_name: str = file.name.lower() if file_name.endswith(".md") or file_name.endswith('.txt'): return True - result = chardet.detect(buffer) + result = detect(buffer) if result['encoding'] != 'ascii' and result['confidence'] > 0.5: return True return False @@ -38,7 +38,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu else: split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit) try: - content = buffer.decode(chardet.detect(buffer)['encoding']) + content = buffer.decode(detect(buffer)['encoding']) except BaseException as e: return {'name': file.name, 'content': []} diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py index eba10bbb18..a7933ebdfb 100644 --- a/apps/common/util/fork.py +++ b/apps/common/util/fork.py @@ -4,9 +4,8 @@ import traceback from functools import reduce from typing import List, Set -from urllib.parse import urljoin, urlparse, ParseResult, urlsplit +from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse -import chardet import html2text as ht import requests from bs4 import BeautifulSoup @@ -44,6 +43,13 @@ def fork_child(child_link: ChildLink, selector_list: List[str], level: int, excl ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler) +def remove_fragment(url: str) -> str: + parsed_url = urlparse(url) + modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path, + params=parsed_url.params, query=parsed_url.query, fragment=None) + return urlunparse(modified_url) + + class Fork: class Response: def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str): @@ -61,6 +67,7 @@ def error(message: str): return Fork.Response('', [], 500, message) def __init__(self, base_fork_url: str, selector_list: List[str]): + base_fork_url = remove_fragment(base_fork_url) self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.') parsed = urlsplit(base_fork_url) query = parsed.query @@ -74,9 +81,11 @@ def __init__(self, base_fork_url: str, selector_list: List[str]): fragment='').geturl() def get_child_link_list(self, bf: BeautifulSoup): - pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + ").*" + pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*" link_list = bf.find_all(name='a', href=re.compile(pattern)) - result = [ChildLink(link.get('href'), link) for link in link_list] + result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink( + self.base_url + link.get('href'), link) for link in link_list] + result = [row for row in result if row.url.startswith(self.base_fork_url)] return result def get_content_html(self, bf: BeautifulSoup): @@ -122,9 +131,18 @@ def reset_beautiful_soup(self, bf: BeautifulSoup): @staticmethod def get_beautiful_soup(response): - encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding + encoding = response.encoding if response.encoding and response.encoding is not 'ISO-8859-1' is not None else response.apparent_encoding html_content = response.content.decode(encoding) - return BeautifulSoup(html_content, "html.parser") + beautiful_soup = BeautifulSoup(html_content, "html.parser") + meta_list = beautiful_soup.find_all('meta') + charset_list = [meta.attrs.get('charset') for meta in meta_list if + meta.attrs is not None and 'charset' in meta.attrs] + if len(charset_list) > 0: + charset = charset_list[0] + if charset is not encoding: + html_content = response.content.decode(charset) + return BeautifulSoup(html_content, "html.parser") + return beautiful_soup def fork(self): try: diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py index 8e062d8b4d..2e469dfe9d 100644 --- a/apps/dataset/serializers/document_serializers.py +++ b/apps/dataset/serializers/document_serializers.py @@ -30,12 +30,11 @@ from common.util.field_message import ErrMessage from common.util.file_util import get_file_content from common.util.fork import Fork -from common.util.split_model import SplitModel, get_split_model +from common.util.split_model import get_split_model from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, Status, ProblemParagraphMapping from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer from smartdoc.conf import PROJECT_DIR -import chardet class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer): diff --git a/pyproject.toml b/pyproject.toml index b2152f4c9c..5d2a520d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ html2text = "^2024.2.26" langchain-openai = "^0.0.8" django-ipware = "^6.0.4" django-apscheduler = "^0.6.2" -chardet2 = "^2.0.3" pymupdf = "^1.24.0" python-docx = "^1.1.0"