From ae0484ae99d271deead1397040ed9ab9d9d1bc58 Mon Sep 17 00:00:00 2001
From: shaohuzhang1 <shaohu.zhang@fit2cloud.com>
Date: Tue, 2 Apr 2024 18:31:43 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E5=88=9B=E5=BB=BA=20MaxKB=20=E5=9C=A8?=
 =?UTF-8?q?=E7=BA=BF=E6=96=87=E6=A1=A3=E7=9A=84=E7=9F=A5=E8=AF=86=E5=BA=93?=
 =?UTF-8?q?=EF=BC=8C=E5=8F=AA=E8=83=BD=E8=8E=B7=E5=8F=96=E6=A0=B9=E5=9C=B0?=
 =?UTF-8?q?=E5=9D=80=E6=95=B0=E6=8D=AE=EF=BC=8C=E5=AD=90=E5=9C=B0=E5=9D=80?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=97=A0=E6=B3=95=E8=8E=B7=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 apps/common/handle/impl/text_split_handle.py  |  6 ++--
 apps/common/util/fork.py                      | 30 +++++++++++++++----
 .../serializers/document_serializers.py       |  3 +-
 pyproject.toml                                |  1 -
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
index 67f56c37d3..176c1d6090 100644
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@@ -9,7 +9,7 @@
 import re
 from typing import List
 
-import chardet
+from charset_normalizer import detect
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@@ -26,7 +26,7 @@ def support(self, file, get_buffer):
         file_name: str = file.name.lower()
         if file_name.endswith(".md") or file_name.endswith('.txt'):
             return True
-        result = chardet.detect(buffer)
+        result = detect(buffer)
         if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
             return True
         return False
@@ -38,7 +38,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         else:
             split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
         try:
-            content = buffer.decode(chardet.detect(buffer)['encoding'])
+            content = buffer.decode(detect(buffer)['encoding'])
         except BaseException as e:
             return {'name': file.name,
                     'content': []}
diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py
index eba10bbb18..a7933ebdfb 100644
--- a/apps/common/util/fork.py
+++ b/apps/common/util/fork.py
@@ -4,9 +4,8 @@
 import traceback
 from functools import reduce
 from typing import List, Set
-from urllib.parse import urljoin, urlparse, ParseResult, urlsplit
+from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
 
-import chardet
 import html2text as ht
 import requests
 from bs4 import BeautifulSoup
@@ -44,6 +43,13 @@ def fork_child(child_link: ChildLink, selector_list: List[str], level: int, excl
                 ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler)
 
 
+def remove_fragment(url: str) -> str:
+    parsed_url = urlparse(url)
+    modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path,
+                               params=parsed_url.params, query=parsed_url.query, fragment=None)
+    return urlunparse(modified_url)
+
+
 class Fork:
     class Response:
         def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
@@ -61,6 +67,7 @@ def error(message: str):
             return Fork.Response('', [], 500, message)
 
     def __init__(self, base_fork_url: str, selector_list: List[str]):
+        base_fork_url = remove_fragment(base_fork_url)
         self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
         parsed = urlsplit(base_fork_url)
         query = parsed.query
@@ -74,9 +81,11 @@ def __init__(self, base_fork_url: str, selector_list: List[str]):
                                     fragment='').geturl()
 
     def get_child_link_list(self, bf: BeautifulSoup):
-        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + ").*"
+        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*"
         link_list = bf.find_all(name='a', href=re.compile(pattern))
-        result = [ChildLink(link.get('href'), link) for link in link_list]
+        result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
+            self.base_url + link.get('href'), link) for link in link_list]
+        result = [row for row in result if row.url.startswith(self.base_fork_url)]
         return result
 
     def get_content_html(self, bf: BeautifulSoup):
@@ -122,9 +131,18 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
 
     @staticmethod
     def get_beautiful_soup(response):
-        encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding
+        encoding = response.encoding if response.encoding and response.encoding is not 'ISO-8859-1' is not None else response.apparent_encoding
         html_content = response.content.decode(encoding)
-        return BeautifulSoup(html_content, "html.parser")
+        beautiful_soup = BeautifulSoup(html_content, "html.parser")
+        meta_list = beautiful_soup.find_all('meta')
+        charset_list = [meta.attrs.get('charset') for meta in meta_list if
+                        meta.attrs is not None and 'charset' in meta.attrs]
+        if len(charset_list) > 0:
+            charset = charset_list[0]
+            if charset is not encoding:
+                html_content = response.content.decode(charset)
+                return BeautifulSoup(html_content, "html.parser")
+        return beautiful_soup
 
     def fork(self):
         try:
diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py
index 8e062d8b4d..2e469dfe9d 100644
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@@ -30,12 +30,11 @@
 from common.util.field_message import ErrMessage
 from common.util.file_util import get_file_content
 from common.util.fork import Fork
-from common.util.split_model import SplitModel, get_split_model
+from common.util.split_model import get_split_model
 from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, Status, ProblemParagraphMapping
 from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
 from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
 from smartdoc.conf import PROJECT_DIR
-import chardet
 
 
 class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):
diff --git a/pyproject.toml b/pyproject.toml
index b2152f4c9c..5d2a520d4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ html2text = "^2024.2.26"
 langchain-openai = "^0.0.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
-chardet2 = "^2.0.3"
 pymupdf = "^1.24.0"
 python-docx = "^1.1.0"