fix: 创建 MaxKB 在线文档的知识库，只能获取根地址数据，子地址数据无法获取

1Panel-dev · Apr 2, 2024 · ae0484a · ae0484a
1 parent 548ebc2
commit ae0484a
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 12 deletions.
diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
@@ -9,7 +9,7 @@
 import re
 from typing import List
 
-import chardet
+from charset_normalizer import detect
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
@@ -26,7 +26,7 @@ def support(self, file, get_buffer):
         file_name: str = file.name.lower()
         if file_name.endswith(".md") or file_name.endswith('.txt'):
             return True
-        result = chardet.detect(buffer)
+        result = detect(buffer)
         if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
             return True
         return False
@@ -38,7 +38,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         else:
             split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
         try:
-            content = buffer.decode(chardet.detect(buffer)['encoding'])
+            content = buffer.decode(detect(buffer)['encoding'])
         except BaseException as e:
             return {'name': file.name,
                     'content': []}

diff --git a/apps/common/util/fork.py b/apps/common/util/fork.py
@@ -4,9 +4,8 @@
 import traceback
 from functools import reduce
 from typing import List, Set
-from urllib.parse import urljoin, urlparse, ParseResult, urlsplit
+from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse
 
-import chardet
 import html2text as ht
 import requests
 from bs4 import BeautifulSoup
@@ -44,6 +43,13 @@ def fork_child(child_link: ChildLink, selector_list: List[str], level: int, excl
                 ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler)
 
 
+def remove_fragment(url: str) -> str:
+    parsed_url = urlparse(url)
+    modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path,
+                               params=parsed_url.params, query=parsed_url.query, fragment=None)
+    return urlunparse(modified_url)
+
+
 class Fork:
     class Response:
         def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
@@ -61,6 +67,7 @@ def error(message: str):
             return Fork.Response('', [], 500, message)
 
     def __init__(self, base_fork_url: str, selector_list: List[str]):
+        base_fork_url = remove_fragment(base_fork_url)
         self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
         parsed = urlsplit(base_fork_url)
         query = parsed.query
@@ -74,9 +81,11 @@ def __init__(self, base_fork_url: str, selector_list: List[str]):
                                     fragment='').geturl()
 
     def get_child_link_list(self, bf: BeautifulSoup):
-        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + ").*"
+        pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*"
         link_list = bf.find_all(name='a', href=re.compile(pattern))
-        result = [ChildLink(link.get('href'), link) for link in link_list]
+        result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
+            self.base_url + link.get('href'), link) for link in link_list]
+        result = [row for row in result if row.url.startswith(self.base_fork_url)]
         return result
 
     def get_content_html(self, bf: BeautifulSoup):
@@ -122,9 +131,18 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):
 
     @staticmethod
     def get_beautiful_soup(response):
-        encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding
+        encoding = response.encoding if response.encoding and response.encoding is not 'ISO-8859-1' is not None else response.apparent_encoding
         html_content = response.content.decode(encoding)
-        return BeautifulSoup(html_content, "html.parser")
+        beautiful_soup = BeautifulSoup(html_content, "html.parser")
+        meta_list = beautiful_soup.find_all('meta')
+        charset_list = [meta.attrs.get('charset') for meta in meta_list if
+                        meta.attrs is not None and 'charset' in meta.attrs]
+        if len(charset_list) > 0:
+            charset = charset_list[0]
+            if charset is not encoding:
+                html_content = response.content.decode(charset)
+                return BeautifulSoup(html_content, "html.parser")
+        return beautiful_soup
 
     def fork(self):
         try:

diff --git a/apps/dataset/serializers/document_serializers.py b/apps/dataset/serializers/document_serializers.py
@@ -30,12 +30,11 @@
 from common.util.field_message import ErrMessage
 from common.util.file_util import get_file_content
 from common.util.fork import Fork
-from common.util.split_model import SplitModel, get_split_model
+from common.util.split_model import get_split_model
 from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, Status, ProblemParagraphMapping
 from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
 from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
 from smartdoc.conf import PROJECT_DIR
-import chardet
 
 
 class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,6 @@ html2text = "^2024.2.26"
 langchain-openai = "^0.0.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
-chardet2 = "^2.0.3"
 pymupdf = "^1.24.0"
 python-docx = "^1.1.0"