Skip to content

Commit

Permalink
fix: 创建 MaxKB 在线文档的知识库,只能获取根地址数据,子地址数据无法获取
Browse files Browse the repository at this point in the history
  • Loading branch information
shaohuzhang1 committed Apr 2, 2024
1 parent 548ebc2 commit ae0484a
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 12 deletions.
6 changes: 3 additions & 3 deletions apps/common/handle/impl/text_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import re
from typing import List

import chardet
from charset_normalizer import detect

from common.handle.base_split_handle import BaseSplitHandle
from common.util.split_model import SplitModel
Expand All @@ -26,7 +26,7 @@ def support(self, file, get_buffer):
file_name: str = file.name.lower()
if file_name.endswith(".md") or file_name.endswith('.txt'):
return True
result = chardet.detect(buffer)
result = detect(buffer)
if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
return True
return False
Expand All @@ -38,7 +38,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
else:
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
try:
content = buffer.decode(chardet.detect(buffer)['encoding'])
content = buffer.decode(detect(buffer)['encoding'])
except BaseException as e:
return {'name': file.name,
'content': []}
Expand Down
30 changes: 24 additions & 6 deletions apps/common/util/fork.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import traceback
from functools import reduce
from typing import List, Set
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit
from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, urlunparse

import chardet
import html2text as ht
import requests
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -44,6 +43,13 @@ def fork_child(child_link: ChildLink, selector_list: List[str], level: int, excl
ForkManage.fork_child(child_link, selector_list, level - 1, exclude_link_url, fork_handler)


def remove_fragment(url: str) -> str:
parsed_url = urlparse(url)
modified_url = ParseResult(scheme=parsed_url.scheme, netloc=parsed_url.netloc, path=parsed_url.path,
params=parsed_url.params, query=parsed_url.query, fragment=None)
return urlunparse(modified_url)


class Fork:
class Response:
def __init__(self, content: str, child_link_list: List[ChildLink], status, message: str):
Expand All @@ -61,6 +67,7 @@ def error(message: str):
return Fork.Response('', [], 500, message)

def __init__(self, base_fork_url: str, selector_list: List[str]):
base_fork_url = remove_fragment(base_fork_url)
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
parsed = urlsplit(base_fork_url)
query = parsed.query
Expand All @@ -74,9 +81,11 @@ def __init__(self, base_fork_url: str, selector_list: List[str]):
fragment='').geturl()

def get_child_link_list(self, bf: BeautifulSoup):
pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + ").*"
pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + self.base_fork_url + "|/).*"
link_list = bf.find_all(name='a', href=re.compile(pattern))
result = [ChildLink(link.get('href'), link) for link in link_list]
result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
self.base_url + link.get('href'), link) for link in link_list]
result = [row for row in result if row.url.startswith(self.base_fork_url)]
return result

def get_content_html(self, bf: BeautifulSoup):
Expand Down Expand Up @@ -122,9 +131,18 @@ def reset_beautiful_soup(self, bf: BeautifulSoup):

@staticmethod
def get_beautiful_soup(response):
encoding = response.encoding if response.encoding and response.encoding != 'ISO-8859-1' is not None else response.apparent_encoding
encoding = response.encoding if response.encoding and response.encoding is not 'ISO-8859-1' is not None else response.apparent_encoding
html_content = response.content.decode(encoding)
return BeautifulSoup(html_content, "html.parser")
beautiful_soup = BeautifulSoup(html_content, "html.parser")
meta_list = beautiful_soup.find_all('meta')
charset_list = [meta.attrs.get('charset') for meta in meta_list if
meta.attrs is not None and 'charset' in meta.attrs]
if len(charset_list) > 0:
charset = charset_list[0]
if charset is not encoding:
html_content = response.content.decode(charset)
return BeautifulSoup(html_content, "html.parser")
return beautiful_soup

def fork(self):
try:
Expand Down
3 changes: 1 addition & 2 deletions apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@
from common.util.field_message import ErrMessage
from common.util.file_util import get_file_content
from common.util.fork import Fork
from common.util.split_model import SplitModel, get_split_model
from common.util.split_model import get_split_model
from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, Status, ProblemParagraphMapping
from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
from smartdoc.conf import PROJECT_DIR
import chardet


class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ html2text = "^2024.2.26"
langchain-openai = "^0.0.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
chardet2 = "^2.0.3"
pymupdf = "^1.24.0"
python-docx = "^1.1.0"

Expand Down

0 comments on commit ae0484a

Please sign in to comment.