Skip to content

Commit

Permalink
fix: 【知识库】导入非utf8 编码的txt文件,分段内容是空白
Browse files Browse the repository at this point in the history
  • Loading branch information
shaohuzhang1 committed Mar 25, 2024
1 parent f540bbe commit d732a46
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 3 deletions.
4 changes: 2 additions & 2 deletions apps/common/util/split_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def filter_title_special_characters(paragraph: Dict):
default_split_pattern = {
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'), re.compile("(?<!#)### (?!#).*"),
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
re.compile("(?<!#)###### (?!#).*")],
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")],
'default': [re.compile("(?<!\n)\n\n+")]
}

Expand All @@ -374,7 +374,7 @@ def get_split_model(filename: str, with_filter: bool = False, limit: int = 4096)
pattern_list = default_split_pattern.get('md')
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)

pattern_list = default_split_pattern.get('default')
pattern_list = default_split_pattern.get('md')
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)


Expand Down
3 changes: 2 additions & 1 deletion apps/dataset/serializers/document_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
from smartdoc.conf import PROJECT_DIR
import chardet


class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):
Expand Down Expand Up @@ -599,7 +600,7 @@ def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
else:
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
try:
content = data.decode('utf-8')
content = data.decode(chardet.detect(data)['encoding'])
except BaseException as e:
return {'name': file.name,
'content': []}
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ html2text = "^2024.2.26"
langchain-openai = "^0.0.8"
django-ipware = "^6.0.4"
django-apscheduler = "^0.6.2"
chardet2 = "^2.0.3"

[build-system]
requires = ["poetry-core"]
Expand Down

0 comments on commit d732a46

Please sign in to comment.