From 3e66b3180f567dc84efd80a4b5744b030c262e6e Mon Sep 17 00:00:00 2001 From: Lennart Schmidt <150007074+LennartSchmidtKern@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:31:51 +0200 Subject: [PATCH] Cache Files for ETL Processing (#72) * remove markdown tokenizer * submodules * submodule merge --------- Co-authored-by: JWittmeyer --- app.py | 17 +--- controller/markdown_file_content.py | 128 ---------------------------- submodules/model | 2 +- submodules/s3 | 2 +- 4 files changed, 3 insertions(+), 146 deletions(-) delete mode 100644 controller/markdown_file_content.py diff --git a/app.py b/app.py index eb3c0d5..94b5d49 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,7 @@ from fastapi import FastAPI, responses, status -from controller import task_manager, tokenization_manager, markdown_file_content +from controller import task_manager, tokenization_manager from misc import util from handler import config_handler, tokenizer_handler from request_classes import ( @@ -101,21 +101,6 @@ def save_tokenizer_as_pickle(request: SaveTokenizer) -> responses.PlainTextRespo return responses.PlainTextResponse(status_code=status.HTTP_200_OK) -@app.put("/cognition/rework-content/{org_id}/{file_id}/{step}") -def rework_markdown_file_content( - org_id: str, file_id: str, step: str -) -> responses.Response: - try: - r = markdown_file_content.rework_markdown_file_content( - org_id, file_id, step.upper() - ) - except Exception: - pass - if not r: - return responses.Response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) - return responses.Response(status_code=status.HTTP_200_OK) - - @app.put("/config_changed") def config_changed() -> responses.PlainTextResponse: config_handler.refresh_config() diff --git a/controller/markdown_file_content.py b/controller/markdown_file_content.py deleted file mode 100644 index 0521a80..0000000 --- a/controller/markdown_file_content.py +++ /dev/null @@ -1,128 +0,0 @@ -import traceback - -from submodules.model.cognition_objects import markdown_file, markdown_dataset -from handler.tokenizer_handler import get_tokenizer -from submodules.model.business_objects import general -from submodules.model.enums import CognitionMarkdownFileState -from spacy.language import Language - -SEGMENT_DIVIDER = "\n\n" - -def rework_markdown_file_content(org_id: str, file_id: str, step: str) -> bool: - if step == "SEGMENT_SENTENCES": - return __rework_segment_sentences(org_id, file_id) - return True - - -def __rework_segment_sentences(org_id: str, file_id: str) -> bool: - markdown_file_item = markdown_file.get(org_id, file_id) - if markdown_file_item is None: - return False - - dataset_item = markdown_dataset.get(org_id, markdown_file_item.dataset_id) - if dataset_item is None: - return False - content = markdown_file_item.content - try: - nlp = get_tokenizer(dataset_item.tokenizer) - max_length = __lookup_final_max_length(nlp) - # Split the content into smaller chunks if it's too large - if __utf8len(content) > max_length: - chunks = __chunk_text_on_bytes(content,max_length - 100) - processed_chunks = [] - - for chunk in chunks: - doc = nlp(chunk) - processed_chunk = SEGMENT_DIVIDER.join( - [sentence for sentence in __segment_sentences(doc)] - ) - processed_chunks.append(processed_chunk) - - content = SEGMENT_DIVIDER.join(processed_chunks) - else: - doc = nlp(content) - content = SEGMENT_DIVIDER.join([sentence for sentence in __segment_sentences(doc)]) - markdown_file_item.content = content - general.commit() - return True - except Exception: - full_traceback = traceback.format_exc() - print(full_traceback, flush=True) - markdown_file.update( - org_id=org_id, - markdown_file_id=file_id, - state=CognitionMarkdownFileState.FAILED.value, - error=full_traceback, # Store the full stack trace instead of just the error message - ) - return False - - -# custom segmentation rule to build very likely sentences from chunk of text -def __segment_sentences(doc: Language): - sentences = [] - current_sentence = None - for sent in doc.sents: - if len(sent.text.strip()) == 0: - continue - last_char = sent.text.strip()[-1] - - if current_sentence is None: - current_sentence = sent.text - else: - current_sentence += " " + sent.text - - if last_char in [".", ";", "?", "!"]: - sentences.append(current_sentence) - current_sentence = None - - if current_sentence is not None: - sentences.append(current_sentence) - return sentences - - -def __chunk_text(text: str, chunk_size: int = 1_000_000): - return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)] - -# splits not after x bytes but ensures that max x bytes are used without destroying the characters -def __chunk_text_on_bytes(text: str, max_chunk_size: int = 1_000_000): - factor = len(text) / __utf8len(text) - increase_by = int(max(min(max_chunk_size*.1,10),1)) - initial_size_guess = int(max(max_chunk_size * factor - 10,1)) - final_list = [] - remaining = text - while len(remaining): - part = remaining[:initial_size_guess] - if __utf8len(part) > max_chunk_size: - initial_size_guess = max(initial_size_guess - min(max_chunk_size *.001,10),1) - continue - cut_after = initial_size_guess - while __utf8len(part) < max_chunk_size and part != remaining: - cut_after = min(len(remaining), cut_after+increase_by) - part = remaining[:cut_after] - - if __utf8len(part) > max_chunk_size: - cut_after-=increase_by - final_list.append(remaining[:cut_after]) - remaining = remaining[cut_after:] - - return final_list - - - -MAX_LENGTH_OVERWRITE = { - # japanese has a max length restriction by sudachi so the spacy max_length only applies if < sudachi - "ja":49149 -} - -def __lookup_final_max_length(nlp:Language) -> int: - overwrite = MAX_LENGTH_OVERWRITE.get(nlp.meta["lang"]) - - if overwrite and overwrite < nlp.max_length: - return overwrite - return nlp.max_length - - -# note that "H" uses up 1 byte while "私" takes 3 bytes -# len(s) would still give 1 but this runs into issues for reserved/allocated spacy memory -def __utf8len(s:str): - return len(s.encode('utf-8')) diff --git a/submodules/model b/submodules/model index 3af110d..49ea175 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 3af110d28d90152c62b40d6676bec40ae50fc132 +Subproject commit 49ea175bb3cd429ddc911996e0fe00e6763beefe diff --git a/submodules/s3 b/submodules/s3 index 1ad3ff5..3299fb4 160000 --- a/submodules/s3 +++ b/submodules/s3 @@ -1 +1 @@ -Subproject commit 1ad3ff584860090f4e215986f334b5e63759a55d +Subproject commit 3299fb46876e3b4cc29c0a5cef004005a87f0f19