From 3e66b3180f567dc84efd80a4b5744b030c262e6e Mon Sep 17 00:00:00 2001
From: Lennart Schmidt <150007074+LennartSchmidtKern@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:31:51 +0200
Subject: [PATCH] Cache Files for ETL Processing (#72)

* remove markdown tokenizer

* submodules

* submodule merge

---------

Co-authored-by: JWittmeyer <jens.wittmeyer@kern.ai>
---
 app.py                              |  17 +---
 controller/markdown_file_content.py | 128 ----------------------------
 submodules/model                    |   2 +-
 submodules/s3                       |   2 +-
 4 files changed, 3 insertions(+), 146 deletions(-)
 delete mode 100644 controller/markdown_file_content.py

diff --git a/app.py b/app.py
index eb3c0d5..94b5d49 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,7 @@
 from fastapi import FastAPI, responses, status
 
 
-from controller import task_manager, tokenization_manager, markdown_file_content
+from controller import task_manager, tokenization_manager
 from misc import util
 from handler import config_handler, tokenizer_handler
 from request_classes import (
@@ -101,21 +101,6 @@ def save_tokenizer_as_pickle(request: SaveTokenizer) -> responses.PlainTextRespo
     return responses.PlainTextResponse(status_code=status.HTTP_200_OK)
 
 
-@app.put("/cognition/rework-content/{org_id}/{file_id}/{step}")
-def rework_markdown_file_content(
-    org_id: str, file_id: str, step: str
-) -> responses.Response:
-    try:
-        r = markdown_file_content.rework_markdown_file_content(
-            org_id, file_id, step.upper()
-        )
-    except Exception:
-        pass
-    if not r:
-        return responses.Response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
-    return responses.Response(status_code=status.HTTP_200_OK)
-
-
 @app.put("/config_changed")
 def config_changed() -> responses.PlainTextResponse:
     config_handler.refresh_config()
diff --git a/controller/markdown_file_content.py b/controller/markdown_file_content.py
deleted file mode 100644
index 0521a80..0000000
--- a/controller/markdown_file_content.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import traceback
-
-from submodules.model.cognition_objects import markdown_file, markdown_dataset
-from handler.tokenizer_handler import get_tokenizer
-from submodules.model.business_objects import general
-from submodules.model.enums import CognitionMarkdownFileState
-from spacy.language import Language
-
-SEGMENT_DIVIDER = "\n\n"
-
-def rework_markdown_file_content(org_id: str, file_id: str, step: str) -> bool:
-    if step == "SEGMENT_SENTENCES":
-        return __rework_segment_sentences(org_id, file_id)
-    return True
-
-
-def __rework_segment_sentences(org_id: str, file_id: str) -> bool:
-    markdown_file_item = markdown_file.get(org_id, file_id)
-    if markdown_file_item is None:
-        return False
-
-    dataset_item = markdown_dataset.get(org_id, markdown_file_item.dataset_id)
-    if dataset_item is None:
-        return False
-    content = markdown_file_item.content
-    try:
-        nlp = get_tokenizer(dataset_item.tokenizer)
-        max_length = __lookup_final_max_length(nlp)
-        # Split the content into smaller chunks if it's too large
-        if __utf8len(content) > max_length:
-            chunks = __chunk_text_on_bytes(content,max_length - 100)
-            processed_chunks = []
-
-            for chunk in chunks:
-                doc = nlp(chunk)
-                processed_chunk = SEGMENT_DIVIDER.join(
-                    [sentence for sentence in __segment_sentences(doc)]
-                )
-                processed_chunks.append(processed_chunk)
-
-            content = SEGMENT_DIVIDER.join(processed_chunks)
-        else:
-            doc = nlp(content)
-            content = SEGMENT_DIVIDER.join([sentence for sentence in __segment_sentences(doc)])
-        markdown_file_item.content = content
-        general.commit()
-        return True
-    except Exception:
-        full_traceback = traceback.format_exc()
-        print(full_traceback, flush=True)
-        markdown_file.update(
-            org_id=org_id,
-            markdown_file_id=file_id,
-            state=CognitionMarkdownFileState.FAILED.value,
-            error=full_traceback,  # Store the full stack trace instead of just the error message
-        )
-        return False
-
-
-# custom segmentation rule to build very likely sentences from chunk of text
-def __segment_sentences(doc: Language):
-    sentences = []
-    current_sentence = None
-    for sent in doc.sents:
-        if len(sent.text.strip()) == 0:
-            continue
-        last_char = sent.text.strip()[-1]
-
-        if current_sentence is None:
-            current_sentence = sent.text
-        else:
-            current_sentence += " " + sent.text
-
-        if last_char in [".", ";", "?", "!"]:
-            sentences.append(current_sentence)
-            current_sentence = None
-
-    if current_sentence is not None:
-        sentences.append(current_sentence)
-    return sentences
-
-
-def __chunk_text(text: str, chunk_size: int = 1_000_000):
-    return [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
-
-# splits not after x bytes but ensures that max x bytes are used without destroying the characters 
-def __chunk_text_on_bytes(text: str, max_chunk_size: int = 1_000_000):
-    factor = len(text) / __utf8len(text)
-    increase_by = int(max(min(max_chunk_size*.1,10),1))
-    initial_size_guess = int(max(max_chunk_size * factor - 10,1))
-    final_list = []
-    remaining = text
-    while len(remaining):
-        part = remaining[:initial_size_guess]
-        if __utf8len(part) > max_chunk_size:
-            initial_size_guess = max(initial_size_guess - min(max_chunk_size *.001,10),1) 
-            continue
-        cut_after = initial_size_guess
-        while __utf8len(part) < max_chunk_size and part != remaining:
-            cut_after = min(len(remaining), cut_after+increase_by)
-            part = remaining[:cut_after]
-            
-        if __utf8len(part) > max_chunk_size:
-            cut_after-=increase_by
-        final_list.append(remaining[:cut_after])
-        remaining = remaining[cut_after:]
-
-    return final_list
-
-
-
-MAX_LENGTH_OVERWRITE = {
-    # japanese has a max length restriction by sudachi so the spacy max_length only applies if < sudachi
-    "ja":49149
-}
-
-def __lookup_final_max_length(nlp:Language) -> int:
-    overwrite = MAX_LENGTH_OVERWRITE.get(nlp.meta["lang"])
-    
-    if overwrite and overwrite < nlp.max_length:
-        return overwrite
-    return nlp.max_length
-
-
-# note that "H" uses up 1 byte while "私" takes 3 bytes
-# len(s) would still give 1 but this runs into issues for reserved/allocated spacy memory
-def __utf8len(s:str):
-    return len(s.encode('utf-8'))
diff --git a/submodules/model b/submodules/model
index 3af110d..49ea175 160000
--- a/submodules/model
+++ b/submodules/model
@@ -1 +1 @@
-Subproject commit 3af110d28d90152c62b40d6676bec40ae50fc132
+Subproject commit 49ea175bb3cd429ddc911996e0fe00e6763beefe
diff --git a/submodules/s3 b/submodules/s3
index 1ad3ff5..3299fb4 160000
--- a/submodules/s3
+++ b/submodules/s3
@@ -1 +1 @@
-Subproject commit 1ad3ff584860090f4e215986f334b5e63759a55d
+Subproject commit 3299fb46876e3b4cc29c0a5cef004005a87f0f19