data cleaning required before pdf ingestion

openchatai · Dec 6, 2023 · a64974b · a64974b
1 parent c457664
commit a64974b
Showing 1 changed file with 2 additions and 0 deletions.
diff --git a/llm-server/workers/tasks/process_pdfs.py b/llm-server/workers/tasks/process_pdfs.py
@@ -18,6 +18,8 @@ def process_pdf(file_name: str, bot_id: str):
         insert_pdf_data_source(chatbot_id=bot_id, file_name=file_name, status="PENDING")
         loader = PyPDFium2Loader(get_file_path(file_name))
         raw_docs = loader.load()
+
+        # clean the data received from pdf document before passing it 
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000, chunk_overlap=200, length_function=len
         )