diff --git a/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb index 2cf5d38d3..6c43585d7 100644 --- a/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb +++ b/recipes/quickstart/NotebookLlama/Step-1 PDF-Pre-Processing-Logic.ipynb @@ -464,49 +464,33 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "a0183c47-339d-4041-ae83-77fc34931075", "metadata": {}, "outputs": [], "source": [ - "INPUT_FILE = \"./resources/extracted_text.txt\" # Replace with your file path\n", + "# Reading the contents of the provided file\n", + "INPUT_FILE = \"extracted_text.txt\"\n", + "\n", + "\n", "CHUNK_SIZE = 1000 # Adjust chunk size if needed\n", "\n", + "# Read the file\n", + "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n", + " text = file.read()\n", + "\n", "chunks = create_word_bounded_chunks(text, CHUNK_SIZE)\n", - "num_chunks = len(chunks)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "bb36814f-9310-4734-bf54-e16a5032339e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "101" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "num_chunks = len(chunks)\n", "num_chunks" ] }, { "cell_type": "code", - "execution_count": 66, + "execution_count": null, "id": "447188d3-ebf0-42d5-940e-4d7e0d9dbf32", "metadata": {}, "outputs": [], "source": [ - "# Read the file\n", - "with open(INPUT_FILE, 'r', encoding='utf-8') as file:\n", - " text = file.read()\n", "\n", "# Calculate number of chunks\n", "num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE\n", @@ -518,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": null, "id": "7917dfdd-b3af-44fc-a8c0-2760ace9363e", "metadata": {}, "outputs": [ @@ -2616,15 +2600,18 @@ } ], "source": [ + "# Initialize processed_text before using it\n", + "processed_text = \"\"\n", + "\n", "with open(output_file, 'w', encoding='utf-8') as out_file:\n", " for chunk_num, chunk in enumerate(tqdm(chunks, desc=\"Processing chunks\")):\n", " # Process chunk and append to complete text\n", " processed_chunk = process_chunk(chunk, chunk_num)\n", " processed_text += processed_chunk + \"\\n\"\n", - " \n", + "\n", " # Write chunk immediately to file\n", " out_file.write(processed_chunk + \"\\n\")\n", - " out_file.flush()" + " out_file.flush()\n" ] }, {