From 8e2ec3bcafb3b3fd403e897c34cd31fc5da737ca Mon Sep 17 00:00:00 2001 From: Laurent Sorber Date: Sun, 13 Oct 2024 18:49:21 +0200 Subject: [PATCH] fix: upgrade pdftext (#30) --- poetry.lock | 10 +++++----- pyproject.toml | 2 +- src/raglite/_markdown.py | 6 +----- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/poetry.lock b/poetry.lock index 61c9212..8f52e6d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3374,20 +3374,20 @@ files = [ [[package]] name = "pdftext" -version = "0.3.10" +version = "0.3.13" description = "Extract structured text from pdfs quickly" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" files = [ - {file = "pdftext-0.3.10-py3-none-any.whl", hash = "sha256:99bd900d0d0692df06719c07ce10a859750ade3eb7f10c543f637118417497f9"}, - {file = "pdftext-0.3.10.tar.gz", hash = "sha256:90de726e818fb5683a0616cabb1a75a32a7224e873c3058006c93da6e440c66c"}, + {file = "pdftext-0.3.13-py3-none-any.whl", hash = "sha256:ae8f6876cdbbc1fe611527bb362cd3d584b4c8ec9370215560f2a01be4343bbc"}, + {file = "pdftext-0.3.13.tar.gz", hash = "sha256:a37ceb759ac0da34c48f85ab5d43d0b128ad9526f949e98b96568495c7be4187"}, ] [package.dependencies] +onnxruntime = ">=1.19.2,<2.0.0" pydantic = ">=2.7.1,<3.0.0" pydantic-settings = ">=2.2.1,<3.0.0" pypdfium2 = ">=4.29.0,<5.0.0" -scikit-learn = ">=1.4.2,<2.0.0" [[package]] name = "pdoc" @@ -6143,4 +6143,4 @@ ragas = ["ragas"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "1ae99fe17eed206b710b17380ae77c2f6d87d66f34e59e17d865c4e705b2d0ae" +content-hash = "43e46c87ead1e376f088fb9560919ffdea07af12cc7cb93303ea5e93b30b22fd" diff --git a/pyproject.toml b/pyproject.toml index 3eb26e7..ad252bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ version_provider = "poetry" # Python: python = ">=3.10,<4.0" # Markdown conversion: -pdftext = ">=0.3.10,!=0.3.11" +pdftext = ">=0.3.13" pypandoc-binary = { version = ">=1.13", optional = true } scikit-learn = ">=1.4.2" # Markdown formatting: diff --git a/src/raglite/_markdown.py b/src/raglite/_markdown.py index 22cac71..389bcba 100644 --- a/src/raglite/_markdown.py +++ b/src/raglite/_markdown.py @@ -1,7 +1,6 @@ """Convert any document to Markdown.""" import re -import warnings from copy import deepcopy from pathlib import Path from typing import Any @@ -10,7 +9,6 @@ import numpy as np from pdftext.extraction import dictionary_output from sklearn.cluster import KMeans -from sklearn.exceptions import InconsistentVersionWarning def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915 @@ -202,9 +200,7 @@ def document_to_markdown(doc_path: Path) -> str: # Convert the file's content to GitHub Flavored Markdown. if doc_path.suffix == ".pdf": # Parse the PDF with pdftext and convert it to Markdown. - with warnings.catch_warnings(): # Ignore https://github.com/VikParuchuri/pdftext/issues/5. - warnings.simplefilter("ignore", InconsistentVersionWarning) - pages = dictionary_output(doc_path, sort=True, keep_chars=False) + pages = dictionary_output(doc_path, sort=True, keep_chars=False) doc = "\n\n".join(parsed_pdf_to_markdown(pages)) else: # Use pandoc for everything else.