Skip to content

Commit

Permalink
fix: upgrade pdftext
Browse files Browse the repository at this point in the history
  • Loading branch information
lsorber committed Oct 13, 2024
1 parent ed215cc commit 8ff7fca
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
10 changes: 5 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ version_provider = "poetry"
# Python:
python = ">=3.10,<4.0"
# Markdown conversion:
pdftext = ">=0.3.10,!=0.3.11"
pdftext = ">=0.3.13"
pypandoc-binary = { version = ">=1.13", optional = true }
scikit-learn = ">=1.4.2"
# Markdown formatting:
Expand Down
6 changes: 1 addition & 5 deletions src/raglite/_markdown.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Convert any document to Markdown."""

import re
import warnings
from copy import deepcopy
from pathlib import Path
from typing import Any
Expand All @@ -10,7 +9,6 @@
import numpy as np
from pdftext.extraction import dictionary_output
from sklearn.cluster import KMeans
from sklearn.exceptions import InconsistentVersionWarning


def parsed_pdf_to_markdown(pages: list[dict[str, Any]]) -> list[str]: # noqa: C901, PLR0915
Expand Down Expand Up @@ -202,9 +200,7 @@ def document_to_markdown(doc_path: Path) -> str:
# Convert the file's content to GitHub Flavored Markdown.
if doc_path.suffix == ".pdf":
# Parse the PDF with pdftext and convert it to Markdown.
with warnings.catch_warnings(): # Ignore https://github.com/VikParuchuri/pdftext/issues/5.
warnings.simplefilter("ignore", InconsistentVersionWarning)
pages = dictionary_output(doc_path, sort=True, keep_chars=False)
pages = dictionary_output(doc_path, sort=True, keep_chars=False)
doc = "\n\n".join(parsed_pdf_to_markdown(pages))
else:
# Use pandoc for everything else.
Expand Down

0 comments on commit 8ff7fca

Please sign in to comment.