Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Replace pdfminer3k with pypdf #994

Merged
merged 2 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata
recursive-include orangecontrib/text/models *.ftz
recursive-include orangecontrib/text/sentiment *.txt
recursive-include orangecontrib/text/tests *.txt *.json *.pkl *.udpipe
recursive-include orangecontrib/text/tests *.txt *.json
recursive-include orangecontrib/text/tests/data *
recursive-include orangecontrib/text/tutorials *.ows
recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu *.csv *.tab *.tab.metadata
recursive-include orangecontrib/text/widgets/tests/data *.txt *.conllu *.csv *.tab *.tab.metadata
include orangecontrib/text/widgets/tests/bow-test
recursive-include scripts *.sh *.py

Expand Down
30 changes: 4 additions & 26 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
from Orange.data.util import get_unique_names
from Orange.misc.utils.embedder_utils import get_proxies
from Orange.util import Registry, dummy_callback
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfparser import PDFDocument, PDFParser
from pypdf import PdfReader as PyPDFReader
from requests.exceptions import ConnectionError

from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -130,28 +127,9 @@ class PdfReader(Reader):
ext = [".pdf"]

def read_file(self):
with open(self.path, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 0.1
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = []

for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text).replace('\x00', '')
reader = PyPDFReader(self.path)
texts = [page.extract_text() for page in reader.pages]
self.content = " ".join(texts)


class XmlReader(Reader):
Expand Down
Binary file not shown.
Binary file not shown.
32 changes: 32 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
TxtReader,
TextData,
XmlReader,
PdfReader,
)


Expand Down Expand Up @@ -296,5 +297,36 @@ def test_error(self):
os.remove(fp.name)


DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents")


class TestPdfReader(unittest.TestCase):
def test_file(self):
reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf"))
res = reader.read()[0]
exp = (
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam "
"nonumy eirmod"
)
self.assertTrue(res.content.startswith(exp))

path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf")
reader = PdfReader(path)
res = reader.read()[0]
self.assertEqual("This is a test pdf file", res.content)
self.assertEqual("sample_pdf", res.name)
self.assertEqual(os.path.join(path), res.path)
self.assertListEqual([".pdf"], res.ext)
self.assertEqual("good", res.category)

def test_error(self):
reader = PdfReader(
os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf")
)
res = reader.read()
self.assertIsNone(res[0])
self.assertEqual("sample_pdf_corrupted.pdf", res[1])


if __name__ == "__main__":
unittest.main()
30 changes: 20 additions & 10 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,49 @@
from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


DATA_PATH = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "..", "tests", "data", "documents")
)


class TestOWImportDocuments(WidgetTest):
def setUp(self) -> None:
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
path = os.path.join(os.path.dirname(__file__), "data/documents")
path = os.path.join(os.path.dirname(__file__), DATA_PATH)
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

def test_current_path(self):
path = os.path.join(os.path.dirname(__file__), "data/documents")
path = os.path.join(os.path.dirname(__file__), DATA_PATH)
self.assertEqual(path, self.widget.currentPath)

def test_no_skipped(self):
path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
path = os.path.join(DATA_PATH, "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
self.assertIsNone(self.get_output(self.widget.Outputs.skipped_documents))

def test_output(self):
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(4, len(output))
self.assertEqual(5, len(output))
self.assertEqual(3, len(output.domain.metas))
names = output.get_column("name")
self.assertListEqual(
# ž in sample_text_ž must be unicode char 0x17E not decomposed
# 0x7A + 0x30C as it is in file name
["sample_docx", "sample_odt", "sample_pdf", "sample_txt_ž"],
[
"minimal-document",
"sample_docx",
"sample_odt",
"sample_pdf",
"sample_txt_ž",
],
sorted(names.tolist()),
)
texts = output.get_column("content")
# skip first document - it contains different text
texts = output.get_column("content")[1:]
self.assertListEqual(
# ž in sample_text_ž must be unicode char 0x17E not decomposed
# 0x7A + 0x30C as it is in file name
Expand Down Expand Up @@ -99,9 +111,7 @@ def test_conllu_cb(self):
self.assertEqual(len(corpus.domain.metas), 4)

def test_info_box(self):
self.assertEqual(
"4 documents, 1 skipped", self.widget.info_area.text()
)
self.assertEqual("5 documents, 1 skipped", self.widget.info_area.text())

# empty widget
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
Expand All @@ -124,7 +134,7 @@ def tests_context(self):
# change default to something else to see if language is changed
self.widget.language = "Slovenian"

path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
path = os.path.join(DATA_PATH, "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0
orange-canvas-core
owlready2
pandas
pdfminer3k>=1.3.1
pypdf
pyqtgraph
pyyaml
requests
Expand Down
Loading