Skip to content

Commit

Permalink
Merge pull request #994 from PrimozGodec/replace-pypdf
Browse files Browse the repository at this point in the history
[FIX] Replace pdfminer3k with pypdf
  • Loading branch information
ajdapretnar authored Nov 10, 2023
2 parents 314c562 + 1cb5734 commit 139eedf
Show file tree
Hide file tree
Showing 11 changed files with 60 additions and 39 deletions.
5 changes: 3 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
recursive-include orangecontrib/text/datasets *.tab *.txt *.metadata
recursive-include orangecontrib/text/models *.ftz
recursive-include orangecontrib/text/sentiment *.txt
recursive-include orangecontrib/text/tests *.txt *.json *.pkl *.udpipe
recursive-include orangecontrib/text/tests *.txt *.json
recursive-include orangecontrib/text/tests/data *
recursive-include orangecontrib/text/tutorials *.ows
recursive-include orangecontrib/text/widgets/icons *.svg *.png *.ai
recursive-include orangecontrib/text/widgets/resources *.js *.css *.html
recursive-include orangecontrib/text/widgets/tests/data *.docx *.odt *.pdf *.txt *.conllu *.csv *.tab *.tab.metadata
recursive-include orangecontrib/text/widgets/tests/data *.txt *.conllu *.csv *.tab *.tab.metadata
include orangecontrib/text/widgets/tests/bow-test
recursive-include scripts *.sh *.py

Expand Down
30 changes: 4 additions & 26 deletions orangecontrib/text/import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
from Orange.data.util import get_unique_names
from Orange.misc.utils.embedder_utils import get_proxies
from Orange.util import Registry, dummy_callback
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfparser import PDFDocument, PDFParser
from pypdf import PdfReader as PyPDFReader
from requests.exceptions import ConnectionError

from orangecontrib.text.corpus import Corpus
Expand Down Expand Up @@ -130,28 +127,9 @@ class PdfReader(Reader):
ext = [".pdf"]

def read_file(self):
with open(self.path, 'rb') as f:
parser = PDFParser(f)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 0.1
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = []

for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj,
LTTextLine):
extracted_text.append(lt_obj.get_text())
self.content = ' '.join(extracted_text).replace('\x00', '')
reader = PyPDFReader(self.path)
texts = [page.extract_text() for page in reader.pages]
self.content = " ".join(texts)


class XmlReader(Reader):
Expand Down
Binary file not shown.
Binary file not shown.
32 changes: 32 additions & 0 deletions orangecontrib/text/tests/test_import_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
TxtReader,
TextData,
XmlReader,
PdfReader,
)


Expand Down Expand Up @@ -296,5 +297,36 @@ def test_error(self):
os.remove(fp.name)


DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "documents")


class TestPdfReader(unittest.TestCase):
def test_file(self):
reader = PdfReader(os.path.join(DATA_PATH, "good", "minimal-document.pdf"))
res = reader.read()[0]
exp = (
"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam "
"nonumy eirmod"
)
self.assertTrue(res.content.startswith(exp))

path = os.path.join(DATA_PATH, "good", "sample_pdf.pdf")
reader = PdfReader(path)
res = reader.read()[0]
self.assertEqual("This is a test pdf file", res.content)
self.assertEqual("sample_pdf", res.name)
self.assertEqual(os.path.join(path), res.path)
self.assertListEqual([".pdf"], res.ext)
self.assertEqual("good", res.category)

def test_error(self):
reader = PdfReader(
os.path.join(DATA_PATH, "corrupted", "sample_pdf_corrupted.pdf")
)
res = reader.read()
self.assertIsNone(res[0])
self.assertEqual("sample_pdf_corrupted.pdf", res[1])


if __name__ == "__main__":
unittest.main()
30 changes: 20 additions & 10 deletions orangecontrib/text/widgets/tests/test_owimportdocuments.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,49 @@
from orangecontrib.text.widgets.owimportdocuments import OWImportDocuments


DATA_PATH = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "..", "tests", "data", "documents")
)


class TestOWImportDocuments(WidgetTest):
def setUp(self) -> None:
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
path = os.path.join(os.path.dirname(__file__), "data/documents")
path = os.path.join(os.path.dirname(__file__), DATA_PATH)
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()

def test_current_path(self):
path = os.path.join(os.path.dirname(__file__), "data/documents")
path = os.path.join(os.path.dirname(__file__), DATA_PATH)
self.assertEqual(path, self.widget.currentPath)

def test_no_skipped(self):
path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
path = os.path.join(DATA_PATH, "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
self.assertIsNone(self.get_output(self.widget.Outputs.skipped_documents))

def test_output(self):
output = self.get_output(self.widget.Outputs.data)
self.assertEqual(4, len(output))
self.assertEqual(5, len(output))
self.assertEqual(3, len(output.domain.metas))
names = output.get_column("name")
self.assertListEqual(
# ž in sample_text_ž must be unicode char 0x17E not decomposed
# 0x7A + 0x30C as it is in file name
["sample_docx", "sample_odt", "sample_pdf", "sample_txt_ž"],
[
"minimal-document",
"sample_docx",
"sample_odt",
"sample_pdf",
"sample_txt_ž",
],
sorted(names.tolist()),
)
texts = output.get_column("content")
# skip first document - it contains different text
texts = output.get_column("content")[1:]
self.assertListEqual(
# ž in sample_text_ž must be unicode char 0x17E not decomposed
# 0x7A + 0x30C as it is in file name
Expand Down Expand Up @@ -99,9 +111,7 @@ def test_conllu_cb(self):
self.assertEqual(len(corpus.domain.metas), 4)

def test_info_box(self):
self.assertEqual(
"4 documents, 1 skipped", self.widget.info_area.text()
)
self.assertEqual("5 documents, 1 skipped", self.widget.info_area.text())

# empty widget
self.widget: OWImportDocuments = self.create_widget(OWImportDocuments)
Expand All @@ -124,7 +134,7 @@ def tests_context(self):
# change default to something else to see if language is changed
self.widget.language = "Slovenian"

path = os.path.join(os.path.dirname(__file__), "data/documents", "good")
path = os.path.join(DATA_PATH, "good")
self.widget.setCurrentPath(path)
self.widget.reload()
self.wait_until_finished()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ orange-widget-base >=4.20.0
orange-canvas-core
owlready2
pandas
pdfminer3k>=1.3.1
pypdf
pyqtgraph
pyyaml
requests
Expand Down

0 comments on commit 139eedf

Please sign in to comment.