Skip to content

Commit

Permalink
ci: benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Oct 27, 2024
1 parent 19f448a commit 88bf274
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 60 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ jobs:
uses: pypa/hatch@install
- name: Run tests
run: hatch test
- name: Run benchmarks
run: |
hatch run bench:all
8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,11 @@ ban-relative-imports = "all"
[tool.pytest.ini_options]
testpaths = [ "tests" ]

[tool.hatch.envs.bench]
dependencies = [ "pdfminer.six" ]

[tool.hatch.envs.bench.scripts]
all = [
"python tests/benchmark_parser.py",
"python tests/benchmark_convert.py",
]
28 changes: 16 additions & 12 deletions tests/benchmark_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,21 @@ def benchmark_one_pdfminer(path: Path):


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
# Silence warnings about broken PDFs
logging.basicConfig(level=logging.ERROR)
niter = 10
if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]:
start = time.time()
for _ in range(niter):
for path in ALLPDFS:
benchmark_one_pdfminer(path)
LOG.info("pdfminer.six took %f", time.time() - start)
if len(sys.argv) == 1 or "playa" in sys.argv[1:]:
start = time.time()
for _ in range(niter):
for path in ALLPDFS:
miner_time = beach_time = 0
for iter in range(niter + 1):
for path in ALLPDFS:
if len(sys.argv) == 1 or "playa" in sys.argv[1:]:
start = time.time()
benchmark_one_pdf(path)
LOG.info("PLAYA took %f", time.time() - start)
if iter != 0:
beach_time += time.time() - start
if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]:
start = time.time()
benchmark_one_pdfminer(path)
if iter != 0:
miner_time += time.time() - start
print("pdfminer.six took %.2fs / iter" % (miner_time / niter,))
print("PLAYA took %.2fs / iter" % (beach_time / niter,))
57 changes: 9 additions & 48 deletions tests/benchmark_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,21 +272,21 @@


def bench_bytes():
from playa.psparser import PSInMemoryParser
from playa.psparser import Lexer

runs = 100
start = time.time()
parser = PSInMemoryParser(DATA * runs)
parser = Lexer(DATA * runs)
_ = list(parser)
print(
"PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
"PLAYA Lexer (bytes): %fms / run" % ((time.time() - start) / runs * 1000),
)


def bench_mmap():
import mmap

from playa.psparser import PSInMemoryParser
from playa.psparser import Lexer

with tempfile.NamedTemporaryFile() as tf:
runs = 100
Expand All @@ -295,55 +295,18 @@ def bench_mmap():
with open(tf.name, "rb") as infh:
start = time.time()
mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ)
parser = PSInMemoryParser(mapping)
parser = Lexer(mapping)
_ = list(parser)
print(
"PLAYA Parser (mmap): %fms / run"
"PLAYA Lexer (mmap): %fms / run"
% ((time.time() - start) / runs * 1000),
)


def bench_bytesio():
from pdfminer.psparser import PSEOF, PSBaseParser

runs = 100
start = time.time()
parser = PSBaseParser(BytesIO(DATA * runs))
while True:
try:
_ = parser.nexttoken()
except PSEOF:
break
print(
"pdfminer.six Parser (BytesIO): %fms / run"
% ((time.time() - start) / runs * 1000),
)


def bench_playa():
from playa.pdfdocument import PDFDocument
from playa.pdfpage import PDFPage
from playa.psparser import PSFileParser

runs = 100
start = time.time()
parser = PSFileParser(BytesIO(DATA * runs))
_ = list(parser)
print(
"PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000),
)
with tempfile.NamedTemporaryFile() as tf:
runs = 100
with open(tf.name, "wb") as outfh:
outfh.write(DATA * runs)
with open(tf.name, "rb") as infh:
start = time.time()
parser = PSFileParser(infh)
_ = list(parser)
print(
"PLAYA Parser (BinaryIO): %fms / run"
% ((time.time() - start) / runs * 1000),
)
bench_bytes()
bench_mmap()

Expand All @@ -352,7 +315,7 @@ def bench_playa():
for _ in range(runs):
with open(TESTDIR / "contrib" / "pagelabels.pdf", "rb") as infh:
doc = PDFDocument(infh)
page = next(PDFPage.create_pages(doc))
page = doc.pages[0]
_ = page.layout
print(
"PLAYA Interpreter: %dms / run" % ((time.time() - start) / runs * 1000),
Expand All @@ -376,7 +339,7 @@ def bench_pdfminer():
except PSEOF:
break
print(
"pdfminer.six Parser (BytesIO): %fms / run"
"pdfminer.six Lexer (BytesIO): %fms / run"
% ((time.time() - start) / runs * 1000),
)
with tempfile.NamedTemporaryFile() as tf:
Expand All @@ -391,7 +354,7 @@ def bench_pdfminer():
except PSEOF:
break
print(
"pdfminer.six Parser (BinaryIO): %fms / run"
"pdfminer.six Lexer (BinaryIO): %fms / run"
% ((time.time() - start) / runs * 1000),
)
runs = 20
Expand All @@ -418,7 +381,5 @@ def bench_pdfminer():
bench_playa()
if len(sys.argv) > 1 and sys.argv[1] == "bytes":
bench_bytes()
if len(sys.argv) > 1 and sys.argv[1] == "bytesio":
bench_bytesio()
if len(sys.argv) > 1 and sys.argv[1] == "mmap":
bench_mmap()

0 comments on commit 88bf274

Please sign in to comment.