From 88bf2740f3e6f647af97e8eb7c0d854b00e114bf Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Sun, 27 Oct 2024 14:23:04 -0400 Subject: [PATCH] ci: benchmark --- .github/workflows/tests.yml | 3 ++ pyproject.toml | 8 ++++++ tests/benchmark_convert.py | 28 ++++++++++-------- tests/benchmark_parser.py | 57 ++++++------------------------------- 4 files changed, 36 insertions(+), 60 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index caa625e6..04c3aa20 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,3 +18,6 @@ jobs: uses: pypa/hatch@install - name: Run tests run: hatch test + - name: Run benchmarks + run: | + hatch run bench:all diff --git a/pyproject.toml b/pyproject.toml index 5a14530d..4d63de2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,3 +67,11 @@ ban-relative-imports = "all" [tool.pytest.ini_options] testpaths = [ "tests" ] +[tool.hatch.envs.bench] +dependencies = [ "pdfminer.six" ] + +[tool.hatch.envs.bench.scripts] +all = [ + "python tests/benchmark_parser.py", + "python tests/benchmark_convert.py", +] diff --git a/tests/benchmark_convert.py b/tests/benchmark_convert.py index e1ced060..b2e60724 100644 --- a/tests/benchmark_convert.py +++ b/tests/benchmark_convert.py @@ -57,17 +57,21 @@ def benchmark_one_pdfminer(path: Path): if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) + # Silence warnings about broken PDFs + logging.basicConfig(level=logging.ERROR) niter = 10 - if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]: - start = time.time() - for _ in range(niter): - for path in ALLPDFS: - benchmark_one_pdfminer(path) - LOG.info("pdfminer.six took %f", time.time() - start) - if len(sys.argv) == 1 or "playa" in sys.argv[1:]: - start = time.time() - for _ in range(niter): - for path in ALLPDFS: + miner_time = beach_time = 0 + for iter in range(niter + 1): + for path in ALLPDFS: + if len(sys.argv) == 1 or "playa" in sys.argv[1:]: + start = time.time() benchmark_one_pdf(path) - LOG.info("PLAYA took %f", time.time() - start) + if iter != 0: + beach_time += time.time() - start + if len(sys.argv) == 1 or "pdfminer" in sys.argv[1:]: + start = time.time() + benchmark_one_pdfminer(path) + if iter != 0: + miner_time += time.time() - start + print("pdfminer.six took %.2fs / iter" % (miner_time / niter,)) + print("PLAYA took %.2fs / iter" % (beach_time / niter,)) diff --git a/tests/benchmark_parser.py b/tests/benchmark_parser.py index df999449..6330d6c4 100644 --- a/tests/benchmark_parser.py +++ b/tests/benchmark_parser.py @@ -272,21 +272,21 @@ def bench_bytes(): - from playa.psparser import PSInMemoryParser + from playa.psparser import Lexer runs = 100 start = time.time() - parser = PSInMemoryParser(DATA * runs) + parser = Lexer(DATA * runs) _ = list(parser) print( - "PLAYA Parser (bytes): %fms / run" % ((time.time() - start) / runs * 1000), + "PLAYA Lexer (bytes): %fms / run" % ((time.time() - start) / runs * 1000), ) def bench_mmap(): import mmap - from playa.psparser import PSInMemoryParser + from playa.psparser import Lexer with tempfile.NamedTemporaryFile() as tf: runs = 100 @@ -295,55 +295,18 @@ def bench_mmap(): with open(tf.name, "rb") as infh: start = time.time() mapping = mmap.mmap(infh.fileno(), 0, access=mmap.ACCESS_READ) - parser = PSInMemoryParser(mapping) + parser = Lexer(mapping) _ = list(parser) print( - "PLAYA Parser (mmap): %fms / run" + "PLAYA Lexer (mmap): %fms / run" % ((time.time() - start) / runs * 1000), ) -def bench_bytesio(): - from pdfminer.psparser import PSEOF, PSBaseParser - - runs = 100 - start = time.time() - parser = PSBaseParser(BytesIO(DATA * runs)) - while True: - try: - _ = parser.nexttoken() - except PSEOF: - break - print( - "pdfminer.six Parser (BytesIO): %fms / run" - % ((time.time() - start) / runs * 1000), - ) - - def bench_playa(): from playa.pdfdocument import PDFDocument from playa.pdfpage import PDFPage - from playa.psparser import PSFileParser - runs = 100 - start = time.time() - parser = PSFileParser(BytesIO(DATA * runs)) - _ = list(parser) - print( - "PLAYA Parser (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), - ) - with tempfile.NamedTemporaryFile() as tf: - runs = 100 - with open(tf.name, "wb") as outfh: - outfh.write(DATA * runs) - with open(tf.name, "rb") as infh: - start = time.time() - parser = PSFileParser(infh) - _ = list(parser) - print( - "PLAYA Parser (BinaryIO): %fms / run" - % ((time.time() - start) / runs * 1000), - ) bench_bytes() bench_mmap() @@ -352,7 +315,7 @@ def bench_playa(): for _ in range(runs): with open(TESTDIR / "contrib" / "pagelabels.pdf", "rb") as infh: doc = PDFDocument(infh) - page = next(PDFPage.create_pages(doc)) + page = doc.pages[0] _ = page.layout print( "PLAYA Interpreter: %dms / run" % ((time.time() - start) / runs * 1000), @@ -376,7 +339,7 @@ def bench_pdfminer(): except PSEOF: break print( - "pdfminer.six Parser (BytesIO): %fms / run" + "pdfminer.six Lexer (BytesIO): %fms / run" % ((time.time() - start) / runs * 1000), ) with tempfile.NamedTemporaryFile() as tf: @@ -391,7 +354,7 @@ def bench_pdfminer(): except PSEOF: break print( - "pdfminer.six Parser (BinaryIO): %fms / run" + "pdfminer.six Lexer (BinaryIO): %fms / run" % ((time.time() - start) / runs * 1000), ) runs = 20 @@ -418,7 +381,5 @@ def bench_pdfminer(): bench_playa() if len(sys.argv) > 1 and sys.argv[1] == "bytes": bench_bytes() - if len(sys.argv) > 1 and sys.argv[1] == "bytesio": - bench_bytesio() if len(sys.argv) > 1 and sys.argv[1] == "mmap": bench_mmap()