-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support parallel operations over pages (#36)
* doc: example of parallel pdfminer.six (we will do better) * fix: do not submit a gratuitous extra job * feat: crude example of parallelizing PLAYA * feat: use layout in playa parallel bench * chore: format * feat: basic support for parallel execution across pages * chore: mypy * docs: deprecate eager api in readme * test: test parallel execution * fix(types): complete callable annotation
- Loading branch information
Showing
9 changed files
with
230 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
""" | ||
Attempt to scale. | ||
""" | ||
|
||
import time | ||
from pathlib import Path | ||
|
||
import playa | ||
from playa.page import Page | ||
|
||
|
||
def process_page(page: Page) -> str: | ||
return " ".join(x.chars for x in page.texts) | ||
|
||
|
||
def benchmark_single(path: Path): | ||
with playa.open(path) as pdf: | ||
return list(pdf.pages.map(process_page)) | ||
|
||
|
||
def benchmark_multi(path: Path, ncpu: int): | ||
with playa.open(path, max_workers=ncpu) as pdf: | ||
return list(pdf.pages.map(process_page)) | ||
|
||
|
||
if __name__ == "__main__": | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("-n", "--ncpu", type=int, default=4) | ||
parser.add_argument("pdf", type=Path) | ||
args = parser.parse_args() | ||
|
||
start = time.time() | ||
benchmark_multi(args.pdf, args.ncpu) | ||
multi_time = time.time() - start | ||
print( | ||
"PLAYA (%d CPUs) took %.2fs" | ||
% ( | ||
args.ncpu, | ||
multi_time, | ||
) | ||
) | ||
|
||
start = time.time() | ||
benchmark_single(args.pdf) | ||
single_time = time.time() - start | ||
print("PLAYA (single) took %.2fs" % (single_time,)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
"""Demonstrate paralle extraction with pdfminer.six""" | ||
|
||
import time | ||
from pdfminer.high_level import extract_pages | ||
from concurrent.futures import ProcessPoolExecutor | ||
from pathlib import Path | ||
from pdfminer.pdfpage import PDFPage | ||
from pdfminer.layout import LTImage | ||
from pdfminer.pdftypes import PDFObjRef | ||
|
||
|
||
def benchmark_single(path: Path): | ||
for page in extract_pages(path): | ||
pass | ||
|
||
|
||
def remove_references(item): | ||
try: | ||
for child in item: | ||
remove_references(child) | ||
except TypeError: | ||
if isinstance(item, LTImage): | ||
for key, val in item.stream.attrs.items(): | ||
if isinstance(val, PDFObjRef): | ||
val.doc = None | ||
|
||
|
||
def extract_batch(path, page_numbers): | ||
batch = list(extract_pages(path, page_numbers=page_numbers)) | ||
remove_references(batch) | ||
return batch | ||
|
||
|
||
def benchmark_multi(path: Path, ncpu: int): | ||
with open(path, "rb") as fp: | ||
npages = sum(1 for _ in PDFPage.get_pages(fp)) | ||
pages = [None] * npages | ||
batches = [] | ||
|
||
with ProcessPoolExecutor(max_workers=ncpu) as pool: | ||
step = max(1, round(npages / ncpu)) | ||
for start in range(0, npages, step): | ||
end = min(npages, start + step) | ||
batch = list(range(start, end)) | ||
print(f"Submitting pages {start} to {end - 1}") | ||
batches.append((batch, pool.submit(extract_batch, path, batch))) | ||
for batch, future in batches: | ||
for idx, page in zip(batch, future.result()): | ||
pages[idx] = page | ||
|
||
|
||
if __name__ == "__main__": | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("-n", "--ncpu", type=int, default=4) | ||
parser.add_argument("pdf", type=Path) | ||
args = parser.parse_args() | ||
|
||
start = time.time() | ||
benchmark_multi(args.pdf, args.ncpu) | ||
multi_time = time.time() - start | ||
print( | ||
"pdfminer.six (%d CPUs) took %.2fs" | ||
% ( | ||
args.ncpu, | ||
multi_time, | ||
) | ||
) | ||
|
||
start = time.time() | ||
benchmark_single(args.pdf) | ||
single_time = time.time() - start | ||
print("pdfminer.six (single) took %.2fs" % (single_time,)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.