From fbbe45d6295706e95ff974e718f6905f394ad240 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 25 Nov 2024 10:49:54 +0100 Subject: [PATCH] Add pdf.spans.manual (#9) * Add pdf.spans.manual * Update types for consistency * Only add image span if bounding box is available * fix entry point definition * pin prodigy version * fix extension str comparison * Tidy up and add docstring * Update types * Update CLI shortcuts * Remove .docx for now Can't create image-based previews easily * Increment version * Fix token offsets * Adjust color for consistency * Add meta to stream * Add pdf.spans.fetch * Apply suggestions from code review Co-authored-by: Magdalena Aniol <96200718+magdaaniol@users.noreply.github.com> * Update recipe name * fix indent * Only compute images if we need them * add answer field * set it to use python 3.10 and Prodigy 1.17.1 * add answer field on top level * fix python version * Require fewer options for fetch recipe Apply layout settings afterwards so fetched data can be reused more easily and make CSS the same for both interface types * Tidy up --------- Co-authored-by: magdaaniol Co-authored-by: Magdalena Aniol <96200718+magdaaniol@users.noreply.github.com> --- .github/workflows/unit_tests.yml | 6 +- prodigy_pdf/spans.py | 338 +++++++++++++++++++++++++++++++ setup.cfg | 9 +- 3 files changed, 347 insertions(+), 6 deletions(-) create mode 100644 prodigy_pdf/spans.py diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 1381d54..6c461cd 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -11,17 +11,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - name: Set up Python 3.9 + - name: Set up Python 3.10 uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: '3.10' cache: "pip" # caching pip dependencies - name: Check out Prodigy uses: actions/checkout@v3 with: repository: explosion/prodigy - ref: v1.14.0 + ref: v1.17.1 path: ./prodigy ssh-key: ${{ secrets.GHA_PRODIGY_READ }} diff --git a/prodigy_pdf/spans.py b/prodigy_pdf/spans.py new file mode 100644 index 0000000..dadce2f --- /dev/null +++ b/prodigy_pdf/spans.py @@ -0,0 +1,338 @@ +import base64 +from io import BytesIO +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import pypdfium2 as pdfium +import srsly +from docling_core.types.doc.labels import DocItemLabel +from prodigy.components.db import connect +from prodigy.components.preprocess import add_answer, resolve_labels +from prodigy.components.stream import Stream, _source_is_dataset, get_stream +from prodigy.core import Arg, recipe +from prodigy.errors import RecipeError +from prodigy.protocols import ControllerComponentsDict +from prodigy.recipes.ner import preprocess_stream as preprocess_ner_stream +from prodigy.types import PathInputType, StreamType, ViewId +from prodigy.util import ensure_path, log, msg, set_hashes +from spacy.language import Language +from spacy.tokens import Doc, Span +from spacy_layout import spaCyLayout + +HEADINGS = [DocItemLabel.SECTION_HEADER, DocItemLabel.PAGE_HEADER, DocItemLabel.TITLE] +SEPARATOR = "\n\n" + +FONT_SIZE_TEXT = 14 +FONT_SIZE_HEADING = 18 +CSS_CLS = ".prodigy-annotator:not(:has(.prodigy-page-content)) .prodigy-container" +CSS_CLS_PAGES = ".prodigy-annotator:has(.prodigy-page-content) .prodigy-page-content" +CSS = ".prodigy-content { text-align: left }" +CSS_PREVIEW = f""" +{CSS_CLS}, {CSS_CLS_PAGES} {{ display: grid }} +{CSS_CLS} {{ grid-template-columns: 0 1fr 50% }} +{CSS_CLS_PAGES} {{ grid-template-columns: 1fr 50%; }} +{CSS_CLS} > div:nth-child(2), {CSS_CLS_PAGES} > div:nth-child(1) {{ border-right: 1px solid #ddd }} +{CSS_CLS} > div:nth-child(3), {CSS_CLS_PAGES} > div:nth-child(3) {{ position: relative }} +{CSS_CLS} > div:nth-child(3) > div:first-child, {CSS_CLS_PAGES} > div:nth-child(2) > div:first-child {{ position: sticky; top: 0 }} +{CSS_CLS} .prodigy-meta {{ grid-column: 1 / span 3 }} +""" + + +def get_layout_tokens(doc: Span, token_labels: Dict[int, str]) -> List[dict]: + result = [] + offset = 0 + for i, token in enumerate(doc): + token_label = token_labels.get(token.i) + token_dict = { + "text": token.text, + "start": offset, + "end": offset + len(token.text), + "id": i, + "ws": bool(token.whitespace_), + "layout": token_label, + } + offset += len(token.text) + if token.text == SEPARATOR: + token_dict["disabled"] = True + token_dict["style"] = {"display": "none"} + if token_label in HEADINGS: + token_dict["style"] = {"fontWeight": "bold", "fontSize": FONT_SIZE_HEADING} + result.append(token_dict) + return result + + +def get_token_labels(doc: Doc) -> Dict[int, str]: + labels_by_id = {} + for span in doc.spans["layout"]: + for i in range(span.start, span.end): + labels_by_id[i] = span.label_ + return labels_by_id + + +def pdf_to_images(path: Path) -> List[str]: + images = [] + pdf = pdfium.PdfDocument(path) + for page_number in range(len(pdf)): + page = pdf.get_page(page_number) + pil_image = page.render().to_pil() + with BytesIO() as buffered: + pil_image.save(buffered, format="JPEG") + img_str = base64.b64encode(buffered.getvalue()) + images.append(f"data:image/png;base64,{img_str.decode('utf-8')}") + return images + + +def disable_tokens(stream: StreamType, disabled: List[str]) -> StreamType: + for eg in stream: + for token in eg.get("tokens", []): + if token.get("layout") in disabled: + token["disabled"] = True + yield eg + + +def remove_preview(stream: StreamType, view_id: ViewId) -> StreamType: + for eg in stream: + config = eg.get("config", {}) + if "blocks" in config: + eg["config"]["blocks"] = [{"view_id": view_id}] + if "image" in eg: + del eg["image"] + yield eg + + +class LayoutStream: + def __init__( + self, + f: PathInputType, + nlp: Language, + file_ext: List[str] = ["pdf"], + view_id: ViewId = "spans_manual", + split_pages: bool = False, + hide_preview: bool = False, + focus: List[str] = [], + ) -> None: + dir_path = ensure_path(f) + if not dir_path.exists() or not dir_path.is_dir(): + raise RecipeError(f"Can't load from directory {f}", dir_path.resolve()) + self.paths = [ + path + for path in sorted(dir_path.iterdir()) + if path.is_file() + and not path.name.startswith(".") + and (path.suffix.lower()[1:] in file_ext) + ] + self.view_id = view_id + self.split_pages = split_pages + self.hide_preview = hide_preview + self.focus = focus + self.nlp = nlp + self.layout = spaCyLayout(nlp, separator=SEPARATOR) + log("RECIPE: Initialized spacy-layout") + + def get_stream(self) -> StreamType: + if self.focus: + yield from self.get_focus_stream() + else: + yield from self.get_full_stream() + + def get_full_stream(self) -> StreamType: + blocks = [{"view_id": self.view_id}] + if not self.hide_preview: + blocks.append({"view_id": "image", "spans": []}) + for file_path in self.paths: + doc = self.layout(file_path) + images = pdf_to_images(file_path) if not self.hide_preview else None + pages = [] + for i, (page_layout, page_spans) in enumerate( + doc._.get(self.layout.attrs.doc_pages) + ): + token_labels = get_token_labels(doc) + page = { + "text": SEPARATOR.join(span.text for span in page_spans), + "tokens": get_layout_tokens( + doc[page_spans[0].start : page_spans[-1].end], + token_labels, + ), + "width": page_layout.width, + "height": page_layout.height, + "view_id": "blocks", + "config": {"blocks": blocks}, + } + if not self.hide_preview and images: + page["image"] = images[i] + pages.append(page) + if self.split_pages: + meta = {"title": file_path.stem, "page": page_layout.page_no} + yield set_hashes({**page, "meta": meta}) + if not self.split_pages: + yield set_hashes({"pages": pages, "meta": {"title": file_path.stem}}) + + def get_focus_stream(self) -> StreamType: + for file_path in self.paths: + doc = self.layout(file_path) + images = pdf_to_images(file_path) if not self.hide_preview else None + for i, (page_layout, page_spans) in enumerate( + doc._.get(self.layout.attrs.doc_pages) + ): + token_labels = get_token_labels(doc) + for span in page_spans: + if span.label_ not in self.focus: + continue + blocks = [{"view_id": self.view_id}] + if not self.hide_preview: + span_layout = span._.get(self.layout.attrs.span_layout) + image_spans = [] + if span_layout: + image_spans.append( + { + "x": span_layout.x, + "y": span_layout.y, + "width": span_layout.width, + "height": span_layout.height, + "color": "magenta", + "id": span.id, + } + ) + blocks.append({"view_id": "image", "spans": image_spans}) + eg = { + "text": span.text, + "tokens": get_layout_tokens(span, token_labels), + "width": page_layout.width, + "height": page_layout.height, + "view_id": "blocks", + "config": {"blocks": blocks}, + "text_span": { + "token_start": span.start, + "token_end": span.end - 1, + "start": span.start_char, + "end": span.end_char, + "text": span.text, + "label": span.label_, + }, + "meta": {"title": file_path.stem, "page": page_layout.page_no}, + } + if not self.hide_preview and images: + eg["image"] = images[i] + yield set_hashes(eg) + + +@recipe( + "pdf.spans.manual", + # fmt: off + dataset=Arg(help="Dataset to save annotations to"), + nlp=Arg(help="Loadable spaCy pipeline"), + source=Arg(help="Path to directory of PDFs or dataset/JSONL file created with pdf.layout.fetch"), + labels=Arg("--label", "-l", help="Comma-separated label(s) to annotate or text file with one label per line"), + add_ents=Arg("--add-ents", "-E", help="Add named enitites for the given labels via the spaCy model"), + focus=Arg("--focus", "-f", help="Focus mode: annotate selected sections of a given type, e.g. 'text'"), + disable=Arg("--disable", "-d", help="Labels of layout spans to disable, e.g. 'footnote'"), + split_pages=Arg("--split-pages", "-S", help="View pages as separate tasks"), + hide_preview=Arg("--hide-preview", "-HP", help="Hide side-by-side preview of layout"), + # fmt: on +) +def pdf_spans_manual( + dataset: str, + nlp: Language, + source: str, + labels: Optional[List[str]] = None, + add_ents: bool = False, + focus: Optional[List[str]] = None, + disable: Optional[List[str]] = None, + hide_preview: bool = False, + split_pages: bool = False, +) -> ControllerComponentsDict: + """ + Apply span annotations to text-based document contents extracted with + spacy-layout and Docling. For efficiency, the recipe can run with + --focus text to walk through individual text blocks, which are highlighted + in a visual preview of the document page. + """ + log("RECIPE: Starting recipe pdf.spans.manual", locals()) + view_id = "spans_manual" + if source.endswith(".jsonl") or _source_is_dataset(source, None): + # Load from existing data created with pdf.layout.fetch + stream = get_stream(source) + else: + layout_stream = LayoutStream( + source, + nlp=nlp, + file_ext=["pdf"], + view_id=view_id, + split_pages=split_pages, + hide_preview=hide_preview, + focus=focus or [], + ) + + stream = Stream.from_iterable(layout_stream.get_stream()) + if add_ents: + labels = resolve_labels(nlp, "ner", recipe_labels=labels) + stream.apply(preprocess_ner_stream, nlp, labels=labels, unsegmented=True) + if disable: + stream.apply(disable_tokens, disabled=disable) + css = CSS + if hide_preview: + stream.apply(remove_preview, view_id=view_id) + else: + css += CSS_PREVIEW + + return { + "dataset": dataset, + "stream": stream, + "view_id": "pages" if not split_pages and not focus else "blocks", + "config": { + "labels": labels, + "global_css": css, + "shade_bounding_boxes": True, + "custom_theme": { + "cardMaxWidth": "95%", + "smallText": FONT_SIZE_TEXT, + "tokenHeight": 25, + }, + }, + } + + +@recipe( + "pdf.layout.fetch", + # fmt: off + output=Arg(help="Output file or dataset (with prefix dataset:)"), + nlp=Arg(help="Loadable spaCy pipeline"), + source=Arg(help="Path to directory to load from"), + focus=Arg("--focus", "-f", help="Focus mode: annotate selected sections of a given type, e.g. 'text'"), + split_pages=Arg("--split-pages", "-S", help="View pages as separate tasks"), + # fmt: on +) +def pdf_layout_fetch( + output: str, + nlp: Language, + source: str, + focus: Optional[List[str]] = None, + split_pages: bool = False, +) -> ControllerComponentsDict: + """ + Pre-process PDFs to use with pdf.spans.manual. This can significantly speed + up loading time during the annotation process. + """ + log("RECIPE: Starting recipe pdf.layout.fetch", locals()) + layout_stream = LayoutStream( + source, + nlp=nlp, + file_ext=["pdf"], + view_id="spans_manual", + split_pages=split_pages, + hide_preview=False, + focus=focus or [], + ) + msg.info("Creating preprocessed PDFs") + layout_stream = add_answer(layout_stream.get_stream()) + stream = Stream.from_iterable(layout_stream) + if _source_is_dataset(output, None): + dataset = str(output).replace("dataset:", "") + db = connect() + if dataset not in db: + db.add_dataset(dataset) + db.add_examples(stream, datasets=[dataset]) + msg.good(f"Saved fetched data to dataset {dataset}") + else: + srsly.write_jsonl(output, stream) + msg.good("Saved fetched data to local file", output) diff --git a/setup.cfg b/setup.cfg index ae12b2f..e250c46 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 0.3.0 +version = 0.4.0 description = Recipes for PDF annotation url = https://github.com/explosion/prodigy-pdf author = Explosion @@ -7,16 +7,19 @@ author_email = contact@explosion.ai [options] zip_safe = true -python_requires = >=3.8 +python_requires = >=3.10 install_requires = - pypdfium2==4.20.0 + pypdfium2>=4.20.0 Pillow<11.0.0 pytesseract==0.3.10 numpy<2.0.0 + spacy-layout>=0.0.4 + prodigy>=1.17.1 [options.entry_points] prodigy_recipes = pdf.image.manual = prodigy_pdf:pdf_image_manual + pdf.spans.manual = prodigy_pdf.spans:pdf_spans_manual [bdist_wheel] universal = true