From eec6ba97e16679f851146a4ec8b50f9669356283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Wed, 21 Jun 2023 11:55:59 +0200 Subject: [PATCH] Sort offsets in generate_highlighted_text --- ingredient_extraction/clean_dataset.py | 4 +++- ingredient_extraction/utils.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ingredient_extraction/clean_dataset.py b/ingredient_extraction/clean_dataset.py index 0f2884dc..4c21dedf 100644 --- a/ingredient_extraction/clean_dataset.py +++ b/ingredient_extraction/clean_dataset.py @@ -110,6 +110,8 @@ def annotate(item: dict, existing_annotation: Optional[dict] = None): console.print(f"Image URL: {meta['url'].replace('.json', '.jpg')}") identifier = meta["id"] console.print(f"ID: {identifier}") + offsets = sorted(item["offsets"], key=lambda x: x[0]) + console.print(f"offsets: {offsets}") if existing_annotation is not None: console.print( f"Annotation already exists: " @@ -117,7 +119,7 @@ def annotate(item: dict, existing_annotation: Optional[dict] = None): f"updated_offsets={existing_annotation['updated_offsets']}" ) marked_text = generate_highlighted_text( - item["text"], [list(x) for x in item["offsets"]] + item["text"], [list(x) for x in offsets] ) marked_text_highlighted = marked_text.replace("", "[red]").replace( "", "[/red]" diff --git a/ingredient_extraction/utils.py b/ingredient_extraction/utils.py index e8d6b5fa..03783ae6 100644 --- a/ingredient_extraction/utils.py +++ b/ingredient_extraction/utils.py @@ -60,6 +60,7 @@ def generate_highlighted_text( mark_token: str = "b", html_escape: bool = False, ) -> str: + offsets = sorted(offsets, key=lambda x: x[0]) highlighted_text = [] previous_idx = 0 escape_func = (lambda x: x) if html_escape is False else html.escape