From c0c4a716374c0a56d5031e2e213658b3e137ee9e Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Fri, 7 Jun 2024 09:43:33 -0400 Subject: [PATCH] fix(upload-notes): allow uploading to dynamic label setups again Newer versions of Label Studio seem to flag dynamic labels differently than previous ones. So detect dynamic labels a different way. Also: - Print how many notes we are attempting to upload. - Warn if we don't see any DocumentReference files. - Add "-f compose.yaml" to the upload-notes example commands in the docs as a reminder that you need to be running off a compose file. --- cumulus_etl/common.py | 11 +++++--- cumulus_etl/loaders/fhir/ndjson_loader.py | 5 +--- cumulus_etl/upload_notes/cli.py | 2 +- cumulus_etl/upload_notes/labelstudio.py | 9 +++++++ cumulus_etl/upload_notes/selector.py | 2 +- docs/chart-review.md | 5 ++-- tests/upload_notes/test_upload_labelstudio.py | 27 +++++++++++++++++-- 7 files changed, 47 insertions(+), 14 deletions(-) diff --git a/cumulus_etl/common.py b/cumulus_etl/common.py index 4270dae0..0e923b74 100644 --- a/cumulus_etl/common.py +++ b/cumulus_etl/common.py @@ -75,10 +75,13 @@ def get_temp_dir(subdir: str) -> str: ############################################################################### -def ls_resources(root: store.Root, resource: str) -> list[str]: +def ls_resources(root: store.Root, resource: str, warn_if_empty: bool = False) -> list[str]: pattern = re.compile(rf".*/([0-9]+\.)?{resource}(\.[^/]+)?\.ndjson") all_files = root.ls() - return sorted(filter(pattern.match, all_files)) + found_files = sorted(filter(pattern.match, all_files)) + if not found_files and warn_if_empty: + logging.warning("No %s files found in %s", resource, root.path) + return found_files ############################################################################### @@ -166,13 +169,13 @@ def read_ndjson(path: str) -> Iterator[dict]: yield json.loads(line) -def read_resource_ndjson(root: store.Root, resource: str) -> Iterator[dict]: +def read_resource_ndjson(root: store.Root, resource: str, warn_if_empty: bool = False) -> Iterator[dict]: """ Grabs all ndjson files from a folder, of a particular resource type. Supports filenames like Condition.ndjson, Condition.000.ndjson, or 1.Condition.ndjson. """ - for filename in ls_resources(root, resource): + for filename in ls_resources(root, resource, warn_if_empty=warn_if_empty): yield from read_ndjson(filename) diff --git a/cumulus_etl/loaders/fhir/ndjson_loader.py b/cumulus_etl/loaders/fhir/ndjson_loader.py index cdb9eb17..44bb2962 100644 --- a/cumulus_etl/loaders/fhir/ndjson_loader.py +++ b/cumulus_etl/loaders/fhir/ndjson_loader.py @@ -1,6 +1,5 @@ """Ndjson FHIR loader""" -import logging import tempfile from cumulus_etl import cli_utils, common, errors, fhir, store @@ -72,11 +71,9 @@ async def load_all(self, resources: list[str]) -> common.Directory: print("Copying ndjson input files…") tmpdir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with for resource in resources: - filenames = common.ls_resources(input_root, resource) + filenames = common.ls_resources(input_root, resource, warn_if_empty=True) for filename in filenames: input_root.get(filename, f"{tmpdir.name}/") - if not filenames: - logging.warning("No resources found for %s", resource) return tmpdir async def _load_from_bulk_export(self, resources: list[str]) -> common.Directory: diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py index 0d069e9d..d99952d2 100644 --- a/cumulus_etl/upload_notes/cli.py +++ b/cumulus_etl/upload_notes/cli.py @@ -223,7 +223,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt def push_to_label_studio( notes: Collection[LabelStudioNote], access_token: str, labels: dict, args: argparse.Namespace ) -> None: - common.print_header("Pushing notes to Label Studio...") + common.print_header(f"Pushing {len(notes)} charts to Label Studio...") ls_client = LabelStudioClient(args.label_studio_url, access_token, args.ls_project, labels) ls_client.push_tasks(notes, overwrite=args.overwrite) diff --git a/cumulus_etl/upload_notes/labelstudio.py b/cumulus_etl/upload_notes/labelstudio.py index c5c8189b..d766e2b7 100644 --- a/cumulus_etl/upload_notes/labelstudio.py +++ b/cumulus_etl/upload_notes/labelstudio.py @@ -193,7 +193,16 @@ def _format_philter_span(self, count: int, start: int, end: int, note: LabelStud } def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None: + uses_dynamic_labels = False if self._labels_config.get("dynamic_labels"): + # Old versions of Label Studio set this nice field for us. + uses_dynamic_labels = True + elif self._labels_config.get("labels") == []: + # Newer versions of Label Studio seem to just pass an empty labels list. + # (An empty list would not normally be allowed in a static setup.) + uses_dynamic_labels = True + + if uses_dynamic_labels: # This path supports configs like where the labels # can be dynamically set by us. # diff --git a/cumulus_etl/upload_notes/selector.py b/cumulus_etl/upload_notes/selector.py index ff898deb..f4bdd2ce 100644 --- a/cumulus_etl/upload_notes/selector.py +++ b/cumulus_etl/upload_notes/selector.py @@ -24,7 +24,7 @@ def select_docrefs_from_files( # Read all input documents, filtering along the way with common.NdjsonWriter(output_file_path) as output_file: - resources = common.read_resource_ndjson(root_input, "DocumentReference") + resources = common.read_resource_ndjson(root_input, "DocumentReference", warn_if_empty=True) for docref in docref_filter(resources): output_file.write(docref) diff --git a/docs/chart-review.md b/docs/chart-review.md index 293b3ed4..a49dad74 100644 --- a/docs/chart-review.md +++ b/docs/chart-review.md @@ -38,7 +38,7 @@ Launch those before you begin: ```shell export UMLS_API_KEY=your-umls-api-key -docker compose --profile upload-notes up -d +docker compose -f $CUMULUS_REPO_PATH/compose.yaml --profile upload-notes up -d ``` Or if you have access to a GPU, @@ -64,7 +64,8 @@ Additionally, there are two required Label Studio parameters: Taken altogether, here is an example minimal `upload-notes` command: ```sh -docker compose run \ +docker compose -f $CUMULUS_REPO_PATH/compose.yaml \ + run --rm \ --volume /local/path:/in \ cumulus-etl \ upload-notes \ diff --git a/tests/upload_notes/test_upload_labelstudio.py b/tests/upload_notes/test_upload_labelstudio.py index 191e3e80..89532e8d 100644 --- a/tests/upload_notes/test_upload_labelstudio.py +++ b/tests/upload_notes/test_upload_labelstudio.py @@ -172,7 +172,8 @@ def test_no_predictions(self): ) @ddt.data("Choices", "Labels") - def test_dynamic_labels(self, label_type): + def test_dynamic_labels_old(self, label_type): + """Verify old-style dynamic labels config""" self.ls_project.parsed_label_config = { "mylabel": {"type": label_type, "to_name": ["mytext"], "dynamic_labels": True}, } @@ -192,9 +193,31 @@ def test_dynamic_labels(self, label_type): self.get_pushed_task()["data"], ) + @ddt.data("Choices", "Labels") + def test_dynamic_labels_new(self, label_type): + """Verify new-style dynamic labels config""" + self.ls_project.parsed_label_config = { + "mylabel": {"type": label_type, "to_name": ["mytext"], "labels": []}, + } + self.push_tasks(self.make_note()) + self.assertEqual( + { + "text": "Normal note text", + "enc_id": "enc", + "anon_id": "enc-anon", + "docref_mappings": {"doc": "doc-anon"}, + "docref_spans": {"doc": [0, 16]}, + "mylabel": [ + {"value": "Itch"}, + {"value": "Nausea"}, + ], + }, + self.get_pushed_task()["data"], + ) + def test_dynamic_labels_no_predictions(self): self.ls_project.parsed_label_config = { - "mylabel": {"type": "Labels", "to_name": ["mytext"], "dynamic_labels": True}, + "mylabel": {"type": "Labels", "to_name": ["mytext"], "labels": []}, } self.push_tasks(self.make_note(ctakes=False, philter_label=False)) self.assertEqual(