Skip to content

Commit

Permalink
fix(upload-notes): allow uploading to dynamic label setups again
Browse files Browse the repository at this point in the history
Newer versions of Label Studio seem to flag dynamic labels differently
than previous ones. So detect dynamic labels a different way.

Also:
- Print how many notes we are attempting to upload.
- Warn if we don't see any DocumentReference files.
- Add "-f compose.yaml" to the upload-notes example commands in the
  docs as a reminder that you need to be running off a compose file.
  • Loading branch information
mikix committed Jun 7, 2024
1 parent fa64d81 commit c0c4a71
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 14 deletions.
11 changes: 7 additions & 4 deletions cumulus_etl/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,13 @@ def get_temp_dir(subdir: str) -> str:
###############################################################################


def ls_resources(root: store.Root, resource: str) -> list[str]:
def ls_resources(root: store.Root, resource: str, warn_if_empty: bool = False) -> list[str]:
pattern = re.compile(rf".*/([0-9]+\.)?{resource}(\.[^/]+)?\.ndjson")
all_files = root.ls()
return sorted(filter(pattern.match, all_files))
found_files = sorted(filter(pattern.match, all_files))
if not found_files and warn_if_empty:
logging.warning("No %s files found in %s", resource, root.path)
return found_files


###############################################################################
Expand Down Expand Up @@ -166,13 +169,13 @@ def read_ndjson(path: str) -> Iterator[dict]:
yield json.loads(line)


def read_resource_ndjson(root: store.Root, resource: str) -> Iterator[dict]:
def read_resource_ndjson(root: store.Root, resource: str, warn_if_empty: bool = False) -> Iterator[dict]:
"""
Grabs all ndjson files from a folder, of a particular resource type.
Supports filenames like Condition.ndjson, Condition.000.ndjson, or 1.Condition.ndjson.
"""
for filename in ls_resources(root, resource):
for filename in ls_resources(root, resource, warn_if_empty=warn_if_empty):
yield from read_ndjson(filename)


Expand Down
5 changes: 1 addition & 4 deletions cumulus_etl/loaders/fhir/ndjson_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Ndjson FHIR loader"""

import logging
import tempfile

from cumulus_etl import cli_utils, common, errors, fhir, store
Expand Down Expand Up @@ -72,11 +71,9 @@ async def load_all(self, resources: list[str]) -> common.Directory:
print("Copying ndjson input files…")
tmpdir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
for resource in resources:
filenames = common.ls_resources(input_root, resource)
filenames = common.ls_resources(input_root, resource, warn_if_empty=True)
for filename in filenames:
input_root.get(filename, f"{tmpdir.name}/")
if not filenames:
logging.warning("No resources found for %s", resource)
return tmpdir

async def _load_from_bulk_export(self, resources: list[str]) -> common.Directory:
Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/upload_notes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
def push_to_label_studio(
notes: Collection[LabelStudioNote], access_token: str, labels: dict, args: argparse.Namespace
) -> None:
common.print_header("Pushing notes to Label Studio...")
common.print_header(f"Pushing {len(notes)} charts to Label Studio...")
ls_client = LabelStudioClient(args.label_studio_url, access_token, args.ls_project, labels)
ls_client.push_tasks(notes, overwrite=args.overwrite)

Expand Down
9 changes: 9 additions & 0 deletions cumulus_etl/upload_notes/labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,16 @@ def _format_philter_span(self, count: int, start: int, end: int, note: LabelStud
}

def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
uses_dynamic_labels = False
if self._labels_config.get("dynamic_labels"):
# Old versions of Label Studio set this nice field for us.
uses_dynamic_labels = True
elif self._labels_config.get("labels") == []:
# Newer versions of Label Studio seem to just pass an empty labels list.
# (An empty list would not normally be allowed in a static setup.)
uses_dynamic_labels = True

if uses_dynamic_labels:
# This path supports configs like <Labels name="label" toName="text" value="$label"/> where the labels
# can be dynamically set by us.
#
Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/upload_notes/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def select_docrefs_from_files(

# Read all input documents, filtering along the way
with common.NdjsonWriter(output_file_path) as output_file:
resources = common.read_resource_ndjson(root_input, "DocumentReference")
resources = common.read_resource_ndjson(root_input, "DocumentReference", warn_if_empty=True)
for docref in docref_filter(resources):
output_file.write(docref)

Expand Down
5 changes: 3 additions & 2 deletions docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Launch those before you begin:

```shell
export UMLS_API_KEY=your-umls-api-key
docker compose --profile upload-notes up -d
docker compose -f $CUMULUS_REPO_PATH/compose.yaml --profile upload-notes up -d
```

Or if you have access to a GPU,
Expand All @@ -64,7 +64,8 @@ Additionally, there are two required Label Studio parameters:

Taken altogether, here is an example minimal `upload-notes` command:
```sh
docker compose run \
docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
run --rm \
--volume /local/path:/in \
cumulus-etl \
upload-notes \
Expand Down
27 changes: 25 additions & 2 deletions tests/upload_notes/test_upload_labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def test_no_predictions(self):
)

@ddt.data("Choices", "Labels")
def test_dynamic_labels(self, label_type):
def test_dynamic_labels_old(self, label_type):
"""Verify old-style dynamic labels config"""
self.ls_project.parsed_label_config = {
"mylabel": {"type": label_type, "to_name": ["mytext"], "dynamic_labels": True},
}
Expand All @@ -192,9 +193,31 @@ def test_dynamic_labels(self, label_type):
self.get_pushed_task()["data"],
)

@ddt.data("Choices", "Labels")
def test_dynamic_labels_new(self, label_type):
"""Verify new-style dynamic labels config"""
self.ls_project.parsed_label_config = {
"mylabel": {"type": label_type, "to_name": ["mytext"], "labels": []},
}
self.push_tasks(self.make_note())
self.assertEqual(
{
"text": "Normal note text",
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [
{"value": "Itch"},
{"value": "Nausea"},
],
},
self.get_pushed_task()["data"],
)

def test_dynamic_labels_no_predictions(self):
self.ls_project.parsed_label_config = {
"mylabel": {"type": "Labels", "to_name": ["mytext"], "dynamic_labels": True},
"mylabel": {"type": "Labels", "to_name": ["mytext"], "labels": []},
}
self.push_tasks(self.make_note(ctakes=False, philter_label=False))
self.assertEqual(
Expand Down

0 comments on commit c0c4a71

Please sign in to comment.