Skip to content

Commit

Permalink
Merge pull request #321 from smart-on-fhir/mikix/upload-notes-fix
Browse files Browse the repository at this point in the history
fix(upload-notes): allow uploading to dynamic label setups again
  • Loading branch information
mikix authored Jun 7, 2024
2 parents fa64d81 + c0c4a71 commit d3bf8ea
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 14 deletions.
11 changes: 7 additions & 4 deletions cumulus_etl/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,13 @@ def get_temp_dir(subdir: str) -> str:
###############################################################################


def ls_resources(root: store.Root, resource: str) -> list[str]:
def ls_resources(root: store.Root, resource: str, warn_if_empty: bool = False) -> list[str]:
pattern = re.compile(rf".*/([0-9]+\.)?{resource}(\.[^/]+)?\.ndjson")
all_files = root.ls()
return sorted(filter(pattern.match, all_files))
found_files = sorted(filter(pattern.match, all_files))
if not found_files and warn_if_empty:
logging.warning("No %s files found in %s", resource, root.path)
return found_files


###############################################################################
Expand Down Expand Up @@ -166,13 +169,13 @@ def read_ndjson(path: str) -> Iterator[dict]:
yield json.loads(line)


def read_resource_ndjson(root: store.Root, resource: str) -> Iterator[dict]:
def read_resource_ndjson(root: store.Root, resource: str, warn_if_empty: bool = False) -> Iterator[dict]:
"""
Grabs all ndjson files from a folder, of a particular resource type.
Supports filenames like Condition.ndjson, Condition.000.ndjson, or 1.Condition.ndjson.
"""
for filename in ls_resources(root, resource):
for filename in ls_resources(root, resource, warn_if_empty=warn_if_empty):
yield from read_ndjson(filename)


Expand Down
5 changes: 1 addition & 4 deletions cumulus_etl/loaders/fhir/ndjson_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Ndjson FHIR loader"""

import logging
import tempfile

from cumulus_etl import cli_utils, common, errors, fhir, store
Expand Down Expand Up @@ -72,11 +71,9 @@ async def load_all(self, resources: list[str]) -> common.Directory:
print("Copying ndjson input files…")
tmpdir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with
for resource in resources:
filenames = common.ls_resources(input_root, resource)
filenames = common.ls_resources(input_root, resource, warn_if_empty=True)
for filename in filenames:
input_root.get(filename, f"{tmpdir.name}/")
if not filenames:
logging.warning("No resources found for %s", resource)
return tmpdir

async def _load_from_bulk_export(self, resources: list[str]) -> common.Directory:
Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/upload_notes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
def push_to_label_studio(
notes: Collection[LabelStudioNote], access_token: str, labels: dict, args: argparse.Namespace
) -> None:
common.print_header("Pushing notes to Label Studio...")
common.print_header(f"Pushing {len(notes)} charts to Label Studio...")
ls_client = LabelStudioClient(args.label_studio_url, access_token, args.ls_project, labels)
ls_client.push_tasks(notes, overwrite=args.overwrite)

Expand Down
9 changes: 9 additions & 0 deletions cumulus_etl/upload_notes/labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,16 @@ def _format_philter_span(self, count: int, start: int, end: int, note: LabelStud
}

def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
uses_dynamic_labels = False
if self._labels_config.get("dynamic_labels"):
# Old versions of Label Studio set this nice field for us.
uses_dynamic_labels = True
elif self._labels_config.get("labels") == []:
# Newer versions of Label Studio seem to just pass an empty labels list.
# (An empty list would not normally be allowed in a static setup.)
uses_dynamic_labels = True

if uses_dynamic_labels:
# This path supports configs like <Labels name="label" toName="text" value="$label"/> where the labels
# can be dynamically set by us.
#
Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/upload_notes/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def select_docrefs_from_files(

# Read all input documents, filtering along the way
with common.NdjsonWriter(output_file_path) as output_file:
resources = common.read_resource_ndjson(root_input, "DocumentReference")
resources = common.read_resource_ndjson(root_input, "DocumentReference", warn_if_empty=True)
for docref in docref_filter(resources):
output_file.write(docref)

Expand Down
5 changes: 3 additions & 2 deletions docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Launch those before you begin:

```shell
export UMLS_API_KEY=your-umls-api-key
docker compose --profile upload-notes up -d
docker compose -f $CUMULUS_REPO_PATH/compose.yaml --profile upload-notes up -d
```

Or if you have access to a GPU,
Expand All @@ -64,7 +64,8 @@ Additionally, there are two required Label Studio parameters:

Taken altogether, here is an example minimal `upload-notes` command:
```sh
docker compose run \
docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
run --rm \
--volume /local/path:/in \
cumulus-etl \
upload-notes \
Expand Down
27 changes: 25 additions & 2 deletions tests/upload_notes/test_upload_labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ def test_no_predictions(self):
)

@ddt.data("Choices", "Labels")
def test_dynamic_labels(self, label_type):
def test_dynamic_labels_old(self, label_type):
"""Verify old-style dynamic labels config"""
self.ls_project.parsed_label_config = {
"mylabel": {"type": label_type, "to_name": ["mytext"], "dynamic_labels": True},
}
Expand All @@ -192,9 +193,31 @@ def test_dynamic_labels(self, label_type):
self.get_pushed_task()["data"],
)

@ddt.data("Choices", "Labels")
def test_dynamic_labels_new(self, label_type):
"""Verify new-style dynamic labels config"""
self.ls_project.parsed_label_config = {
"mylabel": {"type": label_type, "to_name": ["mytext"], "labels": []},
}
self.push_tasks(self.make_note())
self.assertEqual(
{
"text": "Normal note text",
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [
{"value": "Itch"},
{"value": "Nausea"},
],
},
self.get_pushed_task()["data"],
)

def test_dynamic_labels_no_predictions(self):
self.ls_project.parsed_label_config = {
"mylabel": {"type": "Labels", "to_name": ["mytext"], "dynamic_labels": True},
"mylabel": {"type": "Labels", "to_name": ["mytext"], "labels": []},
}
self.push_tasks(self.make_note(ctakes=False, philter_label=False))
self.assertEqual(
Expand Down

0 comments on commit d3bf8ea

Please sign in to comment.