diff --git a/cumulus_etl/deid/philter.py b/cumulus_etl/deid/philter.py index 1dd7acc..92304a0 100644 --- a/cumulus_etl/deid/philter.py +++ b/cumulus_etl/deid/philter.py @@ -16,7 +16,7 @@ class Philter: def __init__(self): # Ensure all the nltk data that our filter_config (below) needs is available. # In docker deployments, these should already be shipped with our docker image. - nltk.download("averaged_perceptron_tagger", quiet=True) + nltk.download("averaged_perceptron_tagger_eng", quiet=True) # philter-lite does not seem to have any easy way to reference this default config...? filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml") diff --git a/docs/nlp.md b/docs/nlp.md index deebb93..d5474bb 100644 --- a/docs/nlp.md +++ b/docs/nlp.md @@ -116,10 +116,6 @@ One additional challenge with cloud LLMs is reproducibility, but recording metadata like the current time and vendor version in the database along with the results can at least help explain changes over time. -{: .note } -Cloud LLM support has not yet been prioritized, and none are currently supported. -But if a new study did need to talk to a specific vendor, we know how we would integrate it. - ### cTAKES [Apache cTAKES](https://ctakes.apache.org/) is a tried and true method of tagging symptoms in text. diff --git a/pyproject.toml b/pyproject.toml index 5ed98e8..4a303aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "inscriptis < 3", "jwcrypto < 2", "label-studio-sdk < 2", + "nltk >= 3.9, < 4", "openai < 2", "oracledb < 3", "philter-lite < 1",