From 6331ef2770a39f7948451c3c868a112886f01fc8 Mon Sep 17 00:00:00 2001 From: Joe Flack Date: Wed, 24 Apr 2024 18:32:39 -0400 Subject: [PATCH] Bugfix: DB refresh - python version - Python 3.9-3.10 got obsoleted for macos, so we switched the DB refresh (objects API) runner to use ubuntu instead. It hasn't needed MacOS in a long time or ever. - For the DB refresh (datasets API), we changed to 3.11. It still remains on the macos runner. - Update: Deleted some obsolete comments. General - Codestyle, documentation, and comment / todo tweaks in dataset_upload.py --- .github/workflows/backend_dev.yml | 173 +- .github/workflows/backend_prod.yml | 175 +- .github/workflows/kill_idle_cons.yml | 4 +- .github/workflows/refresh_counts.yml | 1 - .github/workflows/refresh_db.yml | 7 +- .github/workflows/refresh_from_datasets.yml | 4 +- .github/workflows/refresh_voc.yml | 1 - .../resolve_fetch_failures_0_members.yml | 1 - .../resolve_fetch_failures_excess_items.yml | 1 - enclave_wrangler/dataset_upload.py | 2401 ++++++++--------- 10 files changed, 1377 insertions(+), 1391 deletions(-) diff --git a/.github/workflows/backend_dev.yml b/.github/workflows/backend_dev.yml index a6e0326fc..8bfa8508c 100644 --- a/.github/workflows/backend_dev.yml +++ b/.github/workflows/backend_dev.yml @@ -1,88 +1,87 @@ -# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy -# More GitHub Actions for Azure: https://github.com/Azure/actions -# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions -# A good guide for Python Azure action: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html -# The actual command that runs to initiate our servers on dev/prod isn't shown in the GH action. Instead, go to the following URL, and then click the "General Settings" tab: -# Dev: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/jh-termhub-webapp-rg/providers/Microsoft.Web/sites/termhub/slots/dev/configuration -# Prod: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/JH-TERMHUB-WEBAPP-RG/providers/Microsoft.Web/sites/termhub/configuration -name: Backend dev - build and deploy - -on: -# push: -# branches: -# - develop -# pull_request: -# types: [opened, synchronize, reopened, closed] -# branches: -# - main -# - develop - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Set up Python version - uses: actions/setup-python@v2 - with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 - python-version: '3.9' - - name: Print commit hash & branch for rollbacks & troubleshooting - run: | - echo "Commit hash: ${{ github.sha }}" - echo "Branch: ${{ github.ref }}" - - name: 'Create env file' - run: | - mkdir env - echo "${{ secrets.ENV_FILE }}" > env/.env - echo "HOSTENV=dev" >> env/.env - - name: Create and start virtual environment - run: | - python3 -m venv venv - source venv/bin/activate - - name: Git Large File Store - run: | - git lfs install - cd termhub-vocab - git lfs pull - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install --upgrade wheel - pip install --upgrade setuptools - pip install -r requirements.txt -# todo: optional: run tests -# - name: Run tests -# run: python -m unittest discover -v - # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html - - name: Upload artifact for deployment jobs - uses: actions/upload-artifact@v2 - with: - name: python-app - path: | - . - !venv/ - - deploy: - runs-on: ubuntu-latest - needs: build - environment: - name: 'Backend: Development' - url: ${{ steps.deploy-to-webapp.outputs.webapp-url }} - steps: - # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html - - name: Download artifact from build job - uses: actions/download-artifact@v2 - with: - name: python-app - path: . - - name: 'Deploy to Azure Web App' - uses: azure/webapps-deploy@v2 - id: deploy-to-webapp - with: - app-name: 'termhub' - slot-name: 'dev' +# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy +# More GitHub Actions for Azure: https://github.com/Azure/actions +# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions +# A good guide for Python Azure action: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html +# The actual command that runs to initiate our servers on dev/prod isn't shown in the GH action. Instead, go to the following URL, and then click the "General Settings" tab: +# Dev: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/jh-termhub-webapp-rg/providers/Microsoft.Web/sites/termhub/slots/dev/configuration +# Prod: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/JH-TERMHUB-WEBAPP-RG/providers/Microsoft.Web/sites/termhub/configuration +name: Backend dev - build and deploy + +on: +# push: +# branches: +# - develop +# pull_request: +# types: [opened, synchronize, reopened, closed] +# branches: +# - main +# - develop + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Set up Python version + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Print commit hash & branch for rollbacks & troubleshooting + run: | + echo "Commit hash: ${{ github.sha }}" + echo "Branch: ${{ github.ref }}" + - name: 'Create env file' + run: | + mkdir env + echo "${{ secrets.ENV_FILE }}" > env/.env + echo "HOSTENV=dev" >> env/.env + - name: Create and start virtual environment + run: | + python3 -m venv venv + source venv/bin/activate + - name: Git Large File Store + run: | + git lfs install + cd termhub-vocab + git lfs pull + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade wheel + pip install --upgrade setuptools + pip install -r requirements.txt +# todo: optional: run tests +# - name: Run tests +# run: python -m unittest discover -v + # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html + - name: Upload artifact for deployment jobs + uses: actions/upload-artifact@v2 + with: + name: python-app + path: | + . + !venv/ + + deploy: + runs-on: ubuntu-latest + needs: build + environment: + name: 'Backend: Development' + url: ${{ steps.deploy-to-webapp.outputs.webapp-url }} + steps: + # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html + - name: Download artifact from build job + uses: actions/download-artifact@v2 + with: + name: python-app + path: . + - name: 'Deploy to Azure Web App' + uses: azure/webapps-deploy@v2 + id: deploy-to-webapp + with: + app-name: 'termhub' + slot-name: 'dev' publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_DEV }} \ No newline at end of file diff --git a/.github/workflows/backend_prod.yml b/.github/workflows/backend_prod.yml index aed0a4344..3024cff12 100644 --- a/.github/workflows/backend_prod.yml +++ b/.github/workflows/backend_prod.yml @@ -1,88 +1,87 @@ -# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy -# More GitHub Actions for Azure: https://github.com/Azure/actions -# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions -# A good guide for Python Azure action: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html -# The actual command that runs to initiate our servers on dev/prod isn't shown in the GH action. Instead, go to the following URL, and then click the "General Settings" tab: -# Dev: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/jh-termhub-webapp-rg/providers/Microsoft.Web/sites/termhub/slots/dev/configuration -# Prod: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/JH-TERMHUB-WEBAPP-RG/providers/Microsoft.Web/sites/termhub/configuration -name: Backend prod - build and deploy - -# I believe this will do the deploy when merged. We don't want it to deploy prod when PR opened -on: - push: - branches: - - main -# pull_request: -# types: [opened, synchronize, reopened, closed] -# branches: -# - main - workflow_dispatch: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Set up Python version - uses: actions/setup-python@v2 - with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 - python-version: '3.9' - - name: Print commit hash & branch for rollbacks & troubleshooting - run: | - echo "Commit hash: ${{ github.sha }}" - echo "Branch: ${{ github.ref }}" - - name: 'Create env file' - run: | - mkdir env - echo "${{ secrets.ENV_FILE }}" > env/.env - echo "HOSTENV=prod" >> env/.env - - name: Create and start virtual environment - run: | - python3 -m venv venv - source venv/bin/activate - - name: Git Large File Store - run: | - git lfs install - cd termhub-vocab - git lfs pull - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install --upgrade wheel - pip install --upgrade setuptools - pip install -r requirements.txt -# todo: optional: run tests -# - name: Run tests -# run: python -m unittest discover -v - # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html - - name: Upload artifact for deployment jobs - uses: actions/upload-artifact@v2 - with: - name: python-app - path: | - . - !venv/ - - deploy: - runs-on: ubuntu-latest - needs: build - environment: - name: 'Backend: Production' - url: ${{ steps.deploy-to-webapp.outputs.webapp-url }} - steps: - # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html - - name: Download artifact from build job - uses: actions/download-artifact@v2 - with: - name: python-app - path: . - - name: 'Deploy to Azure Web App' - uses: azure/webapps-deploy@v2 - id: deploy-to-webapp - with: - app-name: 'termhub' - slot-name: 'production' - publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_01B978E0A1074AF5B9757FB5907ED5D2 }} +# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy +# More GitHub Actions for Azure: https://github.com/Azure/actions +# More info on Python, GitHub Actions, and Azure App Service: https://aka.ms/python-webapps-actions +# A good guide for Python Azure action: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html +# The actual command that runs to initiate our servers on dev/prod isn't shown in the GH action. Instead, go to the following URL, and then click the "General Settings" tab: +# Dev: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/jh-termhub-webapp-rg/providers/Microsoft.Web/sites/termhub/slots/dev/configuration +# Prod: https://portal.azure.com/#@live.johnshopkins.edu/resource/subscriptions/fe24df19-d251-4821-9a6f-f037c93d7e47/resourceGroups/JH-TERMHUB-WEBAPP-RG/providers/Microsoft.Web/sites/termhub/configuration +name: Backend prod - build and deploy + +# I believe this will do the deploy when merged. We don't want it to deploy prod when PR opened +on: + push: + branches: + - main +# pull_request: +# types: [opened, synchronize, reopened, closed] +# branches: +# - main + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Set up Python version + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Print commit hash & branch for rollbacks & troubleshooting + run: | + echo "Commit hash: ${{ github.sha }}" + echo "Branch: ${{ github.ref }}" + - name: 'Create env file' + run: | + mkdir env + echo "${{ secrets.ENV_FILE }}" > env/.env + echo "HOSTENV=prod" >> env/.env + - name: Create and start virtual environment + run: | + python3 -m venv venv + source venv/bin/activate + - name: Git Large File Store + run: | + git lfs install + cd termhub-vocab + git lfs pull + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade wheel + pip install --upgrade setuptools + pip install -r requirements.txt +# todo: optional: run tests +# - name: Run tests +# run: python -m unittest discover -v + # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html + - name: Upload artifact for deployment jobs + uses: actions/upload-artifact@v2 + with: + name: python-app + path: | + . + !venv/ + + deploy: + runs-on: ubuntu-latest + needs: build + environment: + name: 'Backend: Production' + url: ${{ steps.deploy-to-webapp.outputs.webapp-url }} + steps: + # About: https://azure.github.io/AppService/2020/12/11/cicd-for-python-apps.html + - name: Download artifact from build job + uses: actions/download-artifact@v2 + with: + name: python-app + path: . + - name: 'Deploy to Azure Web App' + uses: azure/webapps-deploy@v2 + id: deploy-to-webapp + with: + app-name: 'termhub' + slot-name: 'production' + publish-profile: ${{ secrets.AZUREAPPSERVICE_PUBLISHPROFILE_01B978E0A1074AF5B9757FB5907ED5D2 }} diff --git a/.github/workflows/kill_idle_cons.yml b/.github/workflows/kill_idle_cons.yml index 0225cba24..6f4198eac 100644 --- a/.github/workflows/kill_idle_cons.yml +++ b/.github/workflows/kill_idle_cons.yml @@ -16,9 +16,7 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 - python-version: '3.9' # Not sure why 3.9.7 here and 3.9 elsewhere. the .7 works on Mac, but not on Ubuntu - + python-version: '3.9' - name: 'Create env file' run: | mkdir env diff --git a/.github/workflows/refresh_counts.yml b/.github/workflows/refresh_counts.yml index cfc848bc1..4e2c7477c 100644 --- a/.github/workflows/refresh_counts.yml +++ b/.github/workflows/refresh_counts.yml @@ -21,7 +21,6 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 python-version: '3.9' - name: Print commit hash & branch for rollbacks & troubleshooting run: | diff --git a/.github/workflows/refresh_db.yml b/.github/workflows/refresh_db.yml index 26cd2e076..d83355614 100644 --- a/.github/workflows/refresh_db.yml +++ b/.github/workflows/refresh_db.yml @@ -20,9 +20,7 @@ on: jobs: refresh-db: # runs-on: ubuntu-18.04 -# runs-on: ubuntu-latest # not sure which is more current when resolving merge conflicts - # todo: macos-latest: I think was just needed for datasets API because of high memory. should be able to switch back to ubuntu - runs-on: macos-latest + runs-on: ubuntu-latest steps: # Set up - name: Checkout repository and submodules @@ -30,8 +28,7 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 - python-version: '3.9.7' # Not sure why 3.9.7 here and 3.9 elsewhere. the .7 works on mac, but not on ubuntu + python-version: '3.9' - name: Print commit hash & branch for rollbacks & troubleshooting run: | echo "Commit hash: ${{ github.sha }}" diff --git a/.github/workflows/refresh_from_datasets.yml b/.github/workflows/refresh_from_datasets.yml index 605d78c74..1f816edbb 100644 --- a/.github/workflows/refresh_from_datasets.yml +++ b/.github/workflows/refresh_from_datasets.yml @@ -15,9 +15,7 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 - python-version: '3.9.7' - + python-version: '3.11' - name: Print commit hash & branch for rollbacks & troubleshooting run: | echo "Commit hash: ${{ github.sha }}" diff --git a/.github/workflows/refresh_voc.yml b/.github/workflows/refresh_voc.yml index 9fc2dba51..03fdc7ce0 100644 --- a/.github/workflows/refresh_voc.yml +++ b/.github/workflows/refresh_voc.yml @@ -21,7 +21,6 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 python-version: '3.9' - name: Print commit hash & branch for rollbacks & troubleshooting run: | diff --git a/.github/workflows/resolve_fetch_failures_0_members.yml b/.github/workflows/resolve_fetch_failures_0_members.yml index 37fd6766d..9ca6c7fed 100644 --- a/.github/workflows/resolve_fetch_failures_0_members.yml +++ b/.github/workflows/resolve_fetch_failures_0_members.yml @@ -21,7 +21,6 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 python-version: '3.9' - name: Print commit hash & branch for rollbacks & troubleshooting run: | diff --git a/.github/workflows/resolve_fetch_failures_excess_items.yml b/.github/workflows/resolve_fetch_failures_excess_items.yml index 793aa7abe..40cf1737b 100644 --- a/.github/workflows/resolve_fetch_failures_excess_items.yml +++ b/.github/workflows/resolve_fetch_failures_excess_items.yml @@ -17,7 +17,6 @@ jobs: - name: Set up Python version uses: actions/setup-python@v2 with: - # Consider '3.10' or 'v3.10.0': https://github.com/actions/setup-python/issues/160 python-version: '3.9' - name: Print commit hash & branch for rollbacks & troubleshooting run: | diff --git a/enclave_wrangler/dataset_upload.py b/enclave_wrangler/dataset_upload.py index ce76bdd3a..991fd8df6 100644 --- a/enclave_wrangler/dataset_upload.py +++ b/enclave_wrangler/dataset_upload.py @@ -1,1201 +1,1200 @@ -"""Upload datasets to Foundry""" -import json -import os -import urllib.parse -from argparse import ArgumentParser -from typing import Dict, List, Set, Union -from uuid import uuid4 -from time import sleep -from datetime import date, datetime -import pytz - -import pandas as pd -from requests import Response - -from enclave_wrangler.models import CsetContainer -from backend.db.utils import insert_from_dict, get_db_connection - -# TODO: See if sys.path(0) thing works; doesn't require maintenance. Look at unit tests -try: - from enclave_wrangler.config import CSET_UPLOAD_REGISTRY_PATH, CSET_VERSION_MIN_ID, ENCLAVE_PROJECT_NAME, MOFFIT_PREFIX, \ - MOFFIT_SOURCE_ID_TYPE, MOFFIT_SOURCE_URL, PALANTIR_ENCLAVE_USER_ID_1, UPLOADS_DIR, VALIDATE_FIRST, config, \ - PROJECT_ROOT, \ - TERMHUB_CSETS_DIR - from enclave_wrangler.actions_old_palantir3file_api import get_cs_container_data, get_cs_version_data, get_cs_version_expression_data, \ - post_request_enclave_api_addExpressionItems, post_request_enclave_api_create_container, \ - post_request_enclave_api_create_version, update_cs_version_expression_data_with_codesetid - from enclave_wrangler.actions_api import add_concepts_to_cset, finalize_concept_set_version, \ - upload_concept_set_container, \ - upload_concept_set_version_draft, get_concept_set_version_expression_items - from enclave_wrangler.utils import EnclaveWranglerErr, _datetime_palantir_format, log_debug_info, make_actions_request, get_random_codeset_id - from enclave_wrangler.objects_api import fetch_object_by_id, get_object_links, fetch_cset_version, fetch_cset_container, make_objects_request, get_researcher -except ModuleNotFoundError: - # TODO: this isn't up-to-date, is it? (2023-10-16) - from config import CSET_UPLOAD_REGISTRY_PATH, ENCLAVE_PROJECT_NAME, MOFFIT_PREFIX, \ - MOFFIT_SOURCE_ID_TYPE, MOFFIT_SOURCE_URL, PALANTIR_ENCLAVE_USER_ID_1, UPLOADS_DIR, config, PROJECT_ROOT, \ - TERMHUB_CSETS_DIR, CSET_VERSION_MIN_ID - from actions_old_palantir3file_api import get_cs_container_data, get_cs_version_data, get_cs_version_expression_data, \ - post_request_enclave_api_addExpressionItems, post_request_enclave_api_create_container, \ - post_request_enclave_api_create_version, update_cs_version_expression_data_with_codesetid - from actions_api import add_concepts_to_cset, upload_concept_set_container, \ - upload_concept_set_version_draft, get_concept_set_version_expression_items - from utils import _datetime_palantir_format, log_debug_info - - -DEBUG = False - - -# todo: sep fnc for validate? -def process_csv(): - """Some logic to consider: - - imagine a csv with 1 container new and 3 new versions. we can give them info about versions that exist, link to - the enclave, and confirm that they want to create the new versions. then they can confirm or make changes and - revalidate - """ - pass - - -# TODO: return 'outcome': success / error for (i) all the csets, (ii) each cset baesd on all response.status_code -def upload_new_cset_version_with_concepts_from_csv( - path: str = None, df: pd.DataFrame = None, validate_first=False -) -> Dict: - """ - Upload from CSV - file format docs: - https://github.com/jhu-bids/TermHub/tree/develop/enclave_wrangler - - testing from test/test_enclave_wrangler.py - using: - https://github.com/jhu-bids/TermHub/blob/develop/test/input/test_enclave_wrangler/test_dataset_upload/type-2-diabetes-mellitus.csv - - :return A dictionary of wth cset name as key, and values are the responses from each enclave API call to fulfill - the request. - """ - if not path and df is None: - raise RuntimeError('upload_new_cset_version_with_concepts_from_csv: Must provide path or dataframe') - df = df if df is not None else pd.read_csv(path).fillna('') - df = df[~df.eq('').all(axis=1)] # drop all empty rows - - cset_group_cols = ['concept_set_name', 'parent_version_codeset_id'] - more_cset_cols = list( - {'multipassId', 'current_max_version', 'domain_team', 'provenance', 'limitations', 'intention', - 'intended_research_project', 'authority'}.intersection(df.columns)) - concept_cols_required = ['concept_id', 'includeDescendants', 'isExcluded', 'includeMapped'] - concept_cols_optional = ['annotation'] - - responses = {} - csets = df.groupby(cset_group_cols) - for cset in csets: - key: tuple - key, csetdf = cset - - # TODO: From right about here, abstract this logic into new fun: cset_version_df_to_dict() - new_version = {} - cset_name = key[0] - new_version['concept_set_name'] = cset_name - new_version['parent_version_codeset_id'] = int(key[1]) - - first_row = csetdf[more_cset_cols].to_dict(orient='records')[0] - for c in more_cset_cols: - new_version[c] = first_row[c] - - new_version['on_behalf_of'] = new_version['multipassId'] - del new_version['multipassId'] - - selectable_cols = concept_cols_required + [x for x in concept_cols_optional if x in list(csetdf.columns)] - try: - new_version['omop_concepts'] = csetdf[selectable_cols].to_dict(orient='records') - except KeyError as e: - raise EnclaveWranglerErr(str(e)) - for x in new_version['omop_concepts']: - x['concept_id'] = int(x['concept_id']) - - # TODO: for now going to assume that if the concept set exists, they'll give us a parent_version_codeset_id - # but they might give us the name and not the parent id. if they do give us the name but the - # name already exists, two possibilities: 1) they just want to add a new version using the most - # recent version as the parent but didn't bother to find out what it was and include in csv, or - # 2) it's a mistaken duplicate name and they should get an error and have to change the name - # FIX SOMEHOW!!! - - if new_version['parent_version_codeset_id']: # creating new version, setting parent to existing version - responses_i: Dict = upload_new_cset_version_with_concepts(**new_version, validate_first=validate_first) - # TODO: since test/test_enclave_wrangler.py:test_upload() is only expecting one - # result, just returning the first result. but csvs allow multiple, so FIX THIS!!!! - else: - responses_i: Dict = upload_new_container_with_concepts(**new_version, validate_first=validate_first) - responses[cset_name] = responses_i - - # TODO: put something helpful back here but that doesn't cause errors - # print('INFO: ' + cset_name + ': ', str(responses_i)) - print('INFO: ' + cset_name) - return responses - - -# TODO -def upload_new_cset_container_with_concepts_from_csv( - path: str = None, df: pd.DataFrame = None, validate_first=False -) -> Dict[str, Dict[str, Union[Response, List[Response]]]]: - """Upload new container(s) from CSV - - todo's: - - Validate / throw err if multiple `intention`, `research_project`, `assigned_sme`, `assigned_informatician` - """ - if not path and df is None: - raise RuntimeError('upload_new_cset_version_with_concepts_from_csv: Must provide path or dataframe') - df = df if df is not None else pd.read_csv(path).fillna('') - df = df[~df.eq('').all(axis=1)] # drop all empty rows - - # TODO: how about creating a dict or class of containers and versions from this logic? - # implementation 2 - cset_group_cols = ['concept_set_name', 'parent_version_codeset_id'] - csets = df.groupby(cset_group_cols) - containers_with_versions: List[CsetContainer] = [] - for cset in csets: - containers_with_versions.append(CsetContainer().from_dataframe(cset)) - - # implementation 1 - container_dfs = { - cset_name: df[df['concept_set_name'] == cset_name] - for cset_name in df['concept_set_name'].unique()} - responses = {} - for cset_name, df in container_dfs.items(): - first_row = dict(df.iloc[0]) - # todo?: Convert DF rows to List[Dict], properly formatted - # todo: versions_with_concepts.omop_concepts: i say below i expect list[int] but that has changed to list[dict] - # todo: are these fields really all required? prolly not if not in current csv. are they there? - # 'omop_concept_ids': (List[int]) (required), <---- x - # 'provenance' (str) (required): - # 'concept_set_name' (str) (required): - # 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` - # 'limitations' (str) (required): - # 'intention' (str) (required): - # 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` - # 'codeset_id' (int) (required): - - # todo: just testing this logic - df['parent_version_codeset_id'] = '' - upload_new_cset_version_with_concepts_from_csv(df=df) - - rows = df.to_dict(orient='records') - responses[cset_name] = upload_new_container_with_concepts( - concept_set_name=cset_name, - intention=first_row['intention'], - research_project=first_row['research_project'], - assigned_sme=first_row['assigned_sme'], - assigned_informatician=first_row['assigned_informatician'], - versions_with_concepts=[], # TODOx - validate_first=validate_first) - return responses - - -# TODO: What if this fails halfway through? Can we teardown any of the steps? (need to store random `codeset_id` too) -# TODO: Need to do proper codeset_id assignment: (i) look up registry and get next available ID, (ii) assign it here, -# (iii) persist new ID / set to registry, (iv) persist new ID to any files passed through CLI, (v), return the new ID -# todo: @Siggie: Do we want to change this to accept named params instead of a dictionary? - Joe 2022/12/05 -def upload_new_cset_version_with_concepts( - concept_set_name: str, omop_concepts: List[Dict], - provenance: str = "", limitations: str = "", intention: str = "", annotation: str = "", - parent_version_codeset_id: int = None, current_max_version: float = None, - intended_research_project: str = None, on_behalf_of: str = None, codeset_id: int = None, - validate_first=VALIDATE_FIRST, finalize=True # maybe finalize should default to False? -) -> Dict: - """Upload a concept set version along with its concepts. - - # todo: Update this slightly now that this function accepts named params instead of a dict - - 'omop_concepts': [ - { - 'concept_id' (int) (required): - 'includeDescendants' (bool) (required): - 'isExcluded' (bool) (required): - 'includeMapped' (bool) (required): - 'annotation' (str) (optional): - } - ], - 'provenance' (str) (required): - 'concept_set_name' (str) (required): - 'limitations' (str) (required): - 'intention' (str) (required): - 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` - 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` - 'codeset_id' (int) (required): Default:Will ge generated if not passed. - } - - Example: - { - "omop_concepts": [ - { - "concept_id": 45259000, - "includeDescendants": true, - "isExcluded": false, - "includeMapped": true, - "annotation": "This is my concept annotation." - } - ], - "provenance": "Created through TermHub.", - "concept_set_name": "My test concept set", - "limitations": "", - "intention": "" - } - """ - # Handle missing params - if not codeset_id: - # todo: this is temporary until I handle registry persistence - codeset_id = get_random_codeset_id() - if not intended_research_project: - intended_research_project = ENCLAVE_PROJECT_NAME - - # Upload - response_upload_draft_concept_set: Response = upload_concept_set_version_draft( # code_set - base_version=parent_version_codeset_id, - current_max_version=current_max_version, - provenance=provenance, - concept_set=concept_set_name, # == container_d['concept_set_name'] - annotation=annotation, - limitations=limitations, - intention=intention, - intended_research_project=intended_research_project, - version_id=codeset_id, - on_behalf_of=on_behalf_of, - validate_first=validate_first) # == code_sets.codeset_id - sleep(8) - - response_upload_concepts: List[Response] = add_concepts_to_cset( - omop_concepts=omop_concepts, - version__codeset_id=codeset_id, - validate_first=validate_first) # == code_sets.codeset_id - - responses = { - 'upload_concept_set_version': response_upload_draft_concept_set, - 'add_concepts_to_cset': response_upload_concepts, - } - if finalize: - response_finalize_concept_set_version: Response = finalize_concept_set_version( - concept_set=concept_set_name, # == container_d['concept_set_name'] - version_id=codeset_id, - current_max_version=current_max_version, - provenance=provenance, - limitations=limitations, - validate_first=validate_first, - on_behalf_of=on_behalf_of) - responses['finalize_concept_set_version'] = response_finalize_concept_set_version - return { - 'responses': responses, - 'versionId': codeset_id - } - - -# TODO: support concept params, e.g. exclude_children -def upload_new_container_with_concepts( - on_behalf_of: str, concept_set_name: str, intention: str, versions_with_concepts: List[Dict], - research_project: str = ENCLAVE_PROJECT_NAME, assigned_sme: str = PALANTIR_ENCLAVE_USER_ID_1, - assigned_informatician: str = PALANTIR_ENCLAVE_USER_ID_1, validate_first=VALIDATE_FIRST, - fail_if_exists = True -) -> Dict[str, Union[Response, List[Response]]]: - """Upload a new concept set container, and 1+ concept set versions, along with their concepts. - - :param container (Dict): Has the following keys: - concept_set_name (str) (required): - intention (str) (required): - research_project (str) (required): Default:`ENCLAVE_PROJECT_NAME` - assigned_sme (str) (optional): Default:`PALANTIR_ENCLAVE_USER_ID_1` - assigned_informatician (str) (optional): Default:`PALANTIR_ENCLAVE_USER_ID_1` - :param versions_with_concepts (List[Dict]): Has the following schema: [ - TODO: @jflack, this doesn't seem to be what upload_new_cset_version_with_concepts - is really expecting. I'm changing omop_concept_ids to omop_concepts in upload_and_sync_rxnorm_cset - { - 'omop_concept_ids': (List[int]) (required), - 'provenance' (str) (required): - 'concept_set_name' (str) (required): - 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` - 'limitations' (str) (required): - 'intention' (str) (required): - 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` - 'codeset_id' (int) (required): - } - ] - """ - if len(versions_with_concepts) != 1: - # code was written to allow multiple versions, but there's no good reason - # for that. So, just raising error if not exactly one. - raise ValueError("expecting a single version to upload with container") - - # Upload container - existing_container = fetch_cset_container(concept_set_name, fail_on_error=False) - if existing_container: - if fail_if_exists: - raise RuntimeError(f'Error attempting to upload container. Container [{concept_set_name}] already exists.') - version = get_object_links( - object_type='OMOPConceptSetContainer', - object_id=concept_set_name, - link_type='OMOPConceptSet', - return_type='data', - expect_single_item=True, - handle_paginated=False, - retry_if_empty=False) - # TODO: check that already existing version matches what was to be uploaded...? - if not version: # if container was already uploaded but not version, upload version now - version: Dict[str, Union[Response, List[Response]]] = \ - upload_new_cset_version_with_concepts(**(versions_with_concepts[0]), - validate_first=validate_first) - return {'already_exists': True, - 'upload_concept_set_container': existing_container, - 'upload_new_cset_version_with_concepts': version} - - response_upload_concept_set_container: Response = upload_concept_set_container( - on_behalf_of=on_behalf_of, - concept_set_id=concept_set_name, - intention=intention, - research_project=research_project, - assigned_sme=assigned_sme, - assigned_informatician=assigned_informatician, - validate_first=validate_first) - - # Upload version - version_response: Dict[str, Union[Response, List[Response]]] = \ - upload_new_cset_version_with_concepts(**(versions_with_concepts[0]), - validate_first=validate_first) - - return { - 'upload_concept_set_container': response_upload_concept_set_container, - 'upload_new_cset_version_with_concepts': version_response} - - - -# todo: need to have 2 variations: 1 for creates, and 1 for updates. w/ exception handling and proper error messaging -# todo: need to allow 'upload' based on if it is / isn't in the local termhub-csets/datasets/cset_upload_registry.csv -# todo: use CSET_VERSION_MIN_ID? right now I'm not sure. makes more sense to pre-assign before running this so that -# ...I can JOIN together. would need a separate function just to accept a single `linked_cset` as opposed to files -# ...that have multiple. otherwise, with multiple, there's no way to tell what concepts for what cset -def post_to_enclave_from_3csv( - input_csv_folder_path: str, enclave_api=['old_rid_based', 'default'][1], create_cset_container=True, - create_cset_versions=True -) -> Union[Dict, pd.DataFrame]: - """Uploads data to enclave and updates the following column in the input's code_sets.csv: - # todo: Siggie: updating these files is probably not worth / necessary right now. Joe: We might want to do at some point - - enclave_codeset_id - - enclave_codeset_id_updated_at - - concept_set_name - - :param enclave_api (str): One of 'old_rid_based' or 'default'. The 'old_rid_based' Enclave API is the one which - required RIDs for params rather than the param name, and there may be other differences. 'default' refers to the - newer Enclave API (as of 2022/10). - - # todo: after adding 'update' functionality, add params / update these docs - :param create_cset_container (bool): If true, will identify concept set containers from `input_csv_folder_path` and - make a POST request to enclave to create them. Only applicable to the new 'default' API. - :param create_cset_versions (bool): If true, will identify concept set versions from `input_csv_folder_path` and - make a POST request to enclave to create them. Only applicable to the new 'default' API. - """ - # Settings - if DEBUG: - log_debug_info() - # Validate - msg = 'post_to_enclave_from_3csv() has not yet been set up to only add ' \ - 'concepts. Must either pass `create_cset_container=True` to create ' \ - 'a new concept set container, `create_cset_versions=True` to create' \ - ' new concept set versions on an existing container, or both.' \ - ' Doing nothing.' - if not create_cset_container or create_cset_versions: - print(msg) - return {} - # Routing - if enclave_api == 'old_rid_based': - # todo: this doesn't return Response, so might want to change this functions return sig / val - df: pd.DataFrame = post_to_enclave_old_api(input_csv_folder_path) - return df - - # Read data - concept_set_container_edited_df = pd.read_csv( - os.path.join(input_csv_folder_path, 'concept_set_container_edited.csv')).fillna('') - concept_set_container_edited_d = json.loads(concept_set_container_edited_df.to_json(orient='records')) - code_sets_df = _load_standardized_input_df(os.path.join(input_csv_folder_path, 'code_sets.csv')) # .fillna('') - concept_set_version_item_rv_edited_df = pd.read_csv( - os.path.join(input_csv_folder_path, 'concept_set_version_item_rv_edited.csv')).fillna('') - - # Link data into single iterable - # temp: @Siggie: You mentioned to delete the internal_id we assigned, but I don't know how to link these together otherwise - linked_csets: Dict = {} - for container_d in concept_set_container_edited_d: - cset_name = container_d['concept_set_id'] - linked_csets[cset_name] = {'versions': {}} - linked_csets[cset_name]['container'] = container_d - code_sets_df_i = code_sets_df[code_sets_df['concept_set_name'] == cset_name] - code_sets_d_i = json.loads(code_sets_df_i.to_json(orient='records')) - for version_d in code_sets_d_i: - version_id = version_d['codeset_id'] # == code_sets.codeset_id - linked_csets[cset_name]['versions'][version_id] = {'items': []} - linked_csets[cset_name]['versions'][version_id]['version'] = version_d - concept_set_version_item_rv_edited_df_i = concept_set_version_item_rv_edited_df[ - concept_set_version_item_rv_edited_df['codeset_id'] == version_id] # == code_sets.codeset_id - concept_set_version_item_rv_edited_d_i = json.loads(concept_set_version_item_rv_edited_df_i.to_json(orient='records')) - for item_d in concept_set_version_item_rv_edited_d_i: - linked_csets[cset_name]['versions'][version_id]['omop_concept_ids'].append(item_d) - - # Upload to enclave - response_upload_new_container_with_concepts = {} - response_upload_concept_set: Union[None, Response] = None - response_upload_new_cset_version_with_concepts = {} - for linked_cset in list(linked_csets.values()): - container_d = linked_cset['container'] - if create_cset_container and create_cset_versions: - response_upload_new_container_with_concepts: Dict = upload_new_container_with_concepts( - container=container_d, - versions_with_concepts=linked_cset['versions'].values()) - elif create_cset_container: - response_upload_concept_set: Response = upload_concept_set_container( # concept_set_container - concept_set_id=container_d['concept_set_name'], - intention=container_d['intention'], - research_project=ENCLAVE_PROJECT_NAME, - assigned_sme=PALANTIR_ENCLAVE_USER_ID_1, - assigned_informatician=PALANTIR_ENCLAVE_USER_ID_1) - elif create_cset_versions: - response_upload_new_cset_version_with_concepts = [] - for version in linked_cset['versions'].values(): - version_d = version['version'] - version_d['omop_concept_ids'] = version['omop_concept_ids'] - response_i: Dict[str, Union[Response, List[Response]]] = upload_new_cset_version_with_concepts(**version_d) - response_upload_new_cset_version_with_concepts.append(response_i) - - return { - 'upload_new_container_with_concepts': response_upload_new_container_with_concepts, - 'upload_concept_set': response_upload_concept_set, - 'upload_new_cset_version_with_concepts': response_upload_new_cset_version_with_concepts} - - -def post_to_enclave_old_api(input_csv_folder_path: str) -> pd.DataFrame: - """Uploads data to enclave and updates the following column in the input's code_sets.csv: - - enclave_codeset_id - - enclave_codeset_id_updated_at - - concept_set_name - """ - if DEBUG: - log_debug_info() - - # Read data - code_sets_df = _load_standardized_input_df(os.path.join(input_csv_folder_path, 'code_sets.csv')) #.fillna('') - - # 0.1 Create mappings between - # - concept_set_container_edited.csv[concept_set_name], and... - # - code_sets.csv[codeset_id] - # use the concept_set_name as key to store the pre-made codeset_ids, - # store the codeset_ids in the premade_codeset_ids - cs_name_id_mappings = {} - for index, row in code_sets_df.iterrows(): - cs_id = row['codeset_id'] - cs_name = row['concept_set_name'] - cs_name_id_mappings[cs_name] = cs_id - # 0.2 create a list of premade coedeset ids - premade_codeset_ids = [] - for index, row in code_sets_df.iterrows(): - premade_codeset_ids.append(row['codeset_id']) - - # I. Create structures - # I.0. concept_set_version_item_dict; key=codeset_id - concept_set_version_item_dict = {} - concept_set_version_item_rv_edited_df = pd.read_csv( - os.path.join(input_csv_folder_path, 'concept_set_version_item_rv_edited.csv')).fillna('') - for index, row in concept_set_version_item_rv_edited_df.iterrows(): - key = row['codeset_id'] - if key not in concept_set_version_item_dict: - concept_set_version_item_dict[key] = [] - concept_set_version_item_dict[key].append(row) - # cs_result = pd.merge(code_sets_df, concept_set_version_item_rv_edited_df, on=['codeset_id']) - - # I.1. build the list of container json data; key=codeset_id - # I.1.ii. Get the actual container data - concept_set_container_edited_json_all_rows = {} - concept_set_container_edited_df = pd.read_csv( - os.path.join(input_csv_folder_path, 'concept_set_container_edited.csv')).fillna('') - for index, row in concept_set_container_edited_df.iterrows(): - cs_name = row['concept_set_name'].strip() - single_row = get_cs_container_data(cs_name) - cs_id = cs_name_id_mappings[cs_name] - concept_set_container_edited_json_all_rows[cs_id] = single_row - - # I.2. build the list of cs version json data; key=codeset_id - code_set_version_json_all_rows = {} - # code_sets_df = pd.read_csv(os.path.join(input_csv_folder_path, 'code_sets.csv')).fillna('') - for index, row in code_sets_df.iterrows(): - cs_id = row['codeset_id'] - cs_name = row['concept_set_name'].strip() - cs_intention = row['intention'].strip() - cs_limitations = row['limitations'].strip() - cs_update_msg = row['update_message'].strip() - cs_authority = row.get('authority', '').strip() - ##cs_authority = "Mathematica" ## TODO: shong, 4/26/22, code_sets.csv need to build the authority value, uncomment when available - cs_provenance = row['provenance'].strip() - single_row = get_cs_version_data(cs_name, cs_id, cs_intention, cs_limitations, cs_update_msg, cs_provenance, cs_authority) - # cs_name, cs_id, intention, limitation, update_msg, status, provenance - code_set_version_json_all_rows[cs_id] = single_row - # code_set_version_json_all_rows_dict[codeset_id] = single_row - - # I.3. Build all of the Expression items to add to a CS version; key=codeset_id, - # codeset_id = pre-made codeset_id to iterate through the codesets - code_set_expression_items_json_all_rows = {} - for index, row in code_sets_df.iterrows(): - current_code_set_id = row['codeset_id'] - # build the code and codeSystem list for the current codeSet - # reset the code list - code_list = [] - cs_name = row['concept_set_name'].strip() - # code and code system list - concept_set_version_item_rows = concept_set_version_item_dict[current_code_set_id] - # always use the same entry from the first set as currently - # we do not have a support to save these flags per expressionItems - concept_set_version_item_row1 = concept_set_version_item_rows[0] - exclude = concept_set_version_item_row1['isExcluded'] - descendents = concept_set_version_item_row1['includeDescendants'] - mapped = concept_set_version_item_row1['includeMapped'] - annotation = concept_set_version_item_row1['annotation'].strip() - - for concept_set_version_item_row in concept_set_version_item_rows: - code_codesystem_pair = concept_set_version_item_row['codeSystem'] + ":" + str(concept_set_version_item_row['code']) - code_list.append(code_codesystem_pair) - # to-do(future): Right now, the API doesn't expect variation between the following 4 values among - # ...concept set items, so right now we can just take any of the rows and use its values. But, in - # ...the future, when there is variation, we may need to do some update here. - Joe 2022/02/04 - # this is same limitation OMOP concept expression works, so for now it is sufficient - # we can explorer more granular control later if necessary -Stephanie 02/05/2022 - - # now that we have the code list, generate the json for the versionExpression data - single_row = get_cs_version_expression_data(current_code_set_id, cs_name, code_list, exclude, descendents, mapped, annotation) - code_set_expression_items_json_all_rows[current_code_set_id] = single_row - # code_set_expression_items_json_all_rows_dict[codeset_id] = single_row - - # now that we have the code list, generate the json for the versionExpression data - single_row = get_cs_version_expression_data( - current_code_set_id, cs_name, code_list, exclude, descendents, mapped, annotation) - code_set_expression_items_json_all_rows[current_code_set_id] = single_row - # code_set_expression_items_json_all_rows_dict[codeset_id] = single_row - - # --- Steph: check the failed cases 2/15/22----------------------------- - # @Steph: Can we remove this block yet? I don't mind keeping for a while for convenienice, but I like the practice - # ...of keeping code like this inside a copy of the file in a git ignored folder. - Joe 2022/03/15 - # begin test code - # for premade_codeset_id in premade_codeset_ids: - # # we have problem with very large code list - 16(3623) and 39(5104) 58(1820) 73(6791) 74(1501) - # upd_cs_ver_expression_items_dict1 = code_set_expression_items_json_all_rows[premade_codeset_id] - # codeStringList1 = upd_cs_ver_expression_items_dict1['parameters'][ - # 'ri.actions.main.parameter.c9a1b531-86ef-4f80-80a5-cc774d2e4c33']['stringList']['strings'] - # print( str(premade_codeset_id) + "codelistLength: " + str(len(codeStringList1))) - # print('------') - # end test code ------------------------------- # - - # II. call the REST APIs to create them on the Enclave - # ...now that we have all the data from concept set are created - # temp_testing_cset_id = 1000000326 # Stephanie said this was a draft or archived set - Joe 2022/03/15 - for premade_codeset_id in premade_codeset_ids: - # TODO: temporary debug code to look for missing concept container not showing in the UI - # TODO: debug code for adding expressionItems to missing container from UI, l162,l163 - - # if premade_codeset_id != temp_testing_cset_id: - # continue - - # Do a test first using 'validate' - header = { - "authorization": f"Bearer {config['PALANTIR_ENCLAVE_AUTHENTICATION_BEARER_TOKEN']}", - 'content-type': 'application/json' - } - - container_data_dict = concept_set_container_edited_json_all_rows[premade_codeset_id] - # noinspection PyUnusedLocal - # response_json = post_request_enclave_api(api_url, header, test_data_dict) - # create concept set containe: - # ----- container_data_dict['parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807'] - # 'ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807': { - # 'type': 'string', 'string': '[VSAC] Eclampsia'}, - # if DEBUG: - # csContainerName = container_data_dict[ - # 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] - # print(csContainerName) - # print('------------------------------') - response_json = post_request_enclave_api_create_container(header, container_data_dict) - # i.e container object may already exist but we can create another version within a container: - # {'errorCode': 'INVALID_ARGUMENT', 'errorName': 'Actions:ObjectsAlreadyExist', - # 'errorInstanceId': '96fb2188-1947-4004-a7b3-d0572a5a0008', 'parameters': {'objectLocators': - # '[ObjectLocator{objectTypeId: omop-concept-set-container, primaryKey: {concept_set_id=PrimaryKeyValue{ - # value: StringWrapper{value: [VSAC] Mental Behavioral and Neurodevelopmental Disorders}}}}]'}} - # Validate 2: Concept set version item - # noinspection PyUnusedLocal - # DEBUG:urllib3.connectionpool:https://unite.nih.gov:443 "POST /actions/api/actions HTTP/1.1" 200 107 - # {'actionRid': 'ri.actions.main.action.9dea4a02-fb9c-4009-b623-b91ad6a0192b', 'synchronouslyPropagated': False} - # Actually create a version so that we can test the api to add the expression items - - # cs_version_data_dict = code_set_version_json_all_rows[0] - cs_version_data_dict = code_set_version_json_all_rows[premade_codeset_id] - # noinspection PyUnusedLocal - # create the version and ask Enclave for the codeset_id that can be used to addCodeExpressionItems - # create version ----- - codeset_id = post_request_enclave_api_create_version(header, cs_version_data_dict) - # TODO begin ------------------------------------------------------------ - # 3/14/22, stephanie, save the codeset_id with container name in csContainerName - # save codeset_id of a draft version with the container name saved in container_name= container_data_dict[ - # 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] - # premade_codeset_id = dih internal id - # csContainerName = container name - # codeset_id = version id - # --persist the data in the output folder = input_csv_folder_path - # end TODO--------------------------------------------------------------- - # upd_cs_ver_expression_items_dict = code_set_expression_items_json_all_rows[item] - upd_cs_ver_expression_items_dict: Dict = code_set_expression_items_json_all_rows[premade_codeset_id] - # update the payload with the codeset_id returned from the - - # DEBUG: Can use this to check to make sure code list is OK: - # if DEBUG: # updated json data is saved in upd_cs_ver_expression_items_dict - cs_container_name = container_data_dict[ - 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] - print('csContainerName: ' + str(cs_container_name)) - string_list = upd_cs_ver_expression_items_dict[ - 'parameters']['ri.actions.main.parameter.c9a1b531-86ef-4f80-80a5-cc774d2e4c33']['stringList']['strings'] - print('premade_codeset_id: ' + str(premade_codeset_id)) - print('len(stringList): ' + str(len(string_list))) - print('codeset_id: ' + str(codeset_id)) - print('------------------------------') - - # update the json data with the correct codeset_id ----- - upd_cs_ver_expression_items_dict = \ - update_cs_version_expression_data_with_codesetid(codeset_id, upd_cs_ver_expression_items_dict) - # Validate 3: add the concept set expressions to draft version by passing as code and code system - # third api - # https://unite.nih.gov/workspace/ontology/action-type/add-code-system-codes-as-omop-version-expressions/overview - # action type rid: ri.actions.main.action-type.e07f2503-c7c9-47b9-9418-225544b56b71 - # noinspection PyUnusedLocal - # add expressionItems to version ----- - response_json = post_request_enclave_api_addExpressionItems(header, upd_cs_ver_expression_items_dict) - print('post request to add expressionItems returned: ----------' + json.dumps(response_json)) - # Once the expression items has been added save the enclave concept_id so that we can update the code_sets.csv - # file update code_sets_df with the enclave_codeset_id column of the value in the codeset_id retured from the - # enclave and if needed we can also save the json data in upd_cs_ver_expression_items_dict premade_codeset_id is - # stored in the codeset_id column in the csv files, save the id in the enclave_codeset_id column update when it - # was uploaded as well, Stephane 3/15/22 - try: - code_sets_df.set_index('codeset_id', inplace=True) - except Exception as e: - print(e) - - code_sets_df.at[premade_codeset_id, 'enclave_codeset_id'] = codeset_id - code_sets_df.at[premade_codeset_id, 'enclave_codeset_id_updated_at'] = _datetime_palantir_format() - code_sets_df = code_sets_df.reset_index() - # return response_json - - # write out the update csv file with the enclave_codeset_id - # print('before terminating write out the updated code_sets.csv file here') - # date_str = datetime.now().strftime('%Y_%m_%d_%H_%M') - # output_filename = 'code_sets_updated_' + date_str + '.csv' - # TODO: fix creates these columns depending on how we're doing indexing: Unnamed: 0 Unnamed: 0.1, etc - output_filename = 'code_sets.csv' - code_sets_df.to_csv(os.path.join(input_csv_folder_path, output_filename), index=False, encoding='utf-8') - - return code_sets_df - - -def _load_standardized_input_df( - path: str, integer_id_fields=['enclave_codeset_id', 'codeset_id', 'internal_id'] -) -> pd.DataFrame: - """Loads known input dataframe in a standardized way: - - Sets data types - - Fill na values""" - df: pd.DataFrame = pd.read_csv(path).fillna('') - - # Strip: remove the spaces in the beginning and the end of the name - df.columns.str.lstrip() - df.columns.str.rstrip() - - # Float->Int: For some reason, was being read with .0's at the end. - for field in [x for x in integer_id_fields if x in list(df.columns)]: - df[field] = pd.to_numeric(df[field], errors='coerce')\ - .astype('Int64') - - return df - - -def left_join_update(df, df2): - """Does a left join, but where column names are the same, keeps right value if exists, else left value.""" - new_df = df.merge( - df2, how='left', left_on='internal_id', right_on='codeset_id') - - xy_col_counts: Dict[str, int] = {} - for col in list(new_df.columns): - if any([col.endswith(x) for x in ['_x', '_y']]): - common_col = col[:-2] - if common_col not in xy_col_counts: - xy_col_counts[common_col] = 0 - xy_col_counts[common_col] += 1 - - xy_cols: List[str] = [k for k, v in xy_col_counts.items() if v == 2] - for col in xy_cols: - new_df[col] = new_df[col + '_y'].fillna(new_df[col + '_x']) - new_df = new_df.drop([col + '_x', col + '_y'], axis=1) - - return new_df - - -def persist_to_db(code_sets_df: pd.DataFrame) -> pd.DataFrame: - """Save updated items to persistence layer""" - # Vars - join_col = 'codeset_id' - update_cols = ['enclave_codeset_id', 'enclave_codeset_id_updated_at', 'concept_set_name'] - - # Read - persistence_df = _load_standardized_input_df(CSET_UPLOAD_REGISTRY_PATH) - - # Subset - try: - code_sets_df_limited = code_sets_df[[join_col] + update_cols] - except KeyError: # if KeyError, `join_col` is an index - code_sets_df_limited = code_sets_df[update_cols] - # Join - persistence_df_new = left_join_update(persistence_df, code_sets_df_limited) - - # Save and return - persistence_df_new.to_csv(CSET_UPLOAD_REGISTRY_PATH, index=False) - return persistence_df_new - - -def _save_csv( - df: pd.DataFrame, out_format_name: str, in_format_name: str, inpath: str, output_filename: str, - field_delimiter=',' -): - """Side effects: Save CSV""" - # date_str = datetime.now().strftime('%Y.%m.%d') - # out_dir = os.path.join(UPLOADS_DIR, output_name, source_name, date_str, 'output') - infile_stem = os.path.basename(inpath).replace('.csv', '').replace('.tsv', '') - out_dir = os.path.join(UPLOADS_DIR, in_format_name, infile_stem, 'transforms', out_format_name) - os.makedirs(out_dir, exist_ok=True) - output_format = 'csv' if field_delimiter == ',' else 'tsv' if field_delimiter == '\t' else 'txt' - outpath = os.path.join(out_dir, f'{output_filename}.{output_format}') - df.to_csv(outpath, sep=field_delimiter, index=False) - - -def update_cset_upload_registry_moffit( - moffit_path: str, registry_path: str = CSET_UPLOAD_REGISTRY_PATH -) -> pd.DataFrame: - """Update concept set registry file, specifically for moffit entries. - Side effects: Writes update to registry file w/ any new entries""" - moffit_df = pd.read_csv(moffit_path).fillna('') - registry_df = pd.read_csv(registry_path).fillna('') - registered_moffit_cset_df = registry_df[registry_df['source_id_type'] == MOFFIT_SOURCE_ID_TYPE] - registered_moffit_ids = [str(x) for x in registered_moffit_cset_df['source_id']] - moffit_dataset_cset_ids: Set[str] = set([str(int(x)) for x in moffit_df['concept_set_id'].unique() if x != '']) - new_moffit_cset_ids: List = list(set([x for x in moffit_dataset_cset_ids if x not in registered_moffit_ids])) - try: # integers if possible - new_moffit_cset_ids = [int(x) for x in new_moffit_cset_ids] - except TypeError: - pass - new_moffit_cset_ids.sort() - - new_rows = [] - next_internal_id: int = max(registry_df['internal_id']) + 1 - for _id in new_moffit_cset_ids: - new_rows.append({ - 'source_id_type': 'moffit', - 'source_id': _id, - # 'source_id_field': '', - # 'oid': '', - # 'ccsr_code': '', - 'internal_id': next_internal_id, - 'internal_source': MOFFIT_SOURCE_URL, - # 'cset_source': '', - # 'grouped_by_bids': '', - # 'concept_id': '', - # 'codeset_id': '', - # 'enclave_codeset_id': '', - # 'enclave_codeset_id_updated_at': '', - # 'concept_set_name': '', - }) - next_internal_id += 1 - - if len(new_rows) > 0: - new_entries_df = pd.DataFrame(new_rows).fillna('') - new_registry_df = pd.concat([registry_df, new_entries_df]).fillna('') - new_registry_df.to_csv(registry_path, index=False) - return new_registry_df - return registry_df # if no new updates - - -# TODO: repurpose to moffit -# TODO: Make sure all cols are being used -def transform_moffit_to_palantir3file(inpath: str) -> str: - """Transform Moffit format to Palantir 3 File format.""" - # Vars - field_delimiter = ',' - out_format_name = 'palantir-three-file' - in_format_name = 'moffit' - out_filename1 = 'concept_set_version_item_rv_edited' - out_filename2 = 'code_sets' - out_filename3 = 'concept_set_container_edited' - infile_stem = os.path.basename(inpath).replace('.csv', '').replace('.tsv', '') - out_dir = os.path.join(UPLOADS_DIR, in_format_name, infile_stem, 'transforms', out_format_name) - - # Read inputs - inpath = os.path.join(PROJECT_ROOT, inpath) if inpath.startswith('termhub-csets') else inpath - # df = pd.read_csv(inpath, converters={'concept_set_id': int}).fillna('') # fails because one row has '' for id - df = pd.read_csv(inpath).fillna('') - df = df[df['concept_set_id'] != ''] - df['concept_set_id'] = df['concept_set_id'].astype(int) - df['concept_set_id'] = df['concept_set_id'].astype(str) - df = df.applymap(lambda x: x.strip()) - moffit_cset_ids: Set[str] = set([str(int(x)) for x in df['concept_set_id'].unique() if x]) - - # Read/update registry - cset_upload_registry_df: pd.DataFrame = update_cset_upload_registry_moffit(inpath) - registered_moffit_cset_df = cset_upload_registry_df[ - cset_upload_registry_df['source_id_type'] == MOFFIT_SOURCE_ID_TYPE] - moffit_id_internal_id_map: Dict[str, str] = dict(zip( - [str(x) for x in registered_moffit_cset_df['source_id']], - [str(x) for x in registered_moffit_cset_df['internal_id']])) - - # Transform - # II. Create & save exports - _all = {} - # 1. Palantir enclave table: concept_set_version_item_rv_edited - rows1 = [] - codeset_id__code__map = {} - for i, concept_row in df.iterrows(): - internal_id = moffit_id_internal_id_map[str(concept_row['concept_set_id'])] - code = concept_row['concept_code'] - code_system = concept_row['code_system'] - - # This will help us avoid duplicate codes in single concept set - if internal_id not in codeset_id__code__map: - codeset_id__code__map[internal_id] = [] - if code not in codeset_id__code__map[internal_id]: - codeset_id__code__map[internal_id].append(code) - else: - continue - - # The 3 fields isExcluded, includeDescendants, and includeMapped, are from OMOP but also in VSAC. If it has - # ...these 3 options, it is intensional. And when you execute these 3, it is now extensional / expansion. - # todo: Don't need concept_name? - row = { - 'codeset_id': internal_id, - 'concept_id': '', # leave blank for now - # - 'code': code, - 'codeSystem': code_system, - # - 'isExcluded': False, - 'includeDescendants': True if code_system == 'SNOMED' else False, - 'includeMapped': False, - 'item_id': str(uuid4()), # will let palantir verify ID is indeed unique - 'annotation': f'Curated value set: {MOFFIT_PREFIX}', - # 'created_by': 'DI&H Bulk Import', - 'created_by': PALANTIR_ENCLAVE_USER_ID_1, - 'created_at': _datetime_palantir_format() - } - row2 = {} - for k, v in row.items(): - row2[k] = v.replace('\n', ' - ') if type(v) == str else v - row = row2 - rows1.append(row) - df_code_set_members = pd.DataFrame(rows1) - _all[out_filename1] = df_code_set_members - _save_csv(df_code_set_members, out_format_name, in_format_name, inpath, out_filename1, field_delimiter) - - # 2. Palantir enclave table: code_sets - rows2 = [] - for moffit_id in moffit_cset_ids: - v = 1 - internal_id = moffit_id_internal_id_map[moffit_id] - cset_row = df[df['concept_set_id'] == moffit_id].iloc[0] - concept_set_name = f'[{MOFFIT_PREFIX}] ' + cset_row['concept_set_name'] - row = { - 'codeset_id': internal_id, - 'concept_set_name': concept_set_name, - 'concept_set_version_title': concept_set_name + f' (v{str(v)})', - 'project': ENCLAVE_PROJECT_NAME, # always use this project id for bulk import - 'source_application': '', - 'source_application_version': '', # nullable - 'created_at': _datetime_palantir_format(), - 'atlas_json': '', # nullable - 'is_most_recent_version': True, - 'version': v, - 'comments': '', - 'intention': '', # nullable - 'limitations': '', # nullable - 'issues': '', # nullable - 'update_message': 'Initial version.' if v == 1 else '', # nullable (maybe?) - # status field stats as appears in the code_set table 2022/01/12: - # 'status': [ - # '', # null - # 'Finished', - # 'In Progress', - # 'Awaiting Review', - # 'In progress', - # ][2], - # status field doesn't show this in stats in code_set table, but UI uses this value by default: - 'status': 'Under Construction', - 'has_review': '', # boolean (nullable) - 'reviewed_by': '', # nullable - 'created_by': PALANTIR_ENCLAVE_USER_ID_1, - 'provenance': MOFFIT_SOURCE_URL, - 'atlas_json_resource_url': '', # nullable - # null, initial version will not have the parent version so this field would be always null: - 'parent_version_id': '', # nullable - # True ( after the import view it from the concept set editor to review the concept set and click done. - # We can add the comments like we imported from VSAC and reviewed it from the concept set editor. ) - # 1. import 2. manual check 3 click done to finish the definition. - if we want to manually review them - # first and click Done: - 'is_draft': True, - } - rows2.append(row) - df_code_sets = pd.DataFrame(rows2) - _all[out_filename2] = df_code_sets - _save_csv(df_code_sets, out_format_name, in_format_name, inpath, out_filename2, field_delimiter) - - # 3. Palantir enclave table: concept_set_container_edited - rows3 = [] - for moffit_id in moffit_cset_ids: - internal_id = moffit_id_internal_id_map[moffit_id] - cset_row = df[df['concept_set_id'] == moffit_id].iloc[0] - concept_set_name = f'[{MOFFIT_PREFIX}] ' + cset_row['concept_set_name'] - row = { - 'concept_set_id': internal_id, - 'concept_set_name': concept_set_name, - 'project_id': '', # nullable - 'assigned_informatician': PALANTIR_ENCLAVE_USER_ID_1, # nullable - 'assigned_sme': PALANTIR_ENCLAVE_USER_ID_1, # nullable - 'status': ['Finished', 'Under Construction', 'N3C Validation Complete'][1], - 'stage': [ - 'Finished', - 'Awaiting Editing', - 'Candidate for N3C Review', - 'Awaiting N3C Committee Review', - 'Awaiting SME Review', - 'Under N3C Committee Review', - 'Under SME Review', - 'N3C Validation Complete', - 'Awaiting Informatician Review', - 'Under Informatician Review', - ][1], - 'intention': '', # nullable - 'n3c_reviewer': '', # nullable - 'alias': None, # '' better? - 'archived': False, - # 'created_by': 'DI&H Bulk Import', - 'created_by': PALANTIR_ENCLAVE_USER_ID_1, - 'created_at': _datetime_palantir_format() - } - - row2 = {} - for k, v in row.items(): - row2[k] = v.replace('\n', ' - ') if type(v) == str else v - row = row2 - - rows3.append(row) - df_code_sets__container_variation = pd.DataFrame(rows3) - _all[out_filename3] = df_code_sets__container_variation - _save_csv(df_code_sets__container_variation, out_format_name, in_format_name, inpath, out_filename3, field_delimiter) - - return out_dir - - -def upload_dataset(input_path: str, format='palantir-three-file', use_cache=False): - """Main function""" - if format == 'moffit': - input_path = transform_moffit_to_palantir3file(input_path) - code_sets_df: pd.DataFrame = post_to_enclave_from_3csv(input_path) - persist_to_db(code_sets_df) - - -def cli(): - """Command line interface for package. - - Side Effects: Executes program.""" - package_description = 'Tool for uploading to the Palantir Foundry enclave.' - parser = ArgumentParser(description=package_description) - - parser.add_argument( - '-n', '--n3c', default=False, action='store_true', required=False, - help='Create new versions of N3C Recommended for comparison after vocab updates') - parser.add_argument( - '-p', '--input-path', - help='Path to file or folder to be parsed and uploaded.') - parser.add_argument( - '-f', '--format', - choices=['palantir-three-file', 'moffit'], - default='palantir-three-file', - help='The format of the file(s) to be uploaded.\n' - '- palantir-three-file: Path to folder with 3 files that have specific columns that adhere to concept table data model. These ' - 'files must have the following names: i. `code_sets.csv`, ii. `concept_set_container_edited.csv`, iii. ' - '`concept_set_version_item_rv_edited.csv`.\n' - '- moffit: Has columns concept_set_id, concept_set_name, concept_code, concept_name, code_system.') - # parser.add_argument( - # '-c', '--use-cache', - # action='store_true', - # help='If present, will check the input file and look at the `enclave_codeset_id` column. If no empty values are' - # ' present, this indicates that the `enclave_wrangler` has already been run and that the input file itself ' - # 'can be used as cached data. The only thing that will happen is an update to the persistence layer, ' - # '(`data/cset.csv` as of 2022/03/18).'), - kwargs = parser.parse_args() - kwargs_dict: Dict = vars(kwargs) - - if kwargs.n3c: - make_new_versions_of_csets() - else: - # TODO: @joeflack4, can we get rid of this and the things it calls? - upload_dataset(**kwargs_dict) - - -# going to do new container instead. just keeping this code for a bit in case -# we want to revert to just new version -# def upload_cset_as_new_version_of_itself( -# codeset_id: int, -# add_to_field: Dict = {'provenance': f'Version for comparison to N3C-Rec on {date.today().isoformat()}'} -# ) -> Dict: -# ov = fetch_cset_version(codeset_id, False) -# -# vi = [i['properties'] for i in get_concept_set_version_expression_items(codeset_id, 'full')] -# concepts = [] -# for item in vi: -# c = {'concept_id': item['conceptId']} -# for p in ['includeDescendants', 'isExcluded', 'includeMapped']: -# c[p] = item[p] -# concepts.append(c) -# -# upload_args = { -# 'on_behalf_of': ov.get('createdBy', ''), # not allowing this for some csets -# # 'on_behalf_of': config['SERVICE_USER_ID'], -# 'concept_set_name': ov.get('conceptSetNameOMOP', ''), -# 'provenance': ov.get('provenance', ''), -# 'limitations': ov.get('limitations', ''), -# 'intention': ov.get('intention', ''), -# 'parent_version_codeset_id': ov.get('codesetId', ''), -# 'current_max_version': ov.get('version', ''), # probably -# # codeset_id': None, will be assigned -# 'validate_first': True, -# 'omop_concepts': concepts, -# 'finalize': True, -# # annotation, -# # intended_research_project, -# } -# -# for key, value in add_to_field.items(): -# val = '. ' + ov.get(key, '') -# val = value + val -# upload_args[key] = val -# -# # upload_new_cset_version_with_concepts( concept_set_name, omop_concepts, provenance, limitations, intention, annotation, parent_version_codeset_id, current_max_version, intended_research_project, on_behalf_of, codeset_id, validate_first, finalize ) -# # pass_on_args = ['conceptSetNameOMOP'] not sure what this was for -# d = upload_new_cset_version_with_concepts(**upload_args) # {'responses': [...], 'codeset_id': 123} -# -# return d['versionId'] - - -def upload_cset_copy_in_new_container( - codeset_id: int, -) -> Dict: - ov = fetch_cset_version(codeset_id, False) - - concept_set_name = ov.get('conceptSetNameOMOP') - oc = fetch_cset_container(concept_set_name) - - vi = [i['properties'] for i in get_concept_set_version_expression_items(codeset_id, 'full')] - concepts = [] - for item in vi: - c = {'concept_id': item['conceptId']} - for p in ['includeDescendants', 'isExcluded', 'includeMapped']: - c[p] = item[p] - concepts.append(c) - - concept_set_name = ov.get('conceptSetNameOMOP', '') - concept_set_name = f'{concept_set_name} -- Copy of {codeset_id} for comparison {date.today().isoformat()}' - - try: - c_creator = 'unknown' - c_created_by = oc.get('createdBy') - if c_created_by == config['SERVICE_USER_ID']: - c_creator = 'UNITEConceptSetBulk@nih.gov' - elif c_created_by: - c_creator = get_researcher(c_created_by).get('name', 'unknown') - except Exception as e: - raise e - - try: - v_creator = 'unknown' - v_created_by = ov.get('createdBy') - if v_created_by == config['SERVICE_USER_ID']: - v_creator = 'UNITEConceptSetBulk@nih.gov' - elif v_created_by: - v_creator = get_researcher(v_created_by).get('name', 'unknown') - except Exception as e: - raise e - - - container_args = { - 'on_behalf_of': config['SERVICE_USER_ID'], - # 'on_behalf_of': c_created_by, - 'concept_set_name': concept_set_name, - 'intention': f"Copy for comparison of container created by {c_creator} on {oc.get('createdAt', '**missing**')}. Orig intention: {oc.get('intention', '')}", - 'research_project': oc.get('project', config['DIH_PROJ_ID']), - 'assigned_sme': oc.get('assigned_sme', ''), - 'assigned_informatician': oc.get('assigned_informatician', ''), - } - version_args = { - 'on_behalf_of': config['SERVICE_USER_ID'], - # 'on_behalf_of': v_created_by, - 'concept_set_name': concept_set_name, - 'provenance': f"Copy for comparison of container created by {v_creator} on {ov.get('createdAt', '**missing**')}. Orig provenance: {ov.get('provenance', '')}", - 'limitations': ov.get('limitations', ''), - 'intention': ov.get('intention', ''), - # 'parent_version_codeset_id': ov.get('codesetId', ''), - # 'current_max_version': ov.get('version', ''), # probably - 'current_max_version': None, - 'codeset_id': None, # will be assigned - # 'codeset_id': codeset_id * 100, # to make it easy to see they're related -- was too big for api rules - 'omop_concepts': concepts, - 'finalize': True, - # annotation, - # intended_research_project, - } - - d = upload_new_container_with_concepts( - **container_args, versions_with_concepts=[version_args], - fail_if_exists=False, validate_first=True) - - container: Dict = d['upload_concept_set_container'] - if not container: - raise Exception(f'Error uploading or fetching new container copy of {codeset_id}') - version: Dict = d['upload_new_cset_version_with_concepts'] - if not version: - raise Exception(f'Error uploading or fetching new version copy of {codeset_id}') - if 'versionId' in version: - return version['versionId'] - if 'codesetId' in version: - return version['codesetId'] - - raise Exception(f'Failed to find codeset_id of newly uploaded version of {codeset_id}') - - -def make_new_versions_of_csets(): - from backend.db.utils import sql_query_single_col, get_db_connection - from backend.routes.db import get_n3c_recommended_codeset_ids - with get_db_connection() as con: - comparisons_already_done = sql_query_single_col(con, 'select orig_codeset_id from public.codeset_comparison') - - n3c_codeset_ids = set(get_n3c_recommended_codeset_ids()).difference(comparisons_already_done) - - eastern = pytz.timezone('US/Eastern') - new_codeset_ids = [] - with get_db_connection() as con: - for codeset_id in n3c_codeset_ids: - print(f'Making new version of {codeset_id}') - new_version_codeset_id = upload_cset_copy_in_new_container(codeset_id) - if not new_version_codeset_id: - continue - print(f'{codeset_id}, {new_version_codeset_id}') - new_codeset_ids.append(new_version_codeset_id) - row = { - 'fetch_time': datetime.now(eastern).isoformat(), - 'orig_codeset_id': codeset_id, - 'new_codeset_id': new_version_codeset_id - } - insert_from_dict(con, 'public.codeset_comparison', row) - -if __name__ == '__main__': - # test_new_version_compare_codeset_ids = [27371375, 523378440, 490947789] - # test_new_version_compare_codeset_ids = [523378440] - # make_new_versions_of_csets(codeset_ids=test_new_version_compare_codeset_ids) - # pass - cli() \ No newline at end of file +"""Upload datasets to Foundry""" +import json +import os +import urllib.parse +from argparse import ArgumentParser +from typing import Dict, List, Set, Union +from uuid import uuid4 +from time import sleep +from datetime import date, datetime +import pytz + +import pandas as pd +from requests import Response + +from enclave_wrangler.models import CsetContainer +from backend.db.utils import insert_from_dict, get_db_connection + +# TODO: See if sys.path(0) thing works; doesn't require maintenance. Look at unit tests +try: + from enclave_wrangler.config import CSET_UPLOAD_REGISTRY_PATH, CSET_VERSION_MIN_ID, ENCLAVE_PROJECT_NAME, MOFFIT_PREFIX, \ + MOFFIT_SOURCE_ID_TYPE, MOFFIT_SOURCE_URL, PALANTIR_ENCLAVE_USER_ID_1, UPLOADS_DIR, VALIDATE_FIRST, config, \ + PROJECT_ROOT, \ + TERMHUB_CSETS_DIR + from enclave_wrangler.actions_old_palantir3file_api import get_cs_container_data, get_cs_version_data, get_cs_version_expression_data, \ + post_request_enclave_api_addExpressionItems, post_request_enclave_api_create_container, \ + post_request_enclave_api_create_version, update_cs_version_expression_data_with_codesetid + from enclave_wrangler.actions_api import add_concepts_to_cset, finalize_concept_set_version, \ + upload_concept_set_container, \ + upload_concept_set_version_draft, get_concept_set_version_expression_items + from enclave_wrangler.utils import EnclaveWranglerErr, _datetime_palantir_format, log_debug_info, make_actions_request, get_random_codeset_id + from enclave_wrangler.objects_api import fetch_object_by_id, get_object_links, fetch_cset_version, fetch_cset_container, make_objects_request, get_researcher +except ModuleNotFoundError: + # TODO: this isn't up-to-date, is it? (2023-10-16) + from config import CSET_UPLOAD_REGISTRY_PATH, ENCLAVE_PROJECT_NAME, MOFFIT_PREFIX, \ + MOFFIT_SOURCE_ID_TYPE, MOFFIT_SOURCE_URL, PALANTIR_ENCLAVE_USER_ID_1, UPLOADS_DIR, config, PROJECT_ROOT, \ + TERMHUB_CSETS_DIR, CSET_VERSION_MIN_ID + from actions_old_palantir3file_api import get_cs_container_data, get_cs_version_data, get_cs_version_expression_data, \ + post_request_enclave_api_addExpressionItems, post_request_enclave_api_create_container, \ + post_request_enclave_api_create_version, update_cs_version_expression_data_with_codesetid + from actions_api import add_concepts_to_cset, upload_concept_set_container, \ + upload_concept_set_version_draft, get_concept_set_version_expression_items + from utils import _datetime_palantir_format, log_debug_info + + +DEBUG = False + + +# todo: sep fnc for validate? +def process_csv(): + """Some logic to consider: + - imagine a csv with 1 container new and 3 new versions. we can give them info about versions that exist, link to + the enclave, and confirm that they want to create the new versions. then they can confirm or make changes and + revalidate + """ + pass + + +# TODO: return 'outcome': success / error for (i) all the csets, (ii) each cset baesd on all response.status_code +def upload_new_cset_version_with_concepts_from_csv( + path: str = None, df: pd.DataFrame = None, validate_first=False +) -> Dict: + """ + Upload from CSV + file format docs: + https://github.com/jhu-bids/TermHub/tree/develop/enclave_wrangler + + testing from test/test_enclave_wrangler.py + using: + https://github.com/jhu-bids/TermHub/blob/develop/test/input/test_enclave_wrangler/test_dataset_upload/type-2-diabetes-mellitus.csv + + :return A dictionary of wth cset name as key, and values are the responses from each enclave API call to fulfill + the request. + """ + if not path and df is None: + raise RuntimeError('upload_new_cset_version_with_concepts_from_csv: Must provide path or dataframe') + df = df if df is not None else pd.read_csv(path).fillna('') + df = df[~df.eq('').all(axis=1)] # drop all empty rows + + cset_group_cols = ['concept_set_name', 'parent_version_codeset_id'] + more_cset_cols = list( + {'multipassId', 'current_max_version', 'domain_team', 'provenance', 'limitations', 'intention', + 'intended_research_project', 'authority'}.intersection(df.columns)) + concept_cols_required = ['concept_id', 'includeDescendants', 'isExcluded', 'includeMapped'] + concept_cols_optional = ['annotation'] + + responses = {} + csets = df.groupby(cset_group_cols) + for cset in csets: + key: tuple + key, csetdf = cset + + # TODO: From right about here, abstract this logic into new fun: cset_version_df_to_dict() + new_version = {} + cset_name = key[0] + new_version['concept_set_name'] = cset_name + new_version['parent_version_codeset_id'] = int(key[1]) + + first_row = csetdf[more_cset_cols].to_dict(orient='records')[0] + for c in more_cset_cols: + new_version[c] = first_row[c] + + new_version['on_behalf_of'] = new_version['multipassId'] + del new_version['multipassId'] + + selectable_cols = concept_cols_required + [x for x in concept_cols_optional if x in list(csetdf.columns)] + try: + new_version['omop_concepts'] = csetdf[selectable_cols].to_dict(orient='records') + except KeyError as e: + raise EnclaveWranglerErr(str(e)) + for x in new_version['omop_concepts']: + x['concept_id'] = int(x['concept_id']) + + # TODO: for now going to assume that if the concept set exists, they'll give us a parent_version_codeset_id + # but they might give us the name and not the parent id. if they do give us the name but the + # name already exists, two possibilities: 1) they just want to add a new version using the most + # recent version as the parent but didn't bother to find out what it was and include in csv, or + # 2) it's a mistaken duplicate name and they should get an error and have to change the name + # FIX SOMEHOW!!! + + if new_version['parent_version_codeset_id']: # creating new version, setting parent to existing version + responses_i: Dict = upload_new_cset_version_with_concepts(**new_version, validate_first=validate_first) + # TODO: since test/test_enclave_wrangler.py:test_upload() is only expecting one + # result, just returning the first result. but csvs allow multiple, so FIX THIS!!!! + else: + responses_i: Dict = upload_new_container_with_concepts(**new_version, validate_first=validate_first) + responses[cset_name] = responses_i + + # TODO: put something helpful back here but that doesn't cause errors + # print('INFO: ' + cset_name + ': ', str(responses_i)) + print('INFO: ' + cset_name) + return responses + + +# TODO +def upload_new_cset_container_with_concepts_from_csv( + path: str = None, df: pd.DataFrame = None, validate_first=False +) -> Dict[str, Dict[str, Union[Response, List[Response]]]]: + """Upload new container(s) from CSV + + todo's: + - Validate / throw err if multiple `intention`, `research_project`, `assigned_sme`, `assigned_informatician` + """ + if not path and df is None: + raise RuntimeError('upload_new_cset_version_with_concepts_from_csv: Must provide path or dataframe') + df = df if df is not None else pd.read_csv(path).fillna('') + df = df[~df.eq('').all(axis=1)] # drop all empty rows + + # TODO: how about creating a dict or class of containers and versions from this logic? + # implementation 2 + cset_group_cols = ['concept_set_name', 'parent_version_codeset_id'] + csets = df.groupby(cset_group_cols) + containers_with_versions: List[CsetContainer] = [] + for cset in csets: + containers_with_versions.append(CsetContainer().from_dataframe(cset)) + + # implementation 1 + container_dfs = { + cset_name: df[df['concept_set_name'] == cset_name] + for cset_name in df['concept_set_name'].unique()} + responses = {} + for cset_name, df in container_dfs.items(): + first_row = dict(df.iloc[0]) + # todo?: Convert DF rows to List[Dict], properly formatted + # todo: versions_with_concepts.omop_concepts: i say below i expect list[int] but that has changed to list[dict] + # todo: are these fields really all required? prolly not if not in current csv. are they there? + # 'omop_concept_ids': (List[int]) (required), <---- x + # 'provenance' (str) (required): + # 'concept_set_name' (str) (required): + # 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` + # 'limitations' (str) (required): + # 'intention' (str) (required): + # 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` + # 'codeset_id' (int) (required): + + # todo: just testing this logic + df['parent_version_codeset_id'] = '' + upload_new_cset_version_with_concepts_from_csv(df=df) + + rows = df.to_dict(orient='records') + responses[cset_name] = upload_new_container_with_concepts( + concept_set_name=cset_name, + intention=first_row['intention'], + research_project=first_row['research_project'], + assigned_sme=first_row['assigned_sme'], + assigned_informatician=first_row['assigned_informatician'], + versions_with_concepts=[], # TODOx + validate_first=validate_first) + return responses + + +# TODO: What if this fails halfway through? Can we teardown any of the steps? (need to store random `codeset_id` too) +# TODO: Need to do proper codeset_id assignment: (i) look up registry and get next available ID, (ii) assign it here, +# (iii) persist new ID / set to registry, (iv) persist new ID to any files passed through CLI, (v), return the new ID +# todo: @Siggie: Do we want to change this to accept named params instead of a dictionary? - Joe 2022/12/05 +def upload_new_cset_version_with_concepts( + concept_set_name: str, omop_concepts: List[Dict], + provenance: str = "", limitations: str = "", intention: str = "", annotation: str = "", + parent_version_codeset_id: int = None, current_max_version: float = None, + intended_research_project: str = None, on_behalf_of: str = None, codeset_id: int = None, + validate_first=VALIDATE_FIRST, finalize=True # maybe finalize should default to False? +) -> Dict: + """Upload a concept set version along with its concepts. + + # todo: Update this slightly now that this function accepts named params instead of a dict + + 'omop_concepts': [ + { + 'concept_id' (int) (required): + 'includeDescendants' (bool) (required): + 'isExcluded' (bool) (required): + 'includeMapped' (bool) (required): + 'annotation' (str) (optional): + } + ], + 'provenance' (str) (required): + 'concept_set_name' (str) (required): + 'limitations' (str) (required): + 'intention' (str) (required): + 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` + 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` + 'codeset_id' (int) (required): Default:Will ge generated if not passed. + } + + Example: + { + "omop_concepts": [ + { + "concept_id": 45259000, + "includeDescendants": true, + "isExcluded": false, + "includeMapped": true, + "annotation": "This is my concept annotation." + } + ], + "provenance": "Created through TermHub.", + "concept_set_name": "My test concept set", + "limitations": "", + "intention": "" + } + """ + # Handle missing params + if not codeset_id: + # todo: this is temporary until I handle registry persistence + codeset_id = get_random_codeset_id() + if not intended_research_project: + intended_research_project = ENCLAVE_PROJECT_NAME + + # Upload + response_upload_draft_concept_set: Response = upload_concept_set_version_draft( # code_set + base_version=parent_version_codeset_id, + current_max_version=current_max_version, + provenance=provenance, + concept_set=concept_set_name, # == container_d['concept_set_name'] + annotation=annotation, + limitations=limitations, + intention=intention, + intended_research_project=intended_research_project, + version_id=codeset_id, + on_behalf_of=on_behalf_of, + validate_first=validate_first) # == code_sets.codeset_id + sleep(8) + + response_upload_concepts: List[Response] = add_concepts_to_cset( + omop_concepts=omop_concepts, + version__codeset_id=codeset_id, + validate_first=validate_first) # == code_sets.codeset_id + + responses = { + 'upload_concept_set_version': response_upload_draft_concept_set, + 'add_concepts_to_cset': response_upload_concepts, + } + if finalize: + response_finalize_concept_set_version: Response = finalize_concept_set_version( + concept_set=concept_set_name, # == container_d['concept_set_name'] + version_id=codeset_id, + current_max_version=current_max_version, + provenance=provenance, + limitations=limitations, + validate_first=validate_first, + on_behalf_of=on_behalf_of) + responses['finalize_concept_set_version'] = response_finalize_concept_set_version + return { + 'responses': responses, + 'versionId': codeset_id + } + + +# TODO: support concept params, e.g. exclude_children +def upload_new_container_with_concepts( + on_behalf_of: str, concept_set_name: str, intention: str, versions_with_concepts: List[Dict], + research_project: str = ENCLAVE_PROJECT_NAME, assigned_sme: str = PALANTIR_ENCLAVE_USER_ID_1, + assigned_informatician: str = PALANTIR_ENCLAVE_USER_ID_1, validate_first=VALIDATE_FIRST, + fail_if_exists = True +) -> Dict[str, Union[Response, List[Response]]]: + """Upload a new concept set container, and 1+ concept set versions, along with their concepts. + + :param container (Dict): Has the following keys: + concept_set_name (str) (required): + intention (str) (required): + research_project (str) (required): Default:`ENCLAVE_PROJECT_NAME` + assigned_sme (str) (optional): Default:`PALANTIR_ENCLAVE_USER_ID_1` + assigned_informatician (str) (optional): Default:`PALANTIR_ENCLAVE_USER_ID_1` + :param versions_with_concepts (List[Dict]): Has the following schema: [ + TODO: @jflack, this doesn't seem to be what upload_new_cset_version_with_concepts + is really expecting. I'm changing omop_concept_ids to omop_concepts in upload_and_sync_rxnorm_cset + { + 'omop_concept_ids': (List[int]) (required), + 'provenance' (str) (required): + 'concept_set_name' (str) (required): + 'annotation' (str) (optional): Default:`'Curated value set: ' + version['concept_set_name']` + 'limitations' (str) (required): + 'intention' (str) (required): + 'intended_research_project' (str) (optional): Default:`ENCLAVE_PROJECT_NAME` + 'codeset_id' (int) (required): + } + ] + """ + if len(versions_with_concepts) != 1: + # code was written to allow multiple versions, but there's no good reason + # for that. So, just raising error if not exactly one. + raise ValueError("expecting a single version to upload with container") + + # Upload container + existing_container = fetch_cset_container(concept_set_name, fail_on_error=False) + if existing_container: + if fail_if_exists: + raise RuntimeError(f'Error attempting to upload container. Container [{concept_set_name}] already exists.') + version = get_object_links( + object_type='OMOPConceptSetContainer', + object_id=concept_set_name, + link_type='OMOPConceptSet', + return_type='data', + expect_single_item=True, + handle_paginated=False, + retry_if_empty=False) + # TODO: check that already existing version matches what was to be uploaded...? + if not version: # if container was already uploaded but not version, upload version now + version: Dict[str, Union[Response, List[Response]]] = \ + upload_new_cset_version_with_concepts(**(versions_with_concepts[0]), + validate_first=validate_first) + return {'already_exists': True, + 'upload_concept_set_container': existing_container, + 'upload_new_cset_version_with_concepts': version} + + response_upload_concept_set_container: Response = upload_concept_set_container( + on_behalf_of=on_behalf_of, + concept_set_id=concept_set_name, + intention=intention, + research_project=research_project, + assigned_sme=assigned_sme, + assigned_informatician=assigned_informatician, + validate_first=validate_first) + + # Upload version + version_response: Dict[str, Union[Response, List[Response]]] = \ + upload_new_cset_version_with_concepts(**(versions_with_concepts[0]), + validate_first=validate_first) + + return { + 'upload_concept_set_container': response_upload_concept_set_container, + 'upload_new_cset_version_with_concepts': version_response} + + + +# todo: need to have 2 variations: 1 for creates, and 1 for updates. w/ exception handling and proper error messaging +# todo: need to allow 'upload' based on if it is / isn't in the local termhub-csets/datasets/cset_upload_registry.csv +# todo: use CSET_VERSION_MIN_ID? right now I'm not sure. makes more sense to pre-assign before running this so that +# ...I can JOIN together. would need a separate function just to accept a single `linked_cset` as opposed to files +# ...that have multiple. otherwise, with multiple, there's no way to tell what concepts for what cset +def post_to_enclave_from_3csv( + input_csv_folder_path: str, enclave_api=['old_rid_based', 'default'][1], create_cset_container=True, + create_cset_versions=True +) -> Union[Dict, pd.DataFrame]: + """Uploads data to enclave and updates the following column in the input's code_sets.csv: + # todo: Siggie: updating these files is probably not worth / necessary right now. Joe: We might want to do at some point + - enclave_codeset_id + - enclave_codeset_id_updated_at + - concept_set_name + + :param enclave_api (str): One of 'old_rid_based' or 'default'. The 'old_rid_based' Enclave API is the one which + required RIDs for params rather than the param name, and there may be other differences. 'default' refers to the + newer Enclave API (as of 2022/10). + + # todo: after adding 'update' functionality, add params / update these docs + :param create_cset_container (bool): If true, will identify concept set containers from `input_csv_folder_path` and + make a POST request to enclave to create them. Only applicable to the new 'default' API. + :param create_cset_versions (bool): If true, will identify concept set versions from `input_csv_folder_path` and + make a POST request to enclave to create them. Only applicable to the new 'default' API. + """ + # Settings + if DEBUG: + log_debug_info() + # Validate + msg = 'post_to_enclave_from_3csv() has not yet been set up to only add ' \ + 'concepts. Must either pass `create_cset_container=True` to create ' \ + 'a new concept set container, `create_cset_versions=True` to create' \ + ' new concept set versions on an existing container, or both.' \ + ' Doing nothing.' + if not create_cset_container or create_cset_versions: + print(msg) + return {} + # Routing + if enclave_api == 'old_rid_based': + # todo: this doesn't return Response, so might want to change this functions return sig / val + df: pd.DataFrame = post_to_enclave_old_api(input_csv_folder_path) + return df + + # Read data + concept_set_container_edited_df = pd.read_csv( + os.path.join(input_csv_folder_path, 'concept_set_container_edited.csv')).fillna('') + concept_set_container_edited_d = json.loads(concept_set_container_edited_df.to_json(orient='records')) + code_sets_df = _load_standardized_input_df(os.path.join(input_csv_folder_path, 'code_sets.csv')) # .fillna('') + concept_set_version_item_rv_edited_df = pd.read_csv( + os.path.join(input_csv_folder_path, 'concept_set_version_item_rv_edited.csv')).fillna('') + + # Link data into single iterable + # temp: @Siggie: You mentioned to delete the internal_id we assigned, but I don't know how to link these together otherwise + linked_csets: Dict = {} + for container_d in concept_set_container_edited_d: + cset_name = container_d['concept_set_id'] + linked_csets[cset_name] = {'versions': {}} + linked_csets[cset_name]['container'] = container_d + code_sets_df_i = code_sets_df[code_sets_df['concept_set_name'] == cset_name] + code_sets_d_i = json.loads(code_sets_df_i.to_json(orient='records')) + for version_d in code_sets_d_i: + version_id = version_d['codeset_id'] # == code_sets.codeset_id + linked_csets[cset_name]['versions'][version_id] = {'items': []} + linked_csets[cset_name]['versions'][version_id]['version'] = version_d + concept_set_version_item_rv_edited_df_i = concept_set_version_item_rv_edited_df[ + concept_set_version_item_rv_edited_df['codeset_id'] == version_id] # == code_sets.codeset_id + concept_set_version_item_rv_edited_d_i = json.loads(concept_set_version_item_rv_edited_df_i.to_json(orient='records')) + for item_d in concept_set_version_item_rv_edited_d_i: + linked_csets[cset_name]['versions'][version_id]['omop_concept_ids'].append(item_d) + + # Upload to enclave + response_upload_new_container_with_concepts = {} + response_upload_concept_set: Union[None, Response] = None + response_upload_new_cset_version_with_concepts = {} + for linked_cset in list(linked_csets.values()): + container_d = linked_cset['container'] + if create_cset_container and create_cset_versions: + response_upload_new_container_with_concepts: Dict = upload_new_container_with_concepts( + container=container_d, + versions_with_concepts=linked_cset['versions'].values()) + elif create_cset_container: + response_upload_concept_set: Response = upload_concept_set_container( # concept_set_container + concept_set_id=container_d['concept_set_name'], + intention=container_d['intention'], + research_project=ENCLAVE_PROJECT_NAME, + assigned_sme=PALANTIR_ENCLAVE_USER_ID_1, + assigned_informatician=PALANTIR_ENCLAVE_USER_ID_1) + elif create_cset_versions: + response_upload_new_cset_version_with_concepts = [] + for version in linked_cset['versions'].values(): + version_d = version['version'] + version_d['omop_concept_ids'] = version['omop_concept_ids'] + response_i: Dict[str, Union[Response, List[Response]]] = upload_new_cset_version_with_concepts(**version_d) + response_upload_new_cset_version_with_concepts.append(response_i) + + return { + 'upload_new_container_with_concepts': response_upload_new_container_with_concepts, + 'upload_concept_set': response_upload_concept_set, + 'upload_new_cset_version_with_concepts': response_upload_new_cset_version_with_concepts} + + +def post_to_enclave_old_api(input_csv_folder_path: str) -> pd.DataFrame: + """Uploads data to enclave and updates the following column in the input's code_sets.csv: + - enclave_codeset_id + - enclave_codeset_id_updated_at + - concept_set_name + """ + if DEBUG: + log_debug_info() + + # Read data + code_sets_df = _load_standardized_input_df(os.path.join(input_csv_folder_path, 'code_sets.csv')) #.fillna('') + + # 0.1 Create mappings between + # - concept_set_container_edited.csv[concept_set_name], and... + # - code_sets.csv[codeset_id] + # use the concept_set_name as key to store the pre-made codeset_ids, + # store the codeset_ids in the premade_codeset_ids + cs_name_id_mappings = {} + for index, row in code_sets_df.iterrows(): + cs_id = row['codeset_id'] + cs_name = row['concept_set_name'] + cs_name_id_mappings[cs_name] = cs_id + # 0.2 create a list of premade coedeset ids + premade_codeset_ids = [] + for index, row in code_sets_df.iterrows(): + premade_codeset_ids.append(row['codeset_id']) + + # I. Create structures + # I.0. concept_set_version_item_dict; key=codeset_id + concept_set_version_item_dict = {} + concept_set_version_item_rv_edited_df = pd.read_csv( + os.path.join(input_csv_folder_path, 'concept_set_version_item_rv_edited.csv')).fillna('') + for index, row in concept_set_version_item_rv_edited_df.iterrows(): + key = row['codeset_id'] + if key not in concept_set_version_item_dict: + concept_set_version_item_dict[key] = [] + concept_set_version_item_dict[key].append(row) + # cs_result = pd.merge(code_sets_df, concept_set_version_item_rv_edited_df, on=['codeset_id']) + + # I.1. build the list of container json data; key=codeset_id + # I.1.ii. Get the actual container data + concept_set_container_edited_json_all_rows = {} + concept_set_container_edited_df = pd.read_csv( + os.path.join(input_csv_folder_path, 'concept_set_container_edited.csv')).fillna('') + for index, row in concept_set_container_edited_df.iterrows(): + cs_name = row['concept_set_name'].strip() + single_row = get_cs_container_data(cs_name) + cs_id = cs_name_id_mappings[cs_name] + concept_set_container_edited_json_all_rows[cs_id] = single_row + + # I.2. build the list of cs version json data; key=codeset_id + code_set_version_json_all_rows = {} + # code_sets_df = pd.read_csv(os.path.join(input_csv_folder_path, 'code_sets.csv')).fillna('') + for index, row in code_sets_df.iterrows(): + cs_id = row['codeset_id'] + cs_name = row['concept_set_name'].strip() + cs_intention = row['intention'].strip() + cs_limitations = row['limitations'].strip() + cs_update_msg = row['update_message'].strip() + cs_authority = row.get('authority', '').strip() + ##cs_authority = "Mathematica" ## TODO: shong, 4/26/22, code_sets.csv need to build the authority value, uncomment when available + cs_provenance = row['provenance'].strip() + single_row = get_cs_version_data(cs_name, cs_id, cs_intention, cs_limitations, cs_update_msg, cs_provenance, cs_authority) + # cs_name, cs_id, intention, limitation, update_msg, status, provenance + code_set_version_json_all_rows[cs_id] = single_row + # code_set_version_json_all_rows_dict[codeset_id] = single_row + + # I.3. Build all of the Expression items to add to a CS version; key=codeset_id, + # codeset_id = pre-made codeset_id to iterate through the codesets + code_set_expression_items_json_all_rows = {} + for index, row in code_sets_df.iterrows(): + current_code_set_id = row['codeset_id'] + # build the code and codeSystem list for the current codeSet + # reset the code list + code_list = [] + cs_name = row['concept_set_name'].strip() + # code and code system list + concept_set_version_item_rows = concept_set_version_item_dict[current_code_set_id] + # always use the same entry from the first set as currently + # we do not have a support to save these flags per expressionItems + concept_set_version_item_row1 = concept_set_version_item_rows[0] + exclude = concept_set_version_item_row1['isExcluded'] + descendents = concept_set_version_item_row1['includeDescendants'] + mapped = concept_set_version_item_row1['includeMapped'] + annotation = concept_set_version_item_row1['annotation'].strip() + + for concept_set_version_item_row in concept_set_version_item_rows: + code_codesystem_pair = concept_set_version_item_row['codeSystem'] + ":" + str(concept_set_version_item_row['code']) + code_list.append(code_codesystem_pair) + # to-do(future): Right now, the API doesn't expect variation between the following 4 values among + # ...concept set items, so right now we can just take any of the rows and use its values. But, in + # ...the future, when there is variation, we may need to do some update here. - Joe 2022/02/04 + # this is same limitation OMOP concept expression works, so for now it is sufficient + # we can explorer more granular control later if necessary -Stephanie 02/05/2022 + + # now that we have the code list, generate the json for the versionExpression data + single_row = get_cs_version_expression_data(current_code_set_id, cs_name, code_list, exclude, descendents, mapped, annotation) + code_set_expression_items_json_all_rows[current_code_set_id] = single_row + # code_set_expression_items_json_all_rows_dict[codeset_id] = single_row + + # now that we have the code list, generate the json for the versionExpression data + single_row = get_cs_version_expression_data( + current_code_set_id, cs_name, code_list, exclude, descendents, mapped, annotation) + code_set_expression_items_json_all_rows[current_code_set_id] = single_row + # code_set_expression_items_json_all_rows_dict[codeset_id] = single_row + + # --- Steph: check the failed cases 2/15/22----------------------------- + # @Steph: Can we remove this block yet? I don't mind keeping for a while for convenienice, but I like the practice + # ...of keeping code like this inside a copy of the file in a git ignored folder. - Joe 2022/03/15 + # begin test code + # for premade_codeset_id in premade_codeset_ids: + # # we have problem with very large code list - 16(3623) and 39(5104) 58(1820) 73(6791) 74(1501) + # upd_cs_ver_expression_items_dict1 = code_set_expression_items_json_all_rows[premade_codeset_id] + # codeStringList1 = upd_cs_ver_expression_items_dict1['parameters'][ + # 'ri.actions.main.parameter.c9a1b531-86ef-4f80-80a5-cc774d2e4c33']['stringList']['strings'] + # print( str(premade_codeset_id) + "codelistLength: " + str(len(codeStringList1))) + # print('------') + # end test code ------------------------------- # + + # II. call the REST APIs to create them on the Enclave + # ...now that we have all the data from concept set are created + # temp_testing_cset_id = 1000000326 # Stephanie said this was a draft or archived set - Joe 2022/03/15 + for premade_codeset_id in premade_codeset_ids: + # TODO: temporary debug code to look for missing concept container not showing in the UI + # TODO: debug code for adding expressionItems to missing container from UI, l162,l163 + + # if premade_codeset_id != temp_testing_cset_id: + # continue + + # Do a test first using 'validate' + header = { + "authorization": f"Bearer {config['PALANTIR_ENCLAVE_AUTHENTICATION_BEARER_TOKEN']}", + 'content-type': 'application/json' + } + + container_data_dict = concept_set_container_edited_json_all_rows[premade_codeset_id] + # noinspection PyUnusedLocal + # response_json = post_request_enclave_api(api_url, header, test_data_dict) + # create concept set containe: + # ----- container_data_dict['parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807'] + # 'ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807': { + # 'type': 'string', 'string': '[VSAC] Eclampsia'}, + # if DEBUG: + # csContainerName = container_data_dict[ + # 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] + # print(csContainerName) + # print('------------------------------') + response_json = post_request_enclave_api_create_container(header, container_data_dict) + # i.e container object may already exist but we can create another version within a container: + # {'errorCode': 'INVALID_ARGUMENT', 'errorName': 'Actions:ObjectsAlreadyExist', + # 'errorInstanceId': '96fb2188-1947-4004-a7b3-d0572a5a0008', 'parameters': {'objectLocators': + # '[ObjectLocator{objectTypeId: omop-concept-set-container, primaryKey: {concept_set_id=PrimaryKeyValue{ + # value: StringWrapper{value: [VSAC] Mental Behavioral and Neurodevelopmental Disorders}}}}]'}} + # Validate 2: Concept set version item + # noinspection PyUnusedLocal + # DEBUG:urllib3.connectionpool:https://unite.nih.gov:443 "POST /actions/api/actions HTTP/1.1" 200 107 + # {'actionRid': 'ri.actions.main.action.9dea4a02-fb9c-4009-b623-b91ad6a0192b', 'synchronouslyPropagated': False} + # Actually create a version so that we can test the api to add the expression items + + # cs_version_data_dict = code_set_version_json_all_rows[0] + cs_version_data_dict = code_set_version_json_all_rows[premade_codeset_id] + # noinspection PyUnusedLocal + # create the version and ask Enclave for the codeset_id that can be used to addCodeExpressionItems + # create version ----- + codeset_id = post_request_enclave_api_create_version(header, cs_version_data_dict) + # TODO begin ------------------------------------------------------------ + # 3/14/22, stephanie, save the codeset_id with container name in csContainerName + # save codeset_id of a draft version with the container name saved in container_name= container_data_dict[ + # 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] + # premade_codeset_id = dih internal id + # csContainerName = container name + # codeset_id = version id + # --persist the data in the output folder = input_csv_folder_path + # end TODO--------------------------------------------------------------- + # upd_cs_ver_expression_items_dict = code_set_expression_items_json_all_rows[item] + upd_cs_ver_expression_items_dict: Dict = code_set_expression_items_json_all_rows[premade_codeset_id] + # update the payload with the codeset_id returned from the + + # DEBUG: Can use this to check to make sure code list is OK: + # if DEBUG: # updated json data is saved in upd_cs_ver_expression_items_dict + cs_container_name = container_data_dict[ + 'parameters']['ri.actions.main.parameter.1b5cd6e9-b220-4551-b97d-245b9fa86807']['string'] + print('csContainerName: ' + str(cs_container_name)) + string_list = upd_cs_ver_expression_items_dict[ + 'parameters']['ri.actions.main.parameter.c9a1b531-86ef-4f80-80a5-cc774d2e4c33']['stringList']['strings'] + print('premade_codeset_id: ' + str(premade_codeset_id)) + print('len(stringList): ' + str(len(string_list))) + print('codeset_id: ' + str(codeset_id)) + print('------------------------------') + + # update the json data with the correct codeset_id ----- + upd_cs_ver_expression_items_dict = \ + update_cs_version_expression_data_with_codesetid(codeset_id, upd_cs_ver_expression_items_dict) + # Validate 3: add the concept set expressions to draft version by passing as code and code system + # third api + # https://unite.nih.gov/workspace/ontology/action-type/add-code-system-codes-as-omop-version-expressions/overview + # action type rid: ri.actions.main.action-type.e07f2503-c7c9-47b9-9418-225544b56b71 + # noinspection PyUnusedLocal + # add expressionItems to version ----- + response_json = post_request_enclave_api_addExpressionItems(header, upd_cs_ver_expression_items_dict) + print('post request to add expressionItems returned: ----------' + json.dumps(response_json)) + # Once the expression items has been added save the enclave concept_id so that we can update the code_sets.csv + # file update code_sets_df with the enclave_codeset_id column of the value in the codeset_id retured from the + # enclave and if needed we can also save the json data in upd_cs_ver_expression_items_dict premade_codeset_id is + # stored in the codeset_id column in the csv files, save the id in the enclave_codeset_id column update when it + # was uploaded as well, Stephane 3/15/22 + try: + code_sets_df.set_index('codeset_id', inplace=True) + except Exception as e: + print(e) + + code_sets_df.at[premade_codeset_id, 'enclave_codeset_id'] = codeset_id + code_sets_df.at[premade_codeset_id, 'enclave_codeset_id_updated_at'] = _datetime_palantir_format() + code_sets_df = code_sets_df.reset_index() + # return response_json + + # write out the update csv file with the enclave_codeset_id + # print('before terminating write out the updated code_sets.csv file here') + # date_str = datetime.now().strftime('%Y_%m_%d_%H_%M') + # output_filename = 'code_sets_updated_' + date_str + '.csv' + # TODO: fix creates these columns depending on how we're doing indexing: Unnamed: 0 Unnamed: 0.1, etc + output_filename = 'code_sets.csv' + code_sets_df.to_csv(os.path.join(input_csv_folder_path, output_filename), index=False, encoding='utf-8') + + return code_sets_df + + +def _load_standardized_input_df( + path: str, integer_id_fields=['enclave_codeset_id', 'codeset_id', 'internal_id'] +) -> pd.DataFrame: + """Loads known input dataframe in a standardized way: + - Sets data types + - Fill na values""" + df: pd.DataFrame = pd.read_csv(path).fillna('') + + # Strip: remove the spaces in the beginning and the end of the name + df.columns.str.lstrip() + df.columns.str.rstrip() + + # Float->Int: For some reason, was being read with .0's at the end. + for field in [x for x in integer_id_fields if x in list(df.columns)]: + df[field] = pd.to_numeric(df[field], errors='coerce')\ + .astype('Int64') + + return df + + +def left_join_update(df, df2): + """Does a left join, but where column names are the same, keeps right value if exists, else left value.""" + new_df = df.merge( + df2, how='left', left_on='internal_id', right_on='codeset_id') + + xy_col_counts: Dict[str, int] = {} + for col in list(new_df.columns): + if any([col.endswith(x) for x in ['_x', '_y']]): + common_col = col[:-2] + if common_col not in xy_col_counts: + xy_col_counts[common_col] = 0 + xy_col_counts[common_col] += 1 + + xy_cols: List[str] = [k for k, v in xy_col_counts.items() if v == 2] + for col in xy_cols: + new_df[col] = new_df[col + '_y'].fillna(new_df[col + '_x']) + new_df = new_df.drop([col + '_x', col + '_y'], axis=1) + + return new_df + + +def persist_to_db(code_sets_df: pd.DataFrame) -> pd.DataFrame: + """Save updated items to persistence layer""" + # Vars + join_col = 'codeset_id' + update_cols = ['enclave_codeset_id', 'enclave_codeset_id_updated_at', 'concept_set_name'] + + # Read + persistence_df = _load_standardized_input_df(CSET_UPLOAD_REGISTRY_PATH) + + # Subset + try: + code_sets_df_limited = code_sets_df[[join_col] + update_cols] + except KeyError: # if KeyError, `join_col` is an index + code_sets_df_limited = code_sets_df[update_cols] + # Join + persistence_df_new = left_join_update(persistence_df, code_sets_df_limited) + + # Save and return + persistence_df_new.to_csv(CSET_UPLOAD_REGISTRY_PATH, index=False) + return persistence_df_new + + +def _save_csv( + df: pd.DataFrame, out_format_name: str, in_format_name: str, inpath: str, output_filename: str, + field_delimiter=',' +): + """Side effects: Save CSV""" + # date_str = datetime.now().strftime('%Y.%m.%d') + # out_dir = os.path.join(UPLOADS_DIR, output_name, source_name, date_str, 'output') + infile_stem = os.path.basename(inpath).replace('.csv', '').replace('.tsv', '') + out_dir = os.path.join(UPLOADS_DIR, in_format_name, infile_stem, 'transforms', out_format_name) + os.makedirs(out_dir, exist_ok=True) + output_format = 'csv' if field_delimiter == ',' else 'tsv' if field_delimiter == '\t' else 'txt' + outpath = os.path.join(out_dir, f'{output_filename}.{output_format}') + df.to_csv(outpath, sep=field_delimiter, index=False) + + +def update_cset_upload_registry_moffit( + moffit_path: str, registry_path: str = CSET_UPLOAD_REGISTRY_PATH +) -> pd.DataFrame: + """Update concept set registry file, specifically for moffit entries. + Side effects: Writes update to registry file w/ any new entries""" + moffit_df = pd.read_csv(moffit_path).fillna('') + registry_df = pd.read_csv(registry_path).fillna('') + registered_moffit_cset_df = registry_df[registry_df['source_id_type'] == MOFFIT_SOURCE_ID_TYPE] + registered_moffit_ids = [str(x) for x in registered_moffit_cset_df['source_id']] + moffit_dataset_cset_ids: Set[str] = set([str(int(x)) for x in moffit_df['concept_set_id'].unique() if x != '']) + new_moffit_cset_ids: List = list(set([x for x in moffit_dataset_cset_ids if x not in registered_moffit_ids])) + try: # integers if possible + new_moffit_cset_ids = [int(x) for x in new_moffit_cset_ids] + except TypeError: + pass + new_moffit_cset_ids.sort() + + new_rows = [] + next_internal_id: int = max(registry_df['internal_id']) + 1 + for _id in new_moffit_cset_ids: + new_rows.append({ + 'source_id_type': 'moffit', + 'source_id': _id, + # 'source_id_field': '', + # 'oid': '', + # 'ccsr_code': '', + 'internal_id': next_internal_id, + 'internal_source': MOFFIT_SOURCE_URL, + # 'cset_source': '', + # 'grouped_by_bids': '', + # 'concept_id': '', + # 'codeset_id': '', + # 'enclave_codeset_id': '', + # 'enclave_codeset_id_updated_at': '', + # 'concept_set_name': '', + }) + next_internal_id += 1 + + if len(new_rows) > 0: + new_entries_df = pd.DataFrame(new_rows).fillna('') + new_registry_df = pd.concat([registry_df, new_entries_df]).fillna('') + new_registry_df.to_csv(registry_path, index=False) + return new_registry_df + return registry_df # if no new updates + + +# todo: repurpose to moffit (if not defunct) +# - Make sure all cols are being used +def transform_moffit_to_palantir3file(inpath: str) -> str: + """Transform Moffit format to Palantir 3 File format.""" + # Vars + field_delimiter = ',' + out_format_name = 'palantir-three-file' + in_format_name = 'moffit' + out_filename1 = 'concept_set_version_item_rv_edited' + out_filename2 = 'code_sets' + out_filename3 = 'concept_set_container_edited' + infile_stem = os.path.basename(inpath).replace('.csv', '').replace('.tsv', '') + out_dir = os.path.join(UPLOADS_DIR, in_format_name, infile_stem, 'transforms', out_format_name) + + # Read inputs + inpath = os.path.join(PROJECT_ROOT, inpath) if inpath.startswith('termhub-csets') else inpath + # df = pd.read_csv(inpath, converters={'concept_set_id': int}).fillna('') # fails because one row has '' for id + df = pd.read_csv(inpath).fillna('') + df = df[df['concept_set_id'] != ''] + df['concept_set_id'] = df['concept_set_id'].astype(int) + df['concept_set_id'] = df['concept_set_id'].astype(str) + df = df.applymap(lambda x: x.strip()) + moffit_cset_ids: Set[str] = set([str(int(x)) for x in df['concept_set_id'].unique() if x]) + + # Read/update registry + cset_upload_registry_df: pd.DataFrame = update_cset_upload_registry_moffit(inpath) + registered_moffit_cset_df = cset_upload_registry_df[ + cset_upload_registry_df['source_id_type'] == MOFFIT_SOURCE_ID_TYPE] + moffit_id_internal_id_map: Dict[str, str] = dict(zip( + [str(x) for x in registered_moffit_cset_df['source_id']], + [str(x) for x in registered_moffit_cset_df['internal_id']])) + + # Transform + # II. Create & save exports + _all = {} + # 1. Palantir enclave table: concept_set_version_item_rv_edited + rows1 = [] + codeset_id__code__map = {} + for i, concept_row in df.iterrows(): + internal_id = moffit_id_internal_id_map[str(concept_row['concept_set_id'])] + code = concept_row['concept_code'] + code_system = concept_row['code_system'] + + # This will help us avoid duplicate codes in single concept set + if internal_id not in codeset_id__code__map: + codeset_id__code__map[internal_id] = [] + if code not in codeset_id__code__map[internal_id]: + codeset_id__code__map[internal_id].append(code) + else: + continue + + # The 3 fields isExcluded, includeDescendants, and includeMapped, are from OMOP but also in VSAC. If it has + # ...these 3 options, it is intensional. And when you execute these 3, it is now extensional / expansion. + # todo: Don't need concept_name? + row = { + 'codeset_id': internal_id, + 'concept_id': '', # leave blank for now + # + 'code': code, + 'codeSystem': code_system, + # + 'isExcluded': False, + 'includeDescendants': True if code_system == 'SNOMED' else False, + 'includeMapped': False, + 'item_id': str(uuid4()), # will let palantir verify ID is indeed unique + 'annotation': f'Curated value set: {MOFFIT_PREFIX}', + # 'created_by': 'DI&H Bulk Import', + 'created_by': PALANTIR_ENCLAVE_USER_ID_1, + 'created_at': _datetime_palantir_format() + } + row2 = {} + for k, v in row.items(): + row2[k] = v.replace('\n', ' - ') if type(v) == str else v + row = row2 + rows1.append(row) + df_code_set_members = pd.DataFrame(rows1) + _all[out_filename1] = df_code_set_members + _save_csv(df_code_set_members, out_format_name, in_format_name, inpath, out_filename1, field_delimiter) + + # 2. Palantir enclave table: code_sets + rows2 = [] + for moffit_id in moffit_cset_ids: + v = 1 + internal_id = moffit_id_internal_id_map[moffit_id] + cset_row = df[df['concept_set_id'] == moffit_id].iloc[0] + concept_set_name = f'[{MOFFIT_PREFIX}] ' + cset_row['concept_set_name'] + row = { + 'codeset_id': internal_id, + 'concept_set_name': concept_set_name, + 'concept_set_version_title': concept_set_name + f' (v{str(v)})', + 'project': ENCLAVE_PROJECT_NAME, # always use this project id for bulk import + 'source_application': '', + 'source_application_version': '', # nullable + 'created_at': _datetime_palantir_format(), + 'atlas_json': '', # nullable + 'is_most_recent_version': True, + 'version': v, + 'comments': '', + 'intention': '', # nullable + 'limitations': '', # nullable + 'issues': '', # nullable + 'update_message': 'Initial version.' if v == 1 else '', # nullable (maybe?) + # status field stats as appears in the code_set table 2022/01/12: + # 'status': [ + # '', # null + # 'Finished', + # 'In Progress', + # 'Awaiting Review', + # 'In progress', + # ][2], + # status field doesn't show this in stats in code_set table, but UI uses this value by default: + 'status': 'Under Construction', + 'has_review': '', # boolean (nullable) + 'reviewed_by': '', # nullable + 'created_by': PALANTIR_ENCLAVE_USER_ID_1, + 'provenance': MOFFIT_SOURCE_URL, + 'atlas_json_resource_url': '', # nullable + # null, initial version will not have the parent version so this field would be always null: + 'parent_version_id': '', # nullable + # True ( after the import view it from the concept set editor to review the concept set and click done. + # We can add the comments like we imported from VSAC and reviewed it from the concept set editor. ) + # 1. import 2. manual check 3 click done to finish the definition. - if we want to manually review them + # first and click Done: + 'is_draft': True, + } + rows2.append(row) + df_code_sets = pd.DataFrame(rows2) + _all[out_filename2] = df_code_sets + _save_csv(df_code_sets, out_format_name, in_format_name, inpath, out_filename2, field_delimiter) + + # 3. Palantir enclave table: concept_set_container_edited + rows3 = [] + for moffit_id in moffit_cset_ids: + internal_id = moffit_id_internal_id_map[moffit_id] + cset_row = df[df['concept_set_id'] == moffit_id].iloc[0] + concept_set_name = f'[{MOFFIT_PREFIX}] ' + cset_row['concept_set_name'] + row = { + 'concept_set_id': internal_id, + 'concept_set_name': concept_set_name, + 'project_id': '', # nullable + 'assigned_informatician': PALANTIR_ENCLAVE_USER_ID_1, # nullable + 'assigned_sme': PALANTIR_ENCLAVE_USER_ID_1, # nullable + 'status': ['Finished', 'Under Construction', 'N3C Validation Complete'][1], + 'stage': [ + 'Finished', + 'Awaiting Editing', + 'Candidate for N3C Review', + 'Awaiting N3C Committee Review', + 'Awaiting SME Review', + 'Under N3C Committee Review', + 'Under SME Review', + 'N3C Validation Complete', + 'Awaiting Informatician Review', + 'Under Informatician Review', + ][1], + 'intention': '', # nullable + 'n3c_reviewer': '', # nullable + 'alias': None, # '' better? + 'archived': False, + # 'created_by': 'DI&H Bulk Import', + 'created_by': PALANTIR_ENCLAVE_USER_ID_1, + 'created_at': _datetime_palantir_format() + } + + row2 = {} + for k, v in row.items(): + row2[k] = v.replace('\n', ' - ') if type(v) == str else v + row = row2 + + rows3.append(row) + df_code_sets__container_variation = pd.DataFrame(rows3) + _all[out_filename3] = df_code_sets__container_variation + _save_csv(df_code_sets__container_variation, out_format_name, in_format_name, inpath, out_filename3, field_delimiter) + + return out_dir + + +# todo: clean up these comments; can probably remove +# going to do new container instead. just keeping this code for a bit in case +# we want to revert to just new version +# def upload_cset_as_new_version_of_itself( +# codeset_id: int, +# add_to_field: Dict = {'provenance': f'Version for comparison to N3C-Rec on {date.today().isoformat()}'} +# ) -> Dict: +# ov = fetch_cset_version(codeset_id, False) +# +# vi = [i['properties'] for i in get_concept_set_version_expression_items(codeset_id, 'full')] +# concepts = [] +# for item in vi: +# c = {'concept_id': item['conceptId']} +# for p in ['includeDescendants', 'isExcluded', 'includeMapped']: +# c[p] = item[p] +# concepts.append(c) +# +# upload_args = { +# 'on_behalf_of': ov.get('createdBy', ''), # not allowing this for some csets +# # 'on_behalf_of': config['SERVICE_USER_ID'], +# 'concept_set_name': ov.get('conceptSetNameOMOP', ''), +# 'provenance': ov.get('provenance', ''), +# 'limitations': ov.get('limitations', ''), +# 'intention': ov.get('intention', ''), +# 'parent_version_codeset_id': ov.get('codesetId', ''), +# 'current_max_version': ov.get('version', ''), # probably +# # codeset_id': None, will be assigned +# 'validate_first': True, +# 'omop_concepts': concepts, +# 'finalize': True, +# # annotation, +# # intended_research_project, +# } +# +# for key, value in add_to_field.items(): +# val = '. ' + ov.get(key, '') +# val = value + val +# upload_args[key] = val +# +# # upload_new_cset_version_with_concepts( concept_set_name, omop_concepts, provenance, limitations, intention, annotation, parent_version_codeset_id, current_max_version, intended_research_project, on_behalf_of, codeset_id, validate_first, finalize ) +# # pass_on_args = ['conceptSetNameOMOP'] not sure what this was for +# d = upload_new_cset_version_with_concepts(**upload_args) # {'responses': [...], 'codeset_id': 123} +# +# return d['versionId'] + + +def upload_cset_copy_in_new_container( + codeset_id: int, +) -> Dict: + """Upload a copy of a concept set in a new container.""" + ov = fetch_cset_version(codeset_id, False) + + concept_set_name = ov.get('conceptSetNameOMOP') + oc = fetch_cset_container(concept_set_name) + + vi = [i['properties'] for i in get_concept_set_version_expression_items(codeset_id, 'full')] + concepts = [] + for item in vi: + c = {'concept_id': item['conceptId']} + for p in ['includeDescendants', 'isExcluded', 'includeMapped']: + c[p] = item[p] + concepts.append(c) + + concept_set_name = ov.get('conceptSetNameOMOP', '') + concept_set_name = f'{concept_set_name} -- Copy of {codeset_id} for comparison {date.today().isoformat()}' + + try: + c_creator = 'unknown' + c_created_by = oc.get('createdBy') + if c_created_by == config['SERVICE_USER_ID']: + c_creator = 'UNITEConceptSetBulk@nih.gov' + elif c_created_by: + c_creator = get_researcher(c_created_by).get('name', 'unknown') + except Exception as e: + raise e + + try: + v_creator = 'unknown' + v_created_by = ov.get('createdBy') + if v_created_by == config['SERVICE_USER_ID']: + v_creator = 'UNITEConceptSetBulk@nih.gov' + elif v_created_by: + v_creator = get_researcher(v_created_by).get('name', 'unknown') + except Exception as e: + raise e + + + container_args = { + 'on_behalf_of': config['SERVICE_USER_ID'], + # 'on_behalf_of': c_created_by, + 'concept_set_name': concept_set_name, + 'intention': f"Copy for comparison of container created by {c_creator} on {oc.get('createdAt', '**missing**')}. Orig intention: {oc.get('intention', '')}", + 'research_project': oc.get('project', config['DIH_PROJ_ID']), + 'assigned_sme': oc.get('assigned_sme', ''), + 'assigned_informatician': oc.get('assigned_informatician', ''), + } + version_args = { + 'on_behalf_of': config['SERVICE_USER_ID'], + # 'on_behalf_of': v_created_by, + 'concept_set_name': concept_set_name, + 'provenance': f"Copy for comparison of container created by {v_creator} on {ov.get('createdAt', '**missing**')}. Orig provenance: {ov.get('provenance', '')}", + 'limitations': ov.get('limitations', ''), + 'intention': ov.get('intention', ''), + # 'parent_version_codeset_id': ov.get('codesetId', ''), + # 'current_max_version': ov.get('version', ''), # probably + 'current_max_version': None, + 'codeset_id': None, # will be assigned + # 'codeset_id': codeset_id * 100, # to make it easy to see they're related -- was too big for api rules + 'omop_concepts': concepts, + 'finalize': True, + # annotation, + # intended_research_project, + } + + d = upload_new_container_with_concepts( + **container_args, versions_with_concepts=[version_args], + fail_if_exists=False, validate_first=True) + + container: Dict = d['upload_concept_set_container'] + if not container: + raise Exception(f'Error uploading or fetching new container copy of {codeset_id}') + version: Dict = d['upload_new_cset_version_with_concepts'] + if not version: + raise Exception(f'Error uploading or fetching new version copy of {codeset_id}') + if 'versionId' in version: + return version['versionId'] + if 'codesetId' in version: + return version['codesetId'] + + raise Exception(f'Failed to find codeset_id of newly uploaded version of {codeset_id}') + + +def make_new_versions_of_csets(): + """For making a copy of a cset, but for the newest vocabulary""" + from backend.db.utils import sql_query_single_col, get_db_connection + from backend.routes.db import get_n3c_recommended_codeset_ids + with get_db_connection() as con: + comparisons_already_done = sql_query_single_col(con, 'select orig_codeset_id from public.codeset_comparison') + + n3c_codeset_ids = set(get_n3c_recommended_codeset_ids()).difference(comparisons_already_done) + + eastern = pytz.timezone('US/Eastern') + new_codeset_ids = [] + with get_db_connection() as con: + for codeset_id in n3c_codeset_ids: + print(f'Making new version of {codeset_id}') + new_version_codeset_id = upload_cset_copy_in_new_container(codeset_id) + if not new_version_codeset_id: + continue + print(f'{codeset_id}, {new_version_codeset_id}') + new_codeset_ids.append(new_version_codeset_id) + row = { + 'fetch_time': datetime.now(eastern).isoformat(), + 'orig_codeset_id': codeset_id, + 'new_codeset_id': new_version_codeset_id + } + insert_from_dict(con, 'public.codeset_comparison', row) + + +def upload_dataset(input_path: str, format='palantir-three-file', use_cache=False): + """Main function""" + if format == 'moffit': + input_path = transform_moffit_to_palantir3file(input_path) + code_sets_df: pd.DataFrame = post_to_enclave_from_3csv(input_path) + persist_to_db(code_sets_df) + + +def cli(): + """Command line interface for package. + + Side Effects: Executes program.""" + package_description = 'Tool for uploading to the Palantir Foundry enclave.' + parser = ArgumentParser(description=package_description) + + parser.add_argument( + '-n', '--n3c', default=False, action='store_true', required=False, + help='Create new versions of N3C Recommended for comparison after vocab updates') + parser.add_argument( + '-p', '--input-path', + help='Path to file or folder to be parsed and uploaded.') + parser.add_argument( + '-f', '--format', + choices=['palantir-three-file', 'moffit'], + default='palantir-three-file', + help='The format of the file(s) to be uploaded.\n' + '- palantir-three-file: Path to folder with 3 files that have specific columns that adhere to concept table data model. These ' + 'files must have the following names: i. `code_sets.csv`, ii. `concept_set_container_edited.csv`, iii. ' + '`concept_set_version_item_rv_edited.csv`.\n' + '- moffit: Has columns concept_set_id, concept_set_name, concept_code, concept_name, code_system.') + # parser.add_argument( + # '-c', '--use-cache', + # action='store_true', + # help='If present, will check the input file and look at the `enclave_codeset_id` column. If no empty values are' + # ' present, this indicates that the `enclave_wrangler` has already been run and that the input file itself ' + # 'can be used as cached data. The only thing that will happen is an update to the persistence layer, ' + # '(`data/cset.csv` as of 2022/03/18).'), + kwargs = parser.parse_args() + kwargs_dict: Dict = vars(kwargs) + + if kwargs.n3c: + make_new_versions_of_csets() + else: + upload_dataset(**kwargs_dict) + + +if __name__ == '__main__': + cli()