diff --git a/.github/workflows/mopper-conda-release.yaml b/.github/workflows/mopper-conda-release.yaml new file mode 100644 index 0000000..7e070d2 --- /dev/null +++ b/.github/workflows/mopper-conda-release.yaml @@ -0,0 +1,36 @@ +name: Build of mopper conda package for new release + +# Controls when the action will run. +on: + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + conda_deployment_with_new_tag: + name: Test conda deployment of package with Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4.1.7 + with: + fetch-depth: 0 + - name: Set env + run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + - name: Conda environment creation and activation + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + python-version: ${{ matrix.python-version }} + activate-environment: mopper_env + environment-file: conda/environment.yaml # Path to the build conda environment + show-channel-urls: true # + - name: Build but do not upload the conda packages + uses: uibcdf/action-build-and-upload-conda-packages@v1.3.0 + with: + meta_yaml_dir: conda + python-version: ${{ matrix.python-version }} # Values previously defined in `matrix` + user: coecms + label: auto + upload: true + token: ${{ secrets.ANACONDA_TOKEN }} # Replace with the right name of your secret diff --git a/.github/workflows/mopper-pytest.yaml b/.github/workflows/mopper-pytest.yaml new file mode 100644 index 0000000..44e2b5e --- /dev/null +++ b/.github/workflows/mopper-pytest.yaml @@ -0,0 +1,61 @@ +name: mopper-all-tests + +#on: [push] +on: + push: + branches: + - prerelease + - main + pull_request: + branches: + - main + - prerelease + + +jobs: + build-linux: + runs-on: ubuntu-latest + timeout-minutes: 60 + strategy: + max-parallel: 5 + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4.1.7 + - name: Set up Python 3.10/3.11 + uses: actions/setup-python@v5.1.1 + with: + python-version: ${{ matrix.python-version }} + - name: Install Miniconda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + activate-environment: testenv + environment-file: conda/testenv.yaml + python-version: ${{ matrix.python-version }} + channels: conda-forge + - name: Lint with ruff + shell: bash -el {0} + run: | + ruff check --output-format=github . + continue-on-error: true + + # making sure we are testing installed package + - name: Install package + shell: bash -el {0} + run: | + conda activate testenv + pip install -e . + - name: Test with pytest + shell: bash -el {0} + run: | + conda run python -m pytest + #conda run coverage run --source src -m py.test + # - name: Upload to codecov + # shell: bash -el {0} + # if: steps.build.outcome == 'success' + # run: | + # curl -Os https://uploader.codecov.io/latest/linux/codecov + # chmod +x codecov + # ./codecov + diff --git a/.github/workflows/mopper-test-calcs.yaml b/.github/workflows/mopper-test-calcs.yaml new file mode 100644 index 0000000..0c34152 --- /dev/null +++ b/.github/workflows/mopper-test-calcs.yaml @@ -0,0 +1,49 @@ +# this workflow can be used as a template for a worklfow +# that runs automatically only specific tests when pushing +# to a selected branch. +name: mopper-specific-tests + +on: + push: + branches: + - class + + +jobs: + build-linux: + runs-on: ubuntu-latest + timeout-minutes: 60 + strategy: + max-parallel: 5 + matrix: + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4.1.7 + - name: Set up Python 3.10/3.11 + uses: actions/setup-python@v5.1.1 + with: + python-version: ${{ matrix.python-version }} + - name: Install Miniconda + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + activate-environment: testenv + environment-file: conda/testenv.yaml + python-version: ${{ matrix.python-version }} + channels: conda-forge + - name: Lint with ruff + shell: bash -el {0} + run: | + ruff check --output-format=github . + continue-on-error: true + + # making sure we are testing installed package + - name: Install package + shell: bash -el {0} + run: | + conda activate testenv + pip install -e . + - name: Test with pytest + shell: bash -el {0} + run: | + conda run python -m pytest -q tests/test_calculations.py diff --git a/.github/workflows/mopper-test-conda.yaml b/.github/workflows/mopper-test-conda.yaml new file mode 100644 index 0000000..6ab53d2 --- /dev/null +++ b/.github/workflows/mopper-test-conda.yaml @@ -0,0 +1,42 @@ +name: Test Build of mopper conda package + +# Controls when the action will run. +on: + # Triggers the workflow on push or pull request events but only for the prerelase branch + push: + branches: [ prerelease ] + pull_request: + branches: [ prerelease ] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + conda_deployment_with_new_tag: + name: Test conda deployment of package with Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@v4.1.7 + with: + fetch-depth: 0 + - name: Set env + run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV + - name: Conda environment creation and activation + uses: conda-incubator/setup-miniconda@v3.0.4 + with: + python-version: ${{ matrix.python-version }} + activate-environment: mopper_env + environment-file: conda/environment.yaml # Path to the build conda environment + show-channel-urls: true # + - name: Build but do not upload the conda packages + uses: uibcdf/action-build-and-upload-conda-packages@v1.3.0 + with: + meta_yaml_dir: conda + python-version: ${{ matrix.python-version }} # Values previously defined in `matrix` + user: coecms + label: auto + upload: false + token: ${{ secrets.ANACONDA_TOKEN }} # Replace with the right name of your secret diff --git a/.gitignore b/.gitignore index 3ba2e33..c0e81d6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ -custom_app4_*.sh __pycache__/ build/ +mopper_venv/ +*.csv +*.yaml +*.json +localdata/ +src/mopper.egg-info/ +extras/ +*.txt diff --git a/ACDD_conf.yaml b/ACDD_conf.yaml index d75312a..a86f373 100755 --- a/ACDD_conf.yaml +++ b/ACDD_conf.yaml @@ -83,16 +83,25 @@ cmor: grids: "ACDD_grids.json" # Additional NCI information: # NCI project to charge compute; $PROJECT = your default project - # NCI queue to use; hugemem is recommended project: v45 # additional NCI projects to be included in the storage flags addprojs: [] - # queue and memory (GB) per CPU (depends on queue) + # queue and memory (GB) per CPU (depends on queue), + # hugemem is reccomended for high reoslution data and/or derived variables + # hugemem requires a minimum of 6 cpus this is handled by the code queue: hugemem mem_per_cpu: 32 # walltime in "hh:mm:ss" walltime: '8:00:00' mode: custom + # conda_env to use by default hh5 analysis3-unstable + # as this has the code and all dependecies installed + # you can override that by supplying the env to pass to "source" + # Ex + # conda_env: /bin/activate + # or you can set "test: true" and modify mopper_job.sh manually + conda_env: default + # # Global attributes: these will be added to each files comment unwanted ones # Using ACDD CV vocab to check validity of global attributes @@ -165,4 +174,4 @@ attrs: parent: !!bool false # CMOR will add a tracking_id if you want to define a prefix add here tracking_id_prefix: - comment: "post-processed using ACCESS-MOPPeR v0.6.0 https://doi.org/10.5281/zenodo.10346216" + comment: "post-processed using ACCESS-MOPPeR v1.0.0 https://doi.org/10.5281/zenodo.10346216" diff --git a/CMIP6_conf.yaml b/CMIP6_conf.yaml index fd5f14b..c421df0 100755 --- a/CMIP6_conf.yaml +++ b/CMIP6_conf.yaml @@ -84,11 +84,15 @@ cmor: # additional NCI projects to be included in the storage flags addprojs: [] # queue and memory (GB) per CPU (depends on queue) + # hugemem is reccomended for high reoslution data and/or derived variables + # hugemem requires a minimum of 6 cpus this is handled by the code queue: hugemem mem_per_cpu: 30 # walltime in "hh:mm:ss" walltime: '8:00:00' mode: cmip6 + # conda_env: /bin/activate + conda_env: default # Global attributes: these will be added to each files comment unwanted ones # the labels CMIP6/ACDD indicates which ones are necessary to comply with respective standards @@ -159,4 +163,4 @@ attrs: #CMOR will add a tracking_id if you want to define a prefix add here tracking_id_prefix: Conventions: "CF-1.7 CMIP-6.2" - comment: "post-processed using ACCESS-MOPPeR v0.6.0 https://doi.org/10.5281/zenodo.10346216" + comment: "post-processed using ACCESS-MOPPeR v1.0.0 https://doi.org/10.5281/zenodo.10346216" diff --git a/README.md b/README.md index 159a3c5..12b0d94 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # [ACCESS Model Output Post-Processor (MOPPeR)](https://access-mopper.readthedocs.io/en/latest) [![Read the docs](https://readthedocs.org/projects/access-mopper/badge/?version=latest)](https://access-mopper.readthedocs.io/en/latest/) -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10976467.svg)](https://doi.org/10.5281/zenodo.10976467) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12747219.svg)](https://doi.org/10.5281/zenodo.12747219) This code is derived from the [APP4](https://doi.org/10.5281/zenodo.7703469), initially created by Peter Uhe for CMIP5, and further developed for CMIP6-era by Chloe Mackallah from CSIRO, O&A Aspendale. @@ -15,12 +15,13 @@ Designed for use on ACCESS model output that has been archived using the [ACCESS Although we retained a differentiation between `custom` and `cmip` mode the main workflow is the same and `mode` is now only another field in the main configuration file. +See [MOPPeR ReadtheDocs](https://access-mopper.readthedocs.io/en/stable/) for the full documentation. ### Install You can install the latest version of `mopper` directly from conda (accessnri channel):: - conda install -c accessnri mopper + conda install -c coecms mopper If you want to install an unstable version or a different branch: @@ -35,6 +36,6 @@ If you want to install an unstable version or a different branch: MOPPeR is pre-installed into a Conda environment at NCI. Load it with:: module use /g/data3/hh5/public/modules - module load conda/analysis3-unstable + module load conda/analysis3 NB. You need to be a member of the hh5 project to load the modules. diff --git a/conda/environment.yaml b/conda/environment.yaml new file mode 100644 index 0000000..62adf87 --- /dev/null +++ b/conda/environment.yaml @@ -0,0 +1,10 @@ +name: mopper_env +channels: + - conda-forge + - coecms + - default + +dependencies: + - anaconda-client + - conda-build + - conda-verify diff --git a/conda/meta.yaml b/conda/meta.yaml index b0cb321..fff9987 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,7 +1,6 @@ -{% set version = "0.6.1" %} package: name: mopper - version: {{ version }} + version: "{{ environ.get('GIT_DESCRIBE_TAG', '1.0') }}" #source: # path: ./ @@ -9,11 +8,12 @@ package: source: #url: https://github.com/ACCESS-Hive/ACCESS-MOPPeR/archive/refs/tags/{{version}}.tar.gz git_url: https://github.com/ACCESS-Hive/ACCESS-MOPPeR.git - git_rev: {{ version }} - git_depth: 1 # (Defaults to -1/not shallow) + #git_tag: prerelease + git_rev: "{{ version }}" + #git_depth: 1 # (Defaults to -1/not shallow) build: - number: 0 + number: 1 noarch: python script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed" entry_points: @@ -24,7 +24,6 @@ requirements: host: - python - pip - - pbr run: - python - click @@ -35,12 +34,29 @@ requirements: - pyyaml - cftime - python-dateutil + test: + #imports: + # - mopdb + # - mopper source_files: + - tests - tests/testdata + requires: + - cmor + - xarray + - numpy + - dask + - pyyaml + - cftime + - python-dateutil + - pytest + - pyfakefs + - coverage + - codecov about: home: https://github.com/ACCESS-Hive/ACCESS-MOPPeR license: Apache 2.0 #license_file: LICENSE.txt - summary: 'ACCESS-MOPPeR post-process ACCESS raw model output to ESGF data standards' + summary: 'ACCESS-MOPPeR post-process ACCESS raw model output using CMOR and pre-defined data standards' diff --git a/conda/run_test.sh b/conda/run_test.sh index 21da92a..1c3c29b 100644 --- a/conda/run_test.sh +++ b/conda/run_test.sh @@ -1,2 +1,3 @@ #!/bin/bash -py.test +echo 'calling run_test' +python -m pytest diff --git a/conda/run_test_coverage.sh b/conda/run_test_coverage.sh index 4da4a69..476fc38 100644 --- a/conda/run_test_coverage.sh +++ b/conda/run_test_coverage.sh @@ -1,5 +1,5 @@ #!/bin/bash pip install coverage pytest-cov -py.test --cov=mopper --cov-report xml:/tmp/artefacts/tests/pytest/coverage.xml --junit-xml /tmp/artefacts/tests/pytest/results.xml -py.test --cov=mopdb --cov-report xml:/tmp/artefacts/tests/pytest/coverage.xml --junit-xml /tmp/artefacts/tests/pytest/results.xml +python -m pytest --cov=mopper --cov-report xml:/tmp/artefacts/tests/pytest/coverage.xml --junit-xml /tmp/artefacts/tests/pytest/results.xml +python -m pytest --cov=mopdb --cov-report xml:/tmp/artefacts/tests/pytest/coverage.xml --junit-xml /tmp/artefacts/tests/pytest/results.xml diff --git a/conda/testenv.yaml b/conda/testenv.yaml new file mode 100644 index 0000000..7fa8dd5 --- /dev/null +++ b/conda/testenv.yaml @@ -0,0 +1,18 @@ +name: testenv +channels: + - conda-forge + +dependencies: + - cmor + - click + - xarray + - numpy + - dask + - pyyaml + - cftime + - python-dateutil + - pytest + - coverage + - codecov + - pyfakefs + - ruff diff --git a/docs/cmor_conf.yaml b/docs/cmor_conf.yaml index 8ff286d..5c93d65 100755 --- a/docs/cmor_conf.yaml +++ b/docs/cmor_conf.yaml @@ -93,3 +93,4 @@ cmor: # walltime in "hh:mm:ss" walltime: '8:00:00' mode: custom + conda_env: /g/data/.../mopper_env/bin/activate diff --git a/docs/conf.py b/docs/conf.py index 1f21a6f..bd47e1c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,9 +12,6 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/docs/gettingstarted.rst b/docs/gettingstarted.rst index 397652e..62aabd1 100644 --- a/docs/gettingstarted.rst +++ b/docs/gettingstarted.rst @@ -1,48 +1,42 @@ Starting with MOPPeR ==================== -A typical workflow to post-process an ACCESS or UM model output requires three steps. - -Step1: get a list of variables from the raw output --------------------------------------------------- - - *mopdb varlist -i -d * - -`mopdb varlist` will output one or more `csv` files with a detailed list of variables, one list for each pattern of output files. - -.. code-block:: console - - $ mopdb varlist -i /scratch/../exp -d 20120101 - Opened database ~/.local/lib/python3.10/site-packages/data/access.db successfully - Variable list for ocean_scalar.nc- successfully written - Variable list for ocean_month.nc- successfully written - Variable list for ocean_daily.nc- successfully written - -.. csv-table:: Example of varlist output - :file: varlist_example.csv - :delim: ; - -The argument is used to reduce the number of files to check. The tool will recognise anyway a repeated pattern and only add a list of variable for the same pattern once. +A typical workflow to post-process an ACCESS or UM model output requires two steps. +The first step is creating the mapping for a spcific simualtion and it is done only once for an experiment. +The second step is to setup and run the actual post-processing. -Step2: create a template for a mapping file +Step1: create a template for a mapping file ------------------------------------------- - *mopdb template -i -v -a * + *mopdb template -f -m -v -a * .. code-block:: console - $ mopdb template -f ocean.csv -v OM2 -a ocnmon - Opened database ~/.local/lib/python3.10/site-packages/data/access.db successfully - Derived variables: {'msftyrho', 'msftmrho', 'hfds', 'msftmz', 'msftyz'} - Changing advectsweby-CM2_mon units from Watts/m^2 to W m-2 - Changing areacello-CMIP6_Ofx units from m^2 to m2 - Variable difvho-CM2_Omon not found in cmor table + $ mopdb template -f /scratch/.../exp1/atmos -m 095101 -v CM2 -a exp1 + Opened database /home/581/pxp581/.local/lib/python3.10/site-packages/data/access.db successfully + Found more than 1 definition for fld_s16i222: + [('psl', 'AUS2200', 'AUS2200_A10min', '10minPt'), ('psl', 'AUS2200', 'AUS2200_A1hr', '1hr')] + Using psl from AUS2200_A10min + Variable list for cw323a.pm successfully written + Opened database /home/581/pxp581/.local/lib/python3.10/site-packages/data/access.db successfully + Derived variables: {'treeFracBdlEvg', 'grassFracC4', 'shrubFrac', 'prc', 'mrsfl', 'landCoverFrac', 'mmrbc', 'mmrso4', 'theta24', 'sftgif', 'treeFracNdlEvg', 'snw', 'rtmt', 'nwdFracLut', 'sifllatstop', 'prw', 'mrfso', 'rlus', 'mrsll', 'baresoilFrac', 'c4PftFrac', 'wetlandFrac', 'mrro', 'c3PftFrac', 'treeFracBdlDcd', 'od550lt1aer', 'treeFracNdlDcd', 'residualFrac', 'wetss', 'sbl', 'vegFrac', 'rsus', 'cropFrac', 'mmrdust', 'grassFrac', 'mmrss', 'od550aer', 'hus24', 'dryss', 'fracLut', 'mrlso', 'mc', 'od440aer', 'grassFracC3', 'nep', 'mmroa', 'cropFracC3', 'snm', 'agesno'} + Changing cl-CMIP6_Amon units from 1 to % + Changing cli-CMIP6_Amon units from 1 to kg kg-1 + Changing clt-CMIP6_Amon units from 1 to % + Changing clw-CMIP6_Amon units from 1 to kg kg-1 + Variable husuvgrid-CM2_mon not found in cmor table + ... `mopdb template` takes as input: - * the output/s of `varlist` - To get one template for the all variable concatenate the output on `varlist` into one file first. - * the access version to use as preferred - * an optional alias, if omitted the varlist filename will be used. Based on the example: `map_ocnmon.csv` or `map_ocean.csv` if omitted. + * -f/--fpath : the path to the model output + * -m/--match : used to identify files' patterns. The tool will only add a list of variables for the same pattern once. + * -v/--version : the access version to use as preferred mapping. ESM1.5, CM2, OM2 and AUS2200 are currently available. + * -a/--alias : an optional alias, if omitted default names will be used for the output files. + +Alternatively a list of variables can be created separately using the *varlist* command and this can be passed directly to template using the *fpath* option. + + *mopdb template -f -v -a * It produces a csv file with a list of all the variables from raw output mapped to cmip style variables. These mappings also take into account the frequency and include variables that can be potentially calculated with the listed fields. The console output lists these, as shown above. @@ -51,19 +45,21 @@ The mappings can be different between different version and/or configurations of Starting with version 0.6 the list includes matches based on the standard_name, as these rows often list more than one option per field, it's important to either edit or remove these rows before using the mapping file. The :doc:`Customing section ` covers what to do for an experiment using a new configuration which is substantially different from the ones which are available. +It also provides an intermediate varlist_.csv file that shows the information derived directly from the files. This can be useful to debug in case of issues with the mapping. This file is checked before the mapping step to make sure the tool has detected sensible frequency and realm, if the check fails the mapping won't proceed but the varlist file can be edited appropriately. .. warning:: Always check that the resulting template is mapping the variables correctly. This is particularly true for derived variables. Comment lines are inserted to give some information on what assumptions were done for each group of mappings. + The se -Step3: Set up the working environment +Step2: Set up the working environment ------------------------------------- - *mop -c setup* + *mop setup -c * .. code-block:: console - $ mop -c exp_conf.yaml setup + $ mop setup -c exp_conf.yaml Simulation to process: cy286 Setting environment and creating working directory Output directory '/scratch/v45/pxp581/MOPPER_output/cy286' exists. diff --git a/docs/mopdb_command.rst b/docs/mopdb_command.rst index 32d712c..f60d958 100644 --- a/docs/mopdb_command.rst +++ b/docs/mopdb_command.rst @@ -11,6 +11,7 @@ This module is used to manage the mapping of raw output to CMIP style variables. - **varlist** creates an initial list of variables and attributes based on actual files - **template** uses the above list to generate a template of mappings to use in the processing +- **intake** uses the mappings to create an intake catalogue of the raw model output - **cmor** populates the database cmor variables table - **map** populates the database mappings table - **check** checks a variable list against the cmor database table to individuate variables without a definition @@ -54,28 +55,19 @@ e.g. use aus2200 for mappings related to the AUS2200 configuration: A user that wants to create a mapping table for another AUS2200 simulation can use this value to select appropriate mappings (see how to do that below). -Get a list of variables from the model output ---------------------------------------------- -.. code-block:: +Create a mapping file +--------------------- - mopdb varlist -i -d +This can be done by providing the model output path and a pattern to match or directly a varlist file -this will create for each output file a list of variables with useful attributes -These can be concatenated into one or used to create separate mappings. +From output path: + +.. code-block:: -.. _varlist example: -.. dropdown:: Example output of varlist + mopdb template -f -m -v - name;cmor_var;units;dimensions;frequency;realm;cell_methods;cmor_table;vtype;size;nsteps;filename;long_name;standard_name - fld_s00i004;theta;K;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CM2_mon;float32;9400320;12;cw323a.pm;THETA AFTER TIMESTEP;air_potential_temperature - fld_s00i010;hus;1;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;9400320;12;cw323a.pm;SPECIFIC HUMIDITY AFTER TIMESTEP;specific_humidity - fld_s00i024;ts;K;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;12;cw323a.pm;SURFACE TEMPERATURE AFTER TIMESTEP;surface_temperature - fld_s00i030;;1;time lat lon;mon;atmos;area: time: mean;;float32;110592;12;cw323a.pm;LAND MASK (No halo) (LAND=TRUE);land_binary_mask - fld_s00i031;siconca;1;time lat lon;mon;atmos;area: time: mean;CMIP6_SImon;float32;110592;12;cw323a.pm;FRAC OF SEA ICE IN SEA AFTER TSTEP;sea_ice_area_fraction - ... +From varlist file: -Create a mapping file starting from variable list -------------------------------------------------- .. code-block:: mopdb template -f -v @@ -119,6 +111,63 @@ The other groups of records require checking, as either the version or the frequ ... +Create an intake catalogue +-------------------------- + +This represents an extra step on top of the mapping, so it can be start directly from an existing mapping or from scratch by providing the model ouptut path and a match. + +From output path: + +.. code-block:: + + mopdb intake -f -m -v { -a } + +From varlist file: + +.. code-block:: + + mopdb intake -f -fl -v { -a } + +From mapping file: + +.. code-block:: + + mopdb intake -f -fl -v { -a } + +NB the model output path is still needed even when passing an existing mapping or variable list. + +`intake` will generate: +* intake_.yaml - the main intake catalogue; +* intake_.json - the intake-esm catalogue; +* catalogue.csv.xz - a csv file containing a list of the assets. + +The esm-catalogue is a multi-variable catalogue, which means that each file can have more than one variable as it is usual for raw model output. While each file contains a lot of variables, a user can select just one or few and only these will be loaded as an xarray dataset. This is helpful with the UM output where variables with different dimensions can co-exist in a file. In such case, it's necessary to use preprocess to select variables with consitent dimensions to avoid concatenation issues. As this is the standard behaviour for multi-variable intake-esm catalogues, the user don't need to worry about it. + +The esm-intake catalogue also lists separately each variable that can be mapped to a cmor name and/or standard_name. This allows to use the cmor names and/or the standard_names more effectively to query the data. + +Get a list of variables from the model output +--------------------------------------------- +.. code-block:: + + mopdb varlist -f -m + +this will create a list of variables with useful attributes + +.. _varlist example: +.. dropdown:: Example output of varlist + + name;cmor_var;units;dimensions;frequency;realm;cell_methods;cmor_table;vtype;size;nsteps;filename;long_name;standard_name + #cw323a.pm + fld_s00i004;theta;K;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CM2_mon;float32;9400320;12;cw323a.pm;THETA AFTER TIMESTEP;air_potential_temperature + fld_s00i010;hus;1;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;9400320;12;cw323a.pm;SPECIFIC HUMIDITY AFTER TIMESTEP;specific_humidity + fld_s00i024;ts;K;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;12;cw323a.pm;SURFACE TEMPERATURE AFTER TIMESTEP;surface_temperature + fld_s00i030;;1;time lat lon;mon;atmos;area: time: mean;;float32;110592;12;cw323a.pm;LAND MASK (No halo) (LAND=TRUE);land_binary_mask + fld_s00i031;siconca;1;time lat lon;mon;atmos;area: time: mean;CMIP6_SImon;float32;110592;12;cw323a.pm;FRAC OF SEA ICE IN SEA AFTER TSTEP;sea_ice_area_fraction + ... + +Doing this step separately can be useful if the model output is using a random directory structure, as it's more likely in such a case that important attributes like frequency and realm which are used for the mapping might be incorrect or missing. In such a case it might be more efficient processing different kind of files separately first, making sure frequency and realm are correct and then combining them into one file to pass to template. +The template command will stop execution if detects potentially wrong values for these fields and save + Check which variables aren't yet defined ---------------------------------------- .. code-block:: console diff --git a/docs/overview.rst b/docs/overview.rst index 908db06..f074224 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -1,25 +1,16 @@ Install ======= -You can install the latest version of `mopper` directly from conda (accessnri channel):: +We are planning to release ACCESS-MOPPeR in conda soon and then it will be available at NCI on our conda environments. +In the meantime, you can icreate a custom conda environment and install mopper following these steps: - conda install -c accessnri mopper +1. module load conda/analysis3 +2. python -m venv mopper_env --system-site-packages +3. source /mopper_env/bin/activate +4. pip install git+https://github.com/ACCESS-Community-Hub/ACCESS-MOPPeR@main + +The source command will activate the conda env you just created. +Any time you want to use the tool in a new session repeat the first and third steps. -If you want to install an unstable version or a different branch: +The `pip` command above will install from the main branch, you can also indicate a different branch. - * git clone - * git checkout (if installing a a different branch from master) - * cd mopper - * pip install ./ - use --user flag if you want to install it in ~/.local - -Working on the NCI server -------------------------- - -MOPPeR is pre-installed into a Conda environment at NCI. Load it with:: - - module use /g/data3/hh5/public/modules - module load conda/analysis3-unstable - -.. note:: - You need to be a member of the hh5 project to load the modules. diff --git a/mappings/map_AUS2200.csv b/mappings/map_AUS2200.csv index 06e3df1..7f7aec5 100644 --- a/mappings/map_AUS2200.csv +++ b/mappings/map_AUS2200.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name amdry;fld_s30i403;;kg m-2;time lat lon;10minPt;atmos;area: time: point;;AUS2200_A10min;AUS2200;;float32;22048000;2304;umnsa_spec;TOTAL COLUMN DRY MASS RHO GRID; amwet;fld_s30i404;;kg m-2;time lat lon;10minPt;atmos;area: time: point;;AUS2200_A10min;AUS2200;;float32;22048000;2304;umnsa_spec;TOTAL COLUMN WET MASS RHO GRID;atmosphere_mass_per_unit_area cl;fld_s00i265;level_to_height(var[0],levs=(0,66));1;time model_theta_level_number lat lon;1hrPt;atmos;area: time: point;;AUS2200_A1hr;AUS2200;float32;1543360000;384;umnsa_cldrad;AREA CLOUD FRACTION IN EACH LAYER;cloud_area_fraction_in_atmosphere_layer @@ -100,5 +100,6 @@ wsgmax10m;fld_s03i463;;m s-1;time lat lon;10minPt;atmos;area: time: point;;AUS22 wsgmax10m_max;fld_s03i463_max;;m s-1;time_0 lat lon;10min;atmos;area: time: maximum;;AUS2200_A10min;AUS2200;;float32;22048000;2304;umnsa_spec;WIND GUST;wind_speed_of_gust z0;fld_s00i026;;m;time lat lon;1hrPt;atmos;area: time: point;;AUS2200_A1hr;AUS2200;;float32;22048000;384;umnsa_slv;ROUGHNESS LENGTH AFTER TIMESTEP;surface_roughness_length zfull;fld_s15i101;level_to_height(var[0],levs=(0,66));m;time_0 model_theta_level_number lat lon_0;1hrPt;atmos;area: time: point;;AUS2200_A1hr;AUS2200;float32;1543360000;384;umnsa_mdl;H OF THETA MODEL LEVS FROM SEA LEVEL;height_above_reference_ellipsoid +zg16;fld_s16i202;;m;time pressure lat lon;3hrPt;atmos;area: time: point;;AUS2200_A3hr;AUS2200;float32;352768000;114;flreduced_;GEOPOTENTIAL HEIGHT ON P LEV/P GRID;geopotential_height zmla;fld_s00i025;;m;time_0 lat lon;1hr;atmos;area: time: mean;;AUS2200_A1hr;AUS2200;float32;22048000;384;umnsa_slv;BOUNDARY LAYER DEPTH AFTER TIMESTEP;atmosphere_boundary_layer_thickness zmla;fld_s00i025;;m;time lat lon;10minPt;atmos;area: time: point;;AUS2200_A10min;AUS2200;float32;22048000;2304;umnsa_spec;BOUNDARY LAYER DEPTH AFTER TIMESTEP;atmosphere_boundary_layer_thickness diff --git a/mappings/map_aerosol_CM2.csv b/mappings/map_aerosol_CM2.csv index 4c7b1b3..18d671a 100644 --- a/mappings/map_aerosol_CM2.csv +++ b/mappings/map_aerosol_CM2.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name abs550aer;fld_s02i240 fld_s02i241 fld_s02i242 fld_s02i243 fld_s02i585;optical_depth(var,3);1;time pseudo_level_0 lat lon;mon;aerosol;area: time: mean;;CMIP6_AERmon;CM2;float32;663552;12;cw323a.pm;Ambient Aerosol Absorption Optical Thickness at 550nm;atmosphere_absorption_optical_thickness_due_to_ambient_aerosol_particles dryss;fld_s38i218 fld_s38i219;calc_depositions(var);kg m-2 s-1;time model_theta_level_number lat lon;mon;aerosol;area: time: mean;;CMIP6_AERmon;CM2;float32;9400320;12;cw323a.pm;Dry Deposition Rate of Sea-Salt Aerosol;minus_tendency_of_atmosphere_mass_content_of_sea_salt_dry_aerosol_particles_due_to_dry_deposition lwp;fld_s30i405;;kg m-2;time lat lon;mon;aerosol;area: time: mean;;CMIP6_AERmon;CM2;float32;110592;12;cw323a.pm;TOTAL COLUMN QCL RHO GRID;atmosphere_cloud_liquid_water_content diff --git a/mappings/map_atmos_CM2.csv b/mappings/map_atmos_CM2.csv index 5d6f369..b2cc1c3 100644 --- a/mappings/map_atmos_CM2.csv +++ b/mappings/map_atmos_CM2.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name amdry;fld_s30i403;;kg m-2;time lat lon;mon;atmos;area: time: mean;;CM2_mon;CM2;float32;110592;12;cw323a.pm;TOTAL COLUMN DRY MASS RHO GRID;atmosphere_mass_per_unit_area amwet;fld_s30i404;;kg m-2;time lat lon;mon;atmos;area: time: mean;;CM2_mon;CM2;float32;110592;12;cw323a.pm;TOTAL COLUMN WET MASS RHO GRID;atmosphere_mass_per_unit_area ci;fld_s05i269;;1;time lat lon;mon;atmos;area: time: mean;;CMIP6_Amon;CM2;float32;110592;12;cw323a.pm;deep convection indicator; diff --git a/mappings/map_land_CM2.csv b/mappings/map_land_CM2.csv index 92da281..af1f539 100644 --- a/mappings/map_land_CM2.csv +++ b/mappings/map_land_CM2.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name agesno;fld_s03i832 fld_s03i317;average_tile(var[0],tilefrac=var[1]);mon;time pseudo_level_1 lat lon;mon;landIce land;area: time: mean;;CMIP6_LImon;CM2;float32;1880064;12;cw323a.pm;CABLE SNOW AGE ON TILES;age_of_surface_snow baresoilFrac;fld_s03i317 fld_s03i395;extract_tilefrac(var[0],14,landfrac=var[1],lev='typebare');1;time lat lon;mon;land;area: mean where land over all_area_types time: mean;;CMIP6_Lmon;CM2;float32;110592;12;cw323a.pm;Bare Soil Percentage Area Coverage;area_fraction c3PftFrac;fld_s03i317 fld_s03i395;extract_tilefrac(var[0],[1,2,3,4,5,6,8,9,11],landfrac=var[1],lev='typec3pft');1;time pseudo_level_1 lat lon;mon;land;area: mean where land over all_area_types time: mean;;CMIP6_Lmon;CM2;float32;1880064;12;cw323a.pm;Percentage Cover by C3 Plant Functional Type;area_fraction diff --git a/mappings/map_land_ESM1.5.csv b/mappings/map_land_ESM1.5.csv index 23b129a..56b59dd 100644 --- a/mappings/map_land_ESM1.5.csv +++ b/mappings/map_land_ESM1.5.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name agesno;fld_s03i832 fld_s03i317;average_tile(var[0],tilefrac=var[1]);mon;time pseudo_level_1 lat lon;mon;landIce land;area: time: mean;;CMIP6_LImon;ESM1.5;float32;1880064;12;cw323a.pm;CABLE SNOW AGE ON TILES;age_of_surface_snow baresoilFrac;fld_s03i317 fld_s03i395;extract_tilefrac(var[0],14,landfrac=var[1],lev='typebare');1;time lat lon;mon;land;area: mean where land over all_area_types time: mean;;CMIP6_Lmon;ESM1.5;float32;110592;12;cw323a.pm;Bare Soil Percentage Area Coverage;area_fraction c3PftFrac;fld_s03i317 fld_s03i395;extract_tilefrac(var[0],[1,2,3,4,5,6,8,9,11],landfrac=var[1],lev='typec3pft');1;time pseudo_level_1 lat lon;mon;land;area: mean where land over all_area_types time: mean;;CMIP6_Lmon;ESM1.5;float32;1880064;12;cw323a.pm;Percentage Cover by C3 Plant Functional Type;area_fraction diff --git a/mappings/map_ocean_OM2.csv b/mappings/map_ocean_OM2.csv index 224677f..073e4b4 100644 --- a/mappings/map_ocean_OM2.csv +++ b/mappings/map_ocean_OM2.csv @@ -1,4 +1,4 @@ -#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;filename;long_name;standard_name +#cmor_var;input_vars;calculation;units;dimensions;frequency;realm;cell_methods;positive;cmor_table;version;vtype;size;nsteps;fpattern;long_name;standard_name advectsweby;temp_sweby_advec;;W m-2;time st_ocean yt_ocean xt_ocean;mon;ocean;area: time: mean;;CM2_mon;OM2;float32;21600000;36;ocean_month.nc-;cp*rho*dzt*sweby advect tendency; agessc;age_global;;yr;time st_ocean yt_ocean xt_ocean;mon;ocean;area: time: mean;;CMIP6_Omon;OM2;float32;21600000;708;ocean_month.nc-;Age (global);sea_water_age_since_surface_contact areacello;dummy;get_areacello();1;time st_ocean yt_ocean xt_ocean;mon;ocean;area: sum;;CMIP6_Ofx;OM2;float32;21600000;708;ocean_month.nc-;t-cell thickness;cell_area diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..51aac13 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +requires = ["setuptools>=64.0.0", "setuptools-scm"] +#build-backend = "pbr.build" +build-backend = "setuptools.build_meta" + +[project] +name = "ACCESS-MOPPeR" + +authors = [ + {name = "Paola Petrelli", email = "paola.petrelli@utas.edu.au"}, + {name = "Sam Green", email = "sam.green@unsw.edu.au"}, +] +description = "ACCESS Model Output Post-Processor, maps raw model output to CMIP-style defined variables and produce post-processed output using CMOR3" +readme = "README.md" +requires-python = ">=3.8" +keywords = ["ACCESS", "post-processing"] +license = {text = "Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] +dynamic = ["version", "dependencies", "optional-dependencies"] + +[tool.setuptools-git-versioning] +enabled = true + +[project.scripts] +mop = "mopper.mopper:mop_catch" +mopdb = "mopdb.mopdb:mopdb_catch" + +[tool.setuptools.dynamic] +dependencies = {file = "requirements.txt"} +optional-dependencies.test = { file = ["test-requirements.txt"] } + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +mopdata = ["*.json", "*.yaml", "*.db", "*.csv", "update_db.py"] + + +# ... other project metadata fields as listed in: +# https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ diff --git a/requirements.txt b/requirements.txt index 233e9da..0953f2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ # Add general dependencies here -# Optional dependencies e.g. [dev] are added in `setup.cfg` +# Optional dependencies e.g. [dev] are added in `test-requirements` click cmor xarray numpy pyyaml +dask +python-dateutil cftime diff --git a/setup.cfg b/setup.cfg index e2d1814..48b922f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,34 +1,3 @@ -[metadata] -name = mopper -url = https://github.com/ACCESS-Hive/ACCESS-MOPPeR -author = Paola Petrelli, Sam Green -author_email = paola.petrelli@utas.edu.au, sam.green@unsw.edu.au -summary = 'ACCESS Model Output Post-Processor, maps raw model output to CMIP-style defined variables and produce post-processed output using CMOR3' -description_file = README.md -licence = 'Apache-2.0' -keywords = 'ACCESS model' -classifier = - Development Status :: 3 - Alpha - Environment :: Console - Intended Audience :: Science/Research - License :: OSI Approved :: Apache Software License - Operating System :: POSIX :: Linux - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - -[options] -packages = find_namespace: -package_dir = - = src -include_package_data = True - -[options.packages.find] -where = src - -[options.package_data] -data = *.json, *.yaml, *.db, *.csv -mopper = update_db.py - [pbr] autodoc_tree_index_modules = True autodoc_tree_excludes = @@ -36,11 +5,6 @@ autodoc_tree_excludes = test docs/conf.py -[entry_points] -console_scripts = - mop = mopper.mopper:mop_catch - mopdb = mopdb.mopdb:mopdb_catch - [build_sphinx] source_dir = docs build_dir = docs/_build diff --git a/setup.py b/setup.py index 52f1ab9..ba5a4e2 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,6 @@ from setuptools import setup setup( - setup_requires=['pbr', 'setuptools'], - pbr=True, + setup_requires=['setuptools-scm', 'setuptools'], ) diff --git a/src/data/access.db b/src/mopdata/access.db similarity index 99% rename from src/data/access.db rename to src/mopdata/access.db index 2232869..70be17f 100644 Binary files a/src/data/access.db and b/src/mopdata/access.db differ diff --git a/src/data/access_dump.sql b/src/mopdata/access_dump.sql similarity index 99% rename from src/data/access_dump.sql rename to src/mopdata/access_dump.sql index de52ba2..8d4a090 100644 --- a/src/data/access_dump.sql +++ b/src/mopdata/access_dump.sql @@ -2279,6 +2279,7 @@ INSERT INTO cmorvar VALUES('lmask-AUS2200_fx','fx','land','land_binary_mask','%' INSERT INTO cmorvar VALUES('omldamax-CM2_mon','mon','ocean','ocean_mixed_layer_thickness_defined_by_mixing_scheme','m','area: mean time: maximum','area: areacello','Mean Monthly Maximum Ocean Mixed Layer Thickness Defined by Mixing Scheme','The ocean mixed layer is the upper part of the ocean, regarded as being well-mixed. The base of the mixed layer defined by the mixing scheme is a diagnostic of ocean models. ''Thickness'' means the vertical extent of a layer.','longitude latitude time','omldamax','real','','','','','','',''); INSERT INTO cmorvar VALUES('difvho-CM2_mon','mon','ocean','ocean_vertical_heat_diffusivity','m2 s-1','area: mean time: mean','area: areacello volume: volcello','Ocean Vertical Heat Diffusivity','Vertical/dianeutral diffusivity applied to prognostic temperature field.','longitude latitude olevel time','difvho','real','','','','','','',''); INSERT INTO cmorvar VALUES('rho0-CM2_mon','mon','ocean','sea_water_potential_density','kg m-3','area: mean time: mean','area: areacello volume: volcello','Potential Density referenced to 0 dbar','','longitude latitude olevel time','rho0','real','','','','','','',''); +INSERT INTO cmorvar VALUES('zg16-AUS2200_A3hr','3hrPt','atmos','geopotential_height','m','area: mean time: point','area: areacella','Geopotential Height on pressure levels','Geopotential is the sum of the specific gravitational potential energy relative to the geoid and the specific centripetal potential energy. Geopotential height is the geopotential divided by the standard acceleration due to gravity. It is numerically similar to the altitude (or geometric height) and not to the quantity with standard name height, which is relative to the surface.','longitude latitude plev16 time1','zg','real','','','','','','',''); CREATE TABLE mapping ( cmor_var TEXT, input_vars TEXT, @@ -2734,6 +2735,7 @@ INSERT INTO mapping VALUES('zfull','fld_s15i101','','m','time model_theta_level_ INSERT INTO mapping VALUES('zg','fld_s30i297','','m','time pressure lat lon','mon','atmos','area: time: mean','','CMIP6_Amon','CM2','geopotential_height','map_atmos_CM2'); INSERT INTO mapping VALUES('zg','fld_s30i297','','m','time pressure lat lon','day','atmos','area: time: mean','','CMIP6_Eday','CM2','geopotential_height','map_atmos_CM2'); INSERT INTO mapping VALUES('zg','fld_s30i297','','m','time pressure lat lon','day','atmos','area: time: mean','','CMIP6_day','CM2','geopotential_height','map_atmos_CM2'); +INSERT INTO mapping VALUES('zg16','fld_s16i202','','m','time pressure lat lon','3hrPt','atmos','area: time: point','','AUS2200_A3hr','AUS2200','geopotential_height','AUS2200'); INSERT INTO mapping VALUES('zg500','fld_s30i297','','m','time pressure lat lon','6hrPt','atmos','area: time: point','','CMIP6_6hrPlevPt','CM2','geopotential_height','map_atmos_CM2'); INSERT INTO mapping VALUES('zg500','fld_s30i297','var[0].sel(pressure=500)','m','time pressure lat lon','day','aerosol','area: time: mean','','CMIP6_AERday','CM2','geopotential_height','map_aerosol_CM2'); INSERT INTO mapping VALUES('zguvgrid','fld_s30i207','','m','time pressure lat_v lon_u','mon','atmos','area: time: mean','','CM2_mon','CM2','geopotential_height','map_atmos_CM2'); diff --git a/src/data/cmor_tables/ACDD_CV.json b/src/mopdata/cmor_tables/ACDD_CV.json similarity index 100% rename from src/data/cmor_tables/ACDD_CV.json rename to src/mopdata/cmor_tables/ACDD_CV.json diff --git a/src/data/cmor_tables/ACDD_coordinate.json b/src/mopdata/cmor_tables/ACDD_coordinate.json similarity index 100% rename from src/data/cmor_tables/ACDD_coordinate.json rename to src/mopdata/cmor_tables/ACDD_coordinate.json diff --git a/src/data/cmor_tables/ACDD_formula_terms.json b/src/mopdata/cmor_tables/ACDD_formula_terms.json similarity index 100% rename from src/data/cmor_tables/ACDD_formula_terms.json rename to src/mopdata/cmor_tables/ACDD_formula_terms.json diff --git a/src/data/cmor_tables/ACDD_grids.json b/src/mopdata/cmor_tables/ACDD_grids.json similarity index 100% rename from src/data/cmor_tables/ACDD_grids.json rename to src/mopdata/cmor_tables/ACDD_grids.json diff --git a/src/data/cmor_tables/AUS2200_A10min.json b/src/mopdata/cmor_tables/AUS2200_A10min.json similarity index 100% rename from src/data/cmor_tables/AUS2200_A10min.json rename to src/mopdata/cmor_tables/AUS2200_A10min.json diff --git a/src/data/cmor_tables/AUS2200_A1hr.json b/src/mopdata/cmor_tables/AUS2200_A1hr.json similarity index 100% rename from src/data/cmor_tables/AUS2200_A1hr.json rename to src/mopdata/cmor_tables/AUS2200_A1hr.json diff --git a/src/data/cmor_tables/AUS2200_A1hrPlev.json b/src/mopdata/cmor_tables/AUS2200_A1hrPlev.json similarity index 100% rename from src/data/cmor_tables/AUS2200_A1hrPlev.json rename to src/mopdata/cmor_tables/AUS2200_A1hrPlev.json diff --git a/src/data/cmor_tables/AUS2200_A3hr.json b/src/mopdata/cmor_tables/AUS2200_A3hr.json similarity index 80% rename from src/data/cmor_tables/AUS2200_A3hr.json rename to src/mopdata/cmor_tables/AUS2200_A3hr.json index d9108a9..99e6833 100644 --- a/src/data/cmor_tables/AUS2200_A3hr.json +++ b/src/mopdata/cmor_tables/AUS2200_A3hr.json @@ -103,6 +103,24 @@ "valid_max": "", "ok_min_mean_abs": "", "ok_max_mean_abs": "" + }, + "zg16": { + "frequency": "3hrPt", + "modeling_realm": "atmos", + "standard_name": "geopotential_height", + "units": "m", + "cell_methods": "area: mean time: point", + "cell_measures": "area: areacella", + "long_name": "Geopotential Height on pressure levels", + "comment": "Geopotential is the sum of the specific gravitational potential energy relative to the geoid and the specific centripetal potential energy. Geopotential height is the geopotential divided by the standard acceleration due to gravity. It is numerically similar to the altitude (or geometric height) and not to the quantity with standard name height, which is relative to the surface.", + "dimensions": "longitude latitude plev16 time1", + "out_name": "zg", + "type": "real", + "positive": "", + "valid_min": "", + "valid_max": "", + "ok_min_mean_abs": "", + "ok_max_mean_abs": "" } } } diff --git a/src/data/cmor_tables/AUS2200_A6hr.json b/src/mopdata/cmor_tables/AUS2200_A6hr.json similarity index 100% rename from src/data/cmor_tables/AUS2200_A6hr.json rename to src/mopdata/cmor_tables/AUS2200_A6hr.json diff --git a/src/data/cmor_tables/AUS2200_Aday.json b/src/mopdata/cmor_tables/AUS2200_Aday.json similarity index 100% rename from src/data/cmor_tables/AUS2200_Aday.json rename to src/mopdata/cmor_tables/AUS2200_Aday.json diff --git a/src/data/cmor_tables/AUS2200_fx.json b/src/mopdata/cmor_tables/AUS2200_fx.json similarity index 100% rename from src/data/cmor_tables/AUS2200_fx.json rename to src/mopdata/cmor_tables/AUS2200_fx.json diff --git a/src/data/cmor_tables/CM2_3hr.json b/src/mopdata/cmor_tables/CM2_3hr.json similarity index 99% rename from src/data/cmor_tables/CM2_3hr.json rename to src/mopdata/cmor_tables/CM2_3hr.json index dd13bab..34584c2 100644 --- a/src/data/cmor_tables/CM2_3hr.json +++ b/src/mopdata/cmor_tables/CM2_3hr.json @@ -104,10 +104,7 @@ "valid_max": "", "ok_min_mean_abs": "", "ok_max_mean_abs": "" - }, - - - + } } } diff --git a/src/data/cmor_tables/CM2_6hr.json b/src/mopdata/cmor_tables/CM2_6hr.json similarity index 100% rename from src/data/cmor_tables/CM2_6hr.json rename to src/mopdata/cmor_tables/CM2_6hr.json diff --git a/src/data/cmor_tables/CM2_day.json b/src/mopdata/cmor_tables/CM2_day.json similarity index 100% rename from src/data/cmor_tables/CM2_day.json rename to src/mopdata/cmor_tables/CM2_day.json diff --git a/src/data/cmor_tables/CM2_mon.json b/src/mopdata/cmor_tables/CM2_mon.json similarity index 100% rename from src/data/cmor_tables/CM2_mon.json rename to src/mopdata/cmor_tables/CM2_mon.json diff --git a/src/data/cmor_tables/CMIP6_3hr.json b/src/mopdata/cmor_tables/CMIP6_3hr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_3hr.json rename to src/mopdata/cmor_tables/CMIP6_3hr.json diff --git a/src/data/cmor_tables/CMIP6_6hrLev.json b/src/mopdata/cmor_tables/CMIP6_6hrLev.json similarity index 100% rename from src/data/cmor_tables/CMIP6_6hrLev.json rename to src/mopdata/cmor_tables/CMIP6_6hrLev.json diff --git a/src/data/cmor_tables/CMIP6_6hrPlev.json b/src/mopdata/cmor_tables/CMIP6_6hrPlev.json similarity index 100% rename from src/data/cmor_tables/CMIP6_6hrPlev.json rename to src/mopdata/cmor_tables/CMIP6_6hrPlev.json diff --git a/src/data/cmor_tables/CMIP6_6hrPlevPt.json b/src/mopdata/cmor_tables/CMIP6_6hrPlevPt.json similarity index 100% rename from src/data/cmor_tables/CMIP6_6hrPlevPt.json rename to src/mopdata/cmor_tables/CMIP6_6hrPlevPt.json diff --git a/src/data/cmor_tables/CMIP6_AERday.json b/src/mopdata/cmor_tables/CMIP6_AERday.json similarity index 100% rename from src/data/cmor_tables/CMIP6_AERday.json rename to src/mopdata/cmor_tables/CMIP6_AERday.json diff --git a/src/data/cmor_tables/CMIP6_AERhr.json b/src/mopdata/cmor_tables/CMIP6_AERhr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_AERhr.json rename to src/mopdata/cmor_tables/CMIP6_AERhr.json diff --git a/src/data/cmor_tables/CMIP6_AERmon.json b/src/mopdata/cmor_tables/CMIP6_AERmon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_AERmon.json rename to src/mopdata/cmor_tables/CMIP6_AERmon.json diff --git a/src/data/cmor_tables/CMIP6_AERmonZ.json b/src/mopdata/cmor_tables/CMIP6_AERmonZ.json similarity index 100% rename from src/data/cmor_tables/CMIP6_AERmonZ.json rename to src/mopdata/cmor_tables/CMIP6_AERmonZ.json diff --git a/src/data/cmor_tables/CMIP6_Amon.json b/src/mopdata/cmor_tables/CMIP6_Amon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Amon.json rename to src/mopdata/cmor_tables/CMIP6_Amon.json diff --git a/src/data/cmor_tables/CMIP6_CF3hr.json b/src/mopdata/cmor_tables/CMIP6_CF3hr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_CF3hr.json rename to src/mopdata/cmor_tables/CMIP6_CF3hr.json diff --git a/src/data/cmor_tables/CMIP6_CFday.json b/src/mopdata/cmor_tables/CMIP6_CFday.json similarity index 100% rename from src/data/cmor_tables/CMIP6_CFday.json rename to src/mopdata/cmor_tables/CMIP6_CFday.json diff --git a/src/data/cmor_tables/CMIP6_CFmon.json b/src/mopdata/cmor_tables/CMIP6_CFmon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_CFmon.json rename to src/mopdata/cmor_tables/CMIP6_CFmon.json diff --git a/src/data/cmor_tables/CMIP6_CFsubhr.json b/src/mopdata/cmor_tables/CMIP6_CFsubhr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_CFsubhr.json rename to src/mopdata/cmor_tables/CMIP6_CFsubhr.json diff --git a/src/data/cmor_tables/CMIP6_CV.json b/src/mopdata/cmor_tables/CMIP6_CV.json similarity index 100% rename from src/data/cmor_tables/CMIP6_CV.json rename to src/mopdata/cmor_tables/CMIP6_CV.json diff --git a/src/data/cmor_tables/CMIP6_E1hr.json b/src/mopdata/cmor_tables/CMIP6_E1hr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_E1hr.json rename to src/mopdata/cmor_tables/CMIP6_E1hr.json diff --git a/src/data/cmor_tables/CMIP6_E1hrClimMon.json b/src/mopdata/cmor_tables/CMIP6_E1hrClimMon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_E1hrClimMon.json rename to src/mopdata/cmor_tables/CMIP6_E1hrClimMon.json diff --git a/src/data/cmor_tables/CMIP6_E3hr.json b/src/mopdata/cmor_tables/CMIP6_E3hr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_E3hr.json rename to src/mopdata/cmor_tables/CMIP6_E3hr.json diff --git a/src/data/cmor_tables/CMIP6_E3hrPt.json b/src/mopdata/cmor_tables/CMIP6_E3hrPt.json similarity index 100% rename from src/data/cmor_tables/CMIP6_E3hrPt.json rename to src/mopdata/cmor_tables/CMIP6_E3hrPt.json diff --git a/src/data/cmor_tables/CMIP6_E6hrZ.json b/src/mopdata/cmor_tables/CMIP6_E6hrZ.json similarity index 100% rename from src/data/cmor_tables/CMIP6_E6hrZ.json rename to src/mopdata/cmor_tables/CMIP6_E6hrZ.json diff --git a/src/data/cmor_tables/CMIP6_Eday.json b/src/mopdata/cmor_tables/CMIP6_Eday.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Eday.json rename to src/mopdata/cmor_tables/CMIP6_Eday.json diff --git a/src/data/cmor_tables/CMIP6_EdayZ.json b/src/mopdata/cmor_tables/CMIP6_EdayZ.json similarity index 100% rename from src/data/cmor_tables/CMIP6_EdayZ.json rename to src/mopdata/cmor_tables/CMIP6_EdayZ.json diff --git a/src/data/cmor_tables/CMIP6_Efx.json b/src/mopdata/cmor_tables/CMIP6_Efx.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Efx.json rename to src/mopdata/cmor_tables/CMIP6_Efx.json diff --git a/src/data/cmor_tables/CMIP6_Emon.json b/src/mopdata/cmor_tables/CMIP6_Emon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Emon.json rename to src/mopdata/cmor_tables/CMIP6_Emon.json diff --git a/src/data/cmor_tables/CMIP6_EmonZ.json b/src/mopdata/cmor_tables/CMIP6_EmonZ.json similarity index 100% rename from src/data/cmor_tables/CMIP6_EmonZ.json rename to src/mopdata/cmor_tables/CMIP6_EmonZ.json diff --git a/src/data/cmor_tables/CMIP6_Esubhr.json b/src/mopdata/cmor_tables/CMIP6_Esubhr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Esubhr.json rename to src/mopdata/cmor_tables/CMIP6_Esubhr.json diff --git a/src/data/cmor_tables/CMIP6_Eyr.json b/src/mopdata/cmor_tables/CMIP6_Eyr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Eyr.json rename to src/mopdata/cmor_tables/CMIP6_Eyr.json diff --git a/src/data/cmor_tables/CMIP6_IfxAnt.json b/src/mopdata/cmor_tables/CMIP6_IfxAnt.json similarity index 100% rename from src/data/cmor_tables/CMIP6_IfxAnt.json rename to src/mopdata/cmor_tables/CMIP6_IfxAnt.json diff --git a/src/data/cmor_tables/CMIP6_IfxGre.json b/src/mopdata/cmor_tables/CMIP6_IfxGre.json similarity index 100% rename from src/data/cmor_tables/CMIP6_IfxGre.json rename to src/mopdata/cmor_tables/CMIP6_IfxGre.json diff --git a/src/data/cmor_tables/CMIP6_ImonAnt.json b/src/mopdata/cmor_tables/CMIP6_ImonAnt.json similarity index 100% rename from src/data/cmor_tables/CMIP6_ImonAnt.json rename to src/mopdata/cmor_tables/CMIP6_ImonAnt.json diff --git a/src/data/cmor_tables/CMIP6_ImonGre.json b/src/mopdata/cmor_tables/CMIP6_ImonGre.json similarity index 100% rename from src/data/cmor_tables/CMIP6_ImonGre.json rename to src/mopdata/cmor_tables/CMIP6_ImonGre.json diff --git a/src/data/cmor_tables/CMIP6_IyrAnt.json b/src/mopdata/cmor_tables/CMIP6_IyrAnt.json similarity index 100% rename from src/data/cmor_tables/CMIP6_IyrAnt.json rename to src/mopdata/cmor_tables/CMIP6_IyrAnt.json diff --git a/src/data/cmor_tables/CMIP6_IyrGre.json b/src/mopdata/cmor_tables/CMIP6_IyrGre.json similarity index 100% rename from src/data/cmor_tables/CMIP6_IyrGre.json rename to src/mopdata/cmor_tables/CMIP6_IyrGre.json diff --git a/src/data/cmor_tables/CMIP6_LImon.json b/src/mopdata/cmor_tables/CMIP6_LImon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_LImon.json rename to src/mopdata/cmor_tables/CMIP6_LImon.json diff --git a/src/data/cmor_tables/CMIP6_Lmon.json b/src/mopdata/cmor_tables/CMIP6_Lmon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Lmon.json rename to src/mopdata/cmor_tables/CMIP6_Lmon.json diff --git a/src/data/cmor_tables/CMIP6_Oclim.json b/src/mopdata/cmor_tables/CMIP6_Oclim.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Oclim.json rename to src/mopdata/cmor_tables/CMIP6_Oclim.json diff --git a/src/data/cmor_tables/CMIP6_Oday.json b/src/mopdata/cmor_tables/CMIP6_Oday.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Oday.json rename to src/mopdata/cmor_tables/CMIP6_Oday.json diff --git a/src/data/cmor_tables/CMIP6_Odec.json b/src/mopdata/cmor_tables/CMIP6_Odec.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Odec.json rename to src/mopdata/cmor_tables/CMIP6_Odec.json diff --git a/src/data/cmor_tables/CMIP6_Ofx.json b/src/mopdata/cmor_tables/CMIP6_Ofx.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Ofx.json rename to src/mopdata/cmor_tables/CMIP6_Ofx.json diff --git a/src/data/cmor_tables/CMIP6_Omon.json b/src/mopdata/cmor_tables/CMIP6_Omon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Omon.json rename to src/mopdata/cmor_tables/CMIP6_Omon.json diff --git a/src/data/cmor_tables/CMIP6_Oyr.json b/src/mopdata/cmor_tables/CMIP6_Oyr.json similarity index 100% rename from src/data/cmor_tables/CMIP6_Oyr.json rename to src/mopdata/cmor_tables/CMIP6_Oyr.json diff --git a/src/data/cmor_tables/CMIP6_SIday.json b/src/mopdata/cmor_tables/CMIP6_SIday.json similarity index 100% rename from src/data/cmor_tables/CMIP6_SIday.json rename to src/mopdata/cmor_tables/CMIP6_SIday.json diff --git a/src/data/cmor_tables/CMIP6_SImon.json b/src/mopdata/cmor_tables/CMIP6_SImon.json similarity index 100% rename from src/data/cmor_tables/CMIP6_SImon.json rename to src/mopdata/cmor_tables/CMIP6_SImon.json diff --git a/src/data/cmor_tables/CMIP6_coordinate.json b/src/mopdata/cmor_tables/CMIP6_coordinate.json similarity index 100% rename from src/data/cmor_tables/CMIP6_coordinate.json rename to src/mopdata/cmor_tables/CMIP6_coordinate.json diff --git a/src/data/cmor_tables/CMIP6_day.json b/src/mopdata/cmor_tables/CMIP6_day.json similarity index 100% rename from src/data/cmor_tables/CMIP6_day.json rename to src/mopdata/cmor_tables/CMIP6_day.json diff --git a/src/data/cmor_tables/CMIP6_formula_terms.json b/src/mopdata/cmor_tables/CMIP6_formula_terms.json similarity index 100% rename from src/data/cmor_tables/CMIP6_formula_terms.json rename to src/mopdata/cmor_tables/CMIP6_formula_terms.json diff --git a/src/data/cmor_tables/CMIP6_fx.json b/src/mopdata/cmor_tables/CMIP6_fx.json similarity index 100% rename from src/data/cmor_tables/CMIP6_fx.json rename to src/mopdata/cmor_tables/CMIP6_fx.json diff --git a/src/data/cmor_tables/CMIP6_grids.json b/src/mopdata/cmor_tables/CMIP6_grids.json similarity index 100% rename from src/data/cmor_tables/CMIP6_grids.json rename to src/mopdata/cmor_tables/CMIP6_grids.json diff --git a/src/data/dreq/cmvme_all_piControl_3_3.csv b/src/mopdata/dreq/cmvme_all_piControl_3_3.csv similarity index 100% rename from src/data/dreq/cmvme_all_piControl_3_3.csv rename to src/mopdata/dreq/cmvme_all_piControl_3_3.csv diff --git a/src/mopdata/intake_cat_template.json b/src/mopdata/intake_cat_template.json new file mode 100644 index 0000000..f3395ba --- /dev/null +++ b/src/mopdata/intake_cat_template.json @@ -0,0 +1,60 @@ +{ + "id": "", + "title": " model output.", + "description": " raw model output. \\nProject: \\nMaintained By: \\nContact: \\nDocumentation:\\nLicense: https://creativecommons.org/licenses/by/4.0/\\nCitation:\\nReferences:\\n", + "assets": { + "column_name": "path", + "format": "netcdf" + }, + "aggregation_control": { + "variable_column_name": "variable", + "groupby_attrs": [ + "realm", + "frequency", + "mapvar" + ], + "aggregations": [ + { + "type": "join_existing", + "attribute_name": "date", + "options": { + "dim": "time" + } + } + ] + }, + "esmcat_version": "0.1.0", + "catalog_file": "catalogue.csv.xz", + "attributes": [ + { + "column_name": "experiment" + }, + { + "column_name": "realm" + }, + { + "column_name": "frequency" + }, + { + "column_name": "variable" + }, + { + "column_name": "mapvar" + }, + { + "column_name": "standard_name" + }, + { + "column_name": "date" + }, + { + "column_name": "units" + }, + { + "column_name": "calculation" + }, + { + "column_name": "cell_methods" + } + ] +} diff --git a/src/mopdata/intake_cat_template.yaml b/src/mopdata/intake_cat_template.yaml new file mode 100644 index 0000000..0f94eeb --- /dev/null +++ b/src/mopdata/intake_cat_template.yaml @@ -0,0 +1,18 @@ +metadata: + version: 1 +sources: + : + description: "Intake catalogue to load ACCESS model output" + Project: "" + Maintained By: "" + Contact: "" + Documentation: "" + License: "https://creativecommons.org/licenses/by/4.0/" + Citation: "" + References: "" + driver: intake_esm.core.esm_datastore + args: + columns_with_iterables: + - variable + read_csv_kwargs: {"dtype": {"date": str}} + obj: "{{CATALOG_DIR}}/intake_.json" diff --git a/src/mopdata/interval2frq.yaml b/src/mopdata/interval2frq.yaml new file mode 100644 index 0000000..37189ec --- /dev/null +++ b/src/mopdata/interval2frq.yaml @@ -0,0 +1,25 @@ +# This file contains the dictionary neededto associate a time step interval +# to a frequency. There can be more than one depending on the units used by +# the time axis +days: + dec: 3652.0 + yr: 365.0 + mon: 30.0 + day: 1.0 + 6hr: 0.25 + 3hr: 0.125 + 1hr: 0.041667 + 30min: 0.020833 + 10min: 0.006944 + +hours: + dec: 87648.0 + yr: 8760.0 + mon: 720.0 + day: 24.0 + 6hr: 6.0 + 3hr: 3.0 + 1hr: 1.0 + 30min: 0.5 + 10min: 0.167 + diff --git a/src/data/land_tiles.yaml b/src/mopdata/land_tiles.yaml similarity index 100% rename from src/data/land_tiles.yaml rename to src/mopdata/land_tiles.yaml diff --git a/src/data/landtype.yaml b/src/mopdata/landtype.yaml similarity index 100% rename from src/data/landtype.yaml rename to src/mopdata/landtype.yaml diff --git a/src/data/latlon_vertices.yaml b/src/mopdata/latlon_vertices.yaml similarity index 100% rename from src/data/latlon_vertices.yaml rename to src/mopdata/latlon_vertices.yaml diff --git a/src/data/model_levels.yaml b/src/mopdata/model_levels.yaml similarity index 100% rename from src/data/model_levels.yaml rename to src/mopdata/model_levels.yaml diff --git a/src/data/notes.yaml b/src/mopdata/notes.yaml similarity index 100% rename from src/data/notes.yaml rename to src/mopdata/notes.yaml diff --git a/src/data/transport_lines.yaml b/src/mopdata/transport_lines.yaml similarity index 100% rename from src/data/transport_lines.yaml rename to src/mopdata/transport_lines.yaml diff --git a/src/mopper/update_db.py b/src/mopdata/update_db.py.txt similarity index 94% rename from src/mopper/update_db.py rename to src/mopdata/update_db.py.txt index ddcd6a5..c1cb17d 100644 --- a/src/mopper/update_db.py +++ b/src/mopdata/update_db.py.txt @@ -37,7 +37,8 @@ def update_map(conn, varid, ctable): """Read mappings for variable from map file and update them in filelist """ - keys = ['frequency','realm','timeshot','calculation', 'positive', 'resample'] + keys = ['frequency','realm','timeshot','calculation', + 'positive', 'resample'] keys2 = {'vin': 'input_vars', 'in_units': 'units'} fname = f"maps/{ctable}.json" with open(fname, 'r') as f: @@ -48,6 +49,12 @@ def update_map(conn, varid, ctable): args = {k: row[k] for k in keys} for k,v in keys2.items(): args[k] = row[v] + if 'datadir' in row.keys(): + paths = row['file_structure'].split() + infile = '' + for x in paths: + infile += f"{row['datadir']}/{x} " + args['infile'] = infile cur = conn.cursor() sql = f"UPDATE filelist SET" for k,v in args.items(): diff --git a/src/mopdb/__init__.py b/src/mopdb/__init__.py index 2a413df..e69de29 100644 --- a/src/mopdb/__init__.py +++ b/src/mopdb/__init__.py @@ -1 +0,0 @@ -from mopdb import * diff --git a/src/mopdb/mopdb.py b/src/mopdb/mopdb.py index fbc5df8..7e50bed 100644 --- a/src/mopdb/mopdb.py +++ b/src/mopdb/mopdb.py @@ -19,16 +19,18 @@ # last updated 08/04/2024 import click -import sqlite3 import logging import sys -import csv import json from importlib.resources import files as import_files +from pathlib import Path -from mopdb.mopdb_utils import * - +from mopdb.mopdb_utils import (mapping_sql, cmorvar_sql, read_map, + read_map_app4, create_table, write_cmor_table, update_db) +from mopdb.utils import (config_log, db_connect, query, delete_record) +from mopdb.mopdb_map import (write_varlist, write_map_template, + write_catalogue, map_variables, load_vars, get_map_obj) def mopdb_catch(): """ @@ -43,18 +45,56 @@ def mopdb_catch(): sys.exit(1) +def require_date(ctx, param, value): + """Changes match option in template command from optional to + required if fpath is a directory. + """ + names = [] + for i in range(len(ctx.command.params)): + names.append(ctx.command.params[i].name) + idx = names.index('match') + if Path(value).is_dir() and 'filelist' not in names: + ctx.command.params[idx].required = True + return value + + def db_args(f): - """Define database click arguments + """Define database click options """ constraints = [ - click.option('--fname', '-f', type=str, required=True, - help='Input file: used to update db table (mapping/cmor),' + - 'or to pass output model variables (list)'), + click.option('--fname', '-f', type=click.Path(exists=True), + required=True, + help='Input file: used to update db table (mapping/cmor)'), click.option('--dbname', type=str, required=False, default='default', help='Database relative path by default is package access.db'), - click.option('--alias', '-a', type=str, required=False, default=None, - help='Table alias to use when updating cmor var table or creating map template with list' + - ' to keep track of variable definition origin. If none passed uses input filename')] + click.option('--alias', '-a', type=str, required=False, default='', + help='Table alias to track definitions origin in cmorvar table.')] + for c in reversed(constraints): + f = c(f) + return f + + +def map_args(f): + """Define mapping click options for varlist, template, intake + commands + """ + constraints = [ + click.option('--fpath', '-f', type=click.Path(exists=True), + required=True, callback=require_date, + help=("""Model output files path. For 'template' + command can also be file generated by varlist step""")), + click.option('--match', '-m', type=str, required=False, + help=("""String to match output files. Most often + the timestamp from one of the output files""")), + click.option('--version', '-v', required=True, + type=click.Choice(['ESM1.5', 'CM2', 'AUS2200', 'OM2']), + show_default=True, + help="ACCESS version currently only CM2, ESM1.5, AUS2200, OM2"), + click.option('--dbname', type=str, required=False, default='default', + help="Database relative path by default is package access.db"), + click.option('--alias', '-a', type=str, required=False, default='', + help="""Alias to use to keep track of variable definition origin. + If none passed uses input filename""")] for c in reversed(constraints): f = c(f) return f @@ -70,7 +110,7 @@ def mopdb(ctx, debug): ctx.obj={} # set up a default value for flow if none selected for logging ctx.obj['debug'] = debug - ctx.obj['log'] = config_log(debug) + config_log(debug, logname='mopdb_log') @mopdb.command(name='check') @@ -89,14 +129,14 @@ def check_cmor(ctx, dbname): dbname : str Database relative path (default is data/access.db) """ - db_log = ctx.obj['log'] + mopdb_log = logging.getLogger('mopdb_log') # connect to db, this will create one if not existing if dbname == 'default': - dbname = import_files('data').joinpath('access.db') - conn = db_connect(dbname, db_log) + dbname = import_files('mopdata').joinpath('access.db') + conn = db_connect(dbname, logname='mopdb_log') # get list of variables already in db sql = 'SELECT name, out_name FROM cmorvar' - results = query(conn, sql, first=False) + results = query(conn, sql, first=False, logname='mopdb_log') # first set is the actual cmip variable name # second set is the name used in tables to distinguish different dims/freq # original maps files use the second style @@ -105,12 +145,12 @@ def check_cmor(ctx, dbname): cmor_vars.update(cmor_vars2) sql = 'SELECT cmor_var FROM mapping' - results = query(conn, sql, first=False) + results = query(conn, sql, first=False, logname='mopdb_log') map_vars = [x[0] for x in results] missing = set(map_vars) - set(cmor_vars) - db_log.info("Variables not yet defined in cmorvar table:") + mopdb_log.info("Variables not yet defined in cmorvar table:") for v in missing: - db_log.info(f"{v}") + mopdb_log.info(f"{v}") conn.close() return @@ -118,8 +158,10 @@ def check_cmor(ctx, dbname): @mopdb.command(name='table') @db_args @click.option('--label', '-l', required=False, default='CMIP6', - type=click.Choice(['CMIP6', 'AUS2200', 'CM2']), show_default=True, - help='Label indicating origin of CMOR variable definitions. Currently only CMIP6, AUS2200 and CM2') + type=click.Choice(['CMIP6', 'AUS2200', 'CM2', 'OM2']), + show_default=True, + help='''Label indicating origin of CMOR variable definitions. + Currently only CMIP6, AUS2200, CM2 and OM2''') @click.pass_context def cmor_table(ctx, dbname, fname, alias, label): """Create CMIP style table containing new variable definitions @@ -136,35 +178,35 @@ def cmor_table(ctx, dbname, fname, alias, label): fname : str Mapping file??? alias : str - not used here + ??? it is used so what's ahppenw hen not passed? label : str Label indicating preferred cmor variable definitions """ - db_log = ctx.obj['log'] + mopdb_log = logging.getLogger('mopdb_log') # connect to db, this will create one if not existing if dbname == 'default': - dbname = import_files('data').joinpath('access.db') - conn = db_connect(dbname, db_log) + dbname = import_files('mopdata').joinpath('access.db') + conn = db_connect(dbname, logname='mopdb_log') # get list of variables already in db sql = "SELECT out_name, frequency, modeling_realm FROM cmorvar" - results = query(conn, sql, first=False) + results = query(conn, sql, first=False, logname='mopdb_log') # cmor_vars is the actual cmip variable name # this sometime differs from name used in tables tohat can distinguish different dims/freq cmor_vars = set(x[0] for x in results) # read variable list from map_ file - vlist = read_map(fname, alias, db_log) + vlist = read_map(fname, alias) # extract cmor_var,units,dimensions,frequency,realm,cell_methods var_list = [] for v in vlist[1:]: - vid = (v[0], v[5], v[6]) + #vid = (v[0], v[5], v[6]) # This was adding variables to the table just if they didn't exists in other tables if v[0][:4] != 'fld_': if v[0] not in cmor_vars: - db_log.warning(f"Variable {v[0]} not defined in cmorvar table") + mopdb_log.warning(f"Variable {v[0]} not defined in cmorvar table") else: sql = f"SELECT * FROM cmorvar WHERE out_name='{v[0]}'" - records = query(conn, sql, first=False) + records = query(conn, sql, first=False, logname='mopdb_log') record = records[0] if len(records) > 1: for r in records: @@ -178,13 +220,13 @@ def cmor_table(ctx, dbname, fname, alias, label): definition[2] = v[6] # if units are different print warning! if v[3] != record[4]: - db_log.warning(f"Variable {v[0]} units orig/table are different: {v[3]}/{record[4]}") + mopdb_log.warning(f"Variable {v[0]} units orig/table are different: {v[3]}/{record[4]}") if v[7] != '' and v[7] != record[5]: - db_log.warning(f"Variable {v[0]} cell_methods orig/table are different: {v[7]}/{record[5]}") + mopdb_log.warning(f"Variable {v[0]} cell_methods orig/table are different: {v[7]}/{record[5]}") if len(v[4].split()) != len(record[9].split()): - db_log.warning(f"Variable {v[0]} number of dims orig/table are different: {v[4]}/{record[9]}") + mopdb_log.warning(f"Variable {v[0]} number of dims orig/table are different: {v[4]}/{record[9]}") var_list.append(definition) - write_cmor_table(var_list, alias, db_log) + write_cmor_table(var_list, alias) conn.close() return @@ -206,33 +248,33 @@ def update_cmor(ctx, dbname, fname, alias): fname : str Name of json input file with records to add alias : str - Indicates origin of records to add, if None json filename - base is used instead + Indicates origin of records to add, if '' (default) json + filename base is used instead Returns ------- """ - db_log = ctx.obj['log'] - if alias is None: + mopdb_log = logging.getLogger('mopdb_log') + if alias == '': alias = fname.split("/")[-1] alias = alias.replace('.json', '') - db_log.info(f"Adding {alias} to variable name to track origin") + mopdb_log.info(f"Adding {alias} to variable name to track origin") # connect to db, this will create one if not existing - dbcentral = import_files('data').joinpath('access.db') + dbcentral = import_files('mopdata').joinpath('access.db') if dbname in [dbcentral, 'default']: - db_log.error("The package database cannot be updated") + mopdb_log.error("The package database cannot be updated") sys.exit() - conn = db_connect(dbname, db_log) + conn = db_connect(dbname, logname='mopdb_log') # create table if not existing table_sql = cmorvar_sql() - create_table(conn, table_sql, db_log) + create_table(conn, table_sql, logname='mopdb_log') # get list of variables already in db in debug mode if ctx.obj['debug']: sql = 'SELECT name FROM cmorvar' - results = query(conn, sql, first=False) + results = query(conn, sql, first=False, logname='mopdb_log') existing_vars = [x[0] for x in results] - db_log.debug(f"Variables already in db: {existing_vars}") + mopdb_log.debug(f"Variables already in db: {existing_vars}") # read list of vars from file with open(fname, 'r') as fj: @@ -247,81 +289,143 @@ def update_cmor(ctx, dbname, fname, alias): if 'flag_values' not in row.keys(): values = values[:-2] + ['',''] + values[-2:] vars_list.append(tuple([name] + values)) - db_log.debug(f"Variables list: {vars_list}") + mopdb_log.debug(f"Variables list: {vars_list}") # check that all tuples have len == 19 for r in vars_list: if len(r) != 19: - db_log.error(r) + mopdb_log.error(r) sys.exit() # insert new vars and update existing ones - update_db(conn, 'cmorvar', vars_list, db_log) + update_db(conn, 'cmorvar', vars_list) + conn.close() return @mopdb.command(name='template') -@db_args -@click.option('--version', '-v', required=True, - type=click.Choice(['ESM1.5', 'CM2', 'AUS2200', 'OM2']), show_default=True, - help='ACCESS version currently only CM2, ESM1.5, AUS2200, OM2') +@map_args @click.pass_context -def map_template(ctx, dbname, fname, alias, version): +def map_template(ctx, fpath, match, dbname, version, alias): """Writes a template of mapping file needed to run setup. First opens database and check if variables match any in mapping table. If not tries to partially match them. + It can get as input the directory containing the output in + which case it will first call write_varlist() + or the file output of the same if already available. + Parameters ---------- ctx : obj Click context object + fpath : str + Path of csv input file with output variables to map or + of directory containing output files to scan + match : str + Date or other string to match to individuate one file per type dbname : str Database relative path (default is data/access.db) - fname : str - Name of csv input file with output variables to map + version : str + Version of ACCESS model used to generate variables alias : str - Indicates origin of records to add, if None csv filename + Indicates origin of records to add, if '' csv filename base is used instead + + Returns + ------- + """ + mopdb_log = logging.getLogger('mopdb_log') + # connect to db, this will create one if not existing + if dbname == 'default': + dbname = import_files('mopdata').joinpath('access.db') + conn = db_connect(dbname, logname='mopdb_log') + # work out if fpath is varlist or path to output + fpath = Path(fpath) + if fpath.is_file(): + mopdb_log.debug(f"{fpath} is file") + map_file, vobjs, fobjs = load_vars(fpath) + fname = fpath.name + mopdb_log.debug(f"Imported {len(vobjs)} objects from file {fpath}") + mopdb_log.debug(f"File is mapping: {map_file}") + else: + mopdb_log.debug(f"Calling write_varlist() from template: {fpath}") + fname, vobjs, fobjs = write_varlist(conn, fpath, match, version, alias) + if alias == '': + alias = fname.split(".")[0] + parsed = map_variables(conn, vobjs, version) + # potential vars have always duplicates: 1 for each input_var + write_map_template(conn, parsed, alias) + conn.close() + return + + +@mopdb.command(name='intake') +@map_args +@click.option('--filelist','-fl', type=click.Path(exists=True), + required=False, default=None, + help='Map or varlist csv file relative path') +@click.pass_context +def write_intake(ctx, fpath, match, filelist, dbname, version, alias): + """Writes an intake-esm catalogue. + + It can get as input the directory containing the output in + which case it will first call write_varlist() (varlist command) + or the file output of the same if already available. + + Parameters + ---------- + ctx : obj + Click context object + fpath : str + Path of directory containing output files to scan + match : str + Date or other string to match to individuate one file per type + filelist : str + Map or varlist csv file path, optional (default is None) + dbname : str + Database relative path (default is data/access.db) version : str Version of ACCESS model used to generate variables + alias : str + Indicates origin of records to add, if '' csv filename + base is used instead Returns ------- """ - db_log = ctx.obj['log'] - if alias is None: - alias = fname.split(".")[0] + mopdb_log = logging.getLogger('mopdb_log') # connect to db, check first if db exists or exit if dbname == 'default': - dbname = import_files('data').joinpath('access.db') - conn = db_connect(dbname, db_log) - # read list of vars from file - with open(fname, 'r') as csvfile: - reader = csv.DictReader(csvfile, delimiter=';') - rows = list(reader) + dbname = import_files('mopdata').joinpath('access.db') + conn = db_connect(dbname, logname='mopdb_log') + # work out if fpath is varlist or path to output + fpath = Path(fpath) + if fpath.is_file(): + mopdb_log.error(f""" {fpath} + should be absolute or relative path to model output. + To pass a varlist or map file use --filelist/-fl""") + elif filelist is None: + mopdb_log.debug(f"Calling write_varlist() from intake: {fpath}") + fname, vobjs, fobjs = write_varlist(conn, fpath, match, version, alias) + map_file = False + else: + flist = Path(filelist) + fname = flist.name + map_file, vobjs, fobjs = load_vars(flist, indir=fpath) + if alias == '': + alias = fname.split(".")[0] # return lists of fully/partially matching variables and stash_vars # these are input_vars for calculation defined in already in mapping db - full, no_ver, no_frq, stdn, no_match, stash_vars = parse_vars(conn, - rows, version, db_log) - - # remove duplicates from partially matched variables - no_ver = remove_duplicate(no_ver, db_log) - no_frq = remove_duplicate(no_frq, db_log, strict=False) - no_match = remove_duplicate(no_match, db_log, strict=False) - - # check if more derived variables can be added based on all - # input_vars being available - pot_full, pot_part, pot_varnames = potential_vars(conn, rows, - stash_vars, version, db_log) + if map_file is False: + parsed = map_variables(conn, vobjs, version) + vobjs = get_map_obj(parsed) + write_map_template(conn, parsed, alias) # potential vars have always duplicates: 1 for each input_var - pot_full = remove_duplicate(pot_full, db_log, strict=False) - pot_part = remove_duplicate(pot_part, db_log, extra=pot_full, - strict=False) - db_log.info(f"Derived variables: {pot_varnames}") - write_map_template(conn, full, no_ver, no_frq, stdn, - no_match, pot_full, pot_part, alias, db_log) + cat_name, fcsv = write_catalogue(conn, vobjs, fobjs, alias) + mopdb_log.info(f"""Intake-esm and intake catalogues written to + {cat_name} and {cat_name.replace('json','yaml')}. File list saved to {fcsv}""") conn.close() - - return + return None @mopdb.command(name='map') @@ -341,83 +445,78 @@ def update_map(ctx, dbname, fname, alias): fname : str Name of csv input file with mapping records alias : str - Indicates origin of records to add, if None csv filename + Indicates origin of records to add, if '' csv filename base is used instead Returns ------- """ - db_log = ctx.obj['log'] + mopdb_log = logging.getLogger('mopdb_log') # connect to db, this will create one if not existing - dbcentral = import_files('data').joinpath('access.db') + dbcentral = import_files('mopdata').joinpath('access.db') if dbname in [dbcentral, 'default']: - db_log.error("The package database cannot be updated") + mopdb_log.error("The package database cannot be updated") sys.exit() - conn = db_connect(dbname, db_log) + conn = db_connect(dbname, logname='mopdb_log') # create table if not existing table_sql = mapping_sql() - create_table(conn, table_sql, db_log) + create_table(conn, table_sql, logname='mopdb_log') # get list of variables already in db in debug mode if ctx.obj['debug']: sql = 'SELECT cmor_var FROM mapping' - results = query(conn, sql, first=False) + results = query(conn, sql, first=False, logname='mopdb_log') existing_vars = [x[0] for x in results] - db_log.debug(f"Variables already in db: {existing_vars}") + mopdb_log.debug(f"Variables already in db: {existing_vars}") # read list of vars from file if alias == 'app4': var_list = read_map_app4(fname) else: - var_list = read_map(fname, alias, db_log) + var_list = read_map(fname, alias) # update mapping table - update_db(conn, 'mapping', var_list, db_log) - return + update_db(conn, 'mapping', var_list) + conn.close() + return None @mopdb.command(name='varlist') -@click.option('--indir', '-i', type=str, required=True, - help='Converted model output directory') -@click.option('--startdate', '-d', type=str, required=True, - help='Start date of model run as YYYYMMDD') -@click.option('--dbname', type=str, required=False, default='default', - help='Database relative path by default is package access.db') -@click.option('--version', '-v', required=False, default='CM2', - type=click.Choice(['ESM1.5', 'CM2', 'AUS2200', 'OM2']), show_default=True, - help='ACCESS version currently only CM2, ESM1.5, AUS2200, OM2') +@map_args @click.pass_context -def model_vars(ctx, indir, startdate, dbname, version): +def model_vars(ctx, fpath, match, dbname, version, alias): """Read variables from model output opens one file for each kind, save variable list as csv file - alias is not used so far Parameters ---------- ctx : obj Click context object - indir : str + fpath : str Path for model output files - startdate : str + match : str Date or other string to match to individuate one file per type dbname : str Database relative path (default is data/access.db) version : str Version of ACCESS model to use as preferred mapping + alias : str + Used for output filename: 'varlist_'. If '', + 'varlist_mopdb' is used instead Returns ------- """ - db_log = ctx.obj['log'] - # connect to db, this will create one if not existing + # connect to db, check first if db exists or exit if dbname == 'default': - dbname = import_files('data').joinpath('access.db') - conn = db_connect(dbname, db_log) - write_varlist(conn, indir, startdate, version, db_log) + dbname = import_files('mopdata').joinpath('access.db') + conn = db_connect(dbname, logname='mopdb_log') + #mopdb_log = logging.getLogger('mopdb_log') + fname, vobjs, fobjs = write_varlist(conn, fpath, match, version, alias) conn.close() - return + return None @mopdb.command(name='del') -@click.option('--dbname', type=str, required=True, - help='Database relative path') +@click.option('--dbname', type=click.Path(exists=True), + required=True, help='Database relative path') @click.option('--table', '-t', type=str, required=True, help='DB table to remove records from') @click.option('--pair', '-p', type=(str, str), required=True, @@ -442,19 +541,20 @@ def remove_record(ctx, dbname, table, pair): Returns ------- """ - db_log = ctx.obj['log'] + mopdb_log = logging.getLogger('mopdb_log') # connect to db, this will create one if not existing - dbcentral = import_files('data').joinpath('access.db') + dbcentral = import_files('mopdata').joinpath('access.db') if dbname == dbcentral: - db_log.error("The package database cannot be updated") + mopdb_log.error("The package database cannot be updated") sys.exit() - conn = db_connect(dbname, db_log) + conn = db_connect(dbname) + conn = db_connect(dbname, logname='mopdb_log') # set which columns to show based on table if table == 'cmorvar': col = "name" elif table == 'mapping': col = "cmor_var,frequency,realm,cmor_table" # select, confirm, delete record/s - delete_record(conn, table, col, pair, db_log) + delete_record(conn, table, col, pair, logname='mopdb_log') + conn.close() return - diff --git a/src/mopdb/mopdb_class.py b/src/mopdb/mopdb_class.py new file mode 100644 index 0000000..a36a6a4 --- /dev/null +++ b/src/mopdb/mopdb_class.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python +# Copyright 2024 ARC Centre of Excellence for Climate Extremes (CLEX) +# Author: Paola Petrelli for CLEX +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# contact: paola.petrelli@utas.edu.au +# +# last updated 06/07/2024 + +from pathlib import Path + +class FPattern(): + """This class represent a file pattern with a set list of variables + its attributes represents features of the variables which are shared. + """ + + def __init__(self, fpattern: str, fpath: Path | None) -> None: + self.fpattern = fpattern + self.fpath = fpath + self.files = self.get_files() + self.realm = self.get_realm() + self.frequency = self.get_frequency() + self.version = '' + self.multiple_frq = False + self.varlist = [] + + def get_frequency(self): + frequency = 'NAfrq' + if len(self.files) > 0 and self.realm != 'NArealm': + fname = str(self.files[0]) + if self.realm == 'atmos': + fbits = fname.split("_") + frequency = fbits[-1].replace(".nc", "") + elif self.realm == 'ocean': + if any(x in fname for x in ['scalar', 'month']): + frequency = 'mon' + elif 'daily' in fname: + frequency = 'day' + elif self.realm == 'seaIce': + if '_m.' in fname: + frequency = 'mon' + elif '_d.' in fname: + frequency = 'day' + return frequency + + def get_realm(self): + realm = 'NArealm' + if self.fpath is not None: + realm = next((x for x in ['atmos', 'ocean', 'ice', 'ocn','atm'] + if x in self.fpath.parts), 'NArealm') + fix_realm = {'atm': 'atmos', 'ice': 'seaIce', 'ocn': 'ocean'} + if realm in fix_realm.keys(): + realm = fix_realm[realm] + return realm + + def get_files(self): + if self.fpath is None: + files = [] + else: + files = self.list_files(self.fpath, self.fpattern) + return files + + @staticmethod + def list_files(indir, match): + """Returns list of files matching input directory and match""" + files = [x for x in Path(indir).rglob(f"*{match}*") + if x.is_file() and '.nc' in str(x)] + files.sort(key=lambda x:x.name) + return files + + +class Variable(): + """This class represent a single variable with attributes derived from file + and the one added by mapping. + """ + + def __init__(self, varname: str, fobj: FPattern): + self.name = varname + # path object + self.fpattern = fobj.fpattern + # mapping attributes + self._frequency = fobj.frequency + self._realm = fobj.realm + self.cmor_var = '' + self.cmor_table = '' + #self.input_vars = varname + self.calculation = '' + self.version = fobj.version + self.match = False + # descriptive attributes + self.units = '' + self.dimensions = '' + self.cell_methods = '' + self.positive = '' + self.long_name = '' + self.standard_name = '' + # type and size attributes + self.vtype = '' + self.size = 0 + self.nsteps = 0 + + + @property + def frequency(self): + return self._frequency + + + @frequency.setter + def frequency(self, value): + value = value.replace('hPt', 'hrPt') + if not any(x in value for x in + ['fx', 'min', 'hr', 'day', 'mon', 'yr']): + value = 'NAfrq' + self._frequency = value + + + @property + def realm(self): + return self._realm + + @realm.setter + def realm(self, value): + if not any(x in value for x in + ['atmos', 'seaIce', 'ocean', 'land', 'landIce']): + value = 'NArealm' + self.realm = value + + def get_match(self): + """Returns the attributes that mimic + a database match""" + if self.cmor_var != '': + cmor_var = self.cmor_var + else: + cmor_var = self.name + match = (cmor_var, self.name, '', self.frequency, + self.realm, self.version, '', self.positive, self.units) + return match + + +class MapVariable(): + """This class represent a mapping for variable + It's similar but from a cmor_name point of view + """ + + def __init__(self, match: list, vobj: Variable): + # path object + self.fpattern = vobj.fpattern + # mapping attributes + self.frequency = vobj.frequency + self.realm = match[4] + self.cmor_var = match[0] + self.cmor_table = match[6] + self.input_vars = match[1] + self.calculation = match[2] + self.version = match[5] + # could change this to nomatch found or + # kind of match + self.match = True + # descriptive attributes + self.units = vobj.units + if self.units == '': + self.units = match[8] + self.dimensions = vobj.dimensions + self.cell_methods = vobj.cell_methods + self.positive = match[7] + self.long_name = vobj.long_name + self.standard_name = vobj.standard_name + # type and size attributes + self.vtype = vobj.vtype + self.size = vobj.size + self.nsteps = vobj.nsteps + + def attrs(self): + attrs = [] + for k in self.__dict__.keys(): + if k not in ['match']: + attrs.append(k) + return attrs diff --git a/src/mopdb/mopdb_map.py b/src/mopdb/mopdb_map.py new file mode 100644 index 0000000..cd26112 --- /dev/null +++ b/src/mopdb/mopdb_map.py @@ -0,0 +1,641 @@ +#!/usr/bin/env python +# Copyright 2023 ARC Centre of Excellence for Climate Extremes (CLEX) +# Author: Paola Petrelli for CLEX +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# contact: paola.petrelli@utas.edu.au +# +# last updated 10/04/2024 +# + +import logging +import csv +import json +import lzma +import math +import xarray as xr + +from operator import itemgetter, attrgetter +from pathlib import Path +from itertools import compress +from importlib.resources import files as import_files +#from access_nri_intake.source.builders import AccessEsm15Builder + +from mopdb.mopdb_class import FPattern, Variable, MapVariable +from mopdb.utils import query, read_yaml +from mopdb.mopdb_utils import (get_cell_methods, remove_duplicate, + get_realm, check_realm_units, get_date_pattern) + + +def get_cmorname(conn, vobj, version): + """Queries mapping table for cmip name given variable name as output + by the model + """ + mopdb_log = logging.getLogger('mopdb_log') + sql = f"""SELECT cmor_var,model,cmor_table,frequency FROM mapping + WHERE input_vars='{vobj.name}' and (calculation='' + or calculation IS NULL)""" + results = query(conn, sql, first=False, logname='mopdb_log') + names = list(x[0] for x in results) + tables = list(x[2] for x in results) + mopdb_log.debug(f"In get_cmorname query results: {results}") + if len(names) == 0: + vobj.cmor_var = '' + vobj.cmor_table = '' + elif len(names) == 1: + vobj.cmor_var = names[0] + vobj.cmor_table = tables[0] + elif len(names) > 1: + mopdb_log.debug(f"Found more than 1 definition for {vobj.name}:\n" + + f"{results}") + match_found = False + for r in results: + if r[1] == version and r[3] == vobj.frequency: + vobj.cmor_var, vobj.cmor_table = r[0], r[2] + match_found = True + break + if not match_found: + for r in results: + if r[3] == vobj.frequency: + vobj.cmor_var, vobj.cmor_table = r[0], r[2] + match_found = True + break + if not match_found: + for r in results: + if r[1] == version: + vobj.cmor_var, vobj.cmor_table = r[0], r[2] + match_found = True + break + if not match_found: + vobj.cmor_var = names[0] + vobj.cmor_table = tables[0] + mopdb_log.info(f"Found more than 1 definition for {vobj.name}:\n"+ + f"{results}\n Using {vobj.cmor_var} from {vobj.cmor_table}") + return vobj + +def get_file_frq(ds, fnext, int2frq): + """Return a dictionary with frequency for each time axis. + + Frequency is inferred by comparing interval between two consecutive + timesteps with expected interval at a given frequency. + Order time_axis so ones with only one step are last, so we can use + file frequency (interval_file) inferred from other time axes. + This is called if there are more than one time axis in file + (usually only UM) or if frequency can be guessed from filename. + """ + mopdb_log = logging.getLogger('mopdb_log') + mopdb_log.debug(f"in get_file_frq fnext: {fnext}") + frq = {} + # retrieve all time axes + time_axs = [d for d in ds.dims if 'time' in d] + #time_axs_len = set(len(ds[d]) for d in time_axs) + time_axs.sort(key=lambda x: len(ds[x]), reverse=True) + mopdb_log.debug(f"in get_file_frq, time_axs: {time_axs}") + if len(time_axs) > 0: + max_len = len(ds[time_axs[0]]) + else: + max_len = 0 + frq = {'time': 'fx'} + # if all time axes have only 1 timestep we cannot infer frequency + # so we open also next file but get only time axs + if max_len == 1: + if fnext is None: + mopdb_log.info("Only 1 file with 1 tstep cannot determine frequency") + else: + dsnext = xr.open_dataset(fnext, decode_times = False) + time_axs2 = [d for d in dsnext.dims if 'time' in d] + ds = xr.concat([ds[time_axs], dsnext[time_axs2]], dim='time') + time_axs = [d for d in ds.dims if 'time' in d] + time_axs.sort(key=lambda x: len(ds[x]), reverse=True) + if max_len > 0: + for t in time_axs: + mopdb_log.debug(f"len of time axis {t}: {len(ds[t])}") + if len(ds[t]) > 1: + interval = (ds[t][1]-ds[t][0]).values + interval_file = (ds[t][-1] -ds[t][0]).values + else: + interval = interval_file + mopdb_log.debug(f"interval 2 timesteps for {t}: {interval}") + for k,v in int2frq.items(): + if math.isclose(interval, v, rel_tol=0.05): + frq[t] = k + break + return frq + +def write_varlist(conn, indir, match, version, alias): + """Based on model output files create a variable list and save it + to a csv file. Main attributes needed to map output are provided + for each variable + """ + mopdb_log = logging.getLogger('mopdb_log') + line_cols = ['name','cmor_var','units','dimensions','_frequency', + '_realm','cell_methods','cmor_table','vtype','size', + 'nsteps','fpattern','long_name','standard_name'] + vobj_list = [] + fobj_list = [] + patterns = [] + files = FPattern.list_files(indir, match) + mopdb_log.debug(f"Files after sorting: {files}") + if alias == '': + alias = 'mopdb' + fname = f"varlist_{alias}.csv" + fcsv = open(fname, 'w') + fwriter = csv.writer(fcsv, delimiter=';') + fwriter.writerow(["name", "cmor_var", "units", "dimensions", + "frequency", "realm", "cell_methods", "cmor_table", "vtype", + "size", "nsteps", "fpattern", "long_name", "standard_name"]) + for fpath in files: + # get filename pattern until date match + mopdb_log.debug(f"Filename: {fpath.name}") + fpattern = fpath.name.split(match)[0] + if fpattern in patterns: + continue + patterns.append(fpattern) + fobj = FPattern(fpattern, fpath.parent) + #pattern_list = list_files(indir, f"{fpattern}*") + nfiles = len(fobj.files) + mopdb_log.debug(f"File pattern, number of files: {fpattern}, {nfiles}") + #fwriter.writerow([f"#{fpattern}"]) + # get attributes for the file variables + ds = xr.open_dataset(str(fobj.files[0]), decode_times=False) + time_units = ds['time'].units.split()[0] + yfile = import_files('mopdata').joinpath('interval2frq.yaml') + fdata = read_yaml(yfile) + int2frq = fdata[time_units] + coords = [c for c in ds.coords] + ['latitude_longitude'] + #pass next file in case of 1 timestep per file and no frq in name + if len(fobj.files) == 1: + fnext = None + else: + fnext = str(fobj.files[1]) + if fobj.frequency == 'NAfrq' or fobj.realm == 'atmos': + frq_dict = get_file_frq(ds, fnext, int2frq) + # if only one frequency detected empty dict + if len(frq_dict) == 1: + fobj.frequency = frq_dict.popitem()[1] + else: + fobj.multiple_frq = True + fobj.frequency = frq_dict['time'] + mopdb_log.debug(f"Multiple frq: {fobj.multiple_frq}") + if fobj.realm == "NArealm": + fobj.realm = get_realm(version, ds) + pattern_var_list = [] + for vname in ds.variables: + vobj = Variable(vname, fobj) + if vname not in coords and all(x not in vname for x in ['_bnds','_bounds']): + v = ds[vname] + mopdb_log.debug(f"Variable: {vobj.name}") + # get size in bytes of grid for 1 timestep and number of timesteps + vobj.size = v[0].nbytes + vobj.nsteps = nfiles * v.shape[0] + # assign time axis frequency if more than one is available + if fobj.multiple_frq: + if 'time' in v.dims[0]: + vobj._frequency = frq_dict[v.dims[0]] + else: + mopdb_log.info(f"Could not detect frequency for variable: {v}") + attrs = v.attrs + vobj.cell_methods, frqmod = get_cell_methods(attrs, v.dims) + vobj.frequency = vobj.frequency + frqmod + mopdb_log.debug(f"Frequency var: {vobj.frequency}") + # try to retrieve cmip name + vobj = get_cmorname(conn, vobj, version) + vobj.units = attrs.get('units', "") + vobj.long_name = attrs.get('long_name', "") + vobj.standard_name = attrs.get('standard_name', "") + vobj.dimensions = " ".join(v.dims) + vobj.vtype = v.dtype + line = [attrgetter(k)(vobj) for k in line_cols] + fwriter.writerow(line) + vobj_list.append(vobj) + pattern_var_list.append(vobj) + fobj.varlist = pattern_var_list + fobj_list.append(fobj) + mopdb_log.info(f"Variable list for {fpattern} successfully written") + fcsv.close() + return fname, vobj_list, fobj_list + +def match_stdname(conn, vobj, stdn): + """Returns an updated stdn list if finds one or more variables + in cmorvar table that match the standard name passed as input. + It also return a False/True found_match boolean. + """ + #mopdb_log = logging.getLogger('mopdb_log') + found_match = False + sql = f"""SELECT name FROM cmorvar where + standard_name='{vobj.standard_name}'""" + results = query(conn, sql, first=False, logname='mopdb_log') + matches = [x[0] for x in results] + if len(matches) > 0: + vmatch = vobj.get_match() + stdn = add_var(stdn, vobj, tuple([matches]+list(vmatch[1:])), + stdnm=True) + found_match = True + return stdn, found_match + +def match_var(vobj, version, mode, conn, records): + """Returns match for variable if found after looping + variables already mapped in database + Parameters + + """ + mopdb_log = logging.getLogger('mopdb_log') + found_match = False + # build sql query based on mode + sql_base = f"""SELECT cmor_var,input_vars,calculation,frequency, + realm,model,cmor_table,positive,units FROM mapping where + input_vars='{vobj.name}'""" + sql_frq = f" and frequency='{vobj.frequency}'" + sql_ver = f" and model='{version}'" + if mode == 'full': + sql = sql_base + sql_frq + sql_ver + elif mode == 'no_frq': + sql = sql_base + sql_ver + elif mode == 'no_ver': + sql = sql_base + sql_frq + # execute query and process results + result = query(conn, sql, first=False, logname='mopdb_log') + mopdb_log.debug(f"match_var: {result}, sql: {sql[114:]}") + if result is not None and result != []: + for x in result: + mopdb_log.debug(f"match: {x}") + records = add_var(records, vobj, x) + found_match = True + return records, found_match + +def parse_vars(conn, vobjs, version): + """Returns records of variables to include in template mapping file, + a list of all stash variables + frequency available in model output + and a list of variables already defined in db + + Parameters + ---------- + conn : connection object + rows : list(dict) + list of variables to match + version : str + model version to use to match variables + + Returns + ------- + stash_vars : list + varname-frequency for each listed variable, varname is from model output + """ + mopdb_log = logging.getLogger('mopdb_log') + full = [] + no_ver = [] + no_frq = [] + stdn = [] + no_match = [] + stash_vars = [] + + # looping through variables from file and attempt matches to db + for v in vobjs: + #if row['name'][0] == "#" or row['name'] == 'name': + # continue + #else: + full, found = match_var(v, version, 'full', conn, full) + # if no match, ignore model version first and then frequency + #mopdb_log.debug(f"found perfect match: {found}") + if not found: + no_ver, found = match_var(v, version, 'no_ver', conn, no_ver) + mopdb_log.debug(f"found no ver match: {found}") + if not found: + no_frq, found = match_var(v, version, 'no_frq', conn, no_frq) + mopdb_log.debug(f"found no frq match: {found}") + # make a last attempt to match using standard_name + if not found: + if v.standard_name != '': + stdn, found = match_stdname(conn, v, stdn) + mopdb_log.debug(f"found stdnm match: {found}") + if not found: + # use original var values for match + vmatch = v.get_match() + mopdb_log.debug(f"Getting match from variable: {vmatch}") + no_match = add_var(no_match, v, vmatch) + stash_vars.append(f"{v.name}-{v.frequency}") + + return full, no_ver, no_frq, stdn, no_match, stash_vars + +def add_var(vlist, vobj, match, stdnm=False): + """Add information from match to variable list and re-order + fields so they correspond to final mapping output. + + Parameters + match : tuple + match values (cmor_var,input_vars,calculation,frequency, + realm,model(version),cmor_table,positive,units) + """ + mopdb_log = logging.getLogger('mopdb_log') + # assign cmor_var from match and swap place with input_vars + mopdb_log.debug(f"Assign cmor_var: {match}") + mopdb_log.debug(f"initial variable definition: {vobj}") + var = MapVariable(match, vobj) + if stdnm: + var.input_vars = vobj.name + if len(var.cmor_var) == 1: + cmor_var, table = var.cmor_var[0].split("-") + var.cmor_var = cmor_var + var.cmor_table = table + vlist.append(var) + return vlist + +def potential_vars(conn, vobjs, stash_vars, version): + """Returns list of variables that can be potentially derived from + model output. + + Loop across all model variables to match + Select any mapping that contains the variable and if there's a calculation + NB rows modified by add_row when assigning cmorname and positive values + + Parameters + ---------- + conn : connection object + rows : list(dict) + list of variables to match + stash_vars : list + varname-frequency for each listed variable, varname is from model output + version : str + model version to use to match variables + + Returns + ------- + """ + mopdb_log = logging.getLogger('mopdb_log') + pot_full = [] + pot_part = [] + pot_varnames = set() + for v in vobjs: + sql = f"""SELECT cmor_var,input_vars,calculation,frequency, + realm,model,cmor_table,positive,units FROM mapping + WHERE input_vars like '%{v.name}%'""" + results = query(conn, sql, first=False, logname='mopdb_log') + mopdb_log.debug(f"In potential: var {v.name}, db results {results}") + for r in results: + allinput = r[1].split(" ") + mopdb_log.debug(f"{len(allinput)> 1}") + mopdb_log.debug(all(f"{x}-{v.frequency}" in stash_vars for x in allinput)) + if len(allinput) > 1 and all(f"{x}-{v.frequency}" in stash_vars for x in allinput): + # if both version and frequency of applied mapping match + # consider this a full matching potential var + if r[5] == version and r[3] == v.frequency: + pot_full = add_var(pot_full, v, r) + else: + pot_part = add_var(pot_part, v, r) + pot_varnames.add(r[0]) + return pot_full, pot_part, pot_varnames + + +def write_map_template(conn, parsed, alias): + """Write mapping csv file template based on list of variables to define + + Input varlist file order: + name, cmor_var, units, dimensions, frequency, realm, cell_methods, + cmor_table, vtype, size, nsteps, fpattern, long_name, standard_name + Mapping db order: + cmor_var, input_vars, calculation, units, dimensions, frequency, realm, + cell_methods, positive, cmor_table, model, notes, origin + for pot vars + vtype, size, nsteps, fpattern + Final template order: + cmor_var, input_vars, calculation, units, dimensions, frequency, realm, + cell_methods, positive, cmor_table, version, vtype, size, nsteps, fpattern, + long_name, standard_name + """ + + mopdb_log = logging.getLogger('mopdb_log') + full, no_ver, no_frq, stdn, no_match, pot_full, pot_part = parsed + keys = ['cmor_var', 'input_vars', 'calculation', 'units', + 'dimensions', 'frequency', 'realm', 'cell_methods', + 'positive', 'cmor_table', 'version', 'vtype', 'size', + 'nsteps', 'fpattern', 'long_name', 'standard_name'] + + with open(f"map_{alias}.csv", 'w') as fcsv: + fwriter = csv.DictWriter(fcsv, keys, delimiter=';') + write_vars(full, fwriter, keys, conn=conn) + # write header as write_vars skips it if full is empty + if len(full) == 0: + fwriter.writerow({x:x for x in keys}) + div = ("# Derived variables with matching version and " + + "frequency: Use with caution!") + write_vars(pot_full, fwriter, div, conn=conn) + div = ("# Variables definitions coming from different " + + "version") + write_vars(no_ver, fwriter, div, conn=conn) + div = ("# Variables with different frequency: Use with" + + " caution!") + write_vars(no_ver, fwriter, div, conn=conn) + div = ("# Variables matched using standard_name: Use " + + "with caution!") + write_vars(stdn, fwriter, div, sortby='input_vars') + div = "# Derived variables: Use with caution!" + write_vars(pot_part, fwriter, div, conn=conn) + div = "# Variables without mapping" + write_vars(no_match, fwriter, div) + mopdb_log.debug("Finished writing variables to mapping template") + fcsv.close() + return + +def write_vars(vlist, fwriter, div, conn=None, sortby='cmor_var'): + """ + """ + + #mopdb_log = logging.getLogger('mopdb_log') + if len(vlist) > 0: + if type(div) is str: + divrow = {x:'' for x in vlist[0].attrs()} + divrow['cmor_var'] = div + elif type(div) is list: + divrow = {x:x for x in div} + fwriter.writerow(divrow) + dlist = [] + for var in vlist: + if conn: + var = check_realm_units(conn, var) + dlist.append( var.__dict__ ) + for dvar in sorted(dlist, key=itemgetter(sortby)): + if 'match' in dvar.keys(): + dvar.pop('match') + fwriter.writerow(dvar) + return + +def map_variables(conn, vobjs, version): + """ + """ + mopdb_log = logging.getLogger('mopdb_log') + # return lists of fully/partially matching variables and stash_vars + # these are input_vars for calculation defined in already in mapping db + full, no_ver, no_frq, stdn, no_match, stash_vars = parse_vars(conn, + vobjs, version) + # remove duplicates from partially matched variables + no_ver = remove_duplicate(no_ver) + no_frq = remove_duplicate(no_frq, strict=False) + no_match = remove_duplicate(no_match, strict=False) + # check if more derived variables can be added based on all + # input_vars being available + pot_full, pot_part, pot_varnames = potential_vars(conn, vobjs, + stash_vars, version) + # potential vars have always duplicates: 1 for each input_var + pot_full = remove_duplicate(pot_full, strict=False) + pot_part = remove_duplicate(pot_part, extra=pot_full, strict=False) + mopdb_log.info(f"Derived variables: {pot_varnames}") + return full, no_ver, no_frq, stdn, no_match, pot_full, pot_part + +def get_map_obj(parsed): + """Returns list of variable objects to pass to intake""" + full, no_ver, no_frq, stdn, no_match, pot_full, pot_part = parsed + vobjs = [] + select = full + no_ver + no_frq + for v in select: + vobjs.append(v) + return vobjs + +def write_catalogue(conn, vobjs, fobjs, alias): + """Write intake-esm catalogue and returns name + """ + + mopdb_log = logging.getLogger('mopdb_log') + # read template json file + jfile = import_files('mopdata').joinpath('intake_cat_template.json') + with open(jfile, 'r') as f: + template = json.load(f) + # write updated json to file + for k,v in template.items(): + if type(v) is str: + template[k] = v.replace("", alias) + jout = f"intake_{alias}.json" + with open(jout, 'w') as f: + json.dump(template, f, indent=4) + # read template yaml file + yfile = import_files('mopdata').joinpath('intake_cat_template.yaml') + with open(yfile, "r") as f: + maincat = f.read() + maincat = maincat.replace("", alias) + mopdb_log.debug("Opened intake template files") + # write updated yaml to file + yout = f"intake_{alias}.yaml" + with open(yout, 'w') as f: + f.writelines(maincat) + # create a dictionary for each file to list + lines = create_file_dict(fobjs, alias) + # write csv file + cols = [x['column_name'] for x in template['attributes']] + cols = ['path'] + cols + csvname = template['catalog_file'] + with lzma.open(csvname, 'wt') as fcsv: + fwriter = csv.DictWriter(fcsv, cols) + fwriter.writeheader() + for fd in lines: + fwriter.writerow(fd) + fcsv.close() + return jout, csvname + +def create_file_dict(fobjs, alias): + """ + """ + #mopdb_log = logging.getLogger('mopdb_log') + lines = [] + for pat_obj in fobjs: + var_list = [v.name for v in pat_obj.varlist] + # set to remove '' duplicates + base_dict = {'experiment': alias, + 'realm': pat_obj.realm, + 'frequency': pat_obj.frequency, + 'variable': str(var_list), + 'mapvar': "NAV", + 'standard_name': "", + 'units': "", + 'calculation': "", + 'cell_methods': ""} + # work out date_pattern in filename + fname = pat_obj.files[0].name + date_pattern = get_date_pattern(fname, pat_obj.fpattern) + # add date and path for each file + path_list = [] + for fpath in pat_obj.files: + f = fpath.name + fd = base_dict.copy() + fd['path'] = str(fpath) + fd['date'] = ''.join(c for c in compress(f, date_pattern)) + lines.append(fd) + path_list.append((fd['path'],fd['date'])) + lines = add_mapvars(pat_obj.varlist, lines, path_list, alias) + return lines + +def add_mapvars(vobjs, lines, path_list, alias): + """ + """ + #mopdb_log = logging.getLogger('mopdb_log') + for vobj in vobjs: + if vobj.cmor_var != "" or vobj.standard_name != "": + mapvar = vobj.cmor_var + base_dict = {'experiment': alias, + 'realm': vobj.realm, + 'frequency': vobj.frequency, + 'variable': str([vobj.name]), + 'mapvar': mapvar if mapvar else "NAV", + 'standard_name': vobj.standard_name, + 'units': vobj.units, + 'calculation': vobj.calculation, + 'cell_methods': vobj.cell_methods} + # use path_list to add path and date for all files + for fpath, date in path_list: + fd = base_dict.copy() + fd['path'] = fpath + fd['date'] = date + lines.append(fd) + return lines + +def load_vars(fname, indir=None): + """Returns Variable and FPattern objs from varlist or map file. + """ + mopdb_log = logging.getLogger('mopdb_log') + vobjs = [] + fobjs = {} + if indir is not None: + indir = Path(indir) + # distinguish between varlist and mapping file based on header + with open(fname, 'r') as csvfile: + reader = csv.DictReader(csvfile, delimiter=';') + rows = list(reader) + mopdb_log.debug(f"Loaded file with {len(rows)} rows") + # set fobjs + patterns = list(set(x['fpattern'] for x in rows)) + for pat in patterns: + if pat != "": + fo = FPattern(pat, indir) + fobjs[pat] = fo + if 'calculation' in rows[0].keys(): + map_file = True + colname = 'input_vars' + else: + map_file = False + colname = 'name' + for row in rows: + fo = fobjs[row['fpattern']] + vo = Variable(row[colname], fo) + for k,v in row.items(): + if k in ['realm', 'frequency']: + k = '_' + k + vo.__dict__[k] = v + if fo.realm == 'NArealm': + fo.realm = vo.realm + if fo.frequency == 'NAfrq': + fo.frequency = vo.frequency + fo.varlist.append(vo) + if map_file is True: + mvo = MapVariable(list(vo.get_match()), vo) + vobjs.append(mvo) + else: + vobjs.append(vo) + return map_file, vobjs, [x for x in fobjs.values()] diff --git a/src/mopdb/mopdb_utils.py b/src/mopdb/mopdb_utils.py index 7d4ab62..b3f8bf4 100644 --- a/src/mopdb/mopdb_utils.py +++ b/src/mopdb/mopdb_utils.py @@ -19,65 +19,15 @@ # last updated 10/04/2024 # -import sqlite3 import logging import sys -import os import csv -import glob import json -import stat -import xarray as xr -import math -from datetime import datetime, date -from collections import Counter -from operator import itemgetter -from pathlib import Path - - -def config_log(debug): - """Configures log file""" - # start a logger - logger = logging.getLogger('db_log') - # set a formatter to manage the output format of our handler - formatter = logging.Formatter('%(asctime)s; %(message)s',"%Y-%m-%d %H:%M:%S") - # set the level for the logger, has to be logging.LEVEL not a string - level = logging.INFO - flevel = logging.WARNING - if debug: - level = logging.DEBUG - flevel = logging.DEBUG - logger.setLevel(level) - - # add a handler to send WARNING level messages to console - # or DEBUG level if debug is on - clog = logging.StreamHandler() - clog.setLevel(level) - logger.addHandler(clog) - - # add a handler to send INFO level messages to file - # the messagges will be appended to the same file - # create a new log file every month - day = date.today().strftime("%Y%m%d") - logname = 'mopdb_log_' + day + '.txt' - flog = logging.FileHandler(logname) - try: - os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO); - except OSError: - pass - flog.setLevel(flevel) - flog.setFormatter(formatter) - logger.addHandler(flog) - # return the logger object - return logger +from datetime import date +from collections import Counter -def db_connect(db, db_log): - """Connects to ACCESS mapping sqlite database""" - conn = sqlite3.connect(db, timeout=10, isolation_level=None) - if conn.total_changes == 0: - db_log.info(f"Opened database {db} successfully") - return conn +from mopdb.utils import query def mapping_sql(): @@ -106,7 +56,6 @@ def mapping_sql(): ) WITHOUT ROWID;""") return sql - def cmorvar_sql(): """Returns sql definition of cmorvar table @@ -137,7 +86,6 @@ def cmorvar_sql(): ok_max_mean_abs TEXT);""") return sql - def map_update_sql(): """Returns sql needed to update mapping table @@ -155,7 +103,6 @@ def map_update_sql(): {', '.join(x+' = excluded.'+x for x in cols)}""" return sql - def cmor_update_sql(): """Returns sql needed to update cmorvar table @@ -174,8 +121,7 @@ def cmor_update_sql(): {', '.join(x+' = excluded.'+x for x in cols)}""" return sql - -def create_table(conn, sql, db_log): +def create_table(conn, sql): """Creates table if database is empty Parameters @@ -183,17 +129,16 @@ def create_table(conn, sql, db_log): conn : connection object sql : str SQL style string defining table to create - db_log: logger obj """ + mopdb_log = logging.getLogger('mopdb_log') try: c = conn.cursor() c.execute(sql) except Exception as e: - db_log.error(e) + mopdb_log.error(e) return - -def update_db(conn, table, rows_list, db_log): +def update_db(conn, table, rows_list): """Adds to table new variables definitions Parameters @@ -203,114 +148,26 @@ def update_db(conn, table, rows_list, db_log): Name of database table to use rows_list : list List of str represneting rows to add to table - db_log: logger obj """ + mopdb_log = logging.getLogger('mopdb_log') # insert into db if table == 'cmorvar': sql = cmor_update_sql() elif table == 'mapping': sql = map_update_sql() else: - db_log.error("Provide an insert sql statement for table: {table}") + mopdb_log.error("Provide an insert sql statement for table: {table}") if len(rows_list) > 0: - db_log.info('Updating db ...') + mopdb_log.info('Updating db ...') with conn: c = conn.cursor() - db_log.debug(sql) + mopdb_log.debug(sql) c.executemany(sql, rows_list) nmodified = c.rowcount - db_log.info(f"Rows modified: {nmodified}") - conn.close() - db_log.info('--- Done ---') + mopdb_log.info(f"Rows modified: {nmodified}") + mopdb_log.info('--- Done ---') return - -def query(conn, sql, tup=(), first=True): - """Executes generic sql query and returns row/s - - Parameters - ---------- - conn : connection object - Connection to sqlite database - sql : str - sql string representing query - tup : tuple - By default empty, used to pass values when placeholder ? is used - in sql string - first : boolean - By default True will return only first record found, set to False - to return all matching records - - Returns - ------- - result : tuple/list(tuple) - tuple or a list of, representing row/s returned by query - """ - with conn: - c = conn.cursor() - c.execute(sql, tup) - if first: - result = c.fetchone() - else: - result = [ x for x in c.fetchall() ] - #columns = [description[0] for description in c.description] - return result - - -def get_columns(conn, table): - """Gets list of columns form db table - """ - sql = f'PRAGMA table_info({table});' - table_data = query(conn, sql, first=False) - columns = [x[1] for x in table_data] - return columns - - -def get_cmorname(conn, varname, version, frequency, db_log): - """Queries mapping table for cmip name given variable name as output - by the model - """ - sql = f"""SELECT cmor_var,model,cmor_table,frequency FROM mapping - WHERE input_vars='{varname}' and (calculation='' - or calculation IS NULL)""" - results = query(conn, sql, first=False) - names = list(x[0] for x in results) - tables = list(x[2] for x in results) - if len(names) == 0: - cmor_var = '' - cmor_table = '' - elif len(names) == 1: - cmor_var = names[0] - cmor_table = tables[0] - elif len(names) > 1: - db_log.debug(f"Found more than 1 definition for {varname}:\n" + - f"{results}") - match_found = False - for r in results: - if r[1] == version and r[3] == frequency: - cmor_var, cmor_table = r[0], r[2] - match_found = True - break - if not match_found: - for r in results: - if r[3] == frequency: - cmor_var, cmor_table = r[0], r[2] - match_found = True - break - if not match_found: - for r in results: - if r[1] == version: - cmor_var, cmor_table = r[0], r[2] - match_found = True - break - if not match_found: - cmor_var = names[0] - cmor_table = tables[0] - db_log.info(f"Found more than 1 definition for {varname}:\n"+ - f"{results}\n Using {cmor_var} from {cmor_table}") - return cmor_var, cmor_table - - def cmor_table_header(name, realm, frequency): """ """ @@ -334,24 +191,24 @@ def cmor_table_header(name, realm, frequency): } return header - -def write_cmor_table(var_list, name, db_log): +def write_cmor_table(var_list, name): """ """ + mopdb_log = logging.getLogger('mopdb_log') realms = [v[2] for v in var_list] setr = set(realms) if len(setr) > 1: realm = Counter(realms).most_common(1)[0][0] - db_log.info(f"More than one realms found for variables: {setr}") - db_log.info(f"Using: {realm}") + mopdb_log.info(f"More than one realms found for variables: {setr}") + mopdb_log.info(f"Using: {realm}") else: realm = realms[0] freqs = [v[1] for v in var_list] setf = set(freqs) if len(setf) > 1: frequency = Counter(freqs).most_common(1)[0][0] - db_log.info(f"More than one freqs found for variables: {setf}") - db_log.info(f"Using: {frequency}") + mopdb_log.info(f"More than one freqs found for variables: {setf}") + mopdb_log.info(f"Using: {frequency}") else: frequency = freqs[0] header = cmor_table_header(name, realm, frequency) @@ -372,134 +229,13 @@ def write_cmor_table(var_list, name, db_log): json.dump(out, f, indent=4) return - -def delete_record(conn, table, col, pairs, db_log): - """Deletes record from table based on pairs of column and - value passed for selection - - Parameters - ---------- - conn : connection object - connection to db - table: str - db table name - col: str - name of column to return with query - pairs : list[tuple(str, str)] - pairs of columns, values to select record/s - db_log: logger obj - logger connection - """ - # Set up query - sqlwhere = f"FROM {table} WHERE " - for c,v in pairs: - sqlwhere += f"{c}='{v}' AND " - sql = f"SELECT {col} " + sqlwhere[:-4] - db_log.debug(f"Delete query: {sql}") - xl = query(conn, sql, first=False) - # Delete from db - if xl is not None: - db_log.info(f"Found {len(xl)} records") - for x in xl: - db_log.info(f"{x}") - confirm = input('Confirm deletion from database: Y/N ') - if confirm == 'Y': - db_log.info('Updating db ...') - with conn: - c = conn.cursor() - sql = "DELETE " + sqlwhere[:-4] - db_log.debug(f"Delete sql: {sql}") - c.execute(sql) - c.execute('select total_changes()') - db_log.info(f"Rows modified: {c.fetchall()[0][0]}") - else: - db_log.info("The query did not return any records") - conn.close() - return - - -def list_files(indir, match, db_log): - """Returns list of files matching input directory and match""" - files = [x for x in Path(indir).rglob(f"{match}") if x.is_file()] - db_log.debug(f"{indir}/**/*{match}*") - return files - - -def build_umfrq(time_axs, ds, db_log): - """ - """ - umfrq = {} - #PPfirst_step = {} - int2frq = {'dec': 3652.0, 'yr': 365.0, 'mon': 30.0, - 'day': 1.0, '6hr': 0.25, '3hr': 0.125, - '1hr': 0.041667, '10min': 0.006944} - for t in time_axs: - #PPfirst_step[t] = ds[t][0].values - if len(ds[t]) > 1: - interval = (ds[t][1]-ds[t][0]).values - interval_file = (ds[t][-1] -ds[t][0]).values - for k,v in int2frq.items(): - if math.isclose(interval, v, rel_tol=0.05): - umfrq[t] = k - break - else: - umfrq[t] = 'file' - # use other time_axis info to work out frq of time axis with 1 step - db_log.debug(f"umfrq in function {umfrq}") - for t,frq in umfrq.items(): - if frq == 'file': - for k,v in int2frq.items(): - if math.isclose(interval_file, v, rel_tol=0.05): - umfrq[t] = k - break - return umfrq - - -def get_frequency(realm, fname, ds, db_log): - """Return frequency based on realm and filename - For UM files checks if more than one time axis is present and if so - returns dictionary with frequency: variable list - """ - umfrq = {} - frequency = 'NA' - if realm == 'atmos': - fbits = fname.split("_") - frequency = fbits[-1].replace(".nc", "") - if frequency == 'dai': - frequency = 'day' - elif frequency == '3h': - frequency = '3hr' - elif frequency == '6h': - frequency = '6hr' - else: - frequency = frequency.replace('hPt', 'hrPt') - time_axs = [d for d in ds.dims if 'time' in d] - time_axs_len = set(len(ds[d]) for d in time_axs) - if len(time_axs_len) == 1: - umfrq = {} - else: - umfrq = build_umfrq(time_axs, ds, db_log) - elif realm == 'ocean': - # if I found scalar or monthly in any of fbits - if any(x in fname for x in ['scalar', 'month']): - frequency = 'mon' - elif 'daily' in fname: - frequency = 'day' - elif realm == 'ice': - if '_m.' in fname: - frequency = 'mon' - elif '_d.' in fname: - frequency = 'day' - db_log.debug(f"Frequency: {frequency}") - return frequency, umfrq - - def get_cell_methods(attrs, dims): """Get cell_methods from variable attributes. If cell_methods is not defined assumes values are instantaneous `time: point` If `area` not specified is added at start of string as `area: ` """ + #mopdb_log = logging.getLogger('mopdb_log') frqmod = '' val = attrs.get('cell_methods', "") if 'area' not in val: @@ -513,93 +249,9 @@ def get_cell_methods(attrs, dims): val = val.replace(time_axs[0], 'time') return val, frqmod - -def write_varlist(conn, indir, startdate, version, db_log): - """Based on model output files create a variable list and save it - to a csv file. Main attributes needed to map output are provided - for each variable - """ - #PP temporarily remove .nc as ocean files sometimes have pattern.nc-datestamp - #sdate = f"*{startdate}*.nc" - sdate = f"*{startdate}*" - files = list_files(indir, sdate, db_log) - db_log.debug(f"Found files: {files}") - patterns = [] - for fpath in files: - # get filename pattern until date match - db_log.debug(f"Filename: {fpath.name}") - fpattern = fpath.name.split(startdate)[0] - # adding this in case we have a mix of yyyy/yyyymn date stamps - # as then a user would have to pass yyyy only and would get 12 files for some of the patterns - if fpattern in patterns: - continue - patterns.append(fpattern) - pattern_list = list_files(indir, f"{fpattern}*", db_log) - nfiles = len(pattern_list) - db_log.debug(f"File pattern: {fpattern}") - fcsv = open(f"{fpattern}.csv", 'w') - fwriter = csv.writer(fcsv, delimiter=';') - fwriter.writerow(["name", "cmor_var", "units", "dimensions", - "frequency", "realm", "cell_methods", "cmor_table", - "vtype", "size", "nsteps", "filename", "long_name", - "standard_name"]) - # get attributes for the file variables - try: - if version == 'AUS2200': - realm = '/atmos/' - else: - realm = [x for x in ['/atmos/', '/ocean/', '/ice/'] if x in str(fpath)][0] - except: - realm = [x for x in ['/atm/', '/ocn/', '/ice/'] if x in str(fpath)][0] - realm = realm[1:-1] - if realm == 'atm': - realm = 'atmos' - elif realm == 'ocn': - realm = 'ocean' - db_log.debug(realm) - ds = xr.open_dataset(fpath, decode_times=False) - coords = [c for c in ds.coords] + ['latitude_longitude'] - frequency, umfrq = get_frequency(realm, fpath.name, ds, db_log) - db_log.debug(f"Frequency: {frequency}") - db_log.debug(f"umfrq: {umfrq}") - multiple_frq = False - if umfrq != {}: - multiple_frq = True - db_log.debug(f"Multiple frq: {multiple_frq}") - for vname in ds.variables: - if vname not in coords and all(x not in vname for x in ['_bnds','_bounds']): - v = ds[vname] - db_log.debug(f"Variable: {v.name}") - # get size in bytes of grid for 1 timestep and number of timesteps - vsize = v[0].nbytes - nsteps = nfiles * v.shape[0] - # assign specific frequency if more than one is available - if multiple_frq: - if 'time' in v.dims[0]: - frequency = umfrq[v.dims[0]] - else: - frequency = 'NA' - db_log.info(f"Could not detect frequency for variable: {v}") - attrs = v.attrs - cell_methods, frqmod = get_cell_methods(attrs, v.dims) - varfrq = frequency + frqmod - db_log.debug(f"Frequency x var: {varfrq}") - # try to retrieve cmip name - cmor_var, cmor_table = get_cmorname(conn, vname, - version, varfrq, db_log) - line = [v.name, cmor_var, attrs.get('units', ""), - " ".join(v.dims), varfrq, realm, - cell_methods, cmor_table, v.dtype, vsize, - nsteps, fpattern, attrs.get('long_name', ""), - attrs.get('standard_name', "")] - fwriter.writerow(line) - fcsv.close() - db_log.info(f"Variable list for {fpattern} successfully written") - return - - def read_map_app4(fname): """Reads APP4 style mapping """ + #mopdb_log = logging.getLogger('mopdb_log') # old order #cmor_var,definable,input_vars,calculation,units,axes_mod,positive,ACCESS_ver[CM2/ESM/both],realm,notes var_list = [] @@ -622,18 +274,19 @@ def read_map_app4(fname): return var_list -def read_map(fname, alias, db_log): +def read_map(fname, alias): """Reads complete mapping csv file and extract info necessary to create new records for the mapping table in access.db Fields from file: cmor_var, input_vars, calculation, units, dimensions, frequency, realm, cell_methods, positive, cmor_table, version, vtype, size, nsteps, - filename, long_name, standard_name + fpattern, long_name, standard_name Fields in table: cmor_var, input_vars, calculation, units, dimensions, frequency, realm, cell_methods, positive, model, notes, origin NB model and version are often the same but version should eventually be defined in a CV """ + mopdb_log = logging.getLogger('mopdb_log') var_list = [] with open(fname, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=';') @@ -642,161 +295,18 @@ def read_map(fname, alias, db_log): if row[0][0] == "#": continue else: - db_log.debug(f"In read_map: {row[0]}") - db_log.debug(f"In read_map row length: {len(row)}") + mopdb_log.debug(f"In read_map: {row[0]}") + mopdb_log.debug(f"In read_map row length: {len(row)}") if row[16] != '': notes = row[16] else: notes = row[15] - if alias is None: + if alias == '': alias = fname.replace(".csv","") var_list.append(row[:11] + [notes, alias]) return var_list - -def match_stdname(conn, row, stdn, db_log): - """Returns an updated stdn list if finds one or more variables - in cmorvar table that match the standard name passed as input. - It also return a False/True found_match boolean. - """ - found_match = False - sql = f"""SELECT name FROM cmorvar where - standard_name='{row['standard_name']}'""" - results = query(conn, sql, first=False) - matches = [x[0] for x in results] - if len(matches) > 0: - stdn = add_var(stdn, row, tuple([matches]+['']*7), db_log, - stdnm=True) - found_match = True - - return stdn, found_match - - -def match_var(row, version, mode, conn, records, db_log): - """Returns match for variable if found after looping - variables already mapped in database - Parameters - - """ - found_match = False - # build sql query based on mode - sql_base = f"""SELECT cmor_var,input_vars,calculation,frequency, - realm,model,cmor_table,positive,units FROM mapping where - input_vars='{row['name']}'""" - sql_frq = f" and frequency='{row['frequency']}'" - sql_ver = f" and model='{version}'" - if mode == 'full': - sql = sql_base + sql_frq + sql_ver - elif mode == 'no_frq': - sql = sql_base + sql_ver - elif mode == 'no_ver': - sql = sql_base + sql_frq - # execute query and process results - result = query(conn, sql, first=False) - db_log.debug(f"match_var: {result}, sql: {sql[110:]}") - if result is not None and result != []: - for x in result: - db_log.debug(f"match: {x}") - records = add_var(records, row, x, db_log) - found_match = True - - return records, found_match - - -def parse_vars(conn, rows, version, db_log): - """Returns records of variables to include in template mapping file, - a list of all stash variables + frequency available in model output - and a list of variables already defined in db - - Parameters - ---------- - conn : connection object - rows : list(dict) - list of variables to match - version : str - model version to use to match variables - db_log: logger obj - - Returns - ------- - stash_vars : list - varname-frequency for each listed variable, varname is from model output - """ - full = [] - no_ver = [] - no_frq = [] - stdn = [] - no_match = [] - stash_vars = [] - - # looping through variables from file and attempt matches to db - for row in rows: - if row['name'][0] == "#" or row['name'] == 'name': - continue - else: - full, found = match_var(row, version, 'full', conn, full, db_log) - # if no match, ignore model version first and then frequency - db_log.debug(f"found perfect match: {found}") - if not found: - no_ver, found = match_var(row, version, 'no_ver', conn, no_ver, db_log) - db_log.debug(f"found no ver match: {found}") - if not found: - no_frq, found = match_var(row, version, 'no_frq', conn, no_frq, db_log) - db_log.debug(f"found no frq match: {found}") - # make a last attempt to match using standard_name - if not found: - if row['standard_name'] != '': - stdn, found = match_stdname(conn, row, stdn, db_log) - db_log.debug(f"found stdnm match: {found}") - if not found: - no_match = add_var(no_match, row, tuple([row['name']]+['']*8), - db_log) - stash_vars.append(f"{row['name']}-{row['frequency']}") - - return full, no_ver, no_frq, stdn, no_match, stash_vars - - -def add_var(vlist, row, match, db_log, stdnm=False): - """Add information from match to variable list and re-order - fields so they correspond to final mapping output. - - Parameters - match : tuple - match values (cmor_var,input_vars,calculation,frequency, - realm,model(version),cmor_table,positive,units) - """ - # assign cmor_var from match and swap place with input_vars - db_log.debug(f"Assign cmor_var: {match}") - db_log.debug(f"initial row: {row}") - var = row.copy() - var['cmor_var'] = match[0] - var['input_vars'] = match[1] - orig_name = var.pop('name') - # assign realm from match - var['realm'] = match[4] - # with stdn assign cmorvar and table if only 1 match returned - # otherwise assign table from match - if stdnm: - var['input_vars'] = orig_name - if len(var['cmor_var']) == 1: - cmor_var, table = var['cmor_var'][0].split("-") - var['cmor_var'] = cmor_var - var['cmor_table'] = table - else: - var['cmor_table'] = match[6] - # add calculation, positive and version - var['calculation'] = match[2] - var['positive'] = match[7] - var['version'] = match[5] - # maybe we should override units here rather than in check_realm_units - # if units missing get them from match - if var['units'] is None or var['units'] == '': - var['units'] = match[8] - vlist.append(var) - return vlist - - -def remove_duplicate(vlist, db_log, extra=[], strict=True): +def remove_duplicate(vlist, extra=[], strict=True): """Returns list without duplicate variable definitions. Define unique definition for variable as tuple (cmor_var, input_vars, @@ -805,159 +315,98 @@ def remove_duplicate(vlist, db_log, extra=[], strict=True): If extra is defined if a variable exists in this additional set it is a duplicate """ - db_log.debug(f'in duplicate, vlist {vlist}') + mopdb_log = logging.getLogger('mopdb_log') + mopdb_log.debug(f'in duplicate, vlist {vlist}') vid_list = [] keys = ['cmor_var', 'input_vars', 'calculation'] if strict is True: keys += ['frequency', 'realm'] if extra: - vid_list = [tuple(x[k] for k in keys) for x in extra] - db_log.debug(f"vid_list: {vid_list}") + vid_list = [tuple(getattr(x,k) for k in keys) for x in extra] + mopdb_log.debug(f"vid_list: {vid_list}") final = [] for v in vlist: - vid = tuple(v[k] for k in keys) - db_log.debug(f"var and vid: {v['cmor_var']}, {vid}") + vid = tuple(getattr(v,k) for k in keys) + mopdb_log.debug(f"var and vid: {v.cmor_var}, {vid}") if vid not in vid_list: final.append(v) vid_list.append(vid) return final - -def potential_vars(conn, rows, stash_vars, version, db_log): - """Returns list of variables that can be potentially derived from - model output. - - Loop across all model variables to match - Select any mapping that contains the variable and if there's a calculation - NB rows modified by add_row when assigning cmorname and positive values - - Parameters - ---------- - conn : connection object - rows : list(dict) - list of variables to match - stash_vars : list - varname-frequency for each listed variable, varname is from model output - version : str - model version to use to match variables - db_log: logger obj - - Returns - ------- - """ - pot_full = [] - pot_part = [] - pot_varnames = set() - for row in rows: - sql = f"""SELECT cmor_var,input_vars,calculation,frequency, - realm,model,cmor_table,positive,units FROM mapping - WHERE input_vars like '%{row['name']}%'""" - results = query(conn, sql, first=False) - db_log.debug(f"In potential: var {row['name']}, db results {results}") - for r in results: - allinput = r[1].split(" ") - db_log.debug(f"{len(allinput)> 1}") - db_log.debug(all(f"{x}-{row['frequency']}" in stash_vars for x in allinput)) - if len(allinput) > 1 and all(f"{x}-{row['frequency']}" in stash_vars for x in allinput): - # if both version and frequency of applied mapping match - # consider this a full matching potential var - if r[5] == version and r[3] == row['frequency']: - pot_full = add_var(pot_full, row, r, db_log) - else: - pot_part = add_var(pot_part, row, r, db_log) - pot_varnames.add(r[0]) - return pot_full, pot_part, pot_varnames - - -def write_map_template(conn, full, no_ver, no_frq, stdn, - no_match, pot_full, pot_part, alias, db_log): - """Write mapping csv file template based on list of variables to define - - Input varlist file order: - name, cmor_var, units, dimensions, frequency, realm, cell_methods, - cmor_table, vtype, size, nsteps, filename, long_name, standard_name - Mapping db order: - cmor_var, input_vars, calculation, units, dimensions, frequency, realm, - cell_methods, positive, cmor_table, model, notes, origin - for pot vars + vtype, size, nsteps, filename - Final template order: - cmor_var, input_vars, calculation, units, dimensions, frequency, realm, - cell_methods, positive, cmor_table, version, vtype, size, nsteps, filename, - long_name, standard_name - """ - keys = ['cmor_var', 'input_vars', 'calculation', 'units', - 'dimensions', 'frequency', 'realm', 'cell_methods', - 'positive', 'cmor_table', 'version', 'vtype', 'size', - 'nsteps', 'filename', 'long_name', 'standard_name'] - - with open(f"map_{alias}.csv", 'w') as fcsv: - fwriter = csv.DictWriter(fcsv, keys, delimiter=';') - write_vars(full, fwriter, keys, db_log, conn=conn) - div = ("# Derived variables with matching version and " + - "frequency: Use with caution!") - write_vars(pot_full, fwriter, div, db_log, conn=conn) - #pot=True, conn=conn, sortby=0) - div = ("# Variables definitions coming from different " + - "version") - write_vars(no_ver, fwriter, div, db_log, conn=conn) - div = ("# Variables with different frequency: Use with" - + " caution!") - write_vars(no_ver, fwriter, div, db_log, conn=conn) - div = ("# Variables matched using standard_name: Use " + - "with caution!") - write_vars(stdn, fwriter, div, db_log, sortby='input_vars') - div = "# Derived variables: Use with caution!" - write_vars(pot_part, fwriter, div, db_log, conn=conn) - #pot=True, conn=conn, sortby=0) - div = "# Variables without mapping" - write_vars(no_match, fwriter, div, db_log) - db_log.debug("Finished writing variables to mapping template") - fcsv.close() - - return - - -def write_vars(vlist, fwriter, div, db_log, conn=None, sortby='cmor_var'): - """ - """ - if len(vlist) > 0: - if type(div) is str: - divrow = {x:'' for x in vlist[0].keys()} - divrow['cmor_var'] = div - elif type(div) is list: - divrow = {x:x for x in div} - fwriter.writerow(divrow) - for var in sorted(vlist, key=itemgetter(sortby)): - if conn: - var = check_realm_units(conn, var, db_log) - fwriter.writerow(var) - return - - -def check_realm_units(conn, var, db_log): +def check_realm_units(conn, var): """Checks that realm and units are consistent with values in cmor table. """ - vname = f"{var['cmor_var']}-{var['cmor_table']}" - if var['cmor_table'] is None or var['cmor_table'] == "": - db_log.warning(f"Variable: {vname} has no associated cmor_table") + + mopdb_log = logging.getLogger('mopdb_log') + vname = f"{var.cmor_var}-{var.cmor_table}" + if var.cmor_table is None or var.cmor_table == "": + mopdb_log.warning(f"Variable: {vname} has no associated cmor_table") else: # retrieve modeling_realm, units from db cmor table sql = f"""SELECT modeling_realm, units FROM cmorvar WHERE name='{vname}' """ - result = query(conn, sql) - db_log.debug(f"In check_realm_units: {vname}, {result}") + result = query(conn, sql, logname='mopdb_log') + mopdb_log.debug(f"In check_realm_units: {vname}, {result}") if result is not None: dbrealm = result[0] dbunits = result[1] # dbrealm could have two realms - if var['realm'] not in [dbrealm] + dbrealm.split(): - db_log.info(f"Changing {vname} realm from {var['realm']} to {dbrealm}") - var['realm'] = dbrealm - if var['units'] != dbunits : - db_log.info(f"Changing {vname} units from {var['units']} to {dbunits}") - var['units'] = dbunits + if var.realm not in [dbrealm] + dbrealm.split(): + mopdb_log.info(f"Changing {vname} realm from {var.realm} to {dbrealm}") + var.realm = dbrealm + if var.units != dbunits : + mopdb_log.info(f"Changing {vname} units from {var.units} to {dbunits}") + var.units = dbunits else: - db_log.warning(f"Variable {vname} not found in cmor table") + mopdb_log.warning(f"Variable {vname} not found in cmor table") return var +def get_realm(version, ds): + '''Try to retrieve realm if using path failed''' + realm = 'NArealm' + mopdb_log = logging.getLogger('mopdb_log') + if version == 'AUS2200': + realm = 'atmos' + elif 'um_version' in ds.attrs.keys(): + realm = 'atmos' + mopdb_log.debug(f"Realm is {realm}") + return realm + +def check_varlist(rows, fname): + """Checks that varlist written to file has sensible information for frequency and realm + to avoid incorrect mapping to be produced. + + At the moment we're checking only frequency and realm as they can be missed or wrong + depending on the file structure. + + Parameters + ---------- + rows : list(dict) + list of variables to match + """ + + mopdb_log = logging.getLogger('mopdb_log') + frq_list = ['min', 'hr', 'day', 'mon', 'yr'] + realm_list = ['ice', 'ocean', 'atmos', 'land'] + for row in rows: + if row['name'][0] == "#" or row['name'] == 'name': + continue + elif (not any( x in row['frequency'] for x in frq_list) + or row['realm'] not in realm_list): + mopdb_log.error(f""" Check frequency and realm in {fname}. + Some values might be invalid and need fixing""") + sys.exit() + return + +def get_date_pattern(fname, fpattern): + """Try to build a date range for each file pattern based + on its filename + """ + #mopdb_log = logging.getLogger('mopdb_log') + # assign False to any character which is not a digit + date_pattern = [True if c.isdigit() else False for c in fname] + # assign False to fpattern + n = len(fpattern) + date_pattern[:n] = [False] * n + return date_pattern diff --git a/src/mopdb/utils.py b/src/mopdb/utils.py new file mode 100644 index 0000000..c71dc71 --- /dev/null +++ b/src/mopdb/utils.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# Copyright 2024 ARC Centre of Excellence for Climate Extremes (CLEX) +# Author: Paola Petrelli for CLEX +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# contact: paola.petrelli@utas.edu.au +# +# last updated 12/07/2024 +# + +import sqlite3 +import logging +import os +import stat +import yaml + +from datetime import date + + +def config_log(debug, logname): + """Configures log file""" + # start a logger + logger = logging.getLogger(logname) + # set a formatter to manage the output format of our handler + formatter = logging.Formatter('%(asctime)s; %(message)s',"%Y-%m-%d %H:%M:%S") + # set the level for the logger, has to be logging.LEVEL not a string + level = logging.INFO + flevel = logging.WARNING + if debug: + level = logging.DEBUG + flevel = logging.DEBUG + logger.setLevel(level) + + # add a handler to send WARNING level messages to console + # or DEBUG level if debug is on + clog = logging.StreamHandler() + clog.setLevel(level) + logger.addHandler(clog) + + # add a handler to send INFO level messages to file + # the messagges will be appended to the same file + # create a new log file every month + day = date.today().strftime("%Y%m%d") + logname = f"{logname}_{day}.txt" + flog = logging.FileHandler(logname) + try: + os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) + except OSError: + pass + flog.setLevel(flevel) + flog.setFormatter(formatter) + logger.addHandler(flog) + # return the logger object + return logger + +def db_connect(db, logname='__name__'): + """Connects to ACCESS mapping sqlite database""" + log = logging.getLogger(logname) + conn = sqlite3.connect(db, timeout=10, isolation_level=None) + if conn.total_changes == 0: + log.info(f"Opened database {db} successfully") + return conn + +def create_table(conn, sql, logname='__name__'): + """Creates table if database is empty + + Parameters + ---------- + conn : connection object + sql : str + SQL style string defining table to create + """ + log = logging.getLogger(logname) + try: + c = conn.cursor() + c.execute(sql) + except Exception as e: + log.error(e) + return + +def query(conn, sql, tup=(), first=True, logname='__name__'): + """Executes generic sql query and returns row/s + + Parameters + ---------- + conn : connection object + Connection to sqlite database + sql : str + sql string representing query + tup : tuple + By default empty, used to pass values when placeholder ? is used + in sql string + first : boolean + By default True will return only first record found, set to False + to return all matching records + + Returns + ------- + result : tuple/list(tuple) + tuple or a list of, representing row/s returned by query + """ + #log = logging.getLogger(logname) + with conn: + c = conn.cursor() + c.execute(sql, tup) + if first: + result = c.fetchone() + else: + result = [ x for x in c.fetchall() ] + #columns = [description[0] for description in c.description] + return result + + +def get_columns(conn, table, logname='__name__'): + """Gets list of columns from db table + """ + #log = logging.getLogger(logname) + sql = f'PRAGMA table_info({table});' + table_data = query(conn, sql, first=False, logname=logname) + columns = [x[1] for x in table_data] + return columns + + +def delete_record(conn, table, col, pairs, logname='__name__'): + """Deletes record from table based on pairs of column and + value passed for selection + + Parameters + ---------- + conn : connection object + connection to db + table: str + db table name + col: str + name of column to return with query + pairs : list[tuple(str, str)] + pairs of columns, values to select record/s + """ + log = logging.getLogger(logname) + # Set up query + sqlwhere = f"FROM {table} WHERE " + for c,v in pairs: + sqlwhere += f"{c}='{v}' AND " + sql = f"SELECT {col} " + sqlwhere[:-4] + log.debug(f"Delete query: {sql}") + xl = query(conn, sql, first=False, logname=logname) + # Delete from db + if xl is not None: + log.info(f"Found {len(xl)} records") + for x in xl: + log.info(f"{x}") + confirm = input('Confirm deletion from database: Y/N ') + if confirm == 'Y': + log.info('Updating db ...') + with conn: + c = conn.cursor() + sql = "DELETE " + sqlwhere[:-4] + log.debug(f"Delete sql: {sql}") + c.execute(sql) + c.execute('select total_changes()') + log.info(f"Rows modified: {c.fetchall()[0][0]}") + else: + log.info("The query did not return any records") + return + +def read_yaml(fname, logname='__name__'): + """Read yaml file + """ + log = logging.getLogger(logname) + try: + with fname.open(mode='r') as yfile: + data = yaml.safe_load(yfile) + except Exception as e: + log.error(f"Check that {fname} exists and it is a valid yaml file") + log.error(f"Exception: {e}") + return data + +def write_yaml(data, fname, logname='__name__'): + """Write data to a yaml file + + Parameters + ---------- + data : dict + The file content as a dictionary + fname : str + Yaml filename + + Returns + ------- + """ + log = logging.getLogger(logname) + try: + with open(fname, 'w') as f: + yaml.dump(data, f) + except Exception as e: + log.error(f"Exception: {e}") + log.error(f"Check {data} exists and is yaml object") + return diff --git a/src/mopper/__init__.py b/src/mopper/__init__.py index 2c52bb9..e69de29 100644 --- a/src/mopper/__init__.py +++ b/src/mopper/__init__.py @@ -1 +0,0 @@ -from mopper import * diff --git a/src/mopper/calculations.py b/src/mopper/calculations.py index b8af723..ba68dc4 100644 --- a/src/mopper/calculations.py +++ b/src/mopper/calculations.py @@ -33,13 +33,13 @@ import click import xarray as xr import os -import yaml import json import numpy as np import dask +import logging -from importlib_resources import files as import_files -from mopper.setup_utils import read_yaml +from importlib.resources import files as import_files +from mopdb.utils import read_yaml # Global Variables #---------------------------------------------------------------------- @@ -152,7 +152,7 @@ class IceTransportCalculations(): @click.pass_context def __init__(self, ctx): - fname = import_files('data').joinpath('transport_lines.yaml') + fname = import_files('mopdata').joinpath('transport_lines.yaml') self.yaml_data = read_yaml(fname)['lines'] self.gridfile = xr.open_dataset(f"{ctx.obj['ancils_path']}/"+ @@ -232,7 +232,7 @@ def transAcrossLine(self, var, i_start, i_end, j_start, j_end): #sum each axis apart from time (3d) #trans = var.isel(yu_ocean=slice(271, 271+1), xt_ocean=slice(292, 300+1)) trans = var[..., j_start:j_end+1, i_start:i_end+1].sum(dim=['st_ocean', f'{y_ocean}', f'{x_ocean}']) #4D - except: + except Exception as e: trans = var[..., j_start:j_end+1, i_start:i_end+1].sum(dim=[f'{y_ocean}', f'{x_ocean}']) #3D return trans @@ -567,7 +567,7 @@ class SeaIceCalculations(): @click.pass_context def __init__(self, ctx): - fname = import_files('data').joinpath('transport_lines.yaml') + fname = import_files('mopdata').joinpath('transport_lines.yaml') self.yaml_data = read_yaml(fname)['lines'] self.gridfile = xr.open_dataset(f"{ctx.obj['ancil_path']}/" + @@ -664,25 +664,6 @@ def calc_hemi_seaice_extent(self, hemi): return vout.item() - -def ocean_floor(var): - """Not sure.. - - Parameters - ---------- - var : Xarray dataset - pot_temp variable - - Returns - ------- - vout : Xarray dataset - ocean floor temperature? - """ - lv = (~var.isnull()).sum(dim='st_ocean') - 1 - vout = var.take(lv, dim='st_ocean').squeeze() - return vout - - def maskSeaIce(var, sic): """Mask seaice. @@ -701,7 +682,6 @@ def maskSeaIce(var, sic): vout = var.where(sic != 0) return vout - def sithick(hi, aice): """Calculate seaice thickness. @@ -721,7 +701,6 @@ def sithick(hi, aice): vout = hi / aice return vout - def sisnconc(sisnthick): """Calculate seas ice? @@ -806,7 +785,7 @@ def calc_global_ave_ocean(var, rho_dzt, area_t): try: vnew = var.weighted(mass).mean(dim=('st_ocean', 'yt_ocean', 'xt_ocean'), skipna=True) - except: + except Exception as e: vnew = var.weighted(mass[:, 0, :, :]).mean(dim=('x', 'y'), skipna=True) return vnew @@ -834,9 +813,11 @@ def get_plev(ctx, levnum): @click.pass_context def plevinterp(ctx, var, pmod, levnum): """Interpolating var from model levels to pressure levels - + _extended_summary_ + Based on function from Dale Roberts (currently ANU) + Parameters ---------- var : Xarray DataArray @@ -852,9 +833,10 @@ def plevinterp(ctx, var, pmod, levnum): interp : Xarray DataArray The variable interpolated on pressure levels """ + + var_log = logging.getLogger(ctx.obj['var_log']) # avoid dask warning dask.config.set(**{'array.slicing.split_large_chunks': True}) - var_log = ctx.obj['var_log'] plev = get_plev(levnum) lev = var.dims[1] # if pmod is pressure on rho_level_0 and variable is on rho_level @@ -928,7 +910,7 @@ def K_degC(ctx, var): vout : Xarray DataArray temperature array in degrees Celsius """ - var_log = ctx.obj['var_log'] + var_log = logging.getLogger(ctx.obj['var_log']) if 'K' in var.units: var_log.info("temp in K, converting to degC") vout = var - 273.15 @@ -937,7 +919,7 @@ def K_degC(ctx, var): def tos_3hr(var, landfrac): - """notes + """not sure this is needed?? Parameters ---------- @@ -948,7 +930,7 @@ def tos_3hr(var, landfrac): vout : Xarray dataset """ - v = tos_degC(var) + var = K_degC(var) vout = xr.zeros_like(var) t = len(var.time) @@ -1002,7 +984,7 @@ def extract_tilefrac(ctx, tilefrac, tilenum, landfrac=None, lev=None): vout = vout * landfrac if lev: - fname = import_files('data').joinpath('landtype.yaml') + fname = import_files('mopdata').joinpath('landtype.yaml') data = read_yaml(fname) type_dict = data['mod_mapping'] vout = vout.expand_dims(dim={lev: type_dict[lev]}) @@ -1145,14 +1127,15 @@ def average_tile(var, tilefrac=None, lfrac=1, landfrac=None, lev=None): vout = vout * landfrac if lev: - fname = import_files('data').joinpath('landtype.yaml') + fname = import_files('mopdata').joinpath('landtype.yaml') data = read_yaml(fname) type_dict = data['mod_mapping'] vout = vout.expand_dims(dim={lev: type_dict[lev]}) return vout -def calc_topsoil(soilvar): +@click.pass_context +def calc_topsoil(ctx, soilvar): """Returns the variable over the first 10cm of soil. Parameters @@ -1165,15 +1148,17 @@ def calc_topsoil(soilvar): Returns ------- topsoil : Xarray DataArray - Variable define don top 10cm of soil + Variable defined on top 10cm of soil """ + var_log = logging.getLogger(ctx.obj['var_log']) depth = soilvar.depth # find index of bottom depth level including the first 10cm of soil - maxlev = depth.where(depth >= 0.1).argmin().values + maxlev = np.nanargmin(depth.where(depth >= 0.1).values) + var_log.debug(f"Max level of soil used is {maxlev}") # calculate the fraction of maxlev which falls in first 10cm fraction = (0.1 - depth[maxlev -1])/(depth[maxlev] - depth[maxlev-1]) topsoil = soilvar.isel(depth=slice(0,maxlev)).sum(dim='depth') - topsoil = topsoil + fraction * topsoil.isel(depth=maxlev) + topsoil = topsoil + fraction * soilvar.isel(depth=maxlev) return topsoil #---------------------------------------------------------------------- @@ -1199,7 +1184,7 @@ def level_to_height(ctx, var, levs=None): vout : Xarray DataArray Same variable defined on model levels height """ - var_log = ctx.obj['var_log'] + var_log = logging.getLogger(ctx.obj['var_log']) if levs is not None and type(levs) not in [tuple, list]: var_log.error(f"level_to_height function: levs {levs} should be a tuple or list") zdim = var.dims[1] @@ -1265,7 +1250,7 @@ def calc_global_ave_ocean(ctx, var, rho_dzt): mass = rho_dzt * area_t try: vnew=np.average(var,axis=(1,2,3),weights=mass) - except: + except Exception as e: vnew=np.average(var,axis=(1,2),weights=mass[:,0,:,:]) return vnew @@ -1293,7 +1278,7 @@ def calc_overt(ctx, varlist, sv=False): overt: DataArray overturning mass streamfunction (time, basin, depth, gridlat) variable """ - var_log = ctx.obj['var_log'] + var_log = logging.getLogger(ctx.obj['var_log']) var1 = varlist[0] vlat, vlon = var1.dims[2:] mask = get_basin_mask(vlat, vlon) @@ -1344,6 +1329,7 @@ def get_basin_mask(ctx, lat, lon): basin_mask: DataArray basin_mask(lat,lon) """ + var_log = logging.getLogger(ctx.obj['var_log']) coords = ['t', 't'] if 'xu' in lon: coords[0] = 'u' @@ -1381,7 +1367,7 @@ def overturn_stream(ctx, varlist, sv=False): stream: DataArray The ocean overturning mass streamfunction in kg s-1 """ - var_log = ctx.obj['var_log'] + var_log = logging.getLogger(ctx.obj['var_log']) londim = varlist[0].dims[3] depdim = varlist[0].dims[1] var_log.debug(f"Streamfunct lon, dep dims: {londim}, {depdim}") @@ -1434,13 +1420,13 @@ def calc_depositions(ctx, var, weight=None): (personal communication from M. Woodhouse) """ - var_log = ctx.obj['var_log'] + #var_log = logging.getLogger(ctx.obj['var_log']) varlist = [] for v in var: v0 = v.sel(model_theta_level_number=1).squeeze(dim='model_theta_level_number') varlist.append(v0) if weight is None: weight = 0.05844 - deps = sum_vars(varlist) * mole_weight + deps = sum_vars(varlist) * weight return deps diff --git a/src/mopper/cmip_utils.py b/src/mopper/cmip_utils.py index dd7674c..161e55f 100755 --- a/src/mopper/cmip_utils.py +++ b/src/mopper/cmip_utils.py @@ -25,7 +25,6 @@ import json import csv import ast -import copy import click from collections import OrderedDict @@ -39,7 +38,7 @@ def find_cmip_tables(dreq): with dreq.open(mode='r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: - if not row[0] in tables: + if row[0] not in tables: if (row[0] != 'Notes') and (row[0] != 'MIP table') and (row[0] != '0'): tables.append(f"CMIP6_{row[0]}") f.close() @@ -137,9 +136,9 @@ def read_dreq_vars(ctx, table_id, activity_id): years = ast.literal_eval(row[31]) years = reallocate_years(years, ctx.obj['reference_date']) years = f'"{years}"' - except: + except Exception as e: years = 'all' - except: + except Exception as e: years = 'all' dreq_variables[cmorname] = years f.close() diff --git a/src/mopper/mop_setup.py b/src/mopper/mop_setup.py index a639075..02132bb 100755 --- a/src/mopper/mop_setup.py +++ b/src/mopper/mop_setup.py @@ -24,18 +24,21 @@ import os import sys import shutil -import yaml import json import csv import click +import logging from pathlib import Path from json.decoder import JSONDecodeError from importlib.resources import files as import_files -from mopper.setup_utils import * +from mopper.setup_utils import (define_timeshot, adjust_nsteps, + find_custom_tables, write_var_map, write_table) +from mopper.cmip_utils import find_cmip_tables, read_dreq_vars +from mopdb.utils import read_yaml -def find_matches(table, var, realm, frequency, varlist, mop_log): +def find_matches(table, var, realm, frequency, varlist): """Finds variable matching constraints given by table and config settings and returns a dictionary with the variable specifications. @@ -59,14 +62,13 @@ def find_matches(table, var, realm, frequency, varlist, mop_log): varlist : list List of variables, each represented by a dictionary with mappings used to find a match to "var" passed - mop_log : logging object - Log Returns ------- match : dict Dictionary containing matched variable specifications or None if not matches """ + mop_log = logging.getLogger('mop_log') near_matches = [] found = False match = None @@ -83,7 +85,7 @@ def find_matches(table, var, realm, frequency, varlist, mop_log): and v['realm'] in realm.split()): near_matches.append(v) if found is False and frequency != 'fx': - v = find_nearest(near_matches, frequency, mop_log) + v = find_nearest(near_matches, frequency) if v is not None: match = v found = True @@ -98,11 +100,11 @@ def find_matches(table, var, realm, frequency, varlist, mop_log): match['timeshot'] = timeshot match['table'] = table match['frequency'] = frequency - if match['realm'] == 'land': - realmdir = 'atmos' - else: - realmdir = match['realm'] - in_fname = match['filename'].split() + #if match['realm'] == 'land': + # realmdir = 'atmos' + #else: + # realmdir = match['realm'] + in_fname = match['fpattern'].split() match['file_structure'] = '' for f in in_fname: #match['file_structure'] += f"/{realmdir}/{f}* " @@ -110,7 +112,7 @@ def find_matches(table, var, realm, frequency, varlist, mop_log): return match -def find_nearest(varlist, frequency, mop_log): +def find_nearest(varlist, frequency): """If variable is present in file at different frequencies, finds the one with higher frequency nearest to desired frequency. Adds frequency to variable resample field. @@ -124,8 +126,6 @@ def find_nearest(varlist, frequency, mop_log): frequency frequency : str Variable frequency to match - mop_log : logging object - Log Returns ------- @@ -133,6 +133,7 @@ def find_nearest(varlist, frequency, mop_log): Dictionary containing matched variable specifications or None if not matches """ + mop_log = logging.getLogger('mop_log') var = None found = False freq = frequency @@ -178,7 +179,7 @@ def setup_env(ctx): attributes for experiment """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') cdict = ctx.obj cdict['appdir'] = Path(cdict['appdir']) appdir = cdict['appdir'] @@ -197,6 +198,14 @@ def setup_env(ctx): else: cdict['tables_path'] = appdir / cdict['tables_path'] cdict['ancils_path'] = appdir / cdict['ancils_path'] + # conda env to run job + if cdict['conda_env'] == 'default': + cdict['conda_env'] = '' + else: + path = Path(cdict['conda_env']) + if not path.is_absolute(): + path = appdir / path + cdict['conda_env'] = f"source {str(path)}" # Output subdirectories outpath = cdict['outpath'] cdict['maps'] = outpath / "maps" @@ -231,7 +240,7 @@ def setup_env(ctx): def var_map(ctx, activity_id=None): """ """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') tables = ctx.obj.get('tables', 'all') subset = ctx.obj.get('var_subset', False) sublist = ctx.obj.get('var_subset_list', None) @@ -245,11 +254,11 @@ def var_map(ctx, activity_id=None): else: sublist = ctx.obj['appdir'] / sublist # Custom mode vars - if ctx.obj['mode'].lower() == 'custom': - access_version = ctx.obj['access_version'] + #if ctx.obj['mode'].lower() == 'custom': + # access_version = ctx.obj['access_version'] if ctx.obj['force_dreq'] is True: if ctx.obj['dreq'] == 'default': - ctx.obj['dreq'] = import_files('data').joinpath( + ctx.obj['dreq'] = import_files('mopdata').joinpath( 'data/dreq/cmvme_all_piControl_3_3.csv' ) with ctx.obj['master_map'].open(mode='r') as f: reader = csv.DictReader(f, delimiter=';') @@ -264,7 +273,7 @@ def var_map(ctx, activity_id=None): create_var_map(table, masters, selection=selection[table]) elif tables.lower() == 'all': mop_log.info(f"Experiment {ctx.obj['exp']}: processing all tables") - if ctx.obj['force_dreq'] == True: + if ctx.obj['force_dreq']: tables = find_cmip_tables(ctx.obj['dreq']) else: tables = find_custom_tables() @@ -289,11 +298,11 @@ def create_var_map(ctx, table, mappings, activity_id=None, Returns ------- """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') matches = [] fpath = ctx.obj['tables_path'] / f"{table}.json" if not fpath.exists(): - fpath = import_files('data').joinpath( + fpath = import_files('mopdata').joinpath( f"cmor_tables/{table}.json") table_id = table.split('_')[1] mop_log.debug(f"Mappings: {mappings}") @@ -325,7 +334,7 @@ def create_var_map(ctx, table, mappings, activity_id=None, years = dreq_years[var] if 'subhr' in frq: frq = ctx.obj['subhr'] + frq.split('subhr')[1] - match = find_matches(table, var, realm, frq, mappings, mop_log) + match = find_matches(table, var, realm, frq, mappings) if match is not None: match['years'] = years matches.append(match) @@ -367,7 +376,7 @@ def archive_workdir(ctx): def manage_env(ctx): """Prepare output directories and removes pre-existing ones """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') # check if output path already exists outpath = ctx.obj['outpath'] if outpath.exists() and ctx.obj['update'] is False: @@ -399,7 +408,7 @@ def manage_env(ctx): '_control_vocabulary_file']: fpath = ctx.obj['tables_path'] / ctx.obj[f] if not fpath.exists(): - fpath = import_files('data').joinpath( + fpath = import_files('mopdata').joinpath( f"cmor_tables/{ctx.obj[f]}") if f == '_control_vocabulary_file': fname = "CMIP6_CV.json" @@ -409,6 +418,6 @@ def manage_env(ctx): else: fname = ctx.obj[f] shutil.copyfile(fpath, ctx.obj['tpath'] / fname) - update_code = import_files('mopper').joinpath("update_db.py") + update_code = import_files('mopdata').joinpath("update_db.py.txt") shutil.copyfile(update_code, ctx.obj['outpath'] / "update_db.py") return diff --git a/src/mopper/mop_utils.py b/src/mopper/mop_utils.py index 0f0fd42..3577e25 100755 --- a/src/mopper/mop_utils.py +++ b/src/mopper/mop_utils.py @@ -22,25 +22,23 @@ # last updated 15/05/2024 import numpy as np -import glob import re -import os,sys +import os import stat import yaml import xarray as xr import cmor -import calendar import click import logging import cftime -import itertools import copy +import json from functools import partial from pathlib import Path from mopper.calculations import * -from mopper.setup_utils import read_yaml -from importlib_resources import files as import_files +from mopdb.utils import read_yaml +from importlib.resources import files as import_files def config_log(debug, path, stream_level=logging.WARNING): @@ -71,7 +69,7 @@ def config_log(debug, path, stream_level=logging.WARNING): logname = f"{path}/mopper_log.txt" flog = logging.FileHandler(logname) try: - os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO); + os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) except OSError: pass flog.setLevel(level) @@ -81,9 +79,9 @@ def config_log(debug, path, stream_level=logging.WARNING): return logger -def config_varlog(debug, logname): +def config_varlog(debug, logname, pid): """Configure varlog file: use this for specific var information""" - logger = logging.getLogger('var_log') + logger = logging.getLogger(f'{pid}_log') formatter = logging.Formatter('%(asctime)s; %(message)s',"%Y-%m-%d %H:%M:%S") if debug is True: level = logging.DEBUG @@ -93,12 +91,14 @@ def config_varlog(debug, logname): logger.setLevel(level) flog = logging.FileHandler(logname) try: - os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO); + os.chmod(logname, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) except OSError: pass flog.setLevel(level) flog.setFormatter(formatter) logger.addHandler(flog) + # Stop propagation + logger.propagate = False return logger @@ -110,7 +110,8 @@ def _preselect(ds, varlist): if bounds is None: bounds = ds[c].attrs.get('edges', None) if bounds is not None: - bnds.extend(bounds.split()) + bnds.extend([b for b in bounds.split() if b in ds.variables]) + # check all bnds are in file varsel.extend(bnds) # remove attributes for boundaries for v in bnds: @@ -118,9 +119,8 @@ def _preselect(ds, varlist): return ds[varsel] - @click.pass_context -def get_files(ctx, var_log): +def get_files(ctx): """Returns all files in time range First identifies all files with pattern/s defined for invars Then retrieve time dimension and if multiple time axis are present @@ -129,40 +129,41 @@ def get_files(ctx, var_log): last timestep from each file """ # Returns file list for each input var and list of vars for each file pattern - all_files, path_vars = find_all_files(var_log) + var_log = logging.getLogger(ctx.obj['var_log']) + all_files, path_vars = find_all_files() # PP FUNCTION END return all_files, extra_files var_log.debug(f"access files from: {os.path.basename(all_files[0][0])}" + f"to {os.path.basename(all_files[0][-1])}") ds = xr.open_dataset(all_files[0][0], decode_times=False) - time_dim, units, multiple_times = get_time_dim(ds, var_log) + time_dim, units, multiple_times = get_time_dim(ds) del ds try: inrange_files = [] for i,paths in enumerate(all_files): if multiple_times is True: - inrange_files.append( check_in_range(paths, time_dim, var_log) ) + inrange_files.append( check_in_range(paths, time_dim) ) else: - inrange_files.append( check_timestamp(paths, var_log) ) - except: + inrange_files.append( check_timestamp(paths) ) + except Exception as e: for i,paths in enumerate(all_files): - inrange_files.append( check_in_range(paths, time_dim, var_log) ) + inrange_files.append( check_in_range(paths, time_dim) ) for i,paths in enumerate(inrange_files): if paths == []: - mop_log.error(f"No data in requested time range for: {ctx.obj['filename']}") var_log.error(f"No data in requested time range for: {ctx.obj['filename']}") return inrange_files, path_vars, time_dim, units @click.pass_context -def find_all_files(ctx, var_log): +def find_all_files(ctx): """Find all the ACCESS file names which match the pattern/s associated with invars. Sort the filenames, assuming that the sorted filenames will be in chronological order because there is usually some sort of date and/or time information in the filename. Check that all variables needed are in file, otherwise add extra file pattern """ + var_log = logging.getLogger(ctx.obj['var_log']) var_log.debug(f"Input file structure: {ctx.obj['infile']}") patterns = ctx.obj['infile'].split() var_log.debug(f"Input file patterns: {patterns}") @@ -186,7 +187,7 @@ def find_all_files(ctx, var_log): while len(missing) > 0 and i < len(patterns): path_vars[i] = [] f = files[i][0] - missing, found = check_vars_in_file(missing, f, var_log) + missing, found = check_vars_in_file(missing, f) if len(found) > 0: for v in found: path_vars[i].append(v) @@ -198,10 +199,11 @@ def find_all_files(ctx, var_log): @click.pass_context -def check_vars_in_file(ctx, invars, fname, var_log): +def check_vars_in_file(ctx, invars, fname): """Check that all variables needed for calculation are in file else return extra filenames """ + #var_log = logging.getLogger(ctx.obj['var_log']) ds = xr.open_dataset(fname, decode_times=False) tofind = [v for v in invars if v not in ds.variables] found = [v for v in invars if v not in tofind] @@ -209,10 +211,11 @@ def check_vars_in_file(ctx, invars, fname, var_log): @click.pass_context -def get_time_dim(ctx, ds, var_log): +def get_time_dim(ctx, ds): """Find time info: time axis, reference time and set tstart and tend also return mutlitple_times True if more than one time axis """ + var_log = logging.getLogger(ctx.obj['var_log']) time_dim = None multiple_times = False varname = [ctx.obj['vin'][0]] @@ -236,11 +239,12 @@ def get_time_dim(ctx, ds, var_log): @click.pass_context -def check_timestamp(ctx, all_files, var_log): +def check_timestamp(ctx, all_files): """This function tries to guess the time coverage of a file based on its timestamp and return the files in range. At the moment it does a lot of checks based on the realm and real examples eventually it would make sense to make sure all files generated are consistent in naming """ + var_log = logging.getLogger(ctx.obj['var_log']) inrange_files = [] realm = ctx.obj['realm'] var_log.info("checking files timestamp ...") @@ -305,11 +309,12 @@ def check_timestamp(ctx, all_files, var_log): @click.pass_context -def check_in_range(ctx, all_files, tdim, var_log): +def check_in_range(ctx, all_files, tdim): """Return a list of files in time range Open each file and check based on time axis Use this function only if check_timestamp fails """ + var_log = logging.getLogger(ctx.obj['var_log']) inrange_files = [] var_log.info("loading files...") var_log.debug(f"time dimension: {tdim}") @@ -337,7 +342,7 @@ def check_in_range(ctx, all_files, tdim, var_log): @click.pass_context -def load_data(ctx, inrange_files, path_vars, time_dim, var_log): +def load_data(ctx, inrange_files, path_vars, time_dim): """Returns a dictionary of input var: xarray dataset """ # preprocessing to select only variables we need to avoid @@ -345,6 +350,7 @@ def load_data(ctx, inrange_files, path_vars, time_dim, var_log): # temporarily opening file without decoding times, fixing # faulty time bounds units and decoding times # this is to prevent issues with ocean files + var_log = logging.getLogger(ctx.obj['var_log']) input_ds = {} for i, paths in enumerate(inrange_files): preselect = partial(_preselect, varlist=path_vars[i]) @@ -361,9 +367,10 @@ def load_data(ctx, inrange_files, path_vars, time_dim, var_log): @click.pass_context -def get_cmorname(ctx, axis_name, axis, var_log, z_len=None): +def get_cmorname(ctx, axis_name, axis, z_len=None): """Get time cmor name based on timeshot option """ + var_log = logging.getLogger(ctx.obj['var_log']) var_log.debug(f'axis_name, axis.name: {axis_name}, {axis.name}') ctx.obj['axes_modifier'] = [] if axis_name == 't': @@ -418,10 +425,11 @@ def get_cmorname(ctx, axis_name, axis, var_log, z_len=None): #PP this should eventually just be generated directly by defining the dimension using the same terms # in related calculation @click.pass_context -def pseudo_axis(axis, var_log): +def pseudo_axis(ctx, axis): """coordinates with axis_identifier other than X,Y,Z,T PP not sure if axis can be used to remove axes_mod """ + var_log = logging.getLogger(ctx.obj['var_log']) cmor_name = None p_vals = None p_len = None @@ -447,12 +455,13 @@ def pseudo_axis(axis, var_log): cmor_name = 'vegtype' return cmor_name, p_vals, p_len - #PP this should eventually just be generated directly by defining the dimension using the same terms # in calculation for meridional overturning -def create_axis(axis, table, var_log): +@click.pass_context +def create_axis(ctx, axis, table): """ """ + var_log = logging.getLogger(ctx.obj['var_log']) # maybe we can just create these axis as they're meant in calculations var_log.info(f"creating {axis.name} axis...") #func_dict = {'oline': getTransportLines(), @@ -468,10 +477,13 @@ def create_axis(axis, table, var_log): var_log.info(f"setup of {axis.name} axis complete") return axis_id - -def hybrid_axis(lev, z_ax_id, z_ids, var_log): +@click.pass_context +def hybrid_axis(ctx, lev, z_ax_id, z_ids): """Setting up additional hybrid axis information + PP this needs fixing can't possible work now without b_vals, b_bnds?? + lev is cmor_zName? """ + #var_log = logging.getLogger(ctx.obj['var_log']) hybrid_dict = {'hybrid_height': 'b', 'hybrid_height_half': 'b_half'} orog_vals = getOrog() @@ -490,33 +502,30 @@ def hybrid_axis(lev, z_ax_id, z_ids, var_log): zfactor_values=orog_vals) return zfactor_b_id, zfactor_orog_id - @click.pass_context -def ij_axis(ctx, ax, ax_name, table, var_log): +def ij_axis(ctx, ax, ax_name, table): """ """ + #var_log = logging.getLogger(ctx.obj['var_log']) cmor.set_table(table) ax_id = cmor.axis(table_entry=ax_name, units='1', coord_vals=ax.values) return ax_id - @click.pass_context -def ll_axis(ctx, ax, ax_name, ds, table, bounds_list, var_log): +def ll_axis(ctx, ax, ax_name, ds, table, bounds_list): """ """ - var_log.debug(f"n ll_axis") + var_log = logging.getLogger(ctx.obj['var_log']) + var_log.debug("in ll_axis") cmor.set_table(table) - cmor_aName = get_cmorname(ax_name, ax, var_log) - try: - ax_units = ax.units - except: - ax_units = 'degrees' + cmor_aName = get_cmorname(ax_name, ax) + ax_units = ax.attrs.get('units', 'degrees') a_bnds = None var_log.debug(f"got cmor name: {cmor_aName}") if cmor_aName in bounds_list: - a_bnds = get_bounds(ds, ax, cmor_aName, var_log) + a_bnds = get_bounds(ds, ax, cmor_aName) a_vals = ax.values var_log.debug(f"a_bnds: {a_bnds.shape}") var_log.debug(f"a_vals: {a_vals.shape}") @@ -533,10 +542,10 @@ def ll_axis(ctx, ax, ax_name, ds, table, bounds_list, var_log): return ax_id @click.pass_context -def define_grid(ctx, j_id, i_id, lat, lat_bnds, lon, lon_bnds, - var_log): +def define_grid(ctx, j_id, i_id, lat, lat_bnds, lon, lon_bnds): """If we are on a non-cartesian grid, Define the spatial grid """ + var_log = logging.getLogger(ctx.obj['var_log']) grid_id=None var_log.info("setting up grid") #Set grid id and append to axis and z ids @@ -548,19 +557,23 @@ def define_grid(ctx, j_id, i_id, lat, lat_bnds, lon, lon_bnds, var_log.info("setup of lat,lon grid complete") return grid_id - @click.pass_context -def get_coords(ctx, ovar, coords, var_log): +def get_coords(ctx, ovar, coords): """Get lat/lon and their boundaries from ancil file """ + var_log = logging.getLogger(ctx.obj['var_log']) # open ancil grid file to read vertices #PP be careful this is currently hardcoded which is not ok! - ancil_file = ctx.obj[f"grid_{ctx.obj['realm']}"] + ancil_dir = ctx.obj.get('ancils_path', '') + ancil_file = ancil_dir + "/" + ctx.obj.get(f"grid_{ctx.obj['realm']}", '') + if ancil_file == '' or not Path(ancil_file).exists(): + var_log.error(f"Ancil file {ancil_file} not set or inexistent") + sys.exit() var_log.debug(f"getting lat/lon and bnds from ancil file: {ancil_file}") - ds = xr.open_dataset(f"{ctx.obj['ancils_path']}/{ancil_file}") + ds = xr.open_dataset(ancil_file) var_log.debug(f"ancil ds: {ds}") # read lat/lon and vertices mapping - cfile = import_files('data').joinpath('latlon_vertices.yaml') + cfile = import_files('mopdata').joinpath('latlon_vertices.yaml') with open(cfile, 'r') as yfile: data = yaml.safe_load(yfile) ll_dict = data[ctx.obj['realm']] @@ -583,17 +596,18 @@ def get_coords(ctx, ovar, coords, var_log): @click.pass_context -def get_axis_dim(ctx, var, var_log): +def get_axis_dim(ctx, var): """ """ + var_log = logging.getLogger(ctx.obj['var_log']) axes = {'t_ax': None, 'z_ax': None, 'glat_ax': None, 'lat_ax': None, 'lon_ax': None, 'j_ax': None, 'i_ax': None, 'p_ax': None, 'e_ax': None} for dim in var.dims: - try: + if dim in var.coords: axis = var[dim] var_log.debug(f"axis found: {axis}") - except: + else: var_log.warning(f"No coordinate variable associated with the dimension {dim}") axis = None # need to file to give a value then??? @@ -612,6 +626,9 @@ def get_axis_dim(ctx, var, var_log): axes['lat_ax'] = axis elif any(x in dim.lower() for x in ['nj', 'yu_ocean', 'yt_ocean']): axes['j_ax'] = axis + # have to add this because a simulation didn't have the dimenision variables + elif any(x in dim.lower() for x in ['nj', 'yu_ocean', 'yt_ocean']): + axes['j_ax'] = axis elif axis_name and 'X' in axis_name: if 'glon' in dim.lower(): axes['glon_ax'] = axis @@ -619,6 +636,9 @@ def get_axis_dim(ctx, var, var_log): axes['lon_ax'] = axis elif any(x in dim.lower() for x in ['ni', 'xu_ocean', 'xt_ocean']): axes['i_ax'] = axis + # have to add this because a simulation didn't have the dimenision variables + elif any(x in dim.lower() for x in ['ni', 'xu_ocean', 'xt_ocean']): + axes['i_ax'] = axis elif axis_name == 'Z' or any(x in dim for x in ['lev', 'heigth', 'depth']): axes['z_ax'] = axis #z_ax.attrs['axis'] = 'Z' @@ -631,8 +651,10 @@ def get_axis_dim(ctx, var, var_log): return axes -def check_time_bnds(bnds, frequency, var_log): +@click.pass_context +def check_time_bnds(ctx, bnds, frequency): """Checks if dimension boundaries from file are wrong""" + var_log = logging.getLogger(ctx.obj['var_log']) var_log.debug(f"Time bnds 1,0: {bnds[:,1], bnds[:,0]}") diff = bnds[:,1] - bnds[:,0] #approx_int = [np.timedelta64(x, 'D').astype(float) for x in diff] @@ -650,10 +672,11 @@ def check_time_bnds(bnds, frequency, var_log): @click.pass_context -def require_bounds(ctx, var_log): +def require_bounds(ctx): """Returns list of coordinates that require bounds. Reads the requirement directly from .._coordinate.json file """ + var_log = logging.getLogger(ctx.obj['var_log']) fpath = f"{ctx.obj['tpath']}/{ctx.obj['_AXIS_ENTRY_FILE']}" with open(fpath, 'r') as jfile: data = json.load(jfile) @@ -665,10 +688,11 @@ def require_bounds(ctx, var_log): @click.pass_context -def bnds_change(ctx, axis, var_log): +def bnds_change(ctx, axis): """Returns True if calculation/resample changes bnds of specified dimension. """ + #var_log = logging.getLogger(ctx.obj['var_log']) dim = axis.name calculation = ctx.obj['calculation'] changed_bnds = False @@ -681,18 +705,18 @@ def bnds_change(ctx, axis, var_log): changed_bnds = True return changed_bnds - @click.pass_context -def get_bounds(ctx, ds, axis, cmor_name, var_log, ax_val=None): +def get_bounds(ctx, ds, axis, cmor_name, ax_val=None): """Returns bounds for input dimension, if bounds are not available uses edges or tries to calculate them. If variable goes through calculation potentially bounds are different from input file and forces re-calculating them """ + var_log = logging.getLogger(ctx.obj['var_log']) var_log.debug(f'in getting bounds: {axis}') dim = axis.name var_log.info(f"Getting bounds for axis: {dim}") - changed_bnds = bnds_change(axis, var_log) + changed_bnds = bnds_change(axis) var_log.debug(f"Bounds has changed: {changed_bnds}") #The default bounds assume that the grid cells are centred on #each grid point specified by the coordinate variable. @@ -702,10 +726,10 @@ def get_bounds(ctx, ds, axis, cmor_name, var_log, ax_val=None): if 'subhr' in frq: frq = ctx.obj['subhr'] + frq.split('subhr')[1] if 'bounds' in keys and not changed_bnds: - dim_bnds_val = ds[axis.bounds].values + calc, dim_bnds_val = get_bounds_values(ds, axis.bounds) var_log.info(f"Using dimension bounds: {axis.bounds}") elif 'edges' in keys and not changed_bnds: - dim_bnds_val = ds[axis.edges].values + calc, dim_bnds_val = get_bounds_values(ds, axis.edges) var_log.info(f"Using dimension edges as bounds: {axis.edges}") else: var_log.info(f"No bounds for {dim}") @@ -716,7 +740,7 @@ def get_bounds(ctx, ds, axis, cmor_name, var_log, ax_val=None): dim_bnds_val = cftime.date2num(dim_bnds_val, units=ctx.obj['reference_date'], calendar=ctx.obj['attrs']['calendar']) - inrange = check_time_bnds(dim_bnds_val, frq, var_log) + inrange = check_time_bnds(dim_bnds_val, frq) if not inrange: calc = True var_log.info(f"Inherited bounds for {dim} are incorrect") @@ -731,11 +755,12 @@ def get_bounds(ctx, ds, axis, cmor_name, var_log, ax_val=None): max_val = np.roll(min_val, -1) max_val[-1] = 1.5*ax_val[-1] - 0.5*ax_val[-2] dim_bnds_val = np.column_stack((min_val, max_val)) + var_log.debug(f"{axis.name} bnds: {dim_bnds_val}") except Exception as e: var_log.warning(f"dodgy bounds for dimension: {dim}") var_log.error(f"error: {e}") if 'time' in cmor_name: - inrange = check_time_bnds(dim_bnds_val, frq, var_log) + inrange = check_time_bnds(dim_bnds_val, frq) if inrange is False: var_log.error(f"Boundaries for {cmor_name} are " + "wrong even after calculation") @@ -764,11 +789,34 @@ def get_bounds(ctx, ds, axis, cmor_name, var_log, ax_val=None): var_log.info(f"setting minimum {cmor_name} bound to 0") return dim_bnds_val +@click.pass_context +def get_bounds_values(ctx, ds, bname): + """Return values of axis bounds, if they're not in file + tries to get them from ancillary grid file instead. + """ + calc = False + var_log = logging.getLogger(ctx.obj['var_log']) + var_log.debug(f"Getting bounds values for {bname}") + ancil_file = ctx.obj.get(f"grid_{ctx.obj['realm']}", '') + if bname in ds.variables: + var_log.debug(f"Bounds for {bname} in file") + bnds_val = ds[bname].values + elif ancil_file != "": + fname = f"{ctx.obj['ancils_path']}/{ancil_file}" + ancil = xr.open_dataset(fname) + if bname in ancil.variables: + bnds_val = ancil[bname].values + else: + var_log.info(f"Can't locate {bname} in data or ancil file") + bnds_val = None + calc = True + return calc, bnds_val @click.pass_context -def get_attrs(ctx, infiles, var1, var_log): +def get_attrs(ctx, infiles, var1): """ """ + var_log = logging.getLogger(ctx.obj['var_log']) # open only first file so we can access encoding ds = xr.open_dataset(infiles[0][0]) var_attrs = ds[var1].attrs @@ -803,7 +851,7 @@ def get_attrs(ctx, infiles, var1, var_log): @click.pass_context -def extract_var(ctx, input_ds, tdim, in_missing, mop_log, var_log): +def extract_var(ctx, input_ds, tdim, in_missing): """ This function pulls the required variables from the Xarray dataset. If a calculation isn't needed then it just returns the variables to be saved. @@ -814,6 +862,8 @@ def extract_var(ctx, input_ds, tdim, in_missing, mop_log, var_log): input_ds - dict dictionary of input datasets for each variable """ + mop_log = logging.getLogger('mop_log') + var_log = logging.getLogger(ctx.obj['var_log']) failed = False # Save the variables if ctx.obj['calculation'] == '': @@ -850,7 +900,7 @@ def extract_var(ctx, input_ds, tdim, in_missing, mop_log, var_log): if array.dtype.kind == 'i': try: in_missing = int(in_missing) - except: + except Exception as e: in_missing = int(-999) else: array = array.fillna(in_missing) @@ -873,10 +923,11 @@ def define_attrs(ctx): listed in notes file, this is indicated by precending any function in file with a ~. For other fields it checks equality. """ + #var_log = logging.getLogger(ctx.obj['var_log']) attrs = ctx.obj['attrs'] notes = attrs.get('notes', '') # open file containing notes - fname = import_files('data').joinpath('notes.yaml') + fname = import_files('mopdata').joinpath('notes.yaml') data = read_yaml(fname)['notes'] # check all fields and if any of their keys (e.g. a specific variable) # match the field value for the file being processed diff --git a/src/mopper/mopper.py b/src/mopper/mopper.py index 554c7c7..a8f57d5 100644 --- a/src/mopper/mopper.py +++ b/src/mopper/mopper.py @@ -25,18 +25,24 @@ import click import logging -import sqlite3 import concurrent.futures -import os,sys +import os +import subprocess +import sys import warnings import yaml import cmor -import numpy as np -import xarray as xr - -from mopper.mop_utils import * -from mopper.mop_setup import * -from mopdb.mopdb_utils import db_connect, create_table, query +import cftime + +from mopper.mop_utils import (config_log, config_varlog, get_files, + load_data, get_cmorname, pseudo_axis, create_axis, hybrid_axis, + ij_axis, ll_axis, define_grid, get_coords, get_axis_dim, + require_bounds, get_bounds, get_attrs, extract_var, define_attrs) +from mopper.mop_setup import setup_env, var_map, manage_env +from mopper.setup_utils import (create_exp_json, write_config, + populate_db, count_rows, sum_file_sizes, filelist_sql, write_job) +from mopdb.utils import db_connect, create_table, query +from mopper.cmip_utils import edit_json_cv warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=UserWarning) @@ -53,13 +59,22 @@ def mop_catch(): sys.exit(1) +def mop_args(f): + """Define common click options + """ + constraints = [ + click.option('--debug', is_flag=True, default=False, + help="Show debug info"), + click.option('--cfile', '-c', type=str, required=True, + help='Experiment configuration as yaml file')] + for c in reversed(constraints): + f = c(f) + return f + + @click.group(context_settings=dict(help_option_names=['-h', '--help'])) -@click.option('--cfile', '-c', type=str, required=True, - help='Experiment configuration as yaml file') -@click.option('--debug', is_flag=True, default=False, - help="Show debug info") @click.pass_context -def mop(ctx, cfile, debug): +def mop(ctx): """Main command with 2 sub-commands: - setup to setup the job to run - run to execute the post-processing @@ -68,36 +83,41 @@ def mop(ctx, cfile, debug): ---------- ctx : obj Click context object + """ + #ctx.obj = {} + pass + + +@mop.command(name='run') +@mop_args +#@click.option('--cfile', '-c', type=str, required=True, +# help='Experiment configuration as yaml file') +@click.pass_context +def mop_run(ctx, cfile, debug): + """Subcommand that executes the processing. + + Use the configuration yaml file created in setup step as input. + + Parameters + ---------- cfile : str Name of yaml configuration file, run sub-command uses the configuration created by setup debug : bool If true set logging level to debug """ + + # load config file with open(cfile, 'r') as yfile: cfg = yaml.safe_load(yfile) ctx.obj = cfg['cmor'] ctx.obj['attrs'] = cfg['attrs'] - # set up main mop log - if ctx.invoked_subcommand == 'setup': - ctx.obj['log'] = config_log(debug, ctx.obj['appdir'], stream_level=logging.INFO) - else: - ctx.obj['log'] = config_log(debug, ctx.obj['appdir']) + # set up logger + mop_log = config_log(debug, ctx.obj['appdir']) ctx.obj['debug'] = debug - mop_log = ctx.obj['log'] mop_log.info(f"Simulation to process: {ctx.obj['exp']}") - - -@mop.command(name='run') -@click.pass_context -def mop_run(ctx): - """Subcommand that executes the processing. - - Use the configuration yaml file created in setup step as input. - """ - mop_log = ctx.obj['log'] # Open database and retrieve list of files to create - conn = db_connect(ctx.obj['database'], mop_log) + conn = db_connect(ctx.obj['database']) c = conn.cursor() sql = f"""select *,ROWID from filelist where status=='unprocessed' and exp_id=='{ctx.obj['exp']}'""" @@ -118,11 +138,12 @@ def mop_run(ctx): return +@mop.command(name='setup') +@mop_args @click.option('--update', is_flag=True, default=False, help="Update current settings, keeping db and logs") -@mop.command(name='setup') @click.pass_context -def mop_setup(ctx, update): +def mop_setup(ctx, cfile, debug, update): """Setup of mopper processing job and working environment. * Defines and creates paths @@ -132,8 +153,26 @@ def mop_setup(ctx, update): * creates/updates database filelist table to list files to create * finalises configuration and save in new yaml file * writes job executable file and submits (optional) to queue + + Parameters + ---------- + cfile : str + Name of yaml configuration file, run sub-command uses the + configuration created by setup + debug : bool + If True set logging level to debug + update : bool + If True update current workding directory (default is False) """ - mop_log = ctx.obj['log'] + + # load config file + with open(cfile, 'r') as yfile: + cfg = yaml.safe_load(yfile) + ctx.obj = cfg['cmor'] + ctx.obj['attrs'] = cfg['attrs'] + ctx.obj['debug'] = debug + # set up logger + mop_log = config_log(debug, ctx.obj['appdir'], stream_level=logging.INFO) # then add setup_env to config mop_log.info("Setting environment and creating working directory") ctx.obj['update'] = update @@ -152,11 +191,11 @@ def mop_setup(ctx, update): # setup database table database = ctx.obj['database'] mop_log.info(f"creating & using database: {database}") - conn = db_connect(database, mop_log) + conn = db_connect(database) table_sql = filelist_sql() - create_table(conn, table_sql, mop_log) + create_table(conn, table_sql) populate_db(conn) - nrows = count_rows(conn, ctx.obj['exp'], mop_log) + nrows = count_rows(conn, ctx.obj['exp']) tot_size = sum_file_sizes(conn) mop_log.info(f"Estimated total files size before compression is: {tot_size} GB") #write app_job.sh @@ -177,15 +216,16 @@ def mop_setup(ctx, update): @click.pass_context -def mop_process(ctx, mop_log, var_log): +def mop_process(ctx): """Main processing workflow Sets up CMOR dataset, tables and axis. Extracts and/or calculates variable and write to file using CMOR. Returns path of created file if successful or error code if not. """ - - default_cal = "gregorian" + + mop_log = logging.getLogger('mop_log') + var_log = logging.getLogger(ctx.obj['var_log']) logname = f"{ctx.obj['variable_id']}_{ctx.obj['table']}_{ctx.obj['tstart']}" # Setup CMOR @@ -210,15 +250,15 @@ def mop_process(ctx, mop_log, var_log): # Select files to use and associate a path to each input variable #P I might not need this! - inrange_files, path_vars, time_dim, t_units = get_files(var_log) + inrange_files, path_vars, time_dim, t_units = get_files() # Open input datasets based on input files, return dict= {var: ds} - dsin = load_data(inrange_files, path_vars, time_dim, var_log) + dsin = load_data(inrange_files, path_vars, time_dim) #Get the units and other attrs of first variable. var1 = ctx.obj['vin'][0] in_units, in_missing, positive, coords = get_attrs(inrange_files, - var1, var_log) + var1) var_log.debug(f"var just after reading {dsin[var1][var1]}") # Extract variable and calculation: @@ -226,7 +266,7 @@ def mop_process(ctx, mop_log, var_log): var_log.info(f"calculation: {ctx.obj['calculation']}") var_log.info(f"resample: {ctx.obj['resample']}") try: - ovar, failed = extract_var(dsin, time_dim, in_missing, mop_log, var_log) + ovar, failed = extract_var(dsin, time_dim, in_missing) var_log.info("Calculation completed.") except Exception as e: mop_log.error(f"E: Unable to retrieve/calculate var for {ctx.obj['filename']}") @@ -239,25 +279,25 @@ def mop_process(ctx, mop_log, var_log): # Define axis and variable for CMOR var_log.info("Defining axes...") # get list of coordinates that require bounds - bounds_list = require_bounds(var_log) + bounds_list = require_bounds() # get axis of each dimension - axes = get_axis_dim(ovar, var_log) + axes = get_axis_dim(ovar) var_log.debug(f"detected axes: {axes}") cmor.set_table(tables[1]) axis_ids = [] z_ids = [] setgrid = False if axes['t_ax'] is not None: - cmor_tName = get_cmorname('t', axes['t_ax'], var_log) + cmor_tName = get_cmorname('t', axes['t_ax']) ctx.obj['reference_date'] = f"days since {ctx.obj['reference_date']}" var_log.debug(f"{ctx.obj['reference_date']}") t_ax_val = cftime.date2num(axes['t_ax'], units=ctx.obj['reference_date'], calendar=ctx.obj['attrs']['calendar']) - var_log.debug(f"t_ax[3] {t_ax_val[3]}") + #var_log.debug(f"t_ax[3] {t_ax_val[3]}") t_bounds = None if cmor_tName in bounds_list: t_bounds = get_bounds(dsin[var1], axes['t_ax'], cmor_tName, - var_log, ax_val=t_ax_val) + ax_val=t_ax_val) t_ax_id = cmor.axis(table_entry=cmor_tName, units=ctx.obj['reference_date'], length=len(t_ax_val), @@ -266,14 +306,14 @@ def mop_process(ctx, mop_log, var_log): interval=None) axis_ids.append(t_ax_id) if axes['e_ax'] is not None: - e_ax_id = create_axis(axes['e_ax'], tables[1], var_log) + e_ax_id = create_axis(axes['e_ax'], tables[1]) axis_ids.append(e_ax_id) if axes['z_ax'] is not None: zlen = len(axes['z_ax']) - cmor_zName = get_cmorname('z', axes['z_ax'], var_log, z_len=zlen) + cmor_zName = get_cmorname('z', axes['z_ax'], z_len=zlen) z_bounds = None if cmor_zName in bounds_list: - z_bounds = get_bounds(dsin[var1], axes['z_ax'], cmor_zName, var_log) + z_bounds = get_bounds(dsin[var1], axes['z_ax'], cmor_zName) z_ax_id = cmor.axis(table_entry=cmor_zName, units=axes['z_ax'].units, length=zlen, @@ -283,32 +323,32 @@ def mop_process(ctx, mop_log, var_log): axis_ids.append(z_ax_id) # if both i, j are defined setgrid if only one treat as lat/lon if axes['i_ax'] is not None and axes['j_ax'] is not None: + var_log.debug(f"Setting grid with {axes}") setgrid = True - j_id = ij_axis(axes['j_ax'], 'j_index', tables[0], var_log) - i_id = ij_axis(axes['i_ax'], 'i_index', tables[0], var_log) + j_id = ij_axis(axes['j_ax'], 'j_index', tables[0]) + i_id = ij_axis(axes['i_ax'], 'i_index', tables[0]) elif axes['j_ax'] is not None: axes['lat_ax'] = axes['j_ax'] elif axes['i_ax'] is not None: axes['lon_ax'] = axes['i_ax'] # Define the spatial grid if non-cartesian grid if setgrid: - lat, lat_bnds, lon, lon_bnds = get_coords(ovar, coords, var_log) - grid_id = define_grid(j_id, i_id, lat, lat_bnds, lon, - lon_bnds, var_log) + lat, lat_bnds, lon, lon_bnds = get_coords(ovar, coords) + grid_id = define_grid(j_id, i_id, lat, lat_bnds, lon, lon_bnds) else: if axes['glat_ax'] is not None: - lat_id = ll_axis(axes['glat_ax'], 'glat', dsin[var1], tables[1], - bounds_list, var_log) + lat_id = ll_axis(axes['glat_ax'], 'glat', dsin[var1], + tables[1], bounds_list) axis_ids.append(lat_id) #z_ids.append(lat_id) elif axes['lat_ax'] is not None: lat_id = ll_axis(axes['lat_ax'], 'lat', dsin[var1], tables[1], - bounds_list, var_log) + bounds_list) axis_ids.append(lat_id) z_ids.append(lat_id) if axes['lon_ax'] is not None: lon_id = ll_axis(axes['lon_ax'], 'lon', dsin[var1], tables[1], - bounds_list, var_log) + bounds_list) axis_ids.append(lon_id) z_ids.append(lon_id) if axes['p_ax'] is not None: @@ -324,7 +364,7 @@ def mop_process(ctx, mop_log, var_log): # Set up additional hybrid coordinate information if (axes['z_ax'] is not None and cmor_zName in ['hybrid_height', 'hybrid_height_half']): - zfactor_b_id, zfactor_orog_id = hybrid_axis(lev_name, z_ax_id, z_ids, var_log) + zfactor_b_id, zfactor_orog_id = hybrid_axis(cmor_zName, z_ax_id, z_ids) # Freeing up memory del dsin @@ -347,11 +387,11 @@ def mop_process(ctx, mop_log, var_log): mop_log.error(f"Unable to define the CMOR variable {ctx.obj['filename']}") var_log.error(f"Unable to define the CMOR variable {e}") return 2 - var_log.info('Writing...') + var_log.info("Writing...") var_log.info(f"Variable shape is {ovar.shape}") status = None # Write timesteps separately if variable potentially exceeding memory - if float(ctx.obj['file_size']) > 4000.0 and time_dim != None: + if float(ctx.obj['file_size']) > 4000.0 and time_dim is not None: for i in range(ovar.shape[0]): data = ovar.isel({time_dim: i}).values status = cmor.write(variable_id, data, ntimes_passed=1) @@ -360,10 +400,10 @@ def mop_process(ctx, mop_log, var_log): status = cmor.write(variable_id, ovar.values) if status != 0: mop_log.error(f"Unable to write the CMOR variable: {ctx.obj['filename']}\n") - var_log.error(f"Unable to write the CMOR variable to file\n" + var_log.error("Unable to write the CMOR variable to file\n" + f"See cmor log, status: {status}") return 2 - var_log.info(f"Finished writing") + var_log.info("Finished writing") # Close the CMOR file. path = cmor.close(variable_id, file_name=True) @@ -371,7 +411,7 @@ def mop_process(ctx, mop_log, var_log): @click.pass_context -def process_file(ctx, row, var_log): +def process_file(ctx, row): """Processes file from database if status is unprocessed. If override is true, re-writes existing files. Called by process_row() and calls mop_process() to extract and write variable. @@ -382,15 +422,14 @@ def process_file(ctx, row, var_log): Click context object row : dict row from filelist db table describing one output file - var_log : logging handler - Logging file handler specific to the file to process Returns ------- out : tuple Output status message and code and db rowid for processed file """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') + var_log = logging.getLogger(ctx.obj['var_log']) row['vin'] = row['vin'].split() # Check that calculation is defined if more than one variable is passed as input if len(row['vin']) > 1 and row['calculation'] == '': @@ -411,7 +450,7 @@ def process_file(ctx, row, var_log): var_msg = f"{row['table']},{row['variable_id']},{row['tstart']},{row['tend']}" if ctx.obj['override'] or not os.path.exists(expected_file): try: - ret = mop_process(mop_log, var_log) + ret = mop_process() except Exception as e: #something has gone wrong in the processing ret = -1 mop_log.error(e) @@ -438,7 +477,7 @@ def process_file(ctx, row, var_log): #Check if output file matches what we expect var_log.info(f"Output file: {ret}") if ret == expected_file: - var_log.info(f"Expected and cmor file paths match") + var_log.info("Expected and cmor file paths match") msg = f"Successfully processed variable: {var_msg}\n" status = "processed" else : @@ -464,6 +503,7 @@ def process_row(ctx, row): Sets up variable log file, prepares dictionary with file details and calls process_file """ + pid = os.getpid() record = {} header = ['infile', 'filepath', 'filename', 'vin', 'variable_id', 'table', 'frequency', 'realm', 'timeshot', 'tstart', @@ -473,16 +513,14 @@ def process_row(ctx, row): 'json_file_path', 'reference_date', 'version', 'rowid'] for i,val in enumerate(header): record[val] = row[i] - table = record['table'].split('_')[1] # call logging - trange = record['filename'].replace('.nc.','').split("_")[-1] varlog_file = (f"{ctx.obj['var_logs']}/{record['variable_id']}" + f"_{record['table']}_{record['tstart']}.txt") - var_log = config_varlog(ctx.obj['debug'], varlog_file) - ctx.obj['var_log'] = var_log - var_log.info(f"Start processing") - var_log.debug(f"Process id: {os.getpid()}") - msg = process_file(record, var_log) + var_log = config_varlog(ctx.obj['debug'], varlog_file, pid) + ctx.obj['var_log'] = var_log.name + var_log.info("Start processing") + var_log.debug(f"Process id: {pid}") + msg = process_file(record) var_log.handlers[0].close() var_log.removeHandler(var_log.handlers[0]) return msg @@ -500,7 +538,7 @@ def pool_handler(ctx, rows, ncpus): list of process_row() outputs returned by futures, these are tuples with status message and code, and rowid """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') executor = concurrent.futures.ProcessPoolExecutor(max_workers=ncpus) futures = [] for row in rows: diff --git a/src/mopper/setup_utils.py b/src/mopper/setup_utils.py index 7981e0d..05dfaca 100755 --- a/src/mopper/setup_utils.py +++ b/src/mopper/setup_utils.py @@ -21,28 +21,19 @@ # # last updated 08/04/2024 -import os import sys -import shutil -import calendar -import yaml import json -import csv import sqlite3 -import subprocess -import ast import copy -import re import click import pathlib +import logging -from collections import OrderedDict from datetime import datetime#, timedelta from dateutil.relativedelta import relativedelta -from importlib_resources import files as import_files -from json.decoder import JSONDecodeError -from mopdb.mopdb_utils import query +from mopdb.utils import query, write_yaml +from mopper.cmip_utils import fix_years def write_var_map(outpath, table, matches): @@ -102,36 +93,6 @@ def adjust_nsteps(v, frq): new_nsteps = tot_days * nstep_day[frq] return new_nsteps - -def read_yaml(fname): - """Read yaml file - """ - with fname.open(mode='r') as yfile: - data = yaml.safe_load(yfile) - return data - - -def write_yaml(data, fname, logger): - """Write data to a yaml file - - Parameters - ---------- - data : dict - The file content as a dictioinary - fname : str - Yaml filename - - Returns - ------- - """ - try: - with open(fname, 'w') as f: - yaml.dump(data, f) - except: - logger.error(f"Check that {data} exists and it is an object compatible with json") - return - - @click.pass_context def write_config(ctx, fname='exp_config.yaml'): """Write data to a yaml file @@ -153,19 +114,17 @@ def write_config(ctx, fname='exp_config.yaml'): else: config['cmor'][k] = v config['attrs'] = config['cmor'].pop('attrs') - mop_log = config['cmor'].pop('log') - write_yaml(config, fname, mop_log) + write_yaml(config, fname, 'mop_log') return @click.pass_context -def find_custom_tables(ctx): +def find_custom_tables(ctx, cmip=False): """Returns list of tables files in custom table path """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') tables = [] - path = ctx.obj['tables_path'] - tables = ctx.obj['tables_path'].rglob("*_*.json") + table_files = ctx.obj['tables_path'].rglob("*_*.json") for f in table_files: f = str(f).replace(".json", "") tables.append(f) @@ -237,7 +196,7 @@ def filelist_sql(): def write_job(ctx, nrows): """ """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') # define storage flag flag = "storage=gdata/hh5" projects = ctx.obj['addprojs'] + [ctx.obj['project']] @@ -282,7 +241,7 @@ def create_exp_json(ctx, json_cv): fname : str Name of created experiment json file """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') fname = ctx.obj['outpath'] / f"{ctx.obj['exp']}.json" attrs = ctx.obj['attrs'] with json_cv.open(mode='r') as f: @@ -353,7 +312,7 @@ def populate_db(ctx, conn): conn : obj DB connection object """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') cursor = conn.cursor() # process experiment information opts = {} @@ -388,7 +347,7 @@ def populate_db(ctx, conn): return -def add_row(values, cursor, update, mop_log): +def add_row(values, cursor, update): """Add a row to the filelist database table one row specifies the information to produce one output cmip5 file @@ -404,6 +363,7 @@ def add_row(values, cursor, update, mop_log): Returns ------- """ + mop_log = logging.getLogger('mop_log') sql = '''insert into filelist (infile, filepath, filename, vin, variable_id, ctable, frequency, realm, timeshot, tstart, tend, sel_start, sel_end, @@ -437,13 +397,13 @@ def adjust_size(opts, insize): # volume,any vertical sum # resample will affect frequency but that should be already taken into account in mapping calc = opts['calculation'] - resample = opts['resample'] + #resample = opts['resample'] grid_size = insize if 'plevinterp' in calc: - try: + if "," in calc: plevnum = calc.split(',')[-1] - except: - raise('check plevinterp calculation definition plev probably missing') + else: + raise('check plevinterp calculation def plev probably missing') plevnum = float(plevnum.replace(')','')) grid_size = float(insize)/float(opts['levnum'])*plevnum return grid_size @@ -466,7 +426,7 @@ def compute_fsize(ctx, opts, grid_size, frequency): Returns ------- """ - mop_log = ctx.obj['log'] + #mop_log = logging.getLogger('mop_log') # set small number for fx frequency so it always create only one file nstep_day = {'10min': 144, '30min': 48, '1hr': 24, '3hr': 8, '6hr': 4, 'day': 1, '10day': 0.1, 'mon': 1/30, @@ -586,10 +546,6 @@ def process_vars(ctx, maps, opts, cursor): Returns ------- """ - tstep_dict = {'10min': 'minutes=10', '30min': 'minutes=30', - '1hr': 'hours=1', '3hr': 'hours=3', '6hr': 'hours=6', - 'day': 'days=1', '10day': 'days=10', 'mon': 'months=1', - 'yr': 'years=1', 'dec': 'years=10'} unchanged = ['frequency', 'realm', 'table', 'calculation', 'resample', 'positive', 'timeshot'] for mp in maps: @@ -617,16 +573,15 @@ def define_files(ctx, cursor, opts, mp): time interval for each file. This last is determined by maximum file size. These and other files details are saved in filelist db table. """ - mop_log = ctx.obj['log'] + mop_log = logging.getLogger('mop_log') update = ctx.obj['update'] exp_start = opts['exp_start'] exp_end = opts['exp_end'] if mp['years'] != 'all' and ctx.obj['dreq_years']: exp_start, exp_end = fix_years(mp['years'], exp_start[:4], exp_end[:4]) if exp_start is None: - mop_log.info("Years requested for variable are outside specified") - mop_log.info((f"period: {table_id}, {var},", - f"{match['tstart']}, {match['tend']}")) + mop_log.info(f"""Years requested for variable are outside + specified period: {mp['years']}""") return tstep_dict = {'10min': ['minutes=10', 'minutes=5'], '30min': ['minutes=30', 'minutes=15'], @@ -650,7 +605,6 @@ def define_files(ctx, cursor, opts, mp): finish = start + relativedelta(days=1) tstep_dict['fx'] = tstep_dict['day'] while (start < finish): - tstep = eval(f"relativedelta({tstep_dict[frq][0]})") half_tstep = eval(f"relativedelta({tstep_dict[frq][1]})") delta = eval(f"relativedelta({interval})") newtime = min(start+delta, finish) @@ -662,14 +616,16 @@ def define_files(ctx, cursor, opts, mp): opts['sel_end'] = (newtime - half_tstep).strftime('%4Y%m%d%H%M') opts['filepath'], opts['filename'] = build_filename(opts, start, newtime, half_tstep) - rowid = add_row(opts, cursor, update, mop_log) + rowid = add_row(opts, cursor, update) + mop_log.debug(f"Last added row id: {rowid}") start = newtime return -def count_rows(conn, exp, mop_log): +def count_rows(conn, exp): """Returns number of files to process """ + mop_log = logging.getLogger('mop_log') sql = f"select * from filelist where status=='unprocessed' and exp_id=='{exp}'" rows = query(conn, sql, first=False) mop_log.info(f"Number of rows in filelist: {len(rows)}") @@ -697,8 +653,6 @@ def define_template(ctx, flag, nrows): cdict : dict Dictionary with cmor settings for experiment """ - # temporarily removing this as it only works for conda envs - #{os.path.dirname(sys.executable)}/mop -c {ctx.obj['exp']}_config.yaml run template = f"""#!/bin/bash #PBS -P {ctx.obj['project']} #PBS -q {ctx.obj['queue']} @@ -715,8 +669,9 @@ def define_template(ctx, flag, nrows): module use /g/data/hh5/public/modules module load conda/analysis3 +{ctx.obj['conda_env']} cd {ctx.obj['appdir']} -mop -c {ctx.obj['exp']}_config.yaml run +mop run -c {ctx.obj['exp']}_config.yaml echo 'APP completed for exp {ctx.obj['exp']}.'""" return template diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..f6d89a6 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,3 @@ +pytest +pyfakefs + diff --git a/tests/conftest.py b/tests/conftest.py index 2f7fcbf..3770a94 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,18 +18,47 @@ import pytest import os import sqlite3 -import xarray as xr -import numpy as np -import pandas as pd -import datetime +import click +import logging +import csv +import pyfakefs +from pathlib import Path + from mopdb.mopdb_utils import mapping_sql, cmorvar_sql +from mopdb.mopdb_class import MapVariable, Variable, FPattern from mopper.setup_utils import filelist_sql TESTS_HOME = os.path.abspath(os.path.dirname(__file__)) TESTS_DATA = os.path.join(TESTS_HOME, "testdata") +# consecutive files with multiple time axes +dsmulti = os.path.join(TESTS_DATA, "multitime.nc") +dsmulti2 = os.path.join(TESTS_DATA, "multitime_next.nc") +# consecutive files with a 1-time step time axis +dsonestep = os.path.join(TESTS_DATA, "onetstep.nc") +dsonestep2 = os.path.join(TESTS_DATA, "onetstep_next.nc") +# varlist, map file examples + +@pytest.fixture +def fake_fs(fs): # pylint:disable=invalid-name + """Variable name 'fs' causes a pylint warning. Provide a longer name + acceptable to pylint for use in tests. + """ + yield fs +@pytest.fixture +def ctx(): + ctx = click.Context(click.Command('cmd'), + obj={'sel_start': '198302170600', 'sel_end': '198302181300', + 'realm': 'atmos', 'frequency': '1hr', 'var_log': 'varlog_1'}) + return ctx +@pytest.fixture +def vlistcsv(): + vlistcsv = os.path.join(TESTS_DATA, "varlist.csv") + return vlistcsv + +# setting up fixtures for databases:a ccess.db and mopper.db @pytest.fixture def session(): connection = sqlite3.connect(':memory:') @@ -37,6 +66,15 @@ def session(): yield db_session connection.close() +@pytest.fixture +def input_dir(fake_fs): + dfrq = {'d': 'dai', '8': '3h', '7': '6h', 'm': 'mon'} + for date in ['201312', '201401', '201402']: + for k,v in dfrq.items(): + filebase = f"cm000a.p{k}{date}_{v}.nc" + fake_fs.create_file("/raw/atmos/"+ filebase) + assert os.path.exists("/raw/atmos/cm000a.p8201402_3h.nc") + @pytest.fixture def setup_access_db(session): @@ -58,16 +96,80 @@ def setup_access_db(session): @pytest.fixture def setup_mopper_db(session): - filelist_sql = mapping_sql() - session.execute(filelist_sql) + flist_sql = filelist_sql() + session.execute(flist_sql) session.execute('''INSERT INTO filelist VALUES ("/testdata/atmos/umnsa_spec_*.nc", "/testdata/mjo-elnino/v1-0/A10min/", "tas_AUS2200_mjo-elnino_subhrPt_20160101001000-20160102000000.nc", "fld_s03i236", "tas", "AUS2200_A10min", "subhrPt", "atmos", "point", "20160101T0005", "20160102T0000", "201601010000", "201601012355", "unprocessed", "3027.83203125", "mjo-elnino", "K", "AUS2200", "AUS2200", "/testdata/mjo-elnino/mjo-elnino.json", "1970-01-01", "v1-0")''') session.connection.commit() +def test_check_timestamp(caplog): + global ctx, logger + caplog.set_level(logging.DEBUG, logger='mop_log') + @pytest.fixture def varlist_rows(): - lines = ["fld_s03i236;tas;K;time_0 lat lon;1hr;atmos;area: time: mean;AUS2200_A1hr;float32;22048000;96;umnsa_slv_;TEMPERATURE AT 1.5M;air_temperature", - "fld_s03i236;;K;time_0 lat lon;1hr;atmos;area: time: mean;AUS2200_A1hr;float32;22048000;96;umnsa_slv_;TEMPERATURE AT 1.5M;air_temperature", - "fld_s03i236;tas;;time_0 lat lon;1hr;atmos;area: time: mean;AUS2200_A1hr;float32;22048000;96;umnsa_slv_;TEMPERATURE AT 1.5M;air_temperature"] - rows = [l.split(";") for l in lines] + # read list of vars from example file + with open('testdata/varlist_ex.csv', 'r') as csvfile: + reader = csv.DictReader(csvfile, delimiter=';') + rows = list(reader) return rows + +@pytest.fixture +def matches(): + matches = [("tas", "fld_s03i236", "", "1hr", "atmos", "AUS2200", "AUS2200_A1hr", "", "K"), + ("siconca", "fld_s00i031", "", "mon", "ocean", "CM2", "CMIP6_OImon", "", "1"), + ("hfls", "fld_s03i234", "", "mon", "atmos", "CM2", "CMIP6_Amon", "up", "W/m2")] + return matches + +@pytest.fixture +def add_var_out(): + vlist = [{'cmor_var': '', 'input_vars': '', 'calculation': '', 'units': '' + ,'realm': '', 'positive': '', 'version': '', 'cmor_table': ''} + ] + return vlist + +@pytest.fixture +def map_rows(): + maps = [["fld_s03i236","tas","K","time_0 lat lon","1hr","atmos", + "area: time: mean","","AUS2200_A1hr","float32","22048000","96", + "umnsa_slv_","TEMPERATURE AT 1.5M","air_temperature"]] + return maps + +@pytest.fixture +def fobj(input_dir): + fobj = FPattern("cm000a.", Path("/raw/atmos/")) + return fobj + +@pytest.fixture +def var_obj(fobj): + vobj = Variable('tas', fobj) + return vobj + +@pytest.fixture +def mapvar_obj(var_obj): + match = ('','','','','','','','','') + mvobj = MapVariable(match, var_obj) + return mvobj + +@pytest.fixture +def varobjs(mapvar_obj): + mvobj = mapvar_obj + vobjs = [] + vobjs.append(mvobj) + mvobj.name = 'siconca' + vobjs.append(mvobj) + mvobj.name = 'hfls' + vobjs.append(mvobj) + return vobjs + + +@pytest.fixture +def output_file(tmp_path): + # create your file manually here using the tmp_path fixture + # or just import a static pre-built mock file + # something like : + target_output = os.path.join(tmp_path,'mydoc.csv') + with open(target_output, 'w+'): + pass + # write stuff here + return target_output diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 0000000..389e88b --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_cli = true +log_cli_level = DEBUG diff --git a/tests/test_calculations.py b/tests/test_calculations.py index dcd6398..8c70d28 100644 --- a/tests/test_calculations.py +++ b/tests/test_calculations.py @@ -15,18 +15,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import xarray.testing as xrtest import numpy.testing as nptest import xarray as xr +import xarray.testing as xrtest import numpy as np import pandas as pd import logging -from mopper.calculations import * +from mopper.calculations import (overturn_stream, calc_topsoil,) +from conftest import ctx -logger = logging.getLogger('var_log') -ctx = click.Context(click.Command('cmd'), - obj={'sel_start': '198302170600', 'sel_end': '198302181300', - 'realm': 'atmos', 'frequency': '1hr', 'var_log': logger}) def create_var(nlat, nlon, ntime=None, nlev=None, sdepth=False, seed=100): @@ -42,7 +39,7 @@ def create_var(nlat, nlon, ntime=None, nlev=None, sdepth=False, seed=100): dims.insert(0, 'lev') coords['lev'] = lev shape.insert(0, nlev) - elif sdepth is True: + if sdepth is True: depth = np.array([0.05, 0.2, 0.5, 1]) dims.insert(0, 'depth') coords['depth'] = depth @@ -59,17 +56,17 @@ def create_var(nlat, nlon, ntime=None, nlev=None, sdepth=False, seed=100): attrs={'name': 'random'}) return da -mrsol = create_var(2, 3, ntime=4, sdepth=True) -def test_calc_topsoil(): - global mrsol +def test_calc_topsoil(caplog, ctx): + caplog.set_level(logging.DEBUG, logger='varlog_1') + mrsol = create_var(2, 3, ntime=4, sdepth=True) expected = mrsol.isel(depth=0) + mrsol.isel(depth=1)/3.0 - out = calc_topsoil(mrsol) + with ctx: + out = calc_topsoil(mrsol) xrtest.assert_allclose(out, expected, rtol=1e-05) - -def test_overturn_stream(): - global ctx, logger +def test_overturn_stream(caplog, ctx): + caplog.set_level(logging.DEBUG, logger='varlog_1') # set up input dims = ['time', 'depth', 'lat', 'lon'] time = pd.date_range("2014-09-06", periods=1) diff --git a/tests/test_mop_utils.py b/tests/test_mop_utils.py index d006ca1..8cf28a2 100644 --- a/tests/test_mop_utils.py +++ b/tests/test_mop_utils.py @@ -15,51 +15,50 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest +#import pytest +import click +import xarray as xr import numpy as np import pandas as pd -from mopper.mop_utils import * +import logging +from mopper.mop_utils import (check_timestamp, get_cmorname,) -#try: -# import unittest.mock as mock -#except ImportError: -# import mock ctx = click.Context(click.Command('cmd'), obj={'sel_start': '198302170600', 'sel_end': '198302181300', - 'realm': 'atmos', 'frequency': '1hr'}) -logger = logging.getLogger('mop_log') + 'realm': 'atmos', 'frequency': '1hr', 'var_log': 'varlog_1'}) def test_check_timestamp(caplog): - global ctx, logger + global ctx caplog.set_level(logging.DEBUG, logger='mop_log') + caplog.set_level(logging.DEBUG, logger='varlog_1') # test atmos files files = [f'obj_198302{d}T{str(h).zfill(2)}01_1hr.nc' for d in ['17','18','19'] for h in range(24)] inrange = files[6:37] with ctx: - out1 = check_timestamp(files, logger) + out1 = check_timestamp(files) assert out1 == inrange # get only first file is frequency is fx ctx.obj['frequency'] = 'fx' inrange = [files[0]] with ctx: - out2 = check_timestamp(files, logger) + out2 = check_timestamp(files) assert out2 == inrange # test ocn files - ctx.obj['frequency'] = 'mon' + ctx.obj['frequency'] = 'day' ctx.obj['realm'] = 'ocean' files = [f'ocn_daily.nc-198302{str(d).zfill(2)}' for d in range(1,29)] inrange = files[16:18] with ctx: - out3 = check_timestamp(files, logger) + out3 = check_timestamp(files) assert out3 == inrange def test_get_cmorname(caplog): - global ctx, logger + global ctx caplog.set_level(logging.DEBUG, logger='mop_log') - # axiis_name t + # axis_name t ctx.obj['calculation'] = "plevinterp(var[0], var[1], 24)" ctx.obj['variable_id'] = "ta24" ctx.obj['timeshot'] = 'mean' @@ -71,10 +70,10 @@ def test_get_cmorname(caplog): foo = xr.DataArray(data, coords=[levs, tdata, lats, lons], dims=["lev", "t", "lat", "lon"]) with ctx: - tname = get_cmorname('t', foo.t, logger, z_len=None) - iname = get_cmorname('i_index', foo.lon, logger, z_len=None) - jname = get_cmorname('j_index', foo.lat, logger, z_len=None) - zname = get_cmorname('z', foo.lev, logger, z_len=3) + tname = get_cmorname('t', foo.t, z_len=None) + iname = get_cmorname('lon', foo.lon, z_len=None) + jname = get_cmorname('lat', foo.lat, z_len=None) + zname = get_cmorname('z', foo.lev, z_len=3) assert tname == 'time' assert iname == 'longitude' assert jname == 'latitude' diff --git a/tests/test_mopdb.py b/tests/test_mopdb.py index 37f4232..b377077 100644 --- a/tests/test_mopdb.py +++ b/tests/test_mopdb.py @@ -17,43 +17,56 @@ import pytest import os -import sqlite3 -from mopdb.mopdb import * +import logging +import click +from mopdb.mopdb import mopdb from click.testing import CliRunner +#from conftest import vlistcsv +from pytest import CaptureFixture -@pytest.mark.parametrize('subcommand', ['varlist', 'template', 'check', 'cmor', 'table', 'map']) -def test_cmip(command, runner): - result = runner.invoke(mopdb, ['--help']) - assert result.exit_code == 0 - result = runner.invoke(mopdb, [subcommand, '--help']) - assert result.exit_code == 0 -@pytest.mark.usefixtures("setup_db") # 1 -def test_template(session): +@pytest.fixture(scope='module') +def runner(): + return CliRunner() - runner = CliRunner() +def test_command(runner): + result = runner.invoke(mopdb, ['--help']) + assert result.exit_code == 0 - with runner.isolated_filesystem(): - with open('varlist.txt', 'w') as f: - f.write('name;cmor_var;units;dimensions;frequency;realm;cell_methods;cmor_table;dtype;size;nsteps;file_name;long_name;standard_name') - f.write('fld_s03i236;tas;K;time lat lon,mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;2081;cm000a.pm;TEMPERATURE AT 1.5M;air_temperature') - f.write('fld_s03i237;huss;1;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;2081;cm000a.pm;SPECIFIC HUMIDITY AT 1.5M;specific_humidity') - f.write('fld_s05i205;prrc;kg m-2 s-1;time_0 lat lon;3hr;atmos;area: time: mean;CMIP6_E3hr;float32;110592;578880;cm000a.p8;CONVECTIVE RAINFALL RATE KG/M2/S;convective_rainfall_flux') - f.write('fld_s03i236;tas;K;time lat lon;day;atmos;area: time: mean;CMIP6_day;float32;110592;74772;cm000a.pd;TEMPERATURE AT 1.5M;air_temperature') +@pytest.mark.parametrize('subcommand', ['varlist', 'template', + 'intake', 'check', 'cmor', 'table', 'map', 'del']) +def test_subcmd(subcommand, runner): + ctx = click.Context(click.Command('mopdb'), obj={'prop': 'A Context'}) + with ctx: + result = runner.invoke(mopdb, ['--help']) + assert result.exit_code == 0 + result = runner.invoke(mopdb, [subcommand, '--help']) + assert result.exit_code == 0 - result = runner.invoke(mopdb, ['template', '-f varlist.txt', '-vCM2']) - #assert result.exit_code == 0 - assert 'Opened database successfully' in result.output - assert 'Definable cmip var' in result.output -#Pass temp_dir to control where the temporary directory is created. The directory will not be removed by Click in this case. This is useful to integrate with a framework like Pytest that manages temporary files. +@pytest.mark.usefixtures("setup_access_db") # 1 +def test_template(session, runner, tmp_path, caplog, + capsys: CaptureFixture): -#def test_keep_dir(tmp_path): -# runner = CliRunner() + caplog.set_level(logging.DEBUG, logger='mopdb_log') + with capsys.disabled() as disabled: + with runner.isolated_filesystem(temp_dir=tmp_path) as td: + os.mkdir("myfiles") + with open('myfiles/varlist.csv', 'w') as f: + f.write('name;cmor_var;units;dimensions;frequency;realm;cell_methods;cmor_table;dtype;size;nsteps;fpattern;long_name;standard_name') + f.write('fld_s03i236;tas;K;time lat lon,mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;2081;cm000a.pm;TEMPERATURE AT 1.5M;air_temperature') + f.write('fld_s03i237;huss;1;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;2081;cm000a.pm;SPECIFIC HUMIDITY AT 1.5M;specific_humidity') + f.write('fld_s05i205;prrc;kg m-2 s-1;time_0 lat lon;3hr;atmos;area: time: mean;CMIP6_E3hr;float32;110592;578880;cm000a.p8;CONVECTIVE RAINFALL RATE KG/M2/S;convective_rainfall_flux') + f.write('fld_s03i236;tas;K;time lat lon;day;atmos;area: time: mean;CMIP6_day;float32;110592;74772;cm000a.pd;TEMPERATURE AT 1.5M;air_temperature') -# with runner.isolated_filesystem(temp_dir=tmp_path) as td: -# ... + args = ['--debug', 'template', '-f', 'myfiles/varlist.csv', '-v', 'CM2'] + result = runner.invoke(mopdb, args) + #assert result.exit_code == 0 + assert 'Opened database ' in caplog.messages[0] + assert 'myfiles/varlist.csv is file' in caplog.messages + #assert caplog.messages[-1] == 'Finished writing variables to mapping template' + #assert 'Definable cmip var' in result.output -def test_with_context(): - ctx = click.Context(click.Command('cmd'), obj={'prop': 'A Context'}) - with ctx: - process_cmd() +#def test_with_context(): +# ctx = click.Context(click.Command('cmd'), obj={'prop': 'A Context'}) +# with ctx: +# mopdb() diff --git a/tests/test_mopdb_map.py b/tests/test_mopdb_map.py new file mode 100644 index 0000000..9b66447 --- /dev/null +++ b/tests/test_mopdb_map.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# Copyright 2023 ARC Centre of Excellence for Climate Extremes +# author: Paola Petrelli +# author: Sam Green +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import os +import logging +import xarray as xr +from mopdb.mopdb_map import (add_var, get_file_frq) +#from mopdb.mopdb_class import MapVariable, Variable, FPattern +#from conftest import * + + +TESTS_HOME = os.path.abspath(os.path.dirname(__file__)) +TESTS_DATA = os.path.join(TESTS_HOME, "testdata") +# consecutive files with multiple time axes +dsmulti = os.path.join(TESTS_DATA, "multitime.nc") +dsmulti2 = os.path.join(TESTS_DATA, "multitime_next.nc") +# consecutive files with a 1-time step time axis +dsonestep = os.path.join(TESTS_DATA, "onetstep.nc") +dsonestep2 = os.path.join(TESTS_DATA, "onetstep_next.nc") + +@pytest.mark.parametrize('idx', [0,1,2]) +def test_add_var(varobjs, matches, idx, caplog): + caplog.set_level(logging.DEBUG, logger='mopdb_log') + vlist = [] + vlist = add_var(vlist, varobjs[idx], matches[idx]) + assert vlist[0].cmor_var == matches[idx][0] + + +def test_get_file_frq(caplog): + global dsmulti, dsmulti2, dsonestep, dsonestep2 + caplog.set_level(logging.DEBUG, logger='mopdb_log') + umfrq = {'time': 'day', 'time_0': '1hr', 'time_1': '30min'} + int2frq = {'day': 1.0, '1hr': 0.041667, '30min': 0.020833} + # multi time axes in file + ds = xr.open_dataset(dsmulti, decode_times=False) + out = get_file_frq(ds, dsmulti2, int2frq) + assert umfrq == out + # only one time axis in file with 1 value + ds = xr.open_dataset(dsonestep, decode_times=False) + out = get_file_frq(ds, dsonestep2, int2frq) + umfrq = {'time': 'day'} + assert umfrq == out + diff --git a/tests/test_mopdb_utils.py b/tests/test_mopdb_utils.py index 48aa87b..b5409b3 100644 --- a/tests/test_mopdb_utils.py +++ b/tests/test_mopdb_utils.py @@ -15,31 +15,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import os -import sqlite3 -import click +#import pytest import logging -from mopdb.mopdb_utils import * +import itertools +from mopdb.mopdb_utils import (get_date_pattern, ) +#from mopdb.mopdb_class import MapVariable, Variable, FPattern -#from click.testing import CliRunner -@pytest.fixture -def db_log(): - return config_log(False) - -@pytest.fixture -def db_log_debug(): - return config_log(True) - - -@pytest.mark.parametrize('idx', [0,1,2]) -def test_add_var(varlist_rows, idx, db_log): - vlist = [] - vlistout = [["fld_s03i236","tas","K","time_0 lat lon","1hr","atmos", - "area: time: mean","","AUS2200_A1hr","float32","22048000","96", - "umnsa_slv_","TEMPERATURE AT 1.5M","air_temperature"]] - match = ("tas", "", "K") - vlist = add_var(vlist, varlist_rows[idx], match, db_log) - assert vlist == vlistout + +#@pytest.mark.parametrize('fname', [0,1,2]) +def test_get_date_pattern(caplog): + caplog.set_level(logging.DEBUG, logger='mopdb_log') + fname = 'ocean_month.nc-09961231' + fpattern = 'ocean_month.nc-' + dp = get_date_pattern(fname, fpattern) + date = ''.join(x for x in itertools.compress(fname,dp)) + assert date == '09961231' + fname = 'umnsa_cldrad_20160603T0000.nc' + fpattern = 'umnsa_cldrad_' + dp = get_date_pattern(fname, fpattern) + date = ''.join(x for x in itertools.compress(fname,dp)) + assert date == '201606030000' + fname = 'cw323a.pm095101_mon.nc' + fpattern = 'cw323a.pm' + dp = get_date_pattern(fname, fpattern) + date = ''.join(x for x in itertools.compress(fname,dp)) + assert date == '095101' diff --git a/tests/testdata/multitime.nc b/tests/testdata/multitime.nc new file mode 100644 index 0000000..fbee027 Binary files /dev/null and b/tests/testdata/multitime.nc differ diff --git a/tests/testdata/multitime_next.nc b/tests/testdata/multitime_next.nc new file mode 100644 index 0000000..5c76393 Binary files /dev/null and b/tests/testdata/multitime_next.nc differ diff --git a/tests/testdata/onetstep.nc b/tests/testdata/onetstep.nc new file mode 100644 index 0000000..ddeb565 Binary files /dev/null and b/tests/testdata/onetstep.nc differ diff --git a/tests/testdata/onetstep_next.nc b/tests/testdata/onetstep_next.nc new file mode 100644 index 0000000..ab9ffa4 Binary files /dev/null and b/tests/testdata/onetstep_next.nc differ diff --git a/tests/testdata/varlist.csv b/tests/testdata/varlist.csv new file mode 100644 index 0000000..f04cc58 --- /dev/null +++ b/tests/testdata/varlist.csv @@ -0,0 +1,6 @@ +name;cmor_var;units;dimensions;frequency;realm;cell_methods;cmor_table;vtype;size;nsteps;fpattern;long_name;standard_name +fld_s00i004;theta;K;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CM2_mon;float32;9400320;12;cw323a.pm;THETA AFTER TIMESTEP;air_potential_temperature +fld_s00i010;hus;1;time model_theta_level_number lat lon;mon;atmos;area: time: mean;CMIP6_CFmon;float32;9400320;12;cw323a.pm;SPECIFIC HUMIDITY AFTER TIMESTEP;specific_humidity +fld_s00i024;ts;K;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;12;cw323a.pm;SURFACE TEMPERATURE AFTER TIMESTEP;surface_temperature +fld_s00i031;siconca;%;time lat lon;mon;atmos;area: time: mean;;float32;110592;12;cw323a.pm;FRAC OF SEA ICE IN SEA AFTER TSTEP;sea_ice_area_fraction +fld_s03i234;hfls;W m-2;time lat lon;mon;atmos;area: time: mean;CMIP6_Amon;float32;110592;12;cw323a.pm;SURFACE LATENT HEAT FLUX W/M2;surface_upward_latent_heat_flu