diff --git a/pyaerocom/scripts/testdata-minimal/TM5_subset.sh b/pyaerocom/scripts/testdata-minimal/TM5_subset.sh deleted file mode 100755 index b0e537e9f..000000000 --- a/pyaerocom/scripts/testdata-minimal/TM5_subset.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -x -# Extract a few lat/lon points to decrease filesize -LON=20,30 -LAT=20,30 - -ncks -d lat,"$LAT" -d lon,"$LON" aerocom3_TM5-met2010_AP3-CTRL2019_abs550aer_Column_2010_daily.nc ./aerocom3_TM5-met2010_AP3-CTRL2019_abs550aer_Column_2010_daily.nc - -ncks -d lat,"$LAT" -d lon,"$LON" aerocom3_TM5-met2010_AP3-CTRL2019_od550aer_Column_2010_daily.nc ./aerocom3_TM5-met2010_AP3-CTRL2019_od550aer_Column_2010_daily.nc diff --git a/pyaerocom/scripts/testdata-minimal/calc_example_coldata.py b/pyaerocom/scripts/testdata-minimal/calc_example_coldata.py deleted file mode 100755 index a5090544a..000000000 --- a/pyaerocom/scripts/testdata-minimal/calc_example_coldata.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import matplotlib.pyplot as plt - -import pyaerocom as pya - -plt.close("all") - -from pathlib import Path - -import pyaerocom.testdata_access as td -from pyaerocom.conftest import CHECK_PATHS - -tda = td.TestDataAccess() - -TESTDATADIR = Path(tda.testdatadir) - -OUTBASE = TESTDATADIR.joinpath("coldata") - -if not OUTBASE.exists(): - OUTBASE.mkdir() - -fpath = TESTDATADIR.joinpath(CHECK_PATHS["tm5aod"]) -if not fpath.exists(): - raise Exception("Unexpected error, please debug") -mod = pya.GriddedData(fpath) - -obs = pya.io.ReadAeronetSunV3("AeronetSunV3L2Subset.daily").read("od550aer") - -coldata = pya.colocation.colocate_gridded_ungridded(mod, obs) - -coldata.to_netcdf(OUTBASE) - -print(coldata.calc_statistics()) - -coldata.plot_coordinates() - -mod = mod.sel(latitude=(0, 3), longitude=(0, 4)) -cgg = pya.colocation.colocate_gridded_gridded(mod, mod) -cgg.data = cgg.data[:, :3] - -cgg.plot_scatter() - -cgg.to_netcdf(OUTBASE) - -pya.plot.mapping.plot_nmb_map_colocateddata(cgg) diff --git a/pyaerocom/scripts/testdata-minimal/create_subsets_emep.sh b/pyaerocom/scripts/testdata-minimal/create_subsets_emep.sh deleted file mode 100644 index aeffe1dcd..000000000 --- a/pyaerocom/scripts/testdata-minimal/create_subsets_emep.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -x - -INFILEPATH="/lustre/storeB/project/fou/kl/emep/ModelRuns/2019_REPORTING/EMEP01_L20EC_rv4_33.2017/Base_fullrun.nc" -VARIABLES="SURF_ug_O3,SURF_ppb_O3,SURF_ug_PM10_rh50,SURF_ug_PM25_rh50,SURF_ug_NO2" -TMPFILE="./tmp.nc" -LAT=50,52 -LON=10,12 - -ncks -d lat,"$LAT" -d lon,"$LON" -v "$VARIABLES" "$INFILEPATH" "$TMPFILE" - -# netcdf files with dimension set to unlimited takes up a lot of space. -# dump the file, change UNLIMITED to an integer and regenerate the file -OUTFILEPATH="./Base_fullrun.nc" -ncdump "$TMPFILE"| sed -e "s/UNLIMITED/1/" | ncgen -o "$OUTFILEPATH" -rm "$TMPFILE" - - -INFILEPATH="/lustre/storeB/project/fou/kl/emep/ModelRuns/2019_REPORTING/EMEP01_L20EC_rv4_33.2017/Base_month.nc" -ncks -d time,0,2 -d lat,"$LAT" -d lon,"$LON" -v "$VARIABLES" "$INFILEPATH" "$TMPFILE" -OUTFILEPATH="./Base_month.nc" -ncdump "$TMPFILE"| sed -e "s/UNLIMITED/3/" | ncgen -o "$OUTFILEPATH" -rm "$TMPFILE" - -INFILEPATH="/lustre/storeB/project/fou/kl/emep/ModelRuns/2019_REPORTING/EMEP01_L20EC_rv4_33.2017/Base_day.nc" -ncks -d time,0,2 -d lat,"$LAT" -d lon,"$LON" -v "$VARIABLES" "$INFILEPATH" "$TMPFILE" -OUTFILEPATH="./Base_day.nc" -ncdump "$TMPFILE"| sed -e "s/UNLIMITED/3/" | ncgen -o "$OUTFILEPATH" -rm "$TMPFILE" diff --git a/pyaerocom/scripts/testdata-minimal/create_subsets_ghost.py b/pyaerocom/scripts/testdata-minimal/create_subsets_ghost.py deleted file mode 100644 index 46d3e77a1..000000000 --- a/pyaerocom/scripts/testdata-minimal/create_subsets_ghost.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Create minimal testdataset for GHOST reader - -Created on Fri Feb 26 09:17:09 2021 - -@author: jonasg -""" -import os - -import matplotlib.pyplot as plt - -plt.close("all") -import xarray as xr - -import pyaerocom as pya - -path_in = os.path.join(pya.const.OUTPUTDIR, "data/obsdata/GHOST/data") - -path_out = os.path.join(pya.const.OUTPUTDIR, "testdata-minimal/obsdata/GHOST/data") - -assert os.path.exists(path_in) -assert os.path.exists(path_out) - -datasets = ["EEA_AQ_eReporting", "EBAS"] - -freqs = ["hourly", "daily"] - -varis = ["pm10", "sconco3"] -datesfiles = ["201810", "201911", "201912"] - -filename = lambda var, date: f"{var}_{date}.nc" - -files_out = [] -for dsname in datasets: - for freq in freqs: - indir = os.path.join(path_in, dsname, freq) - assert os.path.exists(indir) - outdir = os.path.join(path_out, dsname, freq) - os.makedirs(outdir, exist_ok=True) - assert os.path.exists(outdir) - for var in varis: - if var == "pm10": - dates = datesfiles - numst = 3 - - numts = None if freq == "daily" else 3 - - else: - dates = [datesfiles[0]] - numst = 1 - numts = 3 - for date in dates: - dir_in = os.path.join(indir, var) - dir_out = os.path.join(outdir, var) - os.makedirs(dir_out, exist_ok=True) - assert os.path.exists(dir_in) - fname = filename(var, date) - file_in = os.path.join(dir_in, fname) - file_out = os.path.join(dir_out, fname) - print(file_in) - print(file_out) - assert os.path.exists(file_in) - - ds = xr.open_dataset(file_in) - subset = ds.isel(station=slice(0, numst)) - if numts is not None: - subset = subset.isel(time=slice(0, numts)) - - subset.to_netcdf(file_out) - print("Saved") diff --git a/pyaerocom/scripts/testdata-minimal/README.md b/scripts/testdata-minimal/README.md similarity index 77% rename from pyaerocom/scripts/testdata-minimal/README.md rename to scripts/testdata-minimal/README.md index 5cdc98968..b09deed85 100644 --- a/pyaerocom/scripts/testdata-minimal/README.md +++ b/scripts/testdata-minimal/README.md @@ -1,4 +1,5 @@ # Scripts for test dataset creation of pyaerocom + This directory consists of scripts to create the minimal test dataset needed for automatic testing and continuous integration of pyaerocom. The scripts need access to Met Norway's internal file storage and are therefore @@ -8,8 +9,26 @@ they are included in the main pyaerocom gihub repository anyway. The minimal test data created from these scripts will usually go to the subdirectory `~/MyPyaerocom/testdata-minimal` Example model and observation data can be found in sub-directories `modeldata` and `obsdata`, respectively. -At this time only `create_subset_ebas.py` is running with the -latest version of pyaerocom +``` bash +python -m scripts.testdata-minimal --help +``` + +``` man +Usage: python -m scripts.testdata-minimal [OPTIONS] COMMAND [ARGS]... + + Crete minimal test datasets for pyaerocom + +Options: + --help Show this message and exit. + +Commands: + Aeronet minimal Aeronet dataset + Colocated collocated data example + EBAS minimal EBAS dataset + EMEP minimal EMEP dataset + GHOST minimal GHOST dataset + TM5 minimal TM5 dataset +``` ## Data usage guidelines @@ -18,31 +37,34 @@ The data is generally NOT intended to be downloaded and used. If you download th general data policy terms and restrictions of each provided dataset apply. These will be listed in the following. ### AERONET data + See: [https://aeronet.gsfc.nasa.gov/new_web/data_usage.html](https://aeronet.gsfc.nasa.gov/new_web/data_usage.html) ### EBAS data + See: [https://ebas.nilu.no/](https://ebas.nilu.no/) Under "Data policy". ### Model data -- TM5 :Courtesy of Twan van Noije (KNMI) +- TM5: Courtesy of Twan van Noije (KNMI) ### Satellite data - MODIS: start with the [MODIS landing page](https://modis.gsfc.nasa.gov/data/) ## Updating testdata for CI + **Note:** The test data has to be updated by hand for CI to pickup the changes! Howto for that: -``` + +``` bash cd ~/MyPyaerocom mkdir -p ~/tmp tar -cvzf ~/tmp/testdata-minimal.tar.gz testdata-minimal ``` + The resulting file `~/tmp/testdata-minimal.tar.gz` then needs to be copied to the right place. Please ask your fellow developers in case you do not know how to do that. - - diff --git a/scripts/testdata-minimal/__init__.py b/scripts/testdata-minimal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/testdata-minimal/__main__.py b/scripts/testdata-minimal/__main__.py new file mode 100644 index 000000000..aa0ec1d67 --- /dev/null +++ b/scripts/testdata-minimal/__main__.py @@ -0,0 +1,13 @@ +import typer + +from . import aeronet, coldata, ebas, emep, ghost, tm5 + +main = typer.Typer(help="Crete minimal test datasets for pyaerocom", add_completion=False) +main.command(name="Aeronet")(aeronet.main) +main.command(name="Colocated")(coldata.main) +main.command(name="EBAS")(ebas.main) +main.command(name="EMEP")(emep.main) +main.command(name="GHOST")(ghost.main) +main.command(name="TM5")(tm5.main) + +main() diff --git a/pyaerocom/scripts/testdata-minimal/create_subsets_aeronet.py b/scripts/testdata-minimal/aeronet.py old mode 100755 new mode 100644 similarity index 91% rename from pyaerocom/scripts/testdata-minimal/create_subsets_aeronet.py rename to scripts/testdata-minimal/aeronet.py index e3f9ab38c..214dc8225 --- a/pyaerocom/scripts/testdata-minimal/create_subsets_aeronet.py +++ b/scripts/testdata-minimal/aeronet.py @@ -1,7 +1,5 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- """ -Goal +Minimal Aeronet subset for testing purposes """ import os @@ -10,13 +8,10 @@ from pathlib import Path import numpy as np +import typer import pyaerocom as pya - -OUTBASE = Path(pya.const._TESTDATADIR).joinpath("obsdata") - -if not OUTBASE.exists(): - OUTBASE.mkdir() +from tests.fixtures.data_access import DataForTests MIN_NUM_VALID = 300 @@ -36,7 +31,12 @@ ] revision_files = {} -if __name__ == "__main__": + + +def main( + out_path: Path = typer.Argument(DataForTests("obsdata").path, exists=True, dir_okay=True) +): + """minimal Aeronet dataset""" loaded = {} for name, varlist in NETWORKS.items(): @@ -95,7 +95,7 @@ for name, data in loaded.items(): data_id = IDS[name] - outdir = OUTBASE.joinpath(data_id) + outdir = out_path / data_id # make sure to remove old data if outdir.exists(): print("REMOVING EXISTING DATA FOR {}".format(data_id)) diff --git a/scripts/testdata-minimal/coldata.py b/scripts/testdata-minimal/coldata.py new file mode 100644 index 000000000..938121187 --- /dev/null +++ b/scripts/testdata-minimal/coldata.py @@ -0,0 +1,35 @@ +from pathlib import Path + +import typer + +import pyaerocom as pya +from tests.fixtures.data_access import DataForTests +from tests.fixtures.tm5 import CHECK_PATHS + +MOD_PATH = DataForTests(CHECK_PATHS.tm5aod).path +OUT_PATH = DataForTests("coldata").path + + +def main( + mod_path: Path = typer.Argument(MOD_PATH, exists=True, dir_okay=True), + out_path: Path = typer.Argument(OUT_PATH, exists=True, dir_okay=True), +): + """collocated data example""" + + mod = pya.GriddedData(mod_path) + obs = pya.io.ReadAeronetSunV3("AeronetSunV3L2Subset.daily").read("od550aer") + + coldata = pya.colocation.colocate_gridded_ungridded(mod, obs) + coldata.to_netcdf(out_path) + print(coldata.calc_statistics()) + + coldata.plot_coordinates() + + mod = mod.sel(latitude=(0, 3), longitude=(0, 4)) + cgg = pya.colocation.colocate_gridded_gridded(mod, mod) + cgg.data = cgg.data[:, :3] + + cgg.plot_scatter() + cgg.to_netcdf(out_path) + + pya.plot.mapping.plot_nmb_map_colocateddata(cgg) diff --git a/pyaerocom/scripts/testdata-minimal/create_subset_ebas.py b/scripts/testdata-minimal/ebas.py old mode 100755 new mode 100644 similarity index 89% rename from pyaerocom/scripts/testdata-minimal/create_subset_ebas.py rename to scripts/testdata-minimal/ebas.py index 67270ee54..9530b9413 --- a/pyaerocom/scripts/testdata-minimal/create_subset_ebas.py +++ b/scripts/testdata-minimal/ebas.py @@ -1,51 +1,31 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -"""simple script to generate a small enough test data set for the EBAS obs network +""" +Simple script to generate a small enough test data set for the EBAS obs network Works only if the user has access to the standard EBAS data path at Met Norway """ import os import shutil +from importlib import resources from pathlib import Path import simplejson +import typer import pyaerocom as pya +from tests.fixtures.data_access import DataForTests -# import pyaerocom.access_testdata as td -from pyaerocom.access_testdata import AccessTestData - -# from getpass import getuser -# -# if getuser() == 'jonasg': -# ebas_local = os.path.join(pya.const.OUTPUTDIR, 'data/obsdata/EBASMultiColumn/data') -# assert os.path.exists(ebas_local) -# else: -# ebas_local=None - +OUTBASE = DataForTests("obsdata/EBASMultiColumn").path +SCRIPT_BASE_DIR = DataForTests("scripts").path -tda = AccessTestData() - -TESTDATADIR = tda.basedir - -OUTBASE = Path(TESTDATADIR).joinpath("testdata-minimal/obsdata/EBASMultiColumn") -SCRIPT_BASE_DIR = Path(TESTDATADIR).joinpath("testdata-minimal/scripts") - -FILES_DEST = OUTBASE.joinpath("data") +FILES_DEST = OUTBASE / "data" UPDATE = True UPDATE_EXISTING = False SEARCH_PROBLEM_FILES = False NAME = "EBASMC" -# if ebas_local is not None: -# FILES_SRC = ebas_local -# else: EBAS_BASE_DIR = "/lustre/storeA/project/aerocom/aerocom1/AEROCOM_OBSDATA/EBASMultiColumn/data/" -assert os.path.exists(EBAS_BASE_DIR) - -JSON_FILE = SCRIPT_BASE_DIR.joinpath("ebas_files.json") +JSON_FILE = SCRIPT_BASE_DIR / "ebas_files.json" # ------------------------------------------------------------ # add some files with known problems @@ -73,9 +53,10 @@ def check_outdated(filedir): files_invalid = [] files_valid = [] - with open(JSON_FILE, "r") as f: + with resources.path(__package__, JSON_FILE.name) as path: + shutil.copy(path, JSON_FILE) - data = simplejson.load(f) + data = simplejson.loads(JSON_FILE.read_text()) for var, stats in data.items(): for stat, files in stats.items(): @@ -155,12 +136,11 @@ def get_files_var_statnum(data, var, statnum): return files -def main(): +def main(ebas_path: Path = typer.Argument(EBAS_BASE_DIR, exists=True, dir_okay=True)): + """minimal EBAS dataset""" - # reader = pya.io.ReadUngridded(NAME, data_dir=EBAS_BASE_DIR) - reader = pya.io.ReadUngridded( - NAME, - ) + # reader = pya.io.ReadUngridded(NAME, data_dir=ebas_path) + reader = pya.io.ReadUngridded(NAME) r_lowlev = reader.get_lowlevel_reader(NAME) # r_lowlev._dataset_path = ebas_local @@ -246,7 +226,7 @@ def main(): print("NOTHING WILL BE COPIED TO TEST DATA") else: - src = Path(EBAS_BASE_DIR).joinpath("data") + src = ebas_path / "data" print(f"updating test data @ {r_lowlev.DATASET_PATH}") # copy revision file diff --git a/pyaerocom/scripts/testdata-minimal/ebas_files.json b/scripts/testdata-minimal/ebas_files.json similarity index 100% rename from pyaerocom/scripts/testdata-minimal/ebas_files.json rename to scripts/testdata-minimal/ebas_files.json diff --git a/scripts/testdata-minimal/emep.py b/scripts/testdata-minimal/emep.py new file mode 100644 index 000000000..e4a95c7fa --- /dev/null +++ b/scripts/testdata-minimal/emep.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from pathlib import Path + +import typer +import xarray as xr + +from tests.fixtures.mscw_ctm import EMEP_DATA_PATH + +SRC_DATA_PATH = Path("/lustre/storeB/project/fou/kl/emep/ModelRuns") +SRC_DATA_PATH /= "2019_REPORTING/EMEP01_L20EC_rv4_33.2017" + +VARIABLES = ["SURF_ug_O3", "SURF_ppb_O3", "SURF_ug_PM10_rh50", "SURF_ug_PM25_rh50", "SURF_ug_NO2"] +LAT, LON = slice(50, 52), slice(10, 12) + +PATHS = ( + EMEP_DATA_PATH / "Base_day.nc", + EMEP_DATA_PATH / "Base_month.nc", + EMEP_DATA_PATH / "Base_fullrun.nc", +) + + +def reduce_dims(ds: xr.Dataset) -> xr.Dataset: + """crop domain and remove "unlimited" from time coordinate""" + del ds.encoding["unlimited_dims"] + return ds.isel(lon=LON, lat=LAT) + + +def atomic_write(ds: xr.Dataset, path: Path, **kwargs) -> None: + """write dataset to a netcdf file atomically""" + tmp = path.with_suffix(".tmp") + try: + ds.to_netcdf(tmp, **kwargs) + tmp.rename(path) + finally: + tmp.unlink(missing_ok=True) + + +def main(emep_path: Path = typer.Argument(SRC_DATA_PATH, exists=True, dir_okay=True)): + """minimal EMEP dataset""" + for path in PATHS: + ds = xr.open_dataset(emep_path / path.name)[VARIABLES].pipe(reduce_dims) + atomic_write(ds, path) diff --git a/scripts/testdata-minimal/ghost.py b/scripts/testdata-minimal/ghost.py new file mode 100644 index 000000000..2aef202c2 --- /dev/null +++ b/scripts/testdata-minimal/ghost.py @@ -0,0 +1,50 @@ +""" +Create minimal testdataset for GHOST reader +""" +from itertools import product +from pathlib import Path + +import typer +import xarray as xr + +import pyaerocom as pya +from tests.fixtures.data_access import DataForTests + +PATH_IN = Path(pya.const.OUTPUTDIR) / "data/obsdata/GHOST/data" +PATH_OUT = DataForTests("obsdata/GHOST/data").path +DATASETS = ["EBAS"] +FREQS = ["hourly", "daily"] +VARS = ["pm10", "sconco3"] +DATES = ["201810", "201911", "201912"] + + +def main( + path_in: Path = typer.Argument(PATH_IN, exists=True, dir_okay=True), + path_out: Path = typer.Argument(PATH_OUT, exists=True, dir_okay=True), +): + """minimal GHOST dataset""" + for dsname, freq, var in product(DATASETS, FREQS, VARS): + if var == "pm10": + dates = DATES + numst = 3 + numts = None if freq == "daily" else 3 + else: + dates = DATES[0:1] + numst = 1 + numts = 3 + for date in dates: + file_in = path_in / dsname / freq / var / f"{var}_{date}.nc" + assert file_in.exists(), f"missing {file_in}" + + file_out = path_out / file_in.relative_to(path_in) + file_out.parent.mkdir(exist_ok=True, parents=True) + print(file_in) + print(file_out) + + ds = xr.open_dataset(file_in) + ds = ds.isel(station=slice(0, numst)) + if numts is not None: + ds = ds.isel(time=slice(0, numts)) + + ds.to_netcdf(file_out) + print("Saved") diff --git a/scripts/testdata-minimal/tm5.py b/scripts/testdata-minimal/tm5.py new file mode 100644 index 000000000..f4282e594 --- /dev/null +++ b/scripts/testdata-minimal/tm5.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +from pathlib import Path + +import typer +import xarray as xr + +from tests.fixtures.tm5 import TM5_DATA_PATH + +from .emep import atomic_write + +SRC_DATA_PATH = Path("/lustre/storeA/project/aerocom/aerocom-users-database") +SRC_DATA_PATH /= "AEROCOM-PHASE-III-2019/TM5-met2010_AP3-CTRL2019/renamed" + +LON, LAT = slice(20, 30), slice(20, 30) + +PATHS = { + TM5_DATA_PATH / "aerocom3_TM5-met2010_AP3-CTRL2019_abs550aer_Column_2010_daily.nc", + TM5_DATA_PATH / "aerocom3_TM5-met2010_AP3-CTRL2019_od550aer_Column_2010_daily.nc", +} + + +def reduce_dims(ds: xr.Dataset) -> xr.Dataset: + """crop domain""" + return ds.isel(lon=LON, lat=LAT) + + +def main(tm5_path: Path = typer.Argument(SRC_DATA_PATH, exists=True, dir_okay=True)): + """minimal TM5 dataset""" + for path in PATHS: + ds = xr.open_dataset(tm5_path / path.name).pipe(reduce_dims) + atomic_write(ds, path)