Merge pull request #38 from metno/37-harp-reader-is-missing-mandatory…

…-parameter-filters add filters argument
metno · May 6, 2024 · 5a476bc · 5a476bc
2 parents 1e9dc62 + 458edb7
commit 5a476bc
Show file tree

Hide file tree

Showing 6 changed files with 273 additions and 65 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = pyaro_readers
-version = 0.0.4dev
+version = 0.0.5
 author = MET Norway
 description = implementations of pyaerocom reading plugings using pyaro as interface
 long_description = file: README.md
@@ -22,7 +22,7 @@ url = https://github.com/metno/pyaro-readers
 python_version = >=3.9
 install_requires =
     pyaro >= 0.0.6
-    geocoder_reverse_natural_earth >= 0.0.1
+    geocoder_reverse_natural_earth >= 0.0.2
     netCDF4
     requests
     tqdm
@@ -31,7 +31,7 @@ install_requires =
 
 package_dir =
     =src
-packages = pyaro_readers.aeronetsunreader, pyaro_readers.aeronetsdareader, pyaro_readers.ascii2netcdf, pyaro_readers.harpreader
+packages = pyaro_readers.aeronetsunreader, pyaro_readers.aeronetsdareader, pyaro_readers.ascii2netcdf, pyaro_readers.harpreader, pyaro_readers
 test_require = tox:tox
 
 [options.package_data]

diff --git a/src/pyaro_readers/__init__.py b/src/pyaro_readers/__init__.py
@@ -0,0 +1,5 @@
+from importlib import metadata
+
+__version__ = metadata.version(__package__)
+
+from .units_helpers import *
diff --git a/src/pyaro_readers/harpreader/harpreader.py b/src/pyaro_readers/harpreader/harpreader.py
@@ -11,10 +11,10 @@
 import os
 import xarray as xr
 import numpy as np
-from collections import namedtuple
-import re
+from pathlib import Path
+from tqdm import tqdm
 import cfunits
-import pyaro
+from pyaro_readers.units_helpers import UALIASES
 
 logger = logging.getLogger(__name__)
 
@@ -28,26 +28,76 @@ class AeronetHARPReader(AutoFilterReaderEngine.AutoFilterReader):
     Reader for netCDF files which follow the HARP convention.
     """
 
-    def __init__(self, file: str):
-        self._filters = []
-        if os.path.isfile(file):
-            self._file = file
+    FILE_MASK = "*.nc"
+    COORD_NAMES = [
+        "latitude",
+        "longitude",
+        "altitude",
+        "datetime_start",
+        "datetime_stop",
+    ]
+
+    def __init__(
+        self,
+        file: [Path, str],
+        filters=[],
+        vars_to_read: list[str] = None,
+    ):
+        self._filters = filters
+        realpath = Path(file).resolve()
+
+        self._data = {}
+        self._files = []
+        self._stations = {}
+        self._vars_to_read = vars_to_read
+        self._set_filters(filters)
+
+        # variable include filter comes like this
+        # {'variables': {'include': ['PM10_density']}}
+        # test for variable filter
+        if "variables" in filters:
+            if "include" in filters["variables"]:
+                vars_to_read = filters["variables"]["include"]
+                self._vars_to_read = vars_to_read
+                logger.info(f"applying variable include filter {vars_to_read}...")
+
+        if os.path.isfile(realpath) or os.path.isdir(realpath):
+            pass
         else:
-            raise HARPReaderException(f"No such file: {file}")
+            raise HARPReaderException(f"No such file or directory: {file}")
 
-        with xr.open_dataset(self._file) as harp:
-            if harp.attrs.get("Conventions", None) != "HARP-1.0":
-                raise ValueError(f"File is not a HARP file.")
-
-        self._variables = self._read_file_variables()
-
-    def _unfiltered_stations(self) -> dict[str, Station]:
-        pass
-
-    def close(self):
-        pass
-
-    def _read_file_variables(self) -> dict[str, str]:
+        if os.path.isdir(file):
+            pattern = os.path.join(file, self.FILE_MASK)
+            self._files = glob.glob(pattern)
+        else:
+            self._files.append(file)
+
+        bar = tqdm(total=len(self._files))
+
+        for f_idx, _file in enumerate(self._files):
+            logger.info(f"Reading {_file}")
+            bar.update(1)
+            self._variables = self._read_file_variables(_file)
+            # initialise all variables if not done yet
+            for _var in self._variables:
+                # skip coordinate names
+                if _var in self.COORD_NAMES:
+                    continue
+                if vars_to_read is not None and _var not in vars_to_read:
+                    logger.info(f"Skipping {_var}")
+                    continue
+                if _var not in self._data:
+                    units = self._variables[_var]
+                    data = NpStructuredData(_var, units)
+                    self._data[_var] = data
+
+                self._get_data_from_single_file(
+                    _file,
+                    _var,
+                )
+        bar.close()
+
+    def _read_file_variables(self, filename) -> dict[str, str]:
         """Returns a mapping of variable name to unit for the dataset.
 
         Returns:
@@ -57,44 +107,26 @@ def _read_file_variables(self) -> dict[str, str]:
 
         """
         variables = {}
-        with xr.open_dataset(self._file, decode_cf=False) as d:
+        with xr.open_dataset(
+            filename,
+            decode_cf=False,
+        ) as d:
             for vname, var in d.data_vars.items():
-                variables[vname] = cfunits.Units(var.attrs["units"])
+                if vname in self._vars_to_read:
+                    # Units in pyaro arte by definition strings, but this way
+                    # we can make sure that cfunits understands them
+                    # otherwise variables[vname] = var.attrs["units"] should work as well
+                    variables[vname] = str(cfunits.Units(var.attrs["units"]))
+                    if variables[vname] in UALIASES:
+                        variables[vname] = UALIASES[variables[vname]]
 
         return variables
 
-    def _unfiltered_data(self, varname: str) -> NpStructuredData:
-        """Returns unfiltered data for a variable.
-
-        Parameters:
-        -----------
-        varname : str
-            The variable name for which to return the data.
-
-        Returns:
-        --------
-        NpStructuredArray
-            The data.
-
-        """
-
-        units = self._variables[varname]
-        data = NpStructuredData(varname, units)
-
-        pattern = ""
-        if os.path.isdir(self._file):
-            pattern = os.path.join(self._file, "*.nc")
-        else:
-            pattern = self._file
-
-        for f in glob.glob(pattern):
-            self._get_data_from_single_file(f, varname, data)
-
-        return data
-
     def _get_data_from_single_file(
-        self, file: str, varname: str, data: NpStructuredData
-    ) -> None:
+        self,
+        file: str,
+        varname: str,
+    ) -> bool:
         """Loads data for a variable from a single file.
 
         Parameters:
@@ -107,20 +139,30 @@ def _get_data_from_single_file(
             Data instance to which the data will be appended to in-place.
 
         """
-        dt = xr.open_dataset(file)
+        dt = xr.load_dataset(file)
+
+        if dt.attrs.get("Conventions", None) != "HARP-1.0":
+            raise ValueError(f"File {file} is not a HARP file.")
 
         values = dt[varname].to_numpy()
+        # take station name from filename since there is no name in the data...
+        stat_name = os.path.basename(file).split("-")[2]
 
         values_length = len(values)
         start_time = np.asarray(dt["datetime_start"])
         stop_time = np.asarray(dt["datetime_stop"])
+        # start and stop time have been the same in the 1st data revision
+        # check that and assume hourly data if it's still the case
+        t_diff = stop_time - start_time
+        if t_diff.sum() == 0:
+            stop_time = stop_time + np.timedelta64(1, "h")
         lat = np.asarray([dt["latitude"]] * values_length)
         long = np.asarray([dt["longitude"]] * values_length)
-        station = np.asarray([np.nan] * values_length)
+        station = np.asarray([stat_name] * values_length)
         altitude = np.asarray([dt["altitude"]] * values_length)
 
         flags = np.asarray([Flag.VALID] * values_length)
-        data.append(
+        self._data[varname].append(
             value=values,
             station=station,
             latitude=lat,
@@ -133,14 +175,35 @@ def _get_data_from_single_file(
             standard_deviation=np.asarray([np.nan] * values_length),
         )
 
+        # fill self._stations
+
+        if not stat_name in self._stations:
+            self._stations[stat_name] = Station(
+                {
+                    "station": stat_name,
+                    "longitude": long[0],
+                    "latitude": lat[0],
+                    "altitude": altitude[0],
+                    "country": "NN",
+                    "url": "",
+                    "long_name": stat_name,
+                }
+            )
+
     def _unfiltered_variables(self) -> list[str]:
         """Returns a list of the variable names.
 
         Returns:
         list[str]
             The list of variable names.
         """
-        return list(self._variables.keys())
+        return self._data.keys()
+
+    def _unfiltered_data(self, varname) -> Data:
+        return self._data[varname]
+
+    def _unfiltered_stations(self) -> dict[str, Station]:
+        return self._stations
 
     def close(self):
         pass

diff --git a/src/pyaro_readers/units_helpers.py b/src/pyaro_readers/units_helpers.py
@@ -0,0 +1,101 @@
+import pandas as pd
+
+
+class UnitConversionError(ValueError):
+    pass
+
+
+#: default frequency for rates variables (e.g. deposition, precip)
+RATES_FREQ_DEFAULT = "d"
+
+# 1. DEFINITION OF ATOM and MOLECULAR MASSES
+
+# Atoms
+M_O = 15.999  # u
+M_S = 32.065  # u
+M_N = 14.0067  # u
+M_H = 1.00784  # u
+
+# Molecules
+M_SO2 = M_S + 2 * M_O
+M_SO4 = M_S + 4 * M_O
+
+M_NO2 = M_N + 2 * M_O
+M_NO3 = M_N + 3 * M_O
+
+M_NH3 = M_N + 3 * M_H
+M_NH4 = M_N + 4 * M_H
+
+# Unit conversion and custom units definitions
+
+# 2.1 Other conversion factors
+HA_TO_SQM = 10000  # hectar to square metre.
+
+# 3. LOOKUP TABLE FOR CONVERSION FACTORS
+
+#: Custom unit conversion factors for certain variables
+#: columns: variable -> from unit -> to_unit -> conversion
+#: factor
+UCONV_MUL_FACS = pd.DataFrame(
+    [
+        # ["dryso4", "mg/m2/d", "mgS m-2 d-1", M_S / M_SO4],
+        # ["drynh4", "mg/m2/d", "mgN m-2 d-1", M_N/ M_NH4],
+        # ["concso4", "ug S/m3", "ug m-3", M_SO4 / M_S],
+        # ["SO4ugSm3", "ug/m3", "ug S m-3", M_S / M_SO4],
+        # ["concso4pm25", "ug S/m3", "ug m-3", M_SO4 / M_S],
+        # ["concso4pm10", "ug S/m3", "ug m-3", M_SO4 / M_S],
+        ["concso2", "ug S/m3", "ug m-3", M_SO2 / M_S],
+        ["concbc", "ug C/m3", "ug m-3", 1.0],
+        ["concoa", "ug C/m3", "ug m-3", 1.0],
+        ["concoc", "ug C/m3", "ug m-3", 1.0],
+        ["conctc", "ug C/m3", "ug m-3", 1.0],
+        # a little hacky for ratpm10pm25...
+        # ["ratpm10pm25", "ug m-3", "1", 1.0],
+        ["concpm25", "ug m-3", "1", 1.0],
+        ["concpm10", "ug m-3", "1", 1.0],
+        ["concno2", "ug N/m3", "ug m-3", M_NO2 / M_N],
+        # ["concno3", "ug N/m3", "ug m-3", M_NO3 / M_N],
+        ["concnh3", "ug N/m3", "ug m-3", M_NH3 / M_N],
+        # ["concnh4", "ug N/m3", "ug m-3", M_NH4 / M_N],
+        ["wetso4", "kg S/ha", "kg m-2", M_SO4 / M_S / HA_TO_SQM],
+        ["concso4pr", "mg S/L", "g m-3", M_SO4 / M_S],
+    ],
+    columns=["var_name", "from", "to", "fac"],
+).set_index(["var_name", "from"])
+
+# may be used to specify alternative names for custom units  defined
+# in UCONV_MUL_FACS
+
+UALIASES = {
+    # mass concentrations
+    # "ug S m-3": "ug S/m3",
+    # "ug C m-3": "ug C/m3",
+    # "ug N m-3": "ug N/m3",
+    "ugC/m3": "ug C m-3",
+    "ug/m3": "ug m-3",
+    # deposition rates (implicit)
+    ## sulphur species
+    "mgS/m2": "mg S m-2",
+    "mgSm-2": "mg S m-2",
+    ## nitrogen species
+    "mgN/m2": "mg N m-2",
+    "mgNm-2": "mg N m-2",
+    # deposition rates (explicit)
+    ## sulphur species
+    "mgS/m2/h": "mg S m-2 h-1",
+    "mg/m2/h": "mg m-2 h-1",
+    "mgS/m**2/h": "mg S m-2 h-1",
+    "mgSm-2h-1": "mg S m-2 h-1",
+    "mgSm**-2h-1": "mg S m-2 h-1",
+    "mgS/m2/d": "mg S m-2 d-1",
+    ## nitrogen species
+    "mgN/m2/h": "mg N m-2 h-1",
+    "mgN/m**2/h": "mg N m-2 h-1",
+    "mgNm-2h-1": "mg N m-2 h-1",
+    "mgNm**-2h-1": "mg N m-2 h-1",
+    "mgN/m2/d": "mg N m-2 d-1",
+    ## others
+    "MM/H": "mm h-1",
+    # others
+    "/m": "m-1",
+}