diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 23af6525..169b2906 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
---
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.5.0
+ rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
@@ -12,12 +12,12 @@ repos:
- id: check-ast
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.3.5
+ rev: v0.7.1
hooks:
- id: ruff
args: [--fix]
- repo: https://github.com/psf/black
- rev: 24.3.0
+ rev: 24.10.0
hooks:
- id: black
language_version: python3
@@ -27,18 +27,18 @@ repos:
- id: blackdoc
additional_dependencies: ["black[jupyter]"]
- repo: https://github.com/pre-commit/mirrors-prettier
- rev: "v3.1.0"
+ rev: "v4.0.0-alpha.8"
hooks:
- id: prettier
types_or: [yaml, html, css, scss, javascript, json] # markdown to avoid conflicts with mdformat
- repo: https://github.com/codespell-project/codespell
- rev: v2.2.6
+ rev: v2.3.0
hooks:
- id: codespell
types_or: [python, markdown, rst]
additional_dependencies: [tomli]
- repo: https://github.com/asottile/pyupgrade
- rev: v3.15.2
+ rev: v3.19.0
hooks:
- id: pyupgrade
- repo: https://github.com/MarcoGorelli/madforhooks
@@ -47,7 +47,7 @@ repos:
# - id: conda-env-sorter # conflicts with prettier
- id: check-execution-order
- repo: https://github.com/executablebooks/mdformat
- rev: 0.7.17
+ rev: 0.7.18
hooks:
- id: mdformat
additional_dependencies: [mdformat-gfm, mdformat-black]
@@ -58,7 +58,7 @@ repos:
- id: nbstripout
args: [--keep-output]
- repo: https://github.com/nbQA-dev/nbQA
- rev: 1.8.5
+ rev: 1.8.7
hooks:
- id: nbqa-black
- id: nbqa-ruff
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 259566b2..573a8044 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -5,7 +5,7 @@
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
-identity and expression, level of experience, education, socio-economic status,
+identity and expression, level of experience, education, socioeconomic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml
index 58d5240e..18bf3020 100644
--- a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml
+++ b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml
@@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml
index 12e2198d..424ab818 100644
--- a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml
+++ b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml
@@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/__init__.py b/disdrodb/__init__.py
index 79129c0a..4c5153e4 100644
--- a/disdrodb/__init__.py
+++ b/disdrodb/__init__.py
@@ -1,4 +1,23 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB software."""
+
import contextlib
+import importlib
import os
from importlib.metadata import PackageNotFoundError, version
@@ -18,6 +37,11 @@
check_archive_metadata_geolocation,
)
+PRODUCT_VERSION = "V0"
+SOFTWARE_VERSION = "V" + importlib.metadata.version("disdrodb")
+CONVENTIONS = "CF-1.10, ACDD-1.3"
+
+
__all__ = [
"define_configs",
"available_stations",
diff --git a/disdrodb/api/checks.py b/disdrodb/api/checks.py
index de5b5296..1c5b14ea 100644
--- a/disdrodb/api/checks.py
+++ b/disdrodb/api/checks.py
@@ -24,11 +24,11 @@
from disdrodb.api.info import infer_disdrodb_tree_path_components
from disdrodb.api.path import (
+ define_data_dir,
define_issue_dir,
define_issue_filepath,
define_metadata_dir,
define_metadata_filepath,
- define_station_dir,
)
from disdrodb.utils.directories import (
ensure_string_path,
@@ -70,10 +70,7 @@ def check_url(url: str) -> bool:
``True`` if url well formatted, ``False`` if not well formatted.
"""
regex = r"^(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)$" # noqa: E501
-
- if re.match(regex, url):
- return True
- return False
+ return re.match(regex, url)
def check_path_is_a_directory(dir_path, path_name=""):
@@ -95,6 +92,7 @@ def check_directories_inside(dir_path):
def check_base_dir(base_dir: str):
"""Raise an error if the path does not end with ``DISDRODB``."""
base_dir = str(base_dir) # convert Pathlib to string
+ base_dir = os.path.normpath(base_dir)
if not base_dir.endswith("DISDRODB"):
raise ValueError(f"The path {base_dir} does not end with DISDRODB. Please check the path.")
return base_dir
@@ -150,7 +148,7 @@ def check_product(product):
"""Check DISDRODB product."""
if not isinstance(product, str):
raise TypeError("`product` must be a string.")
- valid_products = ["RAW", "L0A", "L0B"]
+ valid_products = ["RAW", "L0A", "L0B", "L0C", "L1", "L2E", "L2M", "L2S"]
if product.upper() not in valid_products:
msg = f"Valid `products` are {valid_products}."
logger.error(msg)
@@ -158,45 +156,68 @@ def check_product(product):
return product
-def check_station_dir(product, data_source, campaign_name, station_name, base_dir=None):
- """Check existence of the station data directory. If does not exists, raise an error."""
- station_dir = define_station_dir(
+def has_available_data(
+ data_source,
+ campaign_name,
+ station_name,
+ product,
+ base_dir=None,
+ # Option for L2E
+ sample_interval=None,
+ rolling=None,
+ # Option for L2M
+ model_name=None,
+):
+ """Return ``True`` if data are available for the given product and station."""
+ # Define product directory
+ data_dir = define_data_dir(
product=product,
base_dir=base_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ # Directory options
check_exists=False,
)
- if not os.path.exists(station_dir) and os.path.isdir(station_dir):
- msg = f"The station {station_name} data directory does not exist at {station_dir}."
- logger.error(msg)
- raise ValueError(msg)
- return station_dir
-
+ # If the product directory does not exists, return False
+ if not os.path.isdir(data_dir):
+ return False
-def has_available_station_files(product, data_source, campaign_name, station_name, base_dir=None):
- """Return ``True`` if data are available for the given product and station."""
- station_dir = check_station_dir(
- product=product,
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- )
- filepaths = list_files(station_dir, glob_pattern="*", recursive=True)
+ # If no files, return False
+ filepaths = list_files(data_dir, glob_pattern="*", recursive=True)
nfiles = len(filepaths)
return nfiles >= 1
-def check_station_has_data(product, data_source, campaign_name, station_name, base_dir=None):
- """Check the station data directory has data inside. If not, raise an error."""
- if not has_available_station_files(
+def check_data_availability(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ base_dir=None,
+ # Option for L2E
+ sample_interval=None,
+ rolling=None,
+ # Option for L2M
+ model_name=None,
+):
+ """Check the station product data directory has files inside. If not, raise an error."""
+ if not has_available_data(
product=product,
base_dir=base_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
):
msg = f"The {product} station data directory of {data_source} {campaign_name} {station_name} is empty !"
logger.error(msg)
@@ -271,6 +292,7 @@ def check_issue_dir(data_source, campaign_name, base_dir=None):
def check_issue_file(data_source, campaign_name, station_name, base_dir=None):
"""Check existence of a valid issue YAML file. If does not exists, raise an error."""
from disdrodb.issue.checks import check_issue_compliance
+ from disdrodb.issue.writer import create_station_issue
_ = check_issue_dir(
base_dir=base_dir,
@@ -286,9 +308,12 @@ def check_issue_file(data_source, campaign_name, station_name, base_dir=None):
)
# Check existence
if not os.path.exists(issue_filepath):
- msg = f"The issue YAML file of {data_source} {campaign_name} {station_name} does not exist at {issue_filepath}."
- logger.error(msg)
- raise ValueError(msg)
+ create_station_issue(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
# Check validity
check_issue_compliance(
@@ -398,7 +423,7 @@ def check_raw_dir(raw_dir: str, station_name: str) -> None:
check_directories_inside(raw_dir)
# Check there is data in the station directory
- check_station_has_data(
+ check_data_availability(
product="RAW",
base_dir=base_dir,
data_source=data_source,
diff --git a/disdrodb/api/create_directories.py b/disdrodb/api/create_directories.py
index cf31f1d9..af91a95b 100644
--- a/disdrodb/api/create_directories.py
+++ b/disdrodb/api/create_directories.py
@@ -19,7 +19,7 @@
"""Tools to create Raw, L0A and L0B DISDRODB directories."""
# L0A and L0B from raw NC: create_l0_directory_structure(raw_dir, processed_dir)
-# L0B: create_directory_structure(processed_dir)
+# L0B: create_product_directory(processed_dir)
import logging
import os
@@ -27,12 +27,12 @@
from typing import Optional
from disdrodb.api.checks import (
+ check_data_availability,
check_metadata_file,
check_processed_dir,
check_product,
check_raw_dir,
- check_station_has_data,
- has_available_station_files,
+ has_available_data,
)
from disdrodb.api.info import (
infer_campaign_name_from_path,
@@ -41,16 +41,18 @@
)
from disdrodb.api.path import (
define_campaign_dir,
+ define_data_dir,
define_issue_dir,
define_issue_filepath,
+ define_logs_dir,
define_metadata_dir,
define_metadata_filepath,
define_station_dir,
)
from disdrodb.configs import get_base_dir
from disdrodb.utils.directories import (
- check_directory_exists,
copy_file,
+ create_directory,
create_required_directory,
remove_if_exists,
)
@@ -162,52 +164,17 @@ def _copy_station_metadata(
)
-def _check_pre_existing_station_data(
- data_source: str,
- campaign_name: str,
- station_name: str,
- product: str,
- base_dir=None,
- force=False,
-):
- """Check for pre-existing station data.
-
- - If ``force=True``, remove all data inside the station directory.
- - If ``force=False``, raise error.
- """
- # NOTE: ``force=False`` behaviour could be changed to enable updating of missing files.
- # This would require also adding code to check whether a downstream file already exist.
-
- # Check if there are available data
- available_data = has_available_station_files(
- product=product,
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- )
- # Define the station directory path
- station_dir = define_station_dir(
- product=product,
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- )
- # If the station data are already present:
- # - If force=True, remove all data inside the station directory
- # - If force=False, raise error
- if available_data:
- # Check is a directory
- check_directory_exists(station_dir)
- # If force=True, remove all the content
- if force:
- # Remove all station directory content
- shutil.rmtree(station_dir)
- else:
- msg = f"The station directory {station_dir} already exists and force=False."
- logger.error(msg)
- raise ValueError(msg)
+def ensure_empty_data_dir(data_dir, force):
+ """Remove the content of the data_dir directory."""
+ # If force=True, remove all the directory content
+ if force:
+ shutil.rmtree(data_dir)
+ # Recreate the directory
+ create_directory(data_dir)
+ else:
+ msg = f"The product directory {data_dir} already contains files and force=False."
+ logger.error(msg)
+ raise ValueError(msg)
def create_l0_directory_structure(
@@ -236,8 +203,8 @@ def create_l0_directory_structure(
# Retrieve components
base_dir, product_type, data_source, campaign_name = infer_disdrodb_tree_path_components(processed_dir)
- # Check station data are available
- check_station_has_data(
+ # Check RAW station data are available
+ check_data_availability(
product="RAW",
base_dir=base_dir,
data_source=data_source,
@@ -248,7 +215,18 @@ def create_l0_directory_structure(
# Create required directories (if they don't exist)
create_required_directory(processed_dir, dir_name="metadata")
create_required_directory(processed_dir, dir_name="info")
- create_required_directory(processed_dir, dir_name=product)
+
+ # Define and create product directory
+ data_dir = define_data_dir(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
+ # Create required directory (if it doesn't exist)
+ create_directory(data_dir)
# Copy the station metadata
_copy_station_metadata(
@@ -257,40 +235,70 @@ def create_l0_directory_structure(
campaign_name=campaign_name,
station_name=station_name,
)
- # Remove / directory if force=True
- _check_pre_existing_station_data(
+
+ # Check if product files are already available
+ available_data = has_available_data(
product=product,
base_dir=base_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
- force=force,
)
- # Create the / directory
- create_required_directory(os.path.join(processed_dir, product), dir_name=station_name)
+
+ # If product files are already available:
+ # - If force=True, remove all data inside the product directory
+ # - If force=False, raise an error
+ if available_data:
+ ensure_empty_data_dir(data_dir, force=force)
+
+ return data_dir
-def create_directory_structure(processed_dir, product, station_name, force):
- """Create directory structure for L0B and higher DISDRODB products."""
+def create_product_directory(
+ data_source,
+ campaign_name,
+ station_name,
+ product,
+ force,
+ base_dir=None,
+ # Option for L2E
+ sample_interval=None,
+ rolling=None,
+ # Option for L2M
+ model_name=None,
+):
+ """Initialize the directory structure for a DISDRODB product.
+
+ If product files already exists:
+ - If ``force=True``, it remove all existing data inside the product directory.
+ - If ``force=False``, it raise an error.
+ """
+ # NOTE: ``force=False`` behaviour could be changed to enable updating of missing files.
+ # This would require also adding code to check whether a downstream file already exist.
+
+ from disdrodb.api.io import get_required_product
+
+ # Get DISDRODB base directory
+ base_dir = get_base_dir(base_dir)
+
# Check inputs
check_product(product)
- processed_dir = check_processed_dir(processed_dir=processed_dir)
-
- base_dir, product_type, data_source, campaign_name = infer_disdrodb_tree_path_components(processed_dir)
# Determine required product
- if product == "L0B":
- required_product = "L0A"
- else:
- raise NotImplementedError("product {product} not yet implemented.")
+ required_product = get_required_product(product)
- # Check station is available in the previous product level
- check_station_has_data(
+ # Check station data is available in the previous product level
+ check_data_availability(
product=required_product,
base_dir=base_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
)
# Check metadata file is available
@@ -302,19 +310,84 @@ def create_directory_structure(processed_dir, product, station_name, force):
station_name=station_name,
)
+ # Define product output directory
+ data_dir = define_data_dir(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ )
+
# Create required directory (if it doesn't exist)
- create_required_directory(processed_dir, dir_name=product)
+ create_directory(data_dir)
- # Remove / directory if force=True
- _check_pre_existing_station_data(
+ # Check if product files are already available
+ available_data = has_available_data(
product=product,
base_dir=base_dir,
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
- force=force,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
)
+ # If product files are already available:
+ # - If force=True, remove all data inside the product directory
+ # - If force=False, raise an error
+ if available_data:
+ ensure_empty_data_dir(data_dir, force=force)
+
+ # Return product directory
+ return data_dir
+
+
+def create_logs_directory(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ base_dir=None,
+ # Option for L2E
+ sample_interval=None,
+ rolling=None,
+ # Option for L2M
+ model_name=None,
+):
+ """Initialize the logs directory structure for a DISDRODB product."""
+ # Define logs directory
+ logs_dir = define_logs_dir(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ )
+
+ # Ensure empty log directory
+ if os.path.isdir(logs_dir):
+ shutil.rmtree(logs_dir)
+
+ # Create logs directory
+ os.makedirs(logs_dir, exist_ok=True)
+
+ # Return logs directory
+ return logs_dir
+
#### DISDRODB Station Initialization
diff --git a/disdrodb/api/info.py b/disdrodb/api/info.py
index 9b015811..62763538 100644
--- a/disdrodb/api/info.py
+++ b/disdrodb/api/info.py
@@ -19,19 +19,31 @@
"""Retrieve file information from DISDRODB products file names and filepaths."""
import os
+from collections import defaultdict
from pathlib import Path
import numpy as np
from trollsift import Parser
+from disdrodb.utils.time import acronym_to_seconds
+
####---------------------------------------------------------------------------
########################
#### FNAME PATTERNS ####
########################
-DISDRODB_FNAME_PATTERN = (
+DISDRODB_FNAME_L0_PATTERN = (
"{product:s}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
".{version:s}.{data_format:s}"
)
+DISDRODB_FNAME_L2E_PATTERN = ( # also L0C and L1 --> accumulation_acronym = sample_interval
+ "{product:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
+ ".{version:s}.{data_format:s}"
+)
+
+DISDRODB_FNAME_L2M_PATTERN = (
+ "{product:s}_{subproduct:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}"
+ ".{version:s}.{data_format:s}"
+)
####---------------------------------------------------------------------------.
##########################
@@ -41,9 +53,17 @@
def _parse_filename(filename):
"""Parse the filename with trollsift."""
- # Retrieve information from filename
- p = Parser(DISDRODB_FNAME_PATTERN)
- info_dict = p.parse(filename)
+ if filename.startswith("L0A") or filename.startswith("L0B"):
+ p = Parser(DISDRODB_FNAME_L0_PATTERN)
+ info_dict = p.parse(filename)
+ elif filename.startswith("L2E") or filename.startswith("L1") or filename.startswith("L0C"):
+ p = Parser(DISDRODB_FNAME_L2E_PATTERN)
+ info_dict = p.parse(filename)
+ elif filename.startswith("L2M"):
+ p = Parser(DISDRODB_FNAME_L2M_PATTERN)
+ info_dict = p.parse(filename)
+ else:
+ raise ValueError("Not a DISDRODB product file.")
return info_dict
@@ -54,6 +74,11 @@ def _get_info_from_filename(filename):
info_dict = _parse_filename(filename)
except ValueError:
raise ValueError(f"{filename} can not be parsed. Report the issue.")
+
+ # Add additional information to info dictionary
+ if "accumulation_acronym" in info_dict:
+ info_dict["sample_interval"] = acronym_to_seconds(info_dict["accumulation_acronym"])
+
# Return info dictionary
return info_dict
@@ -132,7 +157,14 @@ def get_start_end_time_from_filepaths(filepaths):
"""Return the start and end time of the specified files."""
list_start_time = get_key_from_filepaths(filepaths, key="start_time")
list_end_time = get_key_from_filepaths(filepaths, key="end_time")
- return np.array(list_start_time), np.array(list_end_time)
+ return np.array(list_start_time).astype("M8[s]"), np.array(list_end_time).astype("M8[s]")
+
+
+def get_sample_interval_from_filepaths(filepaths):
+ """Return the sample interval of the specified files."""
+ list_accumulation_acronym = get_key_from_filepaths(filepaths, key="accumulation_acronym")
+ list_sample_interval = [acronym_to_seconds(s) for s in list_accumulation_acronym]
+ return list_sample_interval
####--------------------------------------------------------------------------.
@@ -183,7 +215,7 @@ def infer_path_info_dict(path: str) -> dict:
Returns
-------
- list
+ dict
Dictionary with the path element of the DISDRODB archive.
Valid keys: ``"base_dir"``, ``"data_source"``, ``"campaign_name"``
"""
@@ -197,6 +229,24 @@ def infer_path_info_dict(path: str) -> dict:
return path_dict
+def infer_path_info_tuple(path: str) -> tuple:
+ """Return a tuple with the ``base_dir``, ``data_source`` and ``campaign_name`` of the disdrodb_path.
+
+ Parameters
+ ----------
+ path : str
+ ``path`` can be a ``campaign_dir`` (``raw_dir`` or ``processed_dir``), or a DISDRODB file path.
+
+ Returns
+ -------
+ tuple
+ Dictionary with the path element of the DISDRODB archive.
+ Valid keys: ``"base_dir"``, ``"data_source"``, ``"campaign_name"``
+ """
+ path_dict = infer_path_info_dict(path)
+ return path_dict["base_dir"], path_dict["data_source"], path_dict["campaign_name"]
+
+
def infer_disdrodb_tree_path(path: str) -> str:
"""Return the directory tree path from the base_dir directory.
@@ -281,3 +331,136 @@ def infer_data_source_from_path(path: str) -> str:
####--------------------------------------------------------------------------.
+#######################
+#### Group utility ####
+#######################
+
+
+FILE_KEYS = [
+ "product",
+ "subproduct",
+ "campaign_name",
+ "station_name",
+ "start_time",
+ "end_time",
+ "data_format",
+ "accumulation_acronym",
+ "sample_interval",
+]
+
+
+TIME_KEYS = [
+ "year",
+ "month",
+ "month_name",
+ "quarter",
+ "season",
+ "day",
+ "doy",
+ "dow",
+ "hour",
+ "minute",
+ "second",
+]
+
+
+def check_groups(groups):
+ """Check groups validity."""
+ if not isinstance(groups, (str, list)):
+ raise TypeError("'groups' must be a list (or a string if a single group is specified.")
+ if isinstance(groups, str):
+ groups = [groups]
+ groups = np.array(groups)
+ valid_keys = FILE_KEYS + TIME_KEYS
+ invalid_keys = groups[np.isin(groups, valid_keys, invert=True)]
+ if len(invalid_keys) > 0:
+ raise ValueError(f"The following group keys are invalid: {invalid_keys}. Valid values are {valid_keys}.")
+ return groups.tolist()
+
+
+def get_season(time):
+ """Get season from `datetime.datetime` or `datetime.date` object."""
+ month = time.month
+ if month in [12, 1, 2]:
+ return "DJF" # Winter (December, January, February)
+ if month in [3, 4, 5]:
+ return "MAM" # Spring (March, April, May)
+ if month in [6, 7, 8]:
+ return "JJA" # Summer (June, July, August)
+ return "SON" # Autumn (September, October, November)
+
+
+def get_time_component(time, component):
+ """Get time component from `datetime.datetime` object."""
+ func_dict = {
+ "year": lambda time: time.year,
+ "month": lambda time: time.month,
+ "day": lambda time: time.day,
+ "doy": lambda time: time.timetuple().tm_yday, # Day of year
+ "dow": lambda time: time.weekday(), # Day of week (0=Monday, 6=Sunday)
+ "hour": lambda time: time.hour,
+ "minute": lambda time: time.minute,
+ "second": lambda time: time.second,
+ # Additional
+ "month_name": lambda time: time.strftime("%B"), # Full month name
+ "quarter": lambda time: (time.month - 1) // 3 + 1, # Quarter (1-4)
+ "season": lambda time: get_season(time), # Season (DJF, MAM, JJA, SON)
+ }
+ return str(func_dict[component](time))
+
+
+def _get_groups_value(groups, filepath):
+ """Return the value associated to the groups keys.
+
+ If multiple keys are specified, the value returned is a string of format: ``//...``
+
+ If a single key is specified and is ``start_time`` or ``end_time``, the function
+ returns a :py:class:`datetime.datetime` object.
+ """
+ single_key = len(groups) == 1
+ info_dict = get_info_from_filepath(filepath)
+ start_time = info_dict["start_time"]
+ list_key_values = []
+ for key in groups:
+ if key in TIME_KEYS:
+ list_key_values.append(get_time_component(start_time, component=key))
+ else:
+ value = info_dict.get(key, f"{key}=None")
+ list_key_values.append(value if single_key else str(value))
+ if single_key:
+ return list_key_values[0]
+ return "/".join(list_key_values)
+
+
+def group_filepaths(filepaths, groups=None):
+ """
+ Group filepaths in a dictionary if groups are specified.
+
+ Parameters
+ ----------
+ filepaths : list
+ List of filepaths.
+ groups: list or str
+ The group keys by which to group the filepaths.
+ Valid group keys are ``product``, ``subproduct``, ``campaign_name``, ``station_name``,
+ ``start_time``, ``end_time``,``accumulation_acronym``,``sample_interval``,
+ ``data_format``,
+ ``year``, ``month``, ``day``, ``doy``, ``dow``, ``hour``, ``minute``, ``second``,
+ ``month_name``, ``quarter``, ``season``.
+ The time components are extracted from ``start_time`` !
+ If groups is ``None`` returns the input filepaths list.
+ The default is ``None``.
+
+ Returns
+ -------
+ dict or list
+ Either a dictionary of format ``{: }``.
+ or the original input filepaths (if ``groups=None``)
+
+ """
+ if groups is None:
+ return filepaths
+ groups = check_groups(groups)
+ filepaths_dict = defaultdict(list)
+ _ = [filepaths_dict[_get_groups_value(groups, filepath)].append(filepath) for filepath in filepaths]
+ return dict(filepaths_dict)
diff --git a/disdrodb/api/io.py b/disdrodb/api/io.py
index 67b38242..8832f310 100644
--- a/disdrodb/api/io.py
+++ b/disdrodb/api/io.py
@@ -19,23 +19,129 @@
"""Routines tot extract information from the DISDRODB infrastructure."""
import os
+import shutil
+from typing import Optional
import numpy as np
from disdrodb.api.checks import check_product
-from disdrodb.api.path import get_disdrodb_path
+from disdrodb.api.path import define_data_dir, define_product_dir, get_disdrodb_path
from disdrodb.configs import get_base_dir
from disdrodb.utils.directories import count_files, list_directories, list_files
+from disdrodb.utils.logger import (
+ log_info,
+)
+
+
+def get_required_product(product):
+ """Determine the required product for input product processing."""
+ # Check input
+ check_product(product)
+ # Determine required product
+ requirement_dict = {
+ "L0A": "RAW",
+ "L0B": "L0A",
+ "L0C": "L0B",
+ "L1": "L0C",
+ "L2E": "L1",
+ "L2M": "L2E",
+ }
+ required_product = requirement_dict[product]
+ return required_product
+
+
+def filter_filepaths(filepaths, debugging_mode):
+ """Filter out filepaths if ``debugging_mode=True``."""
+ if debugging_mode:
+ max_files = min(3, len(filepaths))
+ filepaths = filepaths[0:max_files]
+ return filepaths
+
+
+def get_filepaths(
+ data_source,
+ campaign_name,
+ station_name,
+ product,
+ model_name=None,
+ sample_interval=None,
+ rolling=None,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """Retrieve DISDRODB product files for a give station.
+
+ Parameters
+ ----------
+ data_source : str
+ The name of the institution (for campaigns spanning multiple countries) or
+ the name of the country (for campaigns or sensor networks within a single country).
+ Must be provided in UPPER CASE.
+ campaign_name : str
+ The name of the campaign. Must be provided in UPPER CASE.
+ station_name : str
+ The name of the station.
+ product : str
+ The name DISDRODB product.
+ sample_interval : int, optional
+ The sampling interval in seconds of the product.
+ It must be specified only for product L2E and L2M !
+ rolling : bool, optional
+ Whether the dataset has been resampled by aggregating or rolling.
+ It must be specified only for product L2E and L2M !
+ model_name : str
+ The model name of the statistical distribution for the DSD.
+ It must be specified only for product L2M !
+ debugging_mode : bool, optional
+ If ``True``, it select maximum 3 files for debugging purposes.
+ The default is ``False``.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+
+ Returns
+ -------
+ filepaths : list
+ List of file paths.
+
+ """
+ # Retrieve data directory
+ data_dir = define_data_dir(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=product,
+ # Option for L2E and L2M
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Options for L2M
+ model_name=model_name,
+ )
+
+ # Define glob pattern
+ glob_pattern = "*.parquet" if product == "L0A" else "*.nc"
+
+ # Retrieve files
+ filepaths = list_files(data_dir, glob_pattern=glob_pattern, recursive=True)
+
+ # Filter out filepaths if debugging_mode=True
+ filepaths = filter_filepaths(filepaths, debugging_mode=debugging_mode)
+
+ # If no file available, raise error
+ if len(filepaths) == 0:
+ msg = f"No {product} files are available in {data_dir}. Run {product} processing first."
+ raise ValueError(msg)
+
+ # Sort filepaths
+ filepaths = sorted(filepaths)
+
+ return filepaths
def _get_list_stations_dirs(product, campaign_dir):
# Get directory where data are stored
- # - Raw: /data/<...>
- # - Processed: /L0A/L0B>
- if product.upper() == "RAW":
- product_dir = os.path.join(campaign_dir, "data")
- else:
- product_dir = os.path.join(campaign_dir, product)
+ product_dir = define_product_dir(campaign_dir=campaign_dir, product=product)
# Check if the data directory exists
# - For a fresh disdrodb-data cloned repo, no "data" directories
if not os.path.exists(product_dir):
@@ -51,6 +157,7 @@ def _get_list_stations_with_data(product, campaign_dir):
# Get stations directory
list_stations_dir = _get_list_stations_dirs(product=product, campaign_dir=campaign_dir)
# Count number of files within directory
+ # - TODO: here just check for one file !
list_nfiles_per_station = [count_files(station_dir, "*", recursive=True) for station_dir in list_stations_dir]
# Keep only stations with at least one file
stations_names = [os.path.basename(path) for n, path in zip(list_nfiles_per_station, list_stations_dir) if n >= 1]
@@ -75,7 +182,6 @@ def _get_campaign_stations(base_dir, product, data_source, campaign_name):
data_source=data_source,
campaign_name=campaign_name,
)
-
# Get list of stations with data and metadata
list_stations_data = _get_list_stations_with_data(product=product, campaign_dir=campaign_dir)
list_stations_metadata = _get_list_stations_with_metadata(campaign_dir)
@@ -278,9 +384,13 @@ def available_stations(
campaign_names=None,
station_names=None,
return_tuple=True,
+ raise_error_if_empty=False,
base_dir=None,
):
- """Return stations for which data are available on disk."""
+ """Return stations for which data and metadata are available on disk.
+
+ Raise an error if no stations are available.
+ """
base_dir = get_base_dir(base_dir)
# Checks arguments
product = check_product(product)
@@ -297,24 +407,42 @@ def available_stations(
if isinstance(station_names, str):
station_names = [station_names]
- # If data_source is None, first retrieve all stations
+ # If data_source is None, retrieve all stations
if data_sources is None:
list_info = _get_stations(base_dir=base_dir, product=product)
- # Otherwise retrieve all stations for the specified data sources
+
+ ###-----------------------------------------------.
+ ### Filter by data_sources
else:
list_info = _get_data_sources_stations(
base_dir=base_dir,
data_sources=data_sources,
product=product,
)
+ # If no stations available, raise an error
+ if raise_error_if_empty and len(list_info) == 0:
+ raise ValueError(f"No stations available given the provided `data_sources` {data_sources}.")
+
+ ###-----------------------------------------------.
+ ### Filter by campaign_names
# If campaign_names is not None, subset by campaign_names
if campaign_names is not None:
list_info = [info for info in list_info if info[1] in campaign_names]
+ # If no stations available, raise an error
+ if raise_error_if_empty and len(list_info) == 0:
+ raise ValueError(f"No stations available given the provided `campaign_names` {campaign_names}.")
+
+ ###-----------------------------------------------.
+ ### Filter by station_names
# If station_names is not None, subset by station_names
if station_names is not None:
list_info = [info for info in list_info if info[2] in station_names]
+ # If no stations available, raise an error
+ if raise_error_if_empty and len(list_info) == 0:
+ raise ValueError(f"No stations available given the provided `station_names` {station_names}.")
+ ###-----------------------------------------------.
# Return list with the tuple (data_source, campaign_name, station_name)
if return_tuple:
return list_info
@@ -322,3 +450,33 @@ def available_stations(
# - Return list with the name of the available stations
list_stations = [info[2] for info in list_info]
return list_stations
+
+
+####----------------------------------------------------------------------------------
+#### DISDRODB Removal Functions
+
+
+def remove_product(
+ base_dir,
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ logger=None,
+ verbose=True,
+):
+ """Remove all product files of a specific station."""
+ if product.upper() == "RAW":
+ raise ValueError("Removal of 'RAW' files is not allowed.")
+ data_dir = define_data_dir(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+ if logger is not None:
+ log_info(logger=logger, msg="Removal of {product} files started.", verbose=verbose)
+ shutil.rmtree(data_dir)
+ if logger is not None:
+ log_info(logger=logger, msg="Removal of {product} files ended.", verbose=verbose)
diff --git a/disdrodb/api/path.py b/disdrodb/api/path.py
index ab4c6f0d..87955047 100644
--- a/disdrodb/api/path.py
+++ b/disdrodb/api/path.py
@@ -17,15 +17,14 @@
# along with this program. If not, see .
# -----------------------------------------------------------------------------.
"""Define paths within the DISDRODB infrastructure."""
-
import os
+from typing import Optional
import pandas as pd
-import xarray as xr
-from disdrodb.api.info import infer_campaign_name_from_path
from disdrodb.configs import get_base_dir
from disdrodb.utils.directories import check_directory_exists
+from disdrodb.utils.time import ensure_sample_interval_in_seconds, seconds_to_acronym
####--------------------------------------------------------------------------.
#### Paths from BASE_DIR
@@ -120,54 +119,6 @@ def define_campaign_dir(
return str(campaign_dir)
-def define_station_dir(
- product,
- data_source,
- campaign_name,
- station_name,
- base_dir=None,
- check_exists=False,
-):
- """Return the station data directory in the DISDRODB infrastructure.
-
- Parameters
- ----------
- product : str
- The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
- data_source : str
- The data source.
- campaign_name : str
- The campaign name.
- station_name : str
- The station name.
- base_dir : str, optional
- The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
- If not specified, the path specified in the DISDRODB active configuration will be used.
- check_exists : bool, optional
- Whether to check if the directory exists. By default ``False``.
-
- Returns
- -------
- station_dir : str
- Station data directory path
- """
- base_dir = get_base_dir(base_dir)
- campaign_dir = get_disdrodb_path(
- base_dir=base_dir,
- product=product,
- data_source=data_source,
- campaign_name=campaign_name,
- check_exists=check_exists,
- )
- if product.upper() == "RAW":
- station_dir = os.path.join(campaign_dir, "data", station_name)
- else:
- station_dir = os.path.join(campaign_dir, product, station_name)
- if check_exists:
- check_directory_exists(station_dir)
- return str(station_dir)
-
-
def define_metadata_dir(
product,
data_source,
@@ -250,11 +201,11 @@ def define_issue_dir(
def define_metadata_filepath(
- product,
data_source,
campaign_name,
station_name,
base_dir=None,
+ product="RAW",
check_exists=False,
):
"""Return the station metadata filepath in the DISDRODB infrastructure.
@@ -353,82 +304,537 @@ def define_config_dir(product):
#### Directory/Filepaths L0A and L0B products
-def define_l0a_station_dir(processed_dir: str, station_name: str) -> str:
- """Define L0A directory.
+def check_sample_interval(sample_interval):
+ """Check sample_interval argument validity."""
+ if not isinstance(sample_interval, int):
+ raise ValueError("'sample_interval' must be an integer.")
+
+
+def check_rolling(rolling):
+ """Check rolling argument validity."""
+ if not isinstance(rolling, bool):
+ raise ValueError("'rolling' must be a boolean.")
+
+
+def define_product_dir_tree(
+ product,
+ model_name=None,
+ sample_interval=None,
+ rolling=None,
+):
+ """Return the product directory tree.
Parameters
----------
- processed_dir : str
- Path of the processed directory
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ sample_interval : int, optional
+ The sampling interval in seconds of the product.
+ It must be specified only for product L2E and L2M !
+ rolling : bool, optional
+ Whether the dataset has been resampled by aggregating or rolling.
+ It must be specified only for product L2E and L2M !
+ model_name : str
+ The custom model name of the fitted statistical distribution.
+ It must be specified only for product L2M !
+
+ Returns
+ -------
+ data_dir : str
+ Station data directory path
+ """
+ if product.upper() == "RAW":
+ return ""
+ if product.upper() in ["L0A", "L0B", "L0C", "L1"]:
+ return product
+ if product == "L2E":
+ check_rolling(rolling)
+ check_sample_interval(sample_interval)
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ return os.path.join(product, sample_interval_acronym)
+ if product == "L2M":
+ check_rolling(rolling)
+ check_sample_interval(sample_interval)
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ return os.path.join(product, model_name, sample_interval_acronym)
+ raise ValueError(f"The product {product} is not defined.")
+
+
+def define_station_dir_new(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ base_dir=None,
+ check_exists=False,
+): # TODO: IN FUTURE without product --> campaign_dir/station_name/product !
+ """Return the station data directory in the DISDRODB infrastructure.
+
+ Parameters
+ ----------
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ data_source : str
+ The data source.
+ campaign_name : str
+ The campaign name.
station_name : str
- Name of the station
+ The station name.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+ check_exists : bool, optional
+ Whether to check if the directory exists. By default ``False``.
Returns
-------
- str
- L0A directory path.
+ station_dir : str
+ Station data directory path
"""
- station_dir = os.path.join(processed_dir, "L0A", station_name)
- return station_dir
+ base_dir = get_base_dir(base_dir)
+ campaign_dir = get_disdrodb_path(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ check_exists=check_exists,
+ )
+ if product.upper() == "RAW":
+ station_dir = os.path.join(campaign_dir, "data", station_name)
+ else:
+ station_dir = os.path.join(campaign_dir, station_name, "data")
+ if check_exists:
+ check_directory_exists(station_dir)
+ return str(station_dir)
-def define_l0b_station_dir(processed_dir: str, station_name: str) -> str:
- """Define L0B directory.
+def define_data_dir_new(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ model_name=None,
+ sample_interval=None,
+ rolling=None,
+ base_dir=None,
+ check_exists=False,
+):
+ """Return the station data directory in the DISDRODB infrastructure.
Parameters
----------
- processed_dir : str
- Path of the processed directory
- station_name : int
- Name of the station
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ data_source : str
+ The data source.
+ campaign_name : str
+ The campaign name.
+ station_name : str
+ The station name.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+ check_exists : bool, optional
+ Whether to check if the directory exists. By default ``False``.
+
+ Returns
+ -------
+ station_dir : str
+ Station data directory path
+ """
+ station_dir = define_station_dir_new(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ check_exists=check_exists,
+ )
+ product_dir_tree = define_product_dir_tree(
+ product=product,
+ model_name=model_name,
+ sample_interval=sample_interval,
+ rolling=rolling,
+ )
+ data_dir = os.path.join(station_dir, product_dir_tree)
+ if check_exists:
+ check_directory_exists(data_dir)
+ return str(data_dir)
+
+
+def define_logs_dir(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ model_name=None,
+ sample_interval=None,
+ rolling=None,
+ base_dir=None,
+ check_exists=False,
+):
+ """Return the station log directory in the DISDRODB infrastructure.
+
+ Parameters
+ ----------
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ data_source : str
+ The data source.
+ campaign_name : str
+ The campaign name.
+ station_name : str
+ The station name.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+ check_exists : bool, optional
+ Whether to check if the directory exists. By default ``False``.
+
+ Returns
+ -------
+ station_dir : str
+ Station data directory path
+ """
+ # station_dir = define_station_dir_new(
+ # base_dir=base_dir,
+ # product=product,
+ # data_source=data_source,
+ # campaign_name=campaign_name,
+ # check_exists=check_exists,
+ # )
+ campaign_dir = define_campaign_dir(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ check_exists=check_exists,
+ )
+ product_dir_tree = define_product_dir_tree(
+ product=product,
+ model_name=model_name,
+ sample_interval=sample_interval,
+ rolling=rolling,
+ )
+ logs_dir = os.path.join(campaign_dir, "logs", "files", product_dir_tree, station_name)
+ if check_exists:
+ check_directory_exists(logs_dir)
+ return str(logs_dir)
+
+
+def define_data_dir(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ model_name=None,
+ sample_interval=None,
+ rolling=None,
+ base_dir=None,
+ check_exists=False,
+):
+ """Return the station data directory in the DISDRODB infrastructure.
+
+ Parameters
+ ----------
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ data_source : str
+ The data source.
+ campaign_name : str
+ The campaign name.
+ station_name : str
+ The station name.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+ check_exists : bool, optional
+ Whether to check if the directory exists. By default ``False``.
+ sample_interval : int, optional
+ The sampling interval in seconds of the product.
+ It must be specified only for product L2E and L2M !
+ rolling : bool, optional
+ Whether the dataset has been resampled by aggregating or rolling.
+ It must be specified only for product L2E and L2M !
+ model_name : str
+ The name of the fitted statistical distribution for the DSD.
+ It must be specified only for product L2M !
+
+ Returns
+ -------
+ data_dir : str
+ Station data directory path
+ """
+ station_dir = define_station_dir(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ check_exists=check_exists,
+ )
+ if product.upper() in ["RAW", "L0A", "L0B", "L0C", "L1"]:
+ data_dir = station_dir
+ elif product == "L2E":
+ check_rolling(rolling)
+ check_sample_interval(sample_interval)
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ data_dir = os.path.join(station_dir, sample_interval_acronym)
+ elif product == "L2M":
+ check_rolling(rolling)
+ check_sample_interval(sample_interval)
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ data_dir = os.path.join(station_dir, model_name, sample_interval_acronym)
+ else:
+ raise ValueError("TODO") # CHECK Product on top !`
+ if check_exists:
+ check_directory_exists(data_dir)
+ return str(data_dir)
+
+
+def define_product_dir(campaign_dir: str, product: str) -> str:
+ """Define product directory."""
+ # TODO: this currently only works for L0A and L0B. Should be removed !
+ # - Raw: /data/<...>
+ # - Processed: /L0A/L0B>
+ if product.upper() == "RAW":
+ product_dir = os.path.join(campaign_dir, "data")
+ else:
+ product_dir = os.path.join(campaign_dir, product)
+ return product_dir
+
+
+def define_station_dir(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ base_dir=None,
+ check_exists=False,
+): # TODO: IN FUTURE without product --> campaign_dir/station_name/product !
+ """Return the station data directory in the DISDRODB infrastructure.
+
+ Parameters
+ ----------
+ product : str
+ The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``.
+ data_source : str
+ The data source.
+ campaign_name : str
+ The campaign name.
+ station_name : str
+ The station name.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+ check_exists : bool, optional
+ Whether to check if the directory exists. By default ``False``.
+
+ Returns
+ -------
+ station_dir : str
+ Station data directory path
+ """
+ base_dir = get_base_dir(base_dir)
+ campaign_dir = get_disdrodb_path(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ check_exists=check_exists,
+ )
+ if product.upper() == "RAW":
+ station_dir = os.path.join(campaign_dir, "data", station_name)
+ else:
+ station_dir = os.path.join(campaign_dir, product, station_name)
+ if check_exists:
+ check_directory_exists(station_dir)
+ return str(station_dir)
+
+
+####--------------------------------------------------------------------------.
+#### Filenames for DISDRODB products
+
+
+def define_accumulation_acronym(seconds, rolling):
+ """Define the accumulation acronnym.
+
+ Prefix the accumulation interval acronym with ROLL if rolling=True.
+ """
+ accumulation_acronym = seconds_to_acronym(seconds)
+ if rolling:
+ accumulation_acronym = f"ROLL{accumulation_acronym}"
+ return accumulation_acronym
+
+
+####--------------------------------------------------------------------------.
+#### Filenames for DISDRODB products
+
+
+def define_filename(
+ product: str,
+ campaign_name: str,
+ station_name: str,
+ # L2E option
+ sample_interval: Optional[int] = None,
+ rolling: Optional[bool] = None,
+ # L2M option
+ model_name: Optional[str] = None,
+ # Filename options
+ obj=None,
+ add_version=True,
+ add_time_period=True,
+ add_extension=True,
+ # Prefix
+ prefix="",
+ suffix="",
+) -> str:
+ """Define DISDRODB products filename.
+
+ Parameters
+ ----------
+ obj : xarray.Dataset or pandas.DataFrame
+ xarray Dataset or pandas DataFrame.
+ Required if add_time_period = True.
+ campaign_name : str
+ Name of the campaign.
+ station_name : str
+ Name of the station.
+ sample_interval : int, optional
+ The sampling interval in seconds of the product.
+ It must be specified only for product L2E and L2M !
+ rolling : bool, optional
+ Whether the dataset has been resampled by aggregating or rolling.
+ It must be specified only for product L2E and L2M !
+ model_name : str
+ The model name of the fitted statistical distribution for the DSD.
+ It must be specified only for product L2M !
Returns
-------
str
- Path of the L0B directory
+ L0B file name.
"""
- station_dir = os.path.join(processed_dir, "L0B", station_name)
- return station_dir
+ from disdrodb import PRODUCT_VERSION
+ from disdrodb.utils.pandas import get_dataframe_start_end_time
+ from disdrodb.utils.xarray import get_dataset_start_end_time
+
+ # -----------------------------------------.
+ # TODO: Define sample_interval_acronym
+ # - ADD sample_interval_acronym also to L0A and L0B
+ # - Add sample_interval_acronym also to L0C and L1
+
+ # -----------------------------------------.
+ # Define product acronym
+ product_acronym = f"{product}"
+ if product in ["L2E", "L2M"]:
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ product_acronym = f"L2E.{sample_interval_acronym}"
+ if product in ["L2M"]:
+ product_acronym = f"L2M_{model_name}.{sample_interval_acronym}"
+
+ # -----------------------------------------.
+ # Define base filename
+ filename = f"{product_acronym}.{campaign_name}.{station_name}"
+
+ # -----------------------------------------.
+ # Add prefix
+ if prefix != "":
+ filename = f"{prefix}.{filename}"
+
+ # -----------------------------------------.
+ # Add time period information
+ if add_time_period:
+ if product == "L0A":
+ starting_time, ending_time = get_dataframe_start_end_time(obj)
+ else:
+ starting_time, ending_time = get_dataset_start_end_time(obj)
+ starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
+ filename = f"{filename}.s{starting_time}.e{ending_time}"
+
+ # -----------------------------------------.
+ # Add product version
+ if add_version:
+ filename = f"{filename}.{PRODUCT_VERSION}"
+
+ # -----------------------------------------.
+ # Add product extension
+ if add_extension:
+ filename = f"{filename}.parquet" if product == "L0A" else f"{filename}.nc"
+
+ # -----------------------------------------.
+ # Add suffix
+ if suffix != "":
+ filename = f"{filename}.{suffix}"
+ return filename
-def define_l0a_filename(df, processed_dir, station_name: str) -> str:
+def define_l0a_filename(df, campaign_name: str, station_name: str) -> str:
"""Define L0A file name.
Parameters
----------
df : pandas.DataFrame
- L0A DataFrame
- processed_dir : str
- Path of the processed directory
+ L0A DataFrame.
+ campaign_name : str
+ Name of the campaign.
station_name : str
- Name of the station
+ Name of the station.
Returns
-------
str
L0A file name.
"""
- from disdrodb.l0.standards import PRODUCT_VERSION
+ from disdrodb import PRODUCT_VERSION
from disdrodb.utils.pandas import get_dataframe_start_end_time
starting_time, ending_time = get_dataframe_start_end_time(df)
starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
- campaign_name = infer_campaign_name_from_path(processed_dir).replace(".", "-")
version = PRODUCT_VERSION
filename = f"L0A.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.parquet"
return filename
-def define_l0b_filename(ds, processed_dir, station_name: str) -> str:
+def define_l0b_filename(ds, campaign_name: str, station_name: str) -> str:
"""Define L0B file name.
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ L0B xarray Dataset.
+ campaign_name : str
+ Name of the campaign.
+ station_name : str
+ Name of the station.
+
+ Returns
+ -------
+ str
+ L0B file name.
+ """
+ from disdrodb import PRODUCT_VERSION
+ from disdrodb.utils.xarray import get_dataset_start_end_time
+
+ starting_time, ending_time = get_dataset_start_end_time(ds)
+ starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
+ version = PRODUCT_VERSION
+ filename = f"L0B.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ return filename
+
+
+def define_l0c_filename(ds, campaign_name: str, station_name: str) -> str:
+ """Define L0C file name.
+
Parameters
----------
ds : xarray.Dataset
L0B xarray Dataset
- processed_dir : str
- Path of the processed directory
+ campaign_name : str
+ Name of the campaign
station_name : str
Name of the station
@@ -437,69 +843,120 @@ def define_l0b_filename(ds, processed_dir, station_name: str) -> str:
str
L0B file name.
"""
- from disdrodb.l0.standards import PRODUCT_VERSION
+ from disdrodb import PRODUCT_VERSION
from disdrodb.utils.xarray import get_dataset_start_end_time
+ # TODO: add sample_interval as argument
+ sample_interval = int(ensure_sample_interval_in_seconds(ds["sample_interval"]).data.item())
+ sample_interval_acronym = define_accumulation_acronym(sample_interval, rolling=False)
starting_time, ending_time = get_dataset_start_end_time(ds)
starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
- campaign_name = infer_campaign_name_from_path(processed_dir).replace(".", "-")
version = PRODUCT_VERSION
- filename = f"L0B.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ filename = (
+ f"L0C.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ )
return filename
-def define_l0a_filepath(df: pd.DataFrame, processed_dir: str, station_name: str) -> str:
- """Define L0A file path.
+def define_l1_filename(ds, campaign_name, station_name: str) -> str:
+ """Define L1 file name.
Parameters
----------
- df : pandas.DataFrame
- L0A DataFrame.
+ ds : xarray.Dataset
+ L1 xarray Dataset
processed_dir : str
- Path of the processed directory.
+ Path of the processed directory
station_name : str
- Name of the station.
+ Name of the station
Returns
-------
str
- L0A file path.
+ L1 file name.
"""
- filename = define_l0a_filename(df=df, processed_dir=processed_dir, station_name=station_name)
- station_dir = define_l0a_station_dir(processed_dir=processed_dir, station_name=station_name)
- filepath = os.path.join(station_dir, filename)
- return filepath
+ from disdrodb import PRODUCT_VERSION
+ from disdrodb.utils.xarray import get_dataset_start_end_time
+
+ # TODO: add sample_interval as argument
+ sample_interval = int(ensure_sample_interval_in_seconds(ds["sample_interval"]).data.item())
+ sample_interval_acronym = define_accumulation_acronym(sample_interval, rolling=False)
+ starting_time, ending_time = get_dataset_start_end_time(ds)
+ starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
+ version = PRODUCT_VERSION
+ filename = (
+ f"L1.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ )
+ return filename
-def define_l0b_filepath(ds: xr.Dataset, processed_dir: str, station_name: str, l0b_concat=False) -> str:
- """Define L0B file path.
+def define_l2e_filename(ds, campaign_name: str, station_name: str, sample_interval: int, rolling: bool) -> str:
+ """Define L2E file name.
Parameters
----------
ds : xarray.Dataset
- L0B xarray Dataset.
+ L1 xarray Dataset
processed_dir : str
- Path of the processed directory.
+ Path of the processed directory
station_name : str
- ID of the station
- l0b_concat : bool
- If ``False``, the file is specified inside the station directory.
- If ``True``, the file is specified outside the station directory.
+ Name of the station
Returns
-------
str
- L0B file path.
+ L0B file name.
"""
- station_dir = define_l0b_station_dir(processed_dir, station_name)
- filename = define_l0b_filename(ds, processed_dir, station_name)
- if l0b_concat:
- product_dir = os.path.dirname(station_dir)
- filepath = os.path.join(product_dir, filename)
- else:
- filepath = os.path.join(station_dir, filename)
- return filepath
+ from disdrodb import PRODUCT_VERSION
+ from disdrodb.utils.xarray import get_dataset_start_end_time
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ starting_time, ending_time = get_dataset_start_end_time(ds)
+ starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
+ version = PRODUCT_VERSION
+ filename = (
+ f"L2E.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ )
+ return filename
-####--------------------------------------------------------------------------.
+
+def define_l2m_filename(
+ ds,
+ campaign_name: str,
+ station_name: str,
+ sample_interval: int,
+ rolling: bool,
+ model_name: str,
+) -> str:
+ """Define L2M file name.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ L1 xarray Dataset
+ processed_dir : str
+ Path of the processed directory
+ station_name : str
+ Name of the station
+
+ Returns
+ -------
+ str
+ L0B file name.
+ """
+ from disdrodb import PRODUCT_VERSION
+ from disdrodb.utils.xarray import get_dataset_start_end_time
+
+ sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling)
+ starting_time, ending_time = get_dataset_start_end_time(ds)
+ starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S")
+ version = PRODUCT_VERSION
+ filename = (
+ f"L2M_{model_name}.{sample_interval_acronym}.{campaign_name}."
+ + f"{station_name}.s{starting_time}.e{ending_time}.{version}.nc"
+ )
+ return filename
diff --git a/disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py b/disdrodb/cli/disdrodb_check_metadata_archive.py
similarity index 93%
rename from disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py
rename to disdrodb/cli/disdrodb_check_metadata_archive.py
index d3ad0d06..653d145b 100644
--- a/disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py
+++ b/disdrodb/cli/disdrodb_check_metadata_archive.py
@@ -19,7 +19,7 @@
import click
-from disdrodb.utils.scripts import click_base_dir_option, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/data_transfer/scripts/disdrodb_download_archive.py b/disdrodb/cli/disdrodb_download_archive.py
similarity index 94%
rename from disdrodb/data_transfer/scripts/disdrodb_download_archive.py
rename to disdrodb/cli/disdrodb_download_archive.py
index 04b8d67f..f8efad84 100644
--- a/disdrodb/data_transfer/scripts/disdrodb_download_archive.py
+++ b/disdrodb/cli/disdrodb_download_archive.py
@@ -22,7 +22,7 @@
import click
from disdrodb.data_transfer.download_data import click_download_archive_options, click_download_options
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/data_transfer/scripts/disdrodb_download_station.py b/disdrodb/cli/disdrodb_download_station.py
similarity index 96%
rename from disdrodb/data_transfer/scripts/disdrodb_download_station.py
rename to disdrodb/cli/disdrodb_download_station.py
index c13b7e35..52a4d8a4 100644
--- a/disdrodb/data_transfer/scripts/disdrodb_download_station.py
+++ b/disdrodb/cli/disdrodb_download_station.py
@@ -24,7 +24,7 @@
import click
from disdrodb.data_transfer.download_data import click_download_options
-from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/api/scripts/disdrodb_initialize_station.py b/disdrodb/cli/disdrodb_initialize_station.py
similarity index 96%
rename from disdrodb/api/scripts/disdrodb_initialize_station.py
rename to disdrodb/cli/disdrodb_initialize_station.py
index bb36cb80..17e752fe 100644
--- a/disdrodb/api/scripts/disdrodb_initialize_station.py
+++ b/disdrodb/cli/disdrodb_initialize_station.py
@@ -20,7 +20,7 @@
import click
-from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0.py b/disdrodb/cli/disdrodb_run_l0.py
similarity index 86%
rename from disdrodb/l0/scripts/disdrodb_run_l0.py
rename to disdrodb/cli/disdrodb_run_l0.py
index 5d035f9a..b857cc89 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0.py
+++ b/disdrodb/cli/disdrodb_run_l0.py
@@ -20,20 +20,22 @@
import click
-from disdrodb.l0.routines import (
+from disdrodb.utils.cli import (
+ click_base_dir_option,
click_l0_archive_options,
- click_l0_processing_options,
- click_l0_stations_options,
+ click_processing_options,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
)
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
@click.command()
-@click_l0_stations_options
+@click_stations_options
@click_l0_archive_options
-@click_l0_processing_options
+@click_processing_options
@click_base_dir_option
def disdrodb_run_l0(
# L0 disdrodb stations options
@@ -43,7 +45,7 @@ def disdrodb_run_l0(
# L0 archive options
l0a_processing: bool = True,
l0b_processing: bool = True,
- l0b_concat: bool = True,
+ l0c_processing: bool = True,
remove_l0a: bool = False,
remove_l0b: bool = False,
# Processing options
@@ -83,17 +85,14 @@ def disdrodb_run_l0(
l0b_processing : bool
Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.
The default is True.
- l0b_concat : bool
- Whether to concatenate all raw files into a single L0B netCDF file.
- If l0b_concat=True, all raw files will be saved into a single L0B netCDF file.
- If l0b_concat=False, each raw file will be converted into the corresponding L0B netCDF file.
- The default is False.
+ l0c_processing : bool
+ Whether to launch processing to generate L0C netCDF4 file(s) from L0C data.
+ The default is True.
remove_l0a : bool
Whether to keep the L0A files after having generated the L0B netCDF products.
The default is False.
remove_l0b : bool
- Whether to remove the L0B files after having concatenated all L0B netCDF files.
- It takes places only if l0b_concat = True
+ Whether to remove the L0B files after having produced L0C netCDF files.
The default is False.
force : bool
If True, overwrite existing data into destination directories.
@@ -119,7 +118,7 @@ def disdrodb_run_l0(
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- from disdrodb.l0.routines import run_disdrodb_l0
+ from disdrodb.routines import run_disdrodb_l0
# Parse data_sources, campaign_names and station arguments
base_dir = parse_base_dir(base_dir)
@@ -136,7 +135,7 @@ def disdrodb_run_l0(
# L0 archive options
l0a_processing=l0a_processing,
l0b_processing=l0b_processing,
- l0b_concat=l0b_concat,
+ l0c_processing=l0c_processing,
remove_l0a=remove_l0a,
remove_l0b=remove_l0b,
# Processing options
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0_station.py b/disdrodb/cli/disdrodb_run_l0_station.py
similarity index 85%
rename from disdrodb/l0/scripts/disdrodb_run_l0_station.py
rename to disdrodb/cli/disdrodb_run_l0_station.py
index a197f9fb..166c4d73 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0_station.py
+++ b/disdrodb/cli/disdrodb_run_l0_station.py
@@ -20,12 +20,10 @@
import click
-from disdrodb.l0.routines import (
- click_l0_archive_options,
- click_l0_processing_options,
-)
-from disdrodb.utils.scripts import (
+from disdrodb.utils.cli import (
click_base_dir_option,
+ click_l0_archive_options,
+ click_processing_options,
click_station_arguments,
parse_base_dir,
)
@@ -38,7 +36,7 @@
@click.command()
@click_station_arguments
-@click_l0_processing_options
+@click_processing_options
@click_l0_archive_options
@click_base_dir_option
def disdrodb_run_l0_station(
@@ -49,7 +47,7 @@ def disdrodb_run_l0_station(
# L0 archive options
l0a_processing: bool = True,
l0b_processing: bool = True,
- l0b_concat: bool = True,
+ l0c_processing: bool = True,
remove_l0a: bool = False,
remove_l0b: bool = False,
# Processing options
@@ -77,18 +75,15 @@ def disdrodb_run_l0_station(
l0b_processing : bool \n
Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.\n
The default is True.\n
- l0b_concat : bool \n
- Whether to concatenate all raw files into a single L0B netCDF file.\n
- If l0b_concat=True, all raw files will be saved into a single L0B netCDF file.\n
- If l0b_concat=False, each raw file will be converted into the corresponding L0B netCDF file.\n
- The default is False.\n
+ l0c_processing : bool
+ Whether to launch processing to generate L0C netCDF4 file(s) from L0C data.
+ The default is True.
remove_l0a : bool \n
Whether to keep the L0A files after having generated the L0B netCDF products.\n
The default is False.\n
- remove_l0b : bool \n
- Whether to remove the L0B files after having concatenated all L0B netCDF files.\n
- It takes places only if l0b_concat=True\n
- The default is False.\n
+ remove_l0b : bool
+ Whether to remove the L0B files after having produced L0C netCDF files.
+ The default is False.
force : bool \n
If True, overwrite existing data into destination directories.\n
If False, raise an error if there are already data into destination directories.\n
@@ -113,7 +108,7 @@ def disdrodb_run_l0_station(
Format: <...>/DISDRODB \n
If not specified, uses path specified in the DISDRODB active configuration. \n
"""
- from disdrodb.l0.routines import run_disdrodb_l0_station
+ from disdrodb.routines import run_disdrodb_l0_station
base_dir = parse_base_dir(base_dir)
@@ -125,7 +120,7 @@ def disdrodb_run_l0_station(
# L0 archive options
l0a_processing=l0a_processing,
l0b_processing=l0b_processing,
- l0b_concat=l0b_concat,
+ l0c_processing=l0c_processing,
remove_l0a=remove_l0a,
remove_l0b=remove_l0b,
# Processing options
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0a.py b/disdrodb/cli/disdrodb_run_l0a.py
similarity index 93%
rename from disdrodb/l0/scripts/disdrodb_run_l0a.py
rename to disdrodb/cli/disdrodb_run_l0a.py
index 5e8121de..73357c26 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0a.py
+++ b/disdrodb/cli/disdrodb_run_l0a.py
@@ -21,18 +21,20 @@
import click
-from disdrodb.l0.routines import (
- click_l0_processing_options,
- click_l0_stations_options,
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
)
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
@click.command()
-@click_l0_stations_options
-@click_l0_processing_options
+@click_stations_options
+@click_processing_options
@click_base_dir_option
def disdrodb_run_l0a(
# L0 disdrodb stations options
@@ -90,7 +92,7 @@ def disdrodb_run_l0a(
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- from disdrodb.l0.routines import run_disdrodb_l0a
+ from disdrodb.routines import run_disdrodb_l0a
# Parse data_sources, campaign_names and station arguments
base_dir = parse_base_dir(base_dir)
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0a_station.py b/disdrodb/cli/disdrodb_run_l0a_station.py
similarity index 82%
rename from disdrodb/l0/scripts/disdrodb_run_l0a_station.py
rename to disdrodb/cli/disdrodb_run_l0a_station.py
index 4f160a06..dacd0fa6 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0a_station.py
+++ b/disdrodb/cli/disdrodb_run_l0a_station.py
@@ -20,9 +20,9 @@
import click
-from disdrodb.l0.routines import click_l0_processing_options
-from disdrodb.utils.scripts import (
+from disdrodb.utils.cli import (
click_base_dir_option,
+ click_processing_options,
click_station_arguments,
parse_base_dir,
)
@@ -35,7 +35,7 @@
@click.command()
@click_station_arguments
-@click_l0_processing_options
+@click_processing_options
@click_base_dir_option
def disdrodb_run_l0a_station(
# Station arguments
@@ -85,32 +85,15 @@ def disdrodb_run_l0a_station(
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- import os
-
- import dask
- from dask.distributed import Client, LocalCluster
-
from disdrodb.l0.l0_processing import run_l0a_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
base_dir = parse_base_dir(base_dir)
# -------------------------------------------------------------------------.
# If parallel=True, set the dask environment
if parallel:
- # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
- os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
- # Retrieve the number of process to run
- available_workers = os.cpu_count() - 2 # if not set, all CPUs
- num_workers = dask.config.get("num_workers", available_workers)
- # Create dask.distributed local cluster
- cluster = LocalCluster(
- n_workers=num_workers,
- threads_per_worker=1,
- processes=True,
- # memory_limit='8GB',
- # silence_logs=False,
- )
- Client(cluster)
+ cluster, client = initialize_dask_cluster()
# -------------------------------------------------------------------------.
run_l0a_station(
@@ -129,4 +112,4 @@ def disdrodb_run_l0a_station(
# -------------------------------------------------------------------------.
# Close the cluster
if parallel:
- cluster.close()
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b.py b/disdrodb/cli/disdrodb_run_l0b.py
similarity index 93%
rename from disdrodb/l0/scripts/disdrodb_run_l0b.py
rename to disdrodb/cli/disdrodb_run_l0b.py
index 836cc599..b5706c16 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0b.py
+++ b/disdrodb/cli/disdrodb_run_l0b.py
@@ -21,19 +21,21 @@
import click
-from disdrodb.l0.routines import (
- click_l0_processing_options,
- click_l0_stations_options,
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
click_remove_l0a_option,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
)
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
@click.command()
-@click_l0_stations_options
-@click_l0_processing_options
+@click_stations_options
+@click_processing_options
@click_remove_l0a_option
@click_base_dir_option
def disdrodb_run_l0b(
@@ -93,7 +95,7 @@ def disdrodb_run_l0b(
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- from disdrodb.l0.routines import run_disdrodb_l0b
+ from disdrodb.routines import run_disdrodb_l0b
# Parse data_sources, campaign_names and station arguments
base_dir = parse_base_dir(base_dir)
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_station.py b/disdrodb/cli/disdrodb_run_l0b_station.py
similarity index 82%
rename from disdrodb/l0/scripts/disdrodb_run_l0b_station.py
rename to disdrodb/cli/disdrodb_run_l0b_station.py
index 49462911..297de3ae 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0b_station.py
+++ b/disdrodb/cli/disdrodb_run_l0b_station.py
@@ -20,9 +20,10 @@
import click
-from disdrodb.l0.routines import click_l0_processing_options, click_remove_l0a_option
-from disdrodb.utils.scripts import (
+from disdrodb.utils.cli import (
click_base_dir_option,
+ click_processing_options,
+ click_remove_l0a_option,
click_station_arguments,
parse_base_dir,
)
@@ -35,7 +36,7 @@
@click.command()
@click_station_arguments
-@click_l0_processing_options
+@click_processing_options
@click_remove_l0a_option
@click_base_dir_option
def disdrodb_run_l0b_station(
@@ -86,32 +87,15 @@ def disdrodb_run_l0b_station(
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- import os
-
- import dask
- from dask.distributed import Client, LocalCluster
-
from disdrodb.l0.l0_processing import run_l0b_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
base_dir = parse_base_dir(base_dir)
# -------------------------------------------------------------------------.
# If parallel=True, set the dask environment
if parallel:
- # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
- os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
- # Retrieve the number of process to run
- available_workers = os.cpu_count() - 2 # if not set, all CPUs
- num_workers = dask.config.get("num_workers", available_workers)
- # Create dask.distributed local cluster
- cluster = LocalCluster(
- n_workers=num_workers,
- threads_per_worker=1,
- processes=True,
- # memory_limit='8GB',
- # silence_logs=False,
- )
- Client(cluster)
+ cluster, client = initialize_dask_cluster()
# -------------------------------------------------------------------------.
run_l0b_station(
@@ -131,4 +115,4 @@ def disdrodb_run_l0b_station(
# -------------------------------------------------------------------------.
# Close the cluster
if parallel:
- cluster.close()
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/cli/disdrodb_run_l0c.py b/disdrodb/cli/disdrodb_run_l0c.py
new file mode 100644
index 00000000..40bf2b22
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l0c.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L0C processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_remove_l0b_option,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+
+@click.command()
+@click_stations_options
+@click_processing_options
+@click_remove_l0b_option
+@click_base_dir_option
+def disdrodb_run_l0c(
+ # L0 disdrodb stations options
+ data_sources: Optional[str] = None,
+ campaign_names: Optional[str] = None,
+ station_names: Optional[str] = None,
+ # L0C processing options
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L0C processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : str
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR'
+ campaign_names : str
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS'
+ station_names : str
+ Station names.
+ To specify multiple stations, write i.e.: --station_names 'station1 station2'
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is True.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0b
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 100 rows of 3 L0A files for each station.
+ The default is False.
+ remove_l0b: bool, optional
+ Whether to remove the processed L0B files. The default is ``False``.
+ base_dir : str
+ Base directory of DISDRODB
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.routines import run_disdrodb_l0c
+
+ # Parse data_sources, campaign_names and station arguments
+ base_dir = parse_base_dir(base_dir)
+ data_sources = parse_arg_to_list(data_sources)
+ campaign_names = parse_arg_to_list(campaign_names)
+ station_names = parse_arg_to_list(station_names)
+
+ # Run processing
+ run_disdrodb_l0c(
+ base_dir=base_dir,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ # L0C processing options
+ remove_l0b=remove_l0b,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
diff --git a/disdrodb/cli/disdrodb_run_l0c_station.py b/disdrodb/cli/disdrodb_run_l0c_station.py
new file mode 100644
index 00000000..0e3d699e
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l0c_station.py
@@ -0,0 +1,122 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L0C station processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_remove_l0b_option,
+ click_station_arguments,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+# -------------------------------------------------------------------------.
+# Click Command Line Interface decorator
+
+
+@click.command()
+@click_station_arguments
+@click_processing_options
+@click_remove_l0b_option
+@click_base_dir_option
+def disdrodb_run_l0c_station(
+ # Station arguments
+ data_source: str,
+ campaign_name: str,
+ station_name: str,
+ # L0C processing options
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0C processing of a specific DISDRODB station from the terminal.
+
+ Parameters
+ ----------
+ data_source : str
+ Institution name (when campaign data spans more than 1 country),
+ or country (when all campaigns (or sensor networks) are inside a given country).
+ Must be UPPER CASE.
+ campaign_name : str
+ Campaign name. Must be UPPER CASE.
+ station_name : str
+ Station name
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is True.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0b_station
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 100 rows of 3 L0A files.
+ The default is False.
+ remove_l0b: bool, optional
+ Whether to remove the processed L0B files. The default is ``False``.
+ base_dir : str
+ Base directory of DISDRODB
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.l0.l0_processing import run_l0c_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
+
+ base_dir = parse_base_dir(base_dir)
+
+ # -------------------------------------------------------------------------.
+ # If parallel=True, set the dask environment
+ if parallel:
+ cluster, client = initialize_dask_cluster()
+
+ # -------------------------------------------------------------------------.
+ run_l0c_station(
+ # Station arguments
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0C processing options
+ remove_l0b=remove_l0b,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ base_dir=base_dir,
+ )
+
+ # -------------------------------------------------------------------------.
+ # Close the cluster
+ if parallel:
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_concat.py b/disdrodb/cli/disdrodb_run_l1.py
similarity index 60%
rename from disdrodb/l0/scripts/disdrodb_run_l0b_concat.py
rename to disdrodb/cli/disdrodb_run_l1.py
index 46199e54..9458e5a2 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0b_concat.py
+++ b/disdrodb/cli/disdrodb_run_l1.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
# -----------------------------------------------------------------------------.
# Copyright (c) 2021-2023 DISDRODB developers
#
@@ -14,38 +15,45 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# -----------------------------------------------------------------------------.
-"""Script to concatenate the DISDRODB L0B files."""
+"""Script to run the DISDRODB L1B processing."""
import sys
from typing import Optional
import click
-from disdrodb.l0.routines import (
- click_l0_stations_options,
- click_l0b_concat_options,
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
)
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
@click.command()
-@click_l0_stations_options
-@click_l0b_concat_options
+@click_stations_options
+@click_processing_options
@click_base_dir_option
-def disdrodb_run_l0b_concat(
+def disdrodb_run_l1(
+ # Stations options
data_sources: Optional[str] = None,
campaign_names: Optional[str] = None,
station_names: Optional[str] = None,
- remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
base_dir: Optional[str] = None,
):
- """Run the L0B concatenation of available DISDRODB stations.
+ """
+ Run the L1 processing of DISDRODB stations.
- This function allow to launch the processing of many DISDRODB stations with a single command.
- From the list of all available DISDRODB stations, it runs the processing of the
- stations matching the provided data_sources, campaign_names and station_names.
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing
+ of the stations matching the provided data_sources, campaign_names and station_names.
Parameters
----------
@@ -61,18 +69,30 @@ def disdrodb_run_l0b_concat(
station_names : str
Station names.
To specify multiple stations, write i.e.: --station_names 'station1 station2'
- remove_l0b : bool
- If true, remove all source L0B files once L0B concatenation is terminated.
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
The default is False.
verbose : bool
Whether to print detailed processing information into terminal.
The default is False.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files for each station.
+ The default is False.
base_dir : str
Base directory of DISDRODB
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- from disdrodb.l0.routines import run_disdrodb_l0b_concat
+ from disdrodb.l1.routines import run_disdrodb_l1
# Parse data_sources, campaign_names and station arguments
base_dir = parse_base_dir(base_dir)
@@ -80,12 +100,15 @@ def disdrodb_run_l0b_concat(
campaign_names = parse_arg_to_list(campaign_names)
station_names = parse_arg_to_list(station_names)
- # Run concatenation
- run_disdrodb_l0b_concat(
+ # Run processing
+ run_disdrodb_l1(
base_dir=base_dir,
data_sources=data_sources,
campaign_names=campaign_names,
station_names=station_names,
- remove_l0b=remove_l0b,
+ # Processing options
+ force=force,
verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
)
diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py b/disdrodb/cli/disdrodb_run_l1_station.py
similarity index 51%
rename from disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py
rename to disdrodb/cli/disdrodb_run_l1_station.py
index 8da14e10..b91e8c18 100644
--- a/disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py
+++ b/disdrodb/cli/disdrodb_run_l1_station.py
@@ -14,40 +14,43 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# -----------------------------------------------------------------------------.
-"""Script to concatenate the DISDRODB L0B station files."""
-##################################################
-## Wrapper to concat L0B files by command lines ##
-##################################################
+"""Script to run the DISDRODB L1 station processing."""
import sys
from typing import Optional
import click
-from disdrodb.l0.routines import click_l0b_concat_options
-from disdrodb.utils.scripts import (
+from disdrodb.utils.cli import (
click_base_dir_option,
+ click_processing_options,
click_station_arguments,
parse_base_dir,
)
sys.tracebacklimit = 0 # avoid full traceback error if occur
+# -------------------------------------------------------------------------.
+# Click Command Line Interface decorator
+
@click.command()
@click_station_arguments
-@click_l0b_concat_options
+@click_processing_options
@click_base_dir_option
-def disdrodb_run_l0b_concat_station(
+def disdrodb_run_l1_station(
# Station arguments
data_source: str,
campaign_name: str,
station_name: str,
- # L0B concat options
- remove_l0b=False,
- verbose=True,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ parallel: bool = True,
+ debugging_mode: bool = False,
base_dir: Optional[str] = None,
):
- """Concatenation all L0B files of a specific DISDRODB station into a single netCDF.
+ """
+ Run the L1 processing of a specific DISDRODB station from the terminal.
Parameters
----------
@@ -59,28 +62,54 @@ def disdrodb_run_l0b_concat_station(
Campaign name. Must be UPPER CASE.
station_name : str
Station name
- remove_l0b : bool
- If true, remove all source L0B files once L0B concatenation is terminated.
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
The default is False.
verbose : bool
Whether to print detailed processing information into terminal.
+ The default is True.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files.
The default is False.
base_dir : str
- Base directory of DISDRODB
+ Base directory of DISDRODB.
Format: <...>/DISDRODB
If not specified, uses path specified in the DISDRODB active configuration.
"""
- from disdrodb.l0.l0_processing import run_l0b_concat_station
+ from disdrodb.l1.routines import run_l1_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
base_dir = parse_base_dir(base_dir)
- run_l0b_concat_station(
+ # -------------------------------------------------------------------------.
+ # If parallel=True, set the dask environment
+ if parallel:
+ cluster, client = initialize_dask_cluster()
+
+ # -------------------------------------------------------------------------.
+ run_l1_station(
# Station arguments
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
# Processing options
- remove_l0b=remove_l0b,
+ force=force,
verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
base_dir=base_dir,
)
+
+ # -------------------------------------------------------------------------.
+ # Close the cluster
+ if parallel:
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/cli/disdrodb_run_l2e.py b/disdrodb/cli/disdrodb_run_l2e.py
new file mode 100644
index 00000000..8026d7f7
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l2e.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L2E processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+
+@click.command()
+@click_stations_options
+@click_processing_options
+@click_base_dir_option
+def disdrodb_run_l2e(
+ # Stations options
+ data_sources: Optional[str] = None,
+ campaign_names: Optional[str] = None,
+ station_names: Optional[str] = None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L2E processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing
+ of the stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : str
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR'
+ campaign_names : str
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS'
+ station_names : str
+ Station names.
+ To specify multiple stations, write i.e.: --station_names 'station1 station2'
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is False.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files for each station.
+ The default is False.
+ base_dir : str
+ Base directory of DISDRODB
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.routines import run_disdrodb_l2e
+
+ # Parse data_sources, campaign_names and station arguments
+ base_dir = parse_base_dir(base_dir)
+ data_sources = parse_arg_to_list(data_sources)
+ campaign_names = parse_arg_to_list(campaign_names)
+ station_names = parse_arg_to_list(station_names)
+
+ # Run processing
+ run_disdrodb_l2e(
+ base_dir=base_dir,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
diff --git a/disdrodb/cli/disdrodb_run_l2e_station.py b/disdrodb/cli/disdrodb_run_l2e_station.py
new file mode 100644
index 00000000..0fb5a81f
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l2e_station.py
@@ -0,0 +1,115 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L2E station processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_station_arguments,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+# -------------------------------------------------------------------------.
+# Click Command Line Interface decorator
+
+
+@click.command()
+@click_station_arguments
+@click_processing_options
+@click_base_dir_option
+def disdrodb_run_l2e_station(
+ # Station arguments
+ data_source: str,
+ campaign_name: str,
+ station_name: str,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L2E processing of a specific DISDRODB station from the terminal.
+
+ Parameters
+ ----------
+ data_source : str
+ Institution name (when campaign data spans more than 1 country),
+ or country (when all campaigns (or sensor networks) are inside a given country).
+ Must be UPPER CASE.
+ campaign_name : str
+ Campaign name. Must be UPPER CASE.
+ station_name : str
+ Station name
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is True.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files.
+ The default is False.
+ base_dir : str
+ Base directory of DISDRODB.
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.l2.routines import run_l2e_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
+
+ base_dir = parse_base_dir(base_dir)
+
+ # -------------------------------------------------------------------------.
+ # If parallel=True, set the dask environment
+ if parallel:
+ cluster, client = initialize_dask_cluster()
+
+ # -------------------------------------------------------------------------.
+ run_l2e_station(
+ # Station arguments
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ base_dir=base_dir,
+ )
+
+ # -------------------------------------------------------------------------.
+ # Close the cluster
+ if parallel:
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/cli/disdrodb_run_l2m.py b/disdrodb/cli/disdrodb_run_l2m.py
new file mode 100644
index 00000000..ca00c71a
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l2m.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L2M processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_stations_options,
+ parse_arg_to_list,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+
+@click.command()
+@click_stations_options
+@click_processing_options
+@click_base_dir_option
+def disdrodb_run_l2m(
+ # Stations options
+ data_sources: Optional[str] = None,
+ campaign_names: Optional[str] = None,
+ station_names: Optional[str] = None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L2M processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing
+ of the stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : str
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR'
+ campaign_names : str
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS'
+ station_names : str
+ Station names.
+ To specify multiple stations, write i.e.: --station_names 'station1 station2'
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is False.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files for each station.
+ The default is False.
+ base_dir : str
+ Base directory of DISDRODB
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.routines import run_disdrodb_l2m
+
+ # Parse data_sources, campaign_names and station arguments
+ base_dir = parse_base_dir(base_dir)
+ data_sources = parse_arg_to_list(data_sources)
+ campaign_names = parse_arg_to_list(campaign_names)
+ station_names = parse_arg_to_list(station_names)
+
+ # Run processing
+ run_disdrodb_l2m(
+ base_dir=base_dir,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
diff --git a/disdrodb/cli/disdrodb_run_l2m_station.py b/disdrodb/cli/disdrodb_run_l2m_station.py
new file mode 100644
index 00000000..3e1ed86f
--- /dev/null
+++ b/disdrodb/cli/disdrodb_run_l2m_station.py
@@ -0,0 +1,115 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Script to run the DISDRODB L2M station processing."""
+import sys
+from typing import Optional
+
+import click
+
+from disdrodb.utils.cli import (
+ click_base_dir_option,
+ click_processing_options,
+ click_station_arguments,
+ parse_base_dir,
+)
+
+sys.tracebacklimit = 0 # avoid full traceback error if occur
+
+# -------------------------------------------------------------------------.
+# Click Command Line Interface decorator
+
+
+@click.command()
+@click_station_arguments
+@click_processing_options
+@click_base_dir_option
+def disdrodb_run_l2m_station(
+ # Station arguments
+ data_source: str,
+ campaign_name: str,
+ station_name: str,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L2M processing of a specific DISDRODB station from the terminal.
+
+ Parameters
+ ----------
+ data_source : str
+ Institution name (when campaign data spans more than 1 country),
+ or country (when all campaigns (or sensor networks) are inside a given country).
+ Must be UPPER CASE.
+ campaign_name : str
+ Campaign name. Must be UPPER CASE.
+ station_name : str
+ Station name
+ force : bool
+ If True, overwrite existing data into destination directories.
+ If False, raise an error if there are already data into destination directories.
+ The default is False.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is True.
+ parallel : bool
+ If True, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread.
+ By default, the number of process is defined with os.cpu_count().
+ However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station
+ If False, the files are processed sequentially in a single process.
+ If False, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If True, it reduces the amount of data to process.
+ It processes just the first 3 raw data files.
+ The default is False.
+ base_dir : str
+ Base directory of DISDRODB.
+ Format: <...>/DISDRODB
+ If not specified, uses path specified in the DISDRODB active configuration.
+ """
+ from disdrodb.l2.routines import run_l2m_station
+ from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster
+
+ base_dir = parse_base_dir(base_dir)
+
+ # -------------------------------------------------------------------------.
+ # If parallel=True, set the dask environment
+ if parallel:
+ cluster, client = initialize_dask_cluster()
+
+ # -------------------------------------------------------------------------.
+ run_l2m_station(
+ # Station arguments
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ base_dir=base_dir,
+ )
+
+ # -------------------------------------------------------------------------.
+ # Close the cluster
+ if parallel:
+ close_dask_cluster(cluster, client)
diff --git a/disdrodb/data_transfer/scripts/disdrodb_upload_archive.py b/disdrodb/cli/disdrodb_upload_archive.py
similarity index 97%
rename from disdrodb/data_transfer/scripts/disdrodb_upload_archive.py
rename to disdrodb/cli/disdrodb_upload_archive.py
index 0107169d..97bf8d28 100644
--- a/disdrodb/data_transfer/scripts/disdrodb_upload_archive.py
+++ b/disdrodb/cli/disdrodb_upload_archive.py
@@ -24,7 +24,7 @@
import click
from disdrodb.data_transfer.upload_data import click_upload_archive_options, click_upload_options
-from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, parse_arg_to_list, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/data_transfer/scripts/disdrodb_upload_station.py b/disdrodb/cli/disdrodb_upload_station.py
similarity index 96%
rename from disdrodb/data_transfer/scripts/disdrodb_upload_station.py
rename to disdrodb/cli/disdrodb_upload_station.py
index 754a8f7e..188ff5c9 100644
--- a/disdrodb/data_transfer/scripts/disdrodb_upload_station.py
+++ b/disdrodb/cli/disdrodb_upload_station.py
@@ -24,7 +24,7 @@
import click
from disdrodb.data_transfer.upload_data import click_upload_options
-from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir
+from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir
sys.tracebacklimit = 0 # avoid full traceback error if occur
diff --git a/disdrodb/data_transfer/download_data.py b/disdrodb/data_transfer/download_data.py
index a5c03f52..8b6ea512 100644
--- a/disdrodb/data_transfer/download_data.py
+++ b/disdrodb/data_transfer/download_data.py
@@ -207,9 +207,7 @@ def download_station(
def _is_valid_disdrodb_data_url(disdrodb_data_url):
"""Check if it is a valid disdrodb_data_url."""
- if isinstance(disdrodb_data_url, str) and len(disdrodb_data_url) > 10:
- return True
- return False
+ return isinstance(disdrodb_data_url, str) and len(disdrodb_data_url) > 10
def _has_disdrodb_data_url(metadata_filepath):
diff --git a/disdrodb/issue/checks.py b/disdrodb/issue/checks.py
index b28aafc6..8ac4ebb0 100644
--- a/disdrodb/issue/checks.py
+++ b/disdrodb/issue/checks.py
@@ -35,8 +35,7 @@ def _is_numpy_array_string(arr):
arr : numpy array
Numpy array to check.
"""
- dtype = arr.dtype.type
- return dtype in (np.str_, np.unicode_)
+ return np.issubdtype(arr.dtype, np.str_)
def _is_numpy_array_datetime(arr):
@@ -52,7 +51,7 @@ def _is_numpy_array_datetime(arr):
numpy array
Numpy array checked.
"""
- return arr.dtype.type == np.datetime64
+ return np.issubdtype(arr.dtype, np.datetime64)
def _check_timestep_datetime_accuracy(timesteps, unit="s"):
diff --git a/disdrodb/l0/__init__.py b/disdrodb/l0/__init__.py
index cc1b9f11..bbd54d92 100644
--- a/disdrodb/l0/__init__.py
+++ b/disdrodb/l0/__init__.py
@@ -3,7 +3,7 @@
run_l0b_from_nc,
)
from disdrodb.l0.l0_reader import available_readers
-from disdrodb.l0.routines import (
+from disdrodb.routines import (
run_disdrodb_l0,
run_disdrodb_l0_station,
run_disdrodb_l0a,
diff --git a/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml b/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml
index d8b04830..87c77956 100644
--- a/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml
+++ b/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml
@@ -34,7 +34,7 @@ weather_code_synop_4677:
_FillValue: 255
weather_code_metar_4678:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -42,7 +42,7 @@ weather_code_metar_4678:
chunksizes: 5000
weather_code_nws:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -103,7 +103,7 @@ sensor_temperature:
_FillValue: 255
sensor_serial_number:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -111,7 +111,7 @@ sensor_serial_number:
chunksizes: 5000
firmware_iop:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -119,7 +119,7 @@ firmware_iop:
chunksizes: 5000
firmware_dsp:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -152,7 +152,7 @@ sensor_status:
_FillValue: 255
start_time:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -160,7 +160,7 @@ start_time:
chunksizes: 5000
sensor_time:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -168,7 +168,7 @@ sensor_time:
chunksizes: 5000
sensor_date:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -176,7 +176,7 @@ sensor_date:
chunksizes: 5000
station_name:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -184,7 +184,7 @@ station_name:
chunksizes: 5000
station_number:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
diff --git a/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml b/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml
index 192e7e2a..459764af 100644
--- a/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml
+++ b/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml
@@ -294,7 +294,7 @@ raw_drop_average_velocity:
data_range: null
nan_flags: null
dimension_order:
- - velocity_bin_center
+ - diameter_bin_center
n_values: 32
field_number: "91"
raw_drop_number:
diff --git a/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml b/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml
index 88d4a6ed..fbb8c1e1 100644
--- a/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml
+++ b/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml
@@ -15,14 +15,14 @@ rainfall_accumulated_32bit:
contiguous: false
chunksizes: 5000
weather_code_synop_4680:
- dtype: uint32
+ dtype: uint8
zlib: true
complevel: 3
shuffle: true
fletcher32: false
contiguous: false
chunksizes: 5000
- _FillValue: 4294967295
+ _FillValue: 255
weather_code_synop_4677:
dtype: uint32
zlib: true
@@ -34,7 +34,7 @@ weather_code_synop_4677:
_FillValue: 4294967295
weather_code_metar_4678:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -42,7 +42,7 @@ weather_code_metar_4678:
chunksizes: 5000
weather_code_nws:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -103,7 +103,7 @@ sensor_temperature:
_FillValue: 127
sensor_serial_number:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -111,7 +111,7 @@ sensor_serial_number:
chunksizes: 5000
firmware_iop:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -119,7 +119,7 @@ firmware_iop:
chunksizes: 5000
firmware_dsp:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -152,7 +152,7 @@ sensor_status:
_FillValue: 255
start_time:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -160,7 +160,7 @@ start_time:
chunksizes: 5000
sensor_time:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -168,7 +168,7 @@ sensor_time:
chunksizes: 5000
sensor_date:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -176,7 +176,7 @@ sensor_date:
chunksizes: 5000
station_name:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -184,7 +184,7 @@ station_name:
chunksizes: 5000
station_number:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -293,7 +293,7 @@ number_particles_all:
_FillValue: 4294967295
list_particles:
dtype: object
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
diff --git a/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml b/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml
index 5c271483..baf965ca 100644
--- a/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml
+++ b/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml
@@ -364,7 +364,7 @@ raw_drop_average_velocity:
data_range: null
nan_flags: null
dimension_order:
- - velocity_bin_center
+ - diameter_bin_center
n_values: 32
field_number: "91"
raw_drop_number:
diff --git a/disdrodb/l0/configs/RD_80/l0a_encodings.yml b/disdrodb/l0/configs/RD_80/l0a_encodings.yml
index 775d637b..c4399588 100644
--- a/disdrodb/l0/configs/RD_80/l0a_encodings.yml
+++ b/disdrodb/l0/configs/RD_80/l0a_encodings.yml
@@ -1,5 +1,5 @@
sensor_status: "float32" # 'int8'
-interval: "float32" # 'uint8'
+sample_interval: "float32" # 'uint8'
RI: "float32"
RA: "float32"
RAT: "float32"
diff --git a/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml b/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml
index 95baceae..14b515c2 100644
--- a/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml
+++ b/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml
@@ -2,7 +2,7 @@ sensor_status:
description: Sensor status
long_name: Sensor status
units: ""
-interval:
+sample_interval:
description: Time interval between measurement
long_name: Time interval between measurement
units: s
diff --git a/disdrodb/l0/configs/RD_80/l0b_encodings.yml b/disdrodb/l0/configs/RD_80/l0b_encodings.yml
index bc6a63a0..3ae873cb 100644
--- a/disdrodb/l0/configs/RD_80/l0b_encodings.yml
+++ b/disdrodb/l0/configs/RD_80/l0b_encodings.yml
@@ -7,7 +7,7 @@ sensor_status:
contiguous: false
chunksizes: 5000
_FillValue: 255
-interval:
+sample_interval:
dtype: uint8
zlib: true
complevel: 3
diff --git a/disdrodb/l0/configs/RD_80/raw_data_format.yml b/disdrodb/l0/configs/RD_80/raw_data_format.yml
index 0b1cf856..3f82e120 100644
--- a/disdrodb/l0/configs/RD_80/raw_data_format.yml
+++ b/disdrodb/l0/configs/RD_80/raw_data_format.yml
@@ -25,7 +25,7 @@ sensor_status:
- 0
- 1
field_number: "03"
-interval:
+sample_interval:
n_digits: 4
n_characters: 4
n_decimals: 4
diff --git a/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml b/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml
index ce1f50be..5851af0b 100644
--- a/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml
+++ b/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml
@@ -18,7 +18,7 @@ device_address:
_FillValue: 255
# sensor_serial_number:
# dtype: uint16
-# zlib: true
+# zlib: false
# complevel: 3
# shuffle: true
# fletcher32: false
@@ -27,7 +27,7 @@ device_address:
# _FillValue: 65535
# software_version:
# dtype: float32
-# zlib: true
+# zlib: false
# complevel: 3
# shuffle: true
# fletcher32: false
@@ -35,7 +35,7 @@ device_address:
# chunksizes: 5000
# sensor_date:
# dtype: object
-# zlib: true
+# zlib: false
# complevel: 3
# shuffle: true
# fletcher32: false
@@ -43,7 +43,7 @@ device_address:
# chunksizes: 5000
# sensor_time:
# dtype: object
-# zlib: true
+# zlib: false
# complevel: 3
# shuffle: true
# fletcher32: false
@@ -69,7 +69,7 @@ weather_code_synop_4680_5min:
_FillValue: 255
weather_code_metar_4678_5min:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
@@ -103,7 +103,7 @@ weather_code_synop_4680:
_FillValue: 255
weather_code_metar_4678:
dtype: str
- zlib: true
+ zlib: false
complevel: 3
shuffle: true
fletcher32: false
diff --git a/disdrodb/l0/io.py b/disdrodb/l0/io.py
index d3a5f141..5a05e6a1 100644
--- a/disdrodb/l0/io.py
+++ b/disdrodb/l0/io.py
@@ -23,7 +23,7 @@
import pandas as pd
-from disdrodb.api.path import define_l0a_station_dir
+from disdrodb.api.io import filter_filepaths
from disdrodb.utils.directories import list_files
from disdrodb.utils.logger import log_info
@@ -101,14 +101,6 @@ def _get_available_filepaths(raw_dir, station_name, glob_patterns):
return filepaths
-def _filter_filepaths(filepaths, debugging_mode):
- """Filter out filepaths if ``debugging_mode=True``."""
- if debugging_mode:
- max_files = min(3, len(filepaths))
- filepaths = filepaths[0:max_files]
- return filepaths
-
-
def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debugging_mode=False):
"""Get the list of files from a directory based on input parameters.
@@ -122,7 +114,7 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug
Directory of the campaign where to search for files.
Format <..>/DISDRODB/Raw//
station_name : str
- ID of the station
+ Name of the station.
verbose : bool, optional
Whether to verbose the processing.
The default is ``False``.
@@ -141,7 +133,7 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug
filepaths = _get_available_filepaths(raw_dir=raw_dir, station_name=station_name, glob_patterns=glob_patterns)
# Filter out filepaths if debugging_mode=True
- filepaths = _filter_filepaths(filepaths, debugging_mode)
+ filepaths = filter_filepaths(filepaths, debugging_mode)
# Log number of files to process
n_files = len(filepaths)
@@ -153,40 +145,6 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug
return filepaths
-def get_l0a_filepaths(processed_dir, station_name, debugging_mode=False):
- """Retrieve L0A files for a give station.
-
- Parameters
- ----------
- processed_dir : str
- Directory of the campaign where to search for the L0A files.
- Format: ``<..>/DISDRODB/Processed//``.
- station_name : str
- ID of the station
- debugging_mode : bool, optional
- If ``True``, it select maximum 3 files for debugging purposes.
- The default is ``False``.
-
- Returns
- -------
- filepaths : list
- List of L0A file paths.
-
- """
- station_dir = define_l0a_station_dir(processed_dir, station_name)
- filepaths = list_files(station_dir, glob_pattern="*.parquet", recursive=True)
-
- # Filter out filepaths if debugging_mode=True
- filepaths = _filter_filepaths(filepaths, debugging_mode=debugging_mode)
-
- # If no file available, raise error
- if len(filepaths) == 0:
- msg = f"No L0A Apache Parquet file is available in {station_dir}. Run L0A processing first."
- raise ValueError(msg)
-
- return filepaths
-
-
####--------------------------------------------------------------------------.
#### DISDRODB L0A product reader
@@ -243,14 +201,20 @@ def read_l0a_dataframe(
if isinstance(filepaths, str):
filepaths = [filepaths]
# ---------------------------------------------------
- # - If debugging_mode=True, it reads only the first 3 filepaths
+ # If debugging_mode=True, it reads only the first 3 filepaths
if debugging_mode:
filepaths = filepaths[0:3] # select first 3 filepaths
- # - Define the list of dataframe
+ # ---------------------------------------------------
+ # Define the list of dataframe
list_df = [_read_l0a(filepath, verbose=verbose, debugging_mode=debugging_mode) for filepath in filepaths]
- # - Concatenate dataframe
+
+ # Concatenate dataframe
df = concatenate_dataframe(list_df, verbose=verbose)
+
+ # Ensure time is in nanoseconds
+ df["time"] = df["time"].astype("M8[ns]")
+
# ---------------------------------------------------
# Return dataframe
return df
diff --git a/disdrodb/l0/l0_processing.py b/disdrodb/l0/l0_processing.py
index d126c7af..408d3e15 100644
--- a/disdrodb/l0/l0_processing.py
+++ b/disdrodb/l0/l0_processing.py
@@ -19,130 +19,114 @@
"""Implement DISDRODB L0 processing."""
import datetime
-import functools
import logging
import os
-import shutil
import time
from typing import Optional
import dask
-import dask.bag as db
-import xarray as xr
from disdrodb.api.checks import check_sensor_name
# Directory
from disdrodb.api.create_directories import (
- create_directory_structure,
create_l0_directory_structure,
+ create_logs_directory,
+ create_product_directory,
)
-from disdrodb.api.info import infer_path_info_dict
+from disdrodb.api.info import infer_path_info_tuple
+from disdrodb.api.io import get_filepaths, get_required_product, remove_product
from disdrodb.api.path import (
define_campaign_dir,
- define_l0a_filepath,
- define_l0b_filepath,
- define_l0b_station_dir,
- define_station_dir,
- get_disdrodb_path,
+ define_l0a_filename,
+ define_l0b_filename,
+ define_l0c_filename,
+ define_metadata_filepath,
)
+
+# get_disdrodb_path,
from disdrodb.configs import get_base_dir
from disdrodb.issue import read_station_issue
from disdrodb.l0.io import (
- get_l0a_filepaths,
get_raw_filepaths,
read_l0a_dataframe,
)
from disdrodb.l0.l0_reader import get_station_reader_function
+from disdrodb.l0.l0a_processing import (
+ process_raw_file,
+ write_l0a,
+)
+from disdrodb.l0.l0b_nc_processing import create_l0b_from_raw_nc
+from disdrodb.l0.l0b_processing import (
+ create_l0b_from_l0a,
+ set_l0b_encodings,
+ write_l0b,
+)
+from disdrodb.l0.l0c_processing import (
+ create_daily_file,
+ get_files_per_days,
+ retrieve_possible_measurement_intervals,
+)
from disdrodb.metadata import read_station_metadata
-from disdrodb.utils.directories import list_files
+from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel
# Logger
from disdrodb.utils.logger import (
close_logger,
- create_file_logger,
- define_summary_log,
+ create_logger_file,
+ create_product_logs,
log_error,
log_info,
- log_warning,
)
+# log_warning,
+from disdrodb.utils.writer import write_product
+from disdrodb.utils.yaml import read_yaml
+
logger = logging.getLogger(__name__)
# -----------------------------------------------------------------------------.
#### Creation of L0A and L0B Single Station File
-def _delayed_based_on_kwargs(function):
- """Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
-
- @functools.wraps(function)
- def wrapper(*args, **kwargs):
- # Check if it must be a delayed function
- parallel = kwargs.get("parallel")
- # If parallel is True
- if parallel:
- # Enforce verbose to be False
- kwargs["verbose"] = False
- # Define the delayed task
- result = dask.delayed(function)(*args, **kwargs)
- else:
- # Else run the function
- result = function(*args, **kwargs)
- return result
-
- return wrapper
-
-
-@_delayed_based_on_kwargs
+@delayed_if_parallel
+@single_threaded_if_parallel
def _generate_l0a(
filepath,
- processed_dir,
- station_name, # retrievable from filepath
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # Reader arguments
column_names,
reader_kwargs,
df_sanitizer_fun,
+ # Processing info
+ sensor_name,
+ issue_dict,
+ # Processing options
force,
verbose,
parallel,
- issue_dict=None,
):
"""Generate L0A file from raw file."""
- from disdrodb.l0.l0a_processing import (
- process_raw_file,
- write_l0a,
- )
+ # Define product
+ product = "L0A"
##------------------------------------------------------------------------.
# Create file logger
- if issue_dict is None:
- issue_dict = {}
filename = os.path.basename(filepath)
- logger = create_file_logger(
- processed_dir=processed_dir,
- product="L0A",
- station_name=station_name,
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
filename=filename,
parallel=parallel,
)
- # Define logger filepath
- # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None
- logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None
-
##------------------------------------------------------------------------.
# Log start processing
- msg = f"L0A processing of {filename} has started."
+ msg = f"{product} processing of {filename} has started."
log_info(logger=logger, msg=msg, verbose=verbose)
- ##------------------------------------------------------------------------.
- # Retrieve metadata
- attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir))
-
- # Retrieve sensor name
- sensor_name = attrs["sensor_name"]
- check_sensor_name(sensor_name)
-
##------------------------------------------------------------------------.
try:
#### - Read raw file into a dataframe and sanitize to L0A format
@@ -158,7 +142,8 @@ def _generate_l0a(
##--------------------------------------------------------------------.
#### - Write to Parquet
- filepath = define_l0a_filepath(df=df, processed_dir=processed_dir, station_name=station_name)
+ filename = define_l0a_filename(df=df, campaign_name=campaign_name, station_name=station_name)
+ filepath = os.path.join(data_dir, filename)
write_l0a(df=df, filepath=filepath, force=force, verbose=verbose)
##--------------------------------------------------------------------.
@@ -166,7 +151,7 @@ def _generate_l0a(
del df
# Log end processing
- msg = f"L0A processing of {filename} has ended."
+ msg = f"{product} processing of {filename} has ended."
log_info(logger=logger, msg=msg, verbose=verbose)
# Otherwise log the error
@@ -182,58 +167,57 @@ def _generate_l0a(
return logger_filepath
+@delayed_if_parallel
+@single_threaded_if_parallel
def _generate_l0b(
filepath,
- processed_dir, # retrievable from filepath
- station_name, # retrievable from filepath
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # Processing info
+ metadata,
+ # Processing options
force,
verbose,
- debugging_mode,
parallel,
+ debugging_mode,
):
- from disdrodb.l0.l0b_processing import (
- create_l0b_from_l0a,
- write_l0b,
- )
+ # Define product
+ product = "L0B"
# -----------------------------------------------------------------.
# Create file logger
filename = os.path.basename(filepath)
- logger = create_file_logger(
- processed_dir=processed_dir,
- product="L0B",
- station_name=station_name,
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
filename=filename,
parallel=parallel,
)
- # Define logger filepath
- # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None
- logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None
##------------------------------------------------------------------------.
# Log start processing
- msg = f"L0B processing of {filename} has started."
+ msg = f"{product} processing of {filename} has started."
log_info(logger, msg, verbose=verbose)
##------------------------------------------------------------------------.
- # Retrieve metadata
- attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir))
-
# Retrieve sensor name
- sensor_name = attrs["sensor_name"]
+ sensor_name = metadata["sensor_name"]
check_sensor_name(sensor_name)
##------------------------------------------------------------------------.
try:
# Read L0A Apache Parquet file
df = read_l0a_dataframe(filepath, verbose=verbose, debugging_mode=debugging_mode)
+
# -----------------------------------------------------------------.
# Create xarray Dataset
- ds = create_l0b_from_l0a(df=df, attrs=attrs, verbose=verbose)
+ ds = create_l0b_from_l0a(df=df, attrs=metadata, verbose=verbose)
# -----------------------------------------------------------------.
# Write L0B netCDF4 dataset
- filepath = define_l0b_filepath(ds, processed_dir, station_name)
+ filename = define_l0b_filename(ds=ds, campaign_name=campaign_name, station_name=station_name)
+ filepath = os.path.join(data_dir, filename)
write_l0b(ds, filepath=filepath, force=force)
##--------------------------------------------------------------------.
@@ -241,7 +225,7 @@ def _generate_l0b(
del ds, df
# Log end processing
- msg = f"L0B processing of {filename} has ended."
+ msg = f"{product} processing of {filename} has ended."
log_info(logger, msg, verbose=verbose)
# Otherwise log the error
@@ -259,43 +243,43 @@ def _generate_l0b(
def _generate_l0b_from_nc(
filepath,
- processed_dir,
- station_name, # retrievable from filepath
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # Processing info
+ metadata,
+ # Reader arguments
dict_names,
ds_sanitizer_fun,
+ # Processing options
force,
verbose,
parallel,
):
- from disdrodb.l0.l0b_nc_processing import create_l0b_from_raw_nc
- from disdrodb.l0.l0b_processing import write_l0b
+ import xarray as xr # Load in each process
+
+ # -----------------------------------------------------------------.
+ # Define product name
+ product = "L0B"
# -----------------------------------------------------------------.
# Create file logger
filename = os.path.basename(filepath)
- logger = create_file_logger(
- processed_dir=processed_dir,
- product="L0B",
- station_name=station_name,
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
filename=filename,
parallel=parallel,
)
- # Define logger filepath
- # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None
- logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None
-
##------------------------------------------------------------------------.
# Log start processing
- msg = f"L0B processing of {filename} has started."
+ msg = f"{product} processing of {filename} has started."
log_info(logger, msg, verbose=verbose)
##------------------------------------------------------------------------.
- # Retrieve metadata
- attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir))
-
# Retrieve sensor name
- sensor_name = attrs["sensor_name"]
+ sensor_name = metadata["sensor_name"]
check_sensor_name(sensor_name)
##------------------------------------------------------------------------.
@@ -311,11 +295,12 @@ def _generate_l0b_from_nc(
ds_sanitizer_fun=ds_sanitizer_fun,
sensor_name=sensor_name,
verbose=verbose,
- attrs=attrs,
+ attrs=metadata,
)
# -----------------------------------------------------------------.
# Write L0B netCDF4 dataset
- filepath = define_l0b_filepath(ds, processed_dir, station_name)
+ filename = define_l0b_filename(ds=ds, campaign_name=campaign_name, station_name=station_name)
+ filepath = os.path.join(data_dir, filename)
write_l0b(ds, filepath=filepath, force=force)
##--------------------------------------------------------------------.
@@ -339,6 +324,96 @@ def _generate_l0b_from_nc(
return logger_filepath
+@delayed_if_parallel
+@single_threaded_if_parallel
+def _generate_l0c(
+ day,
+ filepaths,
+ data_dir,
+ logs_dir,
+ metadata_filepath,
+ campaign_name,
+ station_name,
+ # Processing options
+ force,
+ verbose,
+ parallel, # this is used only to initialize the correct logger !
+):
+ # -----------------------------------------------------------------.
+ # Define product name
+ product = "L0C"
+
+ # -----------------------------------------------------------------.
+ # Create file logger
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
+ filename=day,
+ parallel=parallel,
+ )
+
+ ##------------------------------------------------------------------------.
+ # Log start processing
+ msg = f"{product} processing for {day} has started."
+ log_info(logger, msg, verbose=verbose)
+
+ ##------------------------------------------------------------------------.
+ ### Core computation
+ try:
+ # Retrieve measurement_intervals
+ # - TODO: in future available from dataset
+ metadata = read_yaml(metadata_filepath)
+ measurement_intervals = retrieve_possible_measurement_intervals(metadata)
+
+ # Produce L0C datasets
+ dict_ds = create_daily_file(
+ day=day,
+ filepaths=filepaths,
+ measurement_intervals=measurement_intervals,
+ ensure_variables_equality=True,
+ logger=logger,
+ verbose=verbose,
+ )
+
+ # Write a dataset for each sample interval
+ for ds in dict_ds.values(): # (sample_interval, ds)
+ # Write L0C netCDF4 dataset
+ if ds["time"].size > 1:
+ # Get sensor name from dataset
+ sensor_name = ds.attrs.get("sensor_name")
+ campaign_name = ds.attrs.get("campaign_name")
+ station_name = ds.attrs.get("station_name")
+
+ # Set encodings
+ ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
+
+ # Define filepath
+ filename = define_l0c_filename(ds, campaign_name=campaign_name, station_name=station_name)
+ filepath = os.path.join(data_dir, filename)
+
+ # Write to disk
+ write_product(ds, product=product, filepath=filepath, force=force)
+
+ # Clean environment
+ del ds
+
+ # Log end processing
+ msg = f"{product} processing for {day} has ended."
+ log_info(logger, msg, verbose=verbose)
+
+ ##--------------------------------------------------------------------.
+ # Otherwise log the error
+ except Exception as e:
+ error_type = str(type(e).__name__)
+ msg = f"{error_type}: {e}"
+ log_error(logger, msg, verbose=verbose)
+
+ # Close the file logger
+ close_logger(logger)
+
+ # Return the logger file path
+ return logger_filepath
+
+
####------------------------------------------------------------------------.
#### Creation of L0A and L0B Single Station Files
@@ -414,19 +489,22 @@ def run_l0a(
Default is ``False``.
"""
+ # Define product name
+ product = "L0A"
+
# ------------------------------------------------------------------------.
# Start L0A processing
if verbose:
t_i = time.time()
- msg = f"L0A processing of station {station_name} has started."
+ msg = f"{product} processing of station {station_name} has started."
log_info(logger=logger, msg=msg, verbose=verbose)
# ------------------------------------------------------------------------.
# Create directory structure
- create_l0_directory_structure(
+ data_dir = create_l0_directory_structure(
raw_dir=raw_dir,
processed_dir=processed_dir,
- product="L0A",
+ product=product,
station_name=station_name,
force=force,
)
@@ -443,9 +521,40 @@ def run_l0a(
debugging_mode=debugging_mode,
)
+ # -------------------------------------------------------------------------.
+ # Retrieve DISDRODB path components
+ base_dir, data_source, campaign_name = infer_path_info_tuple(raw_dir)
+
+ # -------------------------------------------------------------------------.
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
# -----------------------------------------------------------------.
# Read issue YAML file
- issue_dict = read_station_issue(station_name=station_name, **infer_path_info_dict(raw_dir))
+ issue_dict = read_station_issue(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
+ ##------------------------------------------------------------------------.
+ # Read metadata
+ metadata = read_station_metadata(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+ # Retrieve sensor name
+ sensor_name = metadata["sensor_name"]
+ check_sensor_name(sensor_name)
# -----------------------------------------------------------------.
# Generate L0A files
@@ -454,12 +563,16 @@ def run_l0a(
list_tasks = [
_generate_l0a(
filepath=filepath,
- processed_dir=processed_dir,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ campaign_name=campaign_name,
station_name=station_name,
- # L0A reader argument
+ # Reader argument
column_names=column_names,
reader_kwargs=reader_kwargs,
df_sanitizer_fun=df_sanitizer_fun,
+ # Processing info
+ sensor_name=sensor_name,
issue_dict=issue_dict,
# Processing options
force=force,
@@ -471,149 +584,24 @@ def run_l0a(
list_logs = dask.compute(*list_tasks) if parallel else list_tasks
# -----------------------------------------------------------------.
# Define L0A summary logs
- define_summary_log(list_logs)
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
# ---------------------------------------------------------------------.
# End L0A processing
if verbose:
- timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i))
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
msg = f"L0A processing of station {station_name} completed in {timedelta_str}"
log_info(logger=logger, msg=msg, verbose=verbose)
-def run_l0b(
- processed_dir,
- station_name,
- # Processing options
- parallel,
- force,
- verbose,
- debugging_mode,
-):
- """
- Run the L0B processing for a specific DISDRODB station.
-
- Parameters
- ----------
- raw_dir : str
- The directory path where all the raw content of a specific campaign is stored.
- The path must have the following structure: ``<...>/DISDRODB/Raw//``.
- Inside the ``raw_dir`` directory, it is required to adopt the following structure::
-
- - ``/data//``
- - ``/metadata/.yml``
-
- **Important points:**
-
- - For each ````, there must be a corresponding YAML file in the metadata subdirectory.
- - The ``campaign_name`` are expected to be UPPER CASE.
- - The ```` must semantically match between:
- - the ``raw_dir`` and ``processed_dir`` directory paths;
- - with the key ``campaign_name`` within the metadata YAML files.
- processed_dir : str
- The desired directory path for the processed DISDRODB L0A and L0B products.
- The path should have the following structure: ``<...>/DISDRODB/Processed//``.
- For testing purposes, this function exceptionally accepts also a directory path simply ending
- with ```` (e.g., ``/tmp/``).
- station_name : str
- The name of the station.
- force : bool, optional
- If ``True``, overwrite existing data in destination directories.
- If ``False``, raise an error if data already exists in destination directories.
- Default is ``False``.
- verbose : bool, optional
- If ``True``, print detailed processing information to the terminal.
- Default is ``True``.
- parallel : bool, optional
- If ``True``, process the files simultaneously in multiple processes.
- The number of simultaneous processes can be customized using the ``dask.distributed.LocalCluster``.
- Ensure that the ``threads_per_worker`` (number of thread per process) is set to 1 to avoid HDF errors.
- Also, ensure to set the ``HDF5_USE_FILE_LOCKING`` environment variable to ``False``.
- If ``False``, process the files sequentially in a single process.
- Default is ``False``.
- debugging_mode : bool, optional
- If ``True``, reduce the amount of data to process.
- Only the first 3 raw data files will be processed.
- Default is ``False``.
-
- """
- # -----------------------------------------------------------------.
- # Retrieve metadata
- attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir))
-
- # Skip run_l0b processing if the raw data are netCDFs
- if attrs["raw_data_format"] == "netcdf":
- return
-
- # -----------------------------------------------------------------.
- # Start L0B processing
- if verbose:
- t_i = time.time()
- msg = f"L0B processing of station_name {station_name} has started."
- log_info(logger=logger, msg=msg, verbose=verbose)
-
- # -------------------------------------------------------------------------.
- # Create directory structure
- create_directory_structure(
- processed_dir=processed_dir,
- product="L0B",
- station_name=station_name,
- force=force,
- )
-
- ##----------------------------------------------------------------.
- # Get L0A files for the station
- filepaths = get_l0a_filepaths(
- processed_dir=processed_dir,
- station_name=station_name,
- debugging_mode=debugging_mode,
- )
-
- # -----------------------------------------------------------------.
- # Generate L0B files
- # Loop over the L0A files and save the L0B netCDF files.
- # - If parallel=True, it does that in parallel using dask.bag
- # Settings npartitions=len(filepaths) enable to wait prior task on a core
- # finish before starting a new one.
- if not parallel:
- list_logs = [
- _generate_l0b(
- filepath=filepath,
- processed_dir=processed_dir,
- station_name=station_name,
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- )
- for filepath in filepaths
- ]
-
- else:
- bag = db.from_sequence(filepaths, npartitions=len(filepaths))
- list_logs = bag.map(
- _generate_l0b,
- processed_dir=processed_dir,
- station_name=station_name,
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- ).compute()
-
- # -----------------------------------------------------------------.
- # Define L0B summary logs
- define_summary_log(list_logs)
-
- # -----------------------------------------------------------------.
- # End L0B processing
- if verbose:
- timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i))
- msg = f"L0B processing of station_name {station_name} completed in {timedelta_str}"
- log_info(logger=logger, msg=msg, verbose=verbose)
- return
-
-
def run_l0b_from_nc(
raw_dir,
processed_dir,
@@ -694,8 +682,11 @@ def run_l0b_from_nc(
Default is ``False``.
"""
+ # Define product name
+ product = "L0B"
+
# ------------------------------------------------------------------------.
- # Start L0A processing
+ # Start L0B NC processing
if verbose:
t_i = time.time()
msg = f"L0B processing of station {station_name} has started."
@@ -703,14 +694,36 @@ def run_l0b_from_nc(
# ------------------------------------------------------------------------.
# Create directory structure
- create_l0_directory_structure(
+ data_dir = create_l0_directory_structure(
raw_dir=raw_dir,
processed_dir=processed_dir,
- product="L0B",
+ product=product,
station_name=station_name,
force=force,
)
+ # -------------------------------------------------------------------------.
+ # Retrieve DISDRODB path components
+ base_dir, data_source, campaign_name = infer_path_info_tuple(processed_dir)
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
+ # -----------------------------------------------------------------.
+ # Retrieve metadata
+ metadata = read_station_metadata(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
# -------------------------------------------------------------------------.
# List files to process
filepaths = get_raw_filepaths(
@@ -729,28 +742,15 @@ def run_l0b_from_nc(
# - If parallel=True, it does that in parallel using dask.bag
# Settings npartitions=len(filepaths) enable to wait prior task on a core
# finish before starting a new one.
- if not parallel:
- list_logs = [
- _generate_l0b_from_nc(
- filepath=filepath,
- processed_dir=processed_dir,
- station_name=station_name,
- # Reader arguments
- dict_names=dict_names,
- ds_sanitizer_fun=ds_sanitizer_fun,
- # Processing options
- force=force,
- verbose=verbose,
- parallel=parallel,
- )
- for filepath in filepaths
- ]
- else:
- bag = db.from_sequence(filepaths, npartitions=len(filepaths))
- list_logs = bag.map(
- _generate_l0b_from_nc,
- processed_dir=processed_dir,
+ list_tasks = [
+ _generate_l0b_from_nc(
+ filepath=filepath,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ campaign_name=campaign_name,
station_name=station_name,
+ # Processing info
+ metadata=metadata,
# Reader arguments
dict_names=dict_names,
ds_sanitizer_fun=ds_sanitizer_fun,
@@ -758,78 +758,70 @@ def run_l0b_from_nc(
force=force,
verbose=verbose,
parallel=parallel,
- ).compute()
+ )
+ for filepath in filepaths
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+
+ # if not parallel:
+ # list_logs = [
+ # _generate_l0b_from_nc(
+ # filepath=filepath,
+ # data_dir=data_dir,
+ # logs_dir=logs_dir,
+ # campaign_name=campaign_name,
+ # station_name=station_name,
+ # # Processing info
+ # metadata=metadata,
+ # # Reader arguments
+ # dict_names=dict_names,
+ # ds_sanitizer_fun=ds_sanitizer_fun,
+ # # Processing options
+ # force=force,
+ # verbose=verbose,
+ # parallel=parallel,
+ # )
+ # for filepath in filepaths
+ # ]
+ # else:
+ # bag = db.from_sequence(filepaths, npartitions=len(filepaths))
+ # list_logs = bag.map(
+ # _generate_l0b_from_nc,
+ # data_dir=data_dir,
+ # logs_dir=logs_dir,
+ # campaign_name=campaign_name,
+ # station_name=station_name,
+ # # Processing info
+ # metadata=metadata,
+ # # Reader arguments
+ # dict_names=dict_names,
+ # ds_sanitizer_fun=ds_sanitizer_fun,
+ # # Processing options
+ # force=force,
+ # verbose=verbose,
+ # parallel=parallel,
+ # ).compute()
# -----------------------------------------------------------------.
# Define L0B summary logs
- define_summary_log(list_logs)
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
# ---------------------------------------------------------------------.
# End L0B processing
if verbose:
- timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i))
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
msg = f"L0B processing of station {station_name} completed in {timedelta_str}"
log_info(logger=logger, msg=msg, verbose=verbose)
-def run_l0b_concat(processed_dir, station_name, verbose=False):
- """Concatenate all L0B netCDF files into a single netCDF file.
-
- The single netCDF file is saved at ``/L0B``.
- """
- from disdrodb.l0.l0b_processing import write_l0b
- from disdrodb.utils.netcdf import xr_concat_datasets
-
- # Create logger
- filename = f"concatenatation_{station_name}"
- logger = create_file_logger(
- processed_dir=processed_dir,
- product="L0B",
- station_name="", # locate outside the station directory
- filename=filename,
- parallel=False,
- )
-
- # -------------------------------------------------------------------------.
- # Retrieve L0B files
- station_dir = define_l0b_station_dir(processed_dir, station_name)
- filepaths = list_files(station_dir, glob_pattern="*.nc", recursive=True)
- filepaths = sorted(filepaths)
-
- # -------------------------------------------------------------------------.
- # Check there are at least two files
- n_files = len(filepaths)
- if n_files == 0:
- msg = f"No L0B file is available for concatenation in {station_dir}."
- log_error(logger=logger, msg=msg, verbose=False)
- raise ValueError(msg)
-
- if n_files == 1:
- msg = f"Only a single file is available for concatenation in {station_dir}."
- log_warning(logger=logger, msg=msg, verbose=verbose)
-
- # -------------------------------------------------------------------------.
- # Concatenate the files
- ds = xr_concat_datasets(filepaths)
-
- # -------------------------------------------------------------------------.
- # Define the filepath of the concatenated L0B netCDF
- single_nc_filepath = define_l0b_filepath(ds, processed_dir, station_name, l0b_concat=True)
- force = True # TODO add as argument
- write_l0b(ds, filepath=single_nc_filepath, force=force)
-
- # -------------------------------------------------------------------------.
- # Close file and delete
- ds.close()
- del ds
-
- # -------------------------------------------------------------------------.
- # Close the file logger
- close_logger(logger)
-
- # Return the dataset
-
-
####--------------------------------------------------------------------------.
#### DISDRODB Station Functions
@@ -880,7 +872,10 @@ def run_l0a_station(
The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
If not specified, the path specified in the DISDRODB active configuration will be used.
"""
+ # Define base directory
base_dir = get_base_dir(base_dir)
+
+ # Retrieve reader
reader = get_station_reader_function(
base_dir=base_dir,
data_source=data_source,
@@ -901,8 +896,8 @@ def run_l0a_station(
campaign_name=campaign_name,
)
# Run L0A processing
- # --> The reader call the run_l0a within the custom defined reader function
- # --> For the special case of raw netCDF data, it calls the run_l0b_from_nc function
+ # --> The reader calls the run_l0a or the run_l0b_from_nc if the raw data are
+ # text files or netCDF files respectively.
reader(
raw_dir=raw_dir,
processed_dir=processed_dir,
@@ -920,12 +915,13 @@ def run_l0b_station(
data_source,
campaign_name,
station_name,
+ # L0B processing options
+ remove_l0a: bool = False,
# Processing options
force: bool = False,
verbose: bool = True,
parallel: bool = True,
debugging_mode: bool = False,
- remove_l0a: bool = False,
base_dir: Optional[str] = None,
):
"""
@@ -957,58 +953,207 @@ def run_l0b_station(
and multi-threading will be automatically exploited to speed up I/O tasks.
debugging_mode : bool, optional
If ``True``, the amount of data processed will be reduced.
- Only the first 100 rows of 3 L0A files will be processed. By default, ``False``.
+ Only the first 100 rows of 3 L0A files will be processed. The default is ``False``.
+ remove_l0a: bool, optional
+ Whether to remove the processed L0A files. The default is ``False``.
base_dir : str, optional
The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
If not specified, the path specified in the DISDRODB active configuration will be used.
"""
- # Define campaign processed dir
+ # Define product name
+ product = "L0B"
+
+ # Retrieve DISDRODB base directory
base_dir = get_base_dir(base_dir)
- processed_dir = get_disdrodb_path(
+
+ # -----------------------------------------------------------------.
+ # Retrieve metadata
+ metadata = read_station_metadata(
base_dir=base_dir,
- product="L0B",
data_source=data_source,
campaign_name=campaign_name,
- check_exists=False,
+ station_name=station_name,
)
- # Run L0B
- run_l0b(
- processed_dir=processed_dir,
+
+ # Skip run_l0b processing if the raw data are netCDFs
+ # - L0B produced when running L0A ...
+ if metadata["raw_data_format"] == "netcdf":
+ return
+
+ # -----------------------------------------------------------------.
+ # Start L0B processing
+ if verbose:
+ t_i = time.time()
+ msg = f"{product} processing of station_name {station_name} has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
station_name=station_name,
- # Processing options
+ )
+
+ # -------------------------------------------------------------------------.
+ # Create product directory
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=product,
force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
)
+ ##----------------------------------------------------------------.
+ # Get L0A files for the station
+ required_product = get_required_product(product)
+ flag_not_available_data = False
+ try:
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=required_product,
+ debugging_mode=debugging_mode,
+ )
+ except Exception as e:
+ print(str(e)) # Case where no file paths available
+ flag_not_available_data = True
+
+ # -------------------------------------------------------------------------.
+ # If no data available, print error message and return None
+ if flag_not_available_data:
+ msg = (
+ f"{product} processing of {data_source} {campaign_name} {station_name}"
+ + f"has not been launched because of missing {required_product} data."
+ )
+ print(msg)
+ return
+
+ ##----------------------------------------------------------------.
+ # Generate L0B files
+ # Loop over the L0A files and save the L0B netCDF files.
+ # - If parallel=True, it does that in parallel using dask.bag
+ # Settings npartitions=len(filepaths) enable to wait prior task on a core
+ # finish before starting a new one.
+ # BUG: If debugging_mode=True and parallel=True a subtle bug can currently occur when
+ # two processes with a subsetted L0A files want to create the same L0B files !
+ list_tasks = [
+ _generate_l0b(
+ filepath=filepath,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ metadata=metadata,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ for filepath in filepaths
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+ # if not parallel:
+ # list_logs = [
+ # _generate_l0b(
+ # filepath=filepath,
+ # data_dir=data_dir,
+ # logs_dir=logs_dir,
+ # metadata=metadata,
+ # campaign_name=campaign_name,
+ # station_name=station_name,
+ # force=force,
+ # verbose=verbose,
+ # debugging_mode=debugging_mode,
+ # parallel=parallel,
+ # )
+ # for filepath in filepaths
+ # ]
+
+ # else:
+ # bag = db.from_sequence(filepaths, npartitions=len(filepaths))
+ # list_logs = bag.map(
+ # _generate_l0b,
+ # data_dir=data_dir,
+ # logs_dir=logs_dir,
+ # metadata=metadata,
+ # campaign_name=campaign_name,
+ # station_name=station_name,
+ # force=force,
+ # verbose=verbose,
+ # debugging_mode=debugging_mode,
+ # parallel=parallel,
+ # ).compute()
+
+ # -----------------------------------------------------------------.
+ # Define L0B summary logs
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
+
+ # -----------------------------------------------------------------.
+ # End L0B processing
+ if verbose:
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ msg = f"{product} processing of station_name {station_name} completed in {timedelta_str}"
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # -----------------------------------------------------------------.
+ # Option to remove L0A
if remove_l0a:
- station_dir = define_station_dir(
+ remove_product(
base_dir=base_dir,
product="L0A",
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
+ logger=logger,
+ verbose=verbose,
)
- log_info(logger=logger, msg="Removal of single L0A files started.", verbose=verbose)
- shutil.rmtree(station_dir)
- log_info(logger=logger, msg="Removal of single L0A files ended.", verbose=verbose)
-def run_l0b_concat_station(
+def run_l0c_station(
# Station arguments
data_source,
campaign_name,
station_name,
- # L0B concat options
- remove_l0b=False,
- verbose=True,
+ # L0C processing options
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
base_dir: Optional[str] = None,
):
- """Define the L0B file concatenation of a station.
+ """
+ Run the L0C processing of a specific DISDRODB station when invoked from the terminal.
+
+ The DISDRODB L0A and L0B routines just convert source raw data into netCDF format.
+ The DISDRODB L0C routine ingests L0B files and performs data homogenization.
+ The DISDRODB L0C routine takes care of:
+
+ - removing duplicated timesteps across files,
+ - merging/splitting files into daily files,
+ - regularizing timesteps for potentially trailing seconds,
+ - ensuring L0C files with unique sample intervals.
- This function is intended to be called through the ``disdrodb_run_l0b_concat station``
+ Duplicated timesteps are automatically dropped if their variable values coincides,
+ otherwise an error is raised.
+
+ This function is intended to be called through the ``disdrodb_run_l0c_station``
command-line interface.
Parameters
@@ -1021,42 +1166,151 @@ def run_l0b_concat_station(
The name of the campaign. Must be provided in UPPER CASE.
station_name : str
The name of the station.
+ force : bool, optional
+ If ``True``, existing data in the destination directories will be overwritten.
+ If ``False`` (default), an error will be raised if data already exists in the destination directories.
verbose : bool, optional
If ``True`` (default), detailed processing information will be printed to the terminal.
If ``False``, less information will be displayed.
+ parallel : bool, optional
+ If ``True``, files will be processed in multiple processes simultaneously,
+ with each process using a single thread to avoid issues with the HDF/netCDF library.
+ If ``False`` (default), files will be processed sequentially in a single process,
+ and multi-threading will be automatically exploited to speed up I/O tasks.
+ debugging_mode : bool, optional
+ If ``True``, the amount of data processed will be reduced.
+ Only the first 3 files will be processed. By default, ``False``.
+ remove_l0b: bool, optional
+ Whether to remove the processed L0B files. The default is ``False``.
base_dir : str, optional
The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
If not specified, the path specified in the DISDRODB active configuration will be used.
"""
- # Retrieve processed_dir
+ # Define product
+ product = "L0C"
+
+ # Define base directory
base_dir = get_base_dir(base_dir)
- processed_dir = get_disdrodb_path(
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
base_dir=base_dir,
- product="L0B",
data_source=data_source,
campaign_name=campaign_name,
- check_exists=True,
+ station_name=station_name,
)
- # Run concatenation
- run_l0b_concat(
- processed_dir=processed_dir,
+ # ------------------------------------------------------------------------.
+ # Start processing
+ if verbose:
+ t_i = time.time()
+ msg = f"{product} processing of station {station_name} has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # ------------------------------------------------------------------------.
+ # Create product directory
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
station_name=station_name,
- verbose=verbose,
+ product=product,
+ force=force,
)
- if remove_l0b:
- station_dir = define_station_dir(
+ # ------------------------------------------------------------------------.
+ # Define metadata filepath
+ metadata_filepath = define_metadata_filepath(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
+ # -------------------------------------------------------------------------.
+ # List files to process
+ required_product = get_required_product(product)
+ flag_not_available_data = False
+ try:
+ filepaths = get_filepaths(
base_dir=base_dir,
- product="L0B",
data_source=data_source,
campaign_name=campaign_name,
station_name=station_name,
+ product=required_product,
+ # Processing options
+ debugging_mode=debugging_mode,
)
- log_info(logger=logger, msg="Removal of single L0B files started.", verbose=verbose)
- shutil.rmtree(station_dir)
- log_info(logger=logger, msg="Removal of single L0B files ended.", verbose=verbose)
+ except Exception as e:
+ print(str(e)) # Case where no file paths available
+ flag_not_available_data = True
+
+ # -------------------------------------------------------------------------.
+ # If no data available, print error message and return None
+ if flag_not_available_data:
+ msg = (
+ f"{product} processing of {data_source} {campaign_name} {station_name}"
+ + f"has not been launched because of missing {required_product} data."
+ )
+ print(msg)
+ return
+ # -------------------------------------------------------------------------.
+ # Retrieve dictionary with the required files for each day.
+ dict_days_files = get_files_per_days(filepaths)
-####---------------------------------------------------------------------------.
+ # -----------------------------------------------------------------.
+ # Generate L0C files
+ # - Loop over the L0 netCDF files and generate L1 files.
+ # - If parallel=True, it does that in parallel using dask.delayed
+ list_tasks = [
+ _generate_l0c(
+ day=day,
+ filepaths=filepaths,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ metadata_filepath=metadata_filepath,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ parallel=parallel,
+ )
+ for day, filepaths in dict_days_files.items()
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+
+ # -----------------------------------------------------------------.
+ # Define summary logs
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
+
+ # ---------------------------------------------------------------------.
+ # End processing
+ if verbose:
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ msg = f"{product} processing of station {station_name} completed in {timedelta_str}"
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # -----------------------------------------------------------------.
+ # Option to remove L0B
+ if remove_l0b:
+ remove_product(
+ base_dir=base_dir,
+ product="L0B",
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ logger=logger,
+ verbose=verbose,
+ )
diff --git a/disdrodb/l0/l0a_processing.py b/disdrodb/l0/l0a_processing.py
index 9c6540cb..89c35b83 100644
--- a/disdrodb/l0/l0a_processing.py
+++ b/disdrodb/l0/l0a_processing.py
@@ -208,6 +208,8 @@ def remove_duplicated_timesteps(df: pd.DataFrame, verbose: bool = False):
values_duplicates = values[idx_duplicates].astype("M8[s]")
# If there are duplicated timesteps
if len(values_duplicates) > 0:
+ # TODO: raise error if duplicated timesteps have different values !
+
# Drop duplicated timesteps (keeping the first occurrence)
df = df.drop_duplicates(subset="time", keep="first")
# Report the values of duplicated timesteps
@@ -446,7 +448,7 @@ def remove_corrupted_rows(df):
raise ValueError("No remaining rows after data corruption checks.")
# If only one row available, raise also error
if len(df) == 1:
- raise ValueError("Only 1 row remains after data corruption checks. Check the file.")
+ raise ValueError("Only 1 row remains after data corruption checks. Check the raw file and maybe delete it.")
# Return the dataframe
return df
@@ -653,6 +655,9 @@ def process_raw_file(
# - Replace invalid values with np.nan
df = set_nan_invalid_values(df, sensor_name=sensor_name, verbose=verbose)
+ # - Sort by time
+ df = df.sort_values("time")
+
# ------------------------------------------------------.
# - Check column names agrees to DISDRODB standards
check_l0a_column_names(df, sensor_name=sensor_name)
diff --git a/disdrodb/l0/l0b_nc_processing.py b/disdrodb/l0/l0b_nc_processing.py
index 121b8185..734d4d0c 100644
--- a/disdrodb/l0/l0b_nc_processing.py
+++ b/disdrodb/l0/l0b_nc_processing.py
@@ -18,11 +18,9 @@
# -----------------------------------------------------------------------------.
"""Functions to process DISDRODB raw netCDF files into DISDRODB L0B netCDF files."""
-import copy
import logging
import numpy as np
-import xarray as xr
from disdrodb.l0.l0b_processing import finalize_dataset
from disdrodb.l0.standards import (
@@ -115,6 +113,8 @@ def subset_dataset(ds, dict_names, sensor_name):
def add_dataset_missing_variables(ds, missing_vars, sensor_name):
"""Add missing xr.Dataset variables as ``np.nan`` xr.DataArrays."""
+ import xarray as xr
+
from disdrodb.l0.standards import get_variables_dimension
# Get dimension of each variables
@@ -171,8 +171,7 @@ def preprocess_raw_netcdf(ds, dict_names, sensor_name):
ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name)
# Update the coordinates for (diameter and velocity)
- coords = get_bin_coords_dict(sensor_name)
- ds = ds.assign_coords(coords)
+ ds = ds.assign_coords(get_bin_coords_dict(sensor_name))
# Return dataset
return ds
@@ -346,19 +345,6 @@ def create_l0b_from_raw_nc(
# Preprocess netcdf
ds = preprocess_raw_netcdf(ds=ds, dict_names=dict_names, sensor_name=sensor_name)
- # Add CRS and geolocation information
- attrs = copy.deepcopy(attrs)
- coords = {}
- geolocation_vars = ["latitude", "longitude", "altitude"]
- for var in geolocation_vars:
- if var not in ds:
- coords[var] = attrs[var]
- _ = attrs.pop(var)
- ds = ds.assign_coords(coords)
-
- # Add global attributes
- ds.attrs = attrs
-
# Apply dataset sanitizer function
ds = ds_sanitizer_fun(ds)
@@ -372,7 +358,7 @@ def create_l0b_from_raw_nc(
ds = set_nan_invalid_values(ds, sensor_name=sensor_name, verbose=verbose)
# Finalize dataset
- ds = finalize_dataset(ds, sensor_name=sensor_name)
+ ds = finalize_dataset(ds, sensor_name=sensor_name, attrs=attrs)
# Return dataset
return ds
diff --git a/disdrodb/l0/l0b_processing.py b/disdrodb/l0/l0b_processing.py
index 7741535d..ab0021a6 100644
--- a/disdrodb/l0/l0b_processing.py
+++ b/disdrodb/l0/l0b_processing.py
@@ -32,23 +32,26 @@
from disdrodb.l0.standards import (
# get_valid_coordinates_names,
get_bin_coords_dict,
- get_coords_attrs_dict,
get_data_range_dict,
get_dims_size_dict,
get_l0b_cf_attrs_dict,
get_l0b_encodings_dict,
get_raw_array_dims_order,
get_raw_array_nvalues,
- get_time_encoding,
+)
+from disdrodb.utils.attrs import (
+ set_coordinate_attributes,
set_disdrodb_attrs,
)
from disdrodb.utils.directories import create_directory, remove_if_exists
+from disdrodb.utils.encoding import set_encodings
from disdrodb.utils.logger import (
# log_warning,
# log_debug,
log_error,
log_info,
)
+from disdrodb.utils.time import ensure_sorted_by_time
logger = logging.getLogger(__name__)
@@ -329,28 +332,13 @@ def _set_variable_attributes(ds: xr.Dataset, sensor_name: str) -> xr.Dataset:
return ds
-def _set_attrs_dict(ds, attrs_dict):
- for var in attrs_dict:
- if var in ds:
- ds[var].attrs.update(attrs_dict[var])
- return ds
-
-
-def _set_coordinate_attributes(ds):
- # Get attributes dictionary
- attrs_dict = get_coords_attrs_dict()
- # Set attributes
- ds = _set_attrs_dict(ds, attrs_dict)
- return ds
-
-
def _set_dataset_attrs(ds, sensor_name):
"""Set variable and coordinates attributes."""
# - Add netCDF variable attributes
# --> Attributes: long_name, units, descriptions, valid_min, valid_max
ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name)
# - Add netCDF coordinate attributes
- ds = _set_coordinate_attributes(ds=ds)
+ ds = set_coordinate_attributes(ds=ds)
# - Set DISDRODB global attributes
ds = set_disdrodb_attrs(ds=ds, product="L0B")
return ds
@@ -384,44 +372,19 @@ def _define_dataset_variables(df, sensor_name, verbose):
raise ValueError("No raw fields available.")
# Define other disdrometer 'auxiliary' variables varying over time dimension
+ # - Includes time
+ # - Includes longitude and latitude for moving sensors
valid_core_fields = [
"raw_drop_concentration",
"raw_drop_average_velocity",
"raw_drop_number",
- "time",
- # longitude and latitude too for moving sensors
]
aux_columns = df.columns[np.isin(df.columns, valid_core_fields, invert=True)]
aux_data_vars = {column: (["time"], df[column].to_numpy()) for column in aux_columns}
data_vars.update(aux_data_vars)
-
- # Add key "time"
- # - Is dropped in _define_coordinates !
- data_vars["time"] = df["time"].to_numpy()
-
return data_vars
-def _define_coordinates(data_vars, attrs, sensor_name):
- """Define DISDRODB L0B netCDF coordinates."""
- # Note: attrs and data_vars are modified in place !
-
- # - Diameter and velocity
- coords = get_bin_coords_dict(sensor_name=sensor_name)
-
- # - Geolocation + Time
- geolocation_vars = ["time", "latitude", "longitude", "altitude"]
- for var in geolocation_vars:
- if var in data_vars:
- coords[var] = data_vars[var]
- _ = data_vars.pop(var)
- _ = attrs.pop(var, None)
- else:
- coords[var] = attrs[var]
- _ = attrs.pop(var)
- return coords
-
-
def create_l0b_from_l0a(
df: pd.DataFrame,
attrs: dict,
@@ -451,25 +414,13 @@ def create_l0b_from_l0a(
# Retrieve sensor name
attrs = attrs.copy()
sensor_name = attrs["sensor_name"]
- # -----------------------------------------------------------.
+
# Define Dataset variables and coordinates
data_vars = _define_dataset_variables(df, sensor_name=sensor_name, verbose=verbose)
- # -----------------------------------------------------------.
- # Define coordinates for xarray Dataset
- # - attrs and data_vars are modified in place !
- coords = _define_coordinates(data_vars, attrs=attrs, sensor_name=sensor_name)
-
- # -----------------------------------------------------------
# Create xarray Dataset
- ds = xr.Dataset(
- data_vars=data_vars,
- coords=coords,
- attrs=attrs,
- )
- ds = finalize_dataset(ds, sensor_name=sensor_name)
-
- # -----------------------------------------------------------
+ ds = xr.Dataset(data_vars=data_vars)
+ ds = finalize_dataset(ds, sensor_name=sensor_name, attrs=attrs)
return ds
@@ -477,8 +428,43 @@ def create_l0b_from_l0a(
#### L0B netCDF4 Writer
-def finalize_dataset(ds, sensor_name):
+def set_geolocation_coordinates(ds, attrs):
+ """Add geolocation coordinates to dataset."""
+ # Assumption
+ # - If coordinate is present in L0A, overrides the one specified in the attributes
+ # - If a station is fixed, discard the coordinates in the DISDRODB reader !
+
+ # Assign geolocation coordinates to dataset
+ coords = ["latitude", "longitude", "altitude"]
+ for coord in coords:
+ # If coordinate not present, add it from dictionary
+ if coord not in ds:
+ ds = ds.assign_coords({coord: attrs.pop(coord, np.nan)})
+ # Else if set coordinates the variable in the dataset (present in the raw data)
+ else:
+ ds = ds.set_coords(coord)
+ _ = attrs.pop(coord, None)
+
+ # Set -9999 flag value to np.nan
+ for coord in coords:
+ ds[coord] = xr.where(ds[coord] == -9999, np.nan, ds[coord])
+
+ # Set attributes without geolocation coordinates
+ ds.attrs = attrs
+ return ds
+
+
+def finalize_dataset(ds, sensor_name, attrs):
"""Finalize DISDRODB L0B Dataset."""
+ # Ensure sorted by time
+ ds = ensure_sorted_by_time(ds)
+
+ # Set diameter and velocity bin coordinates
+ ds = ds.assign_coords(get_bin_coords_dict(sensor_name=sensor_name))
+
+ # Set geolocation coordinates and attributes
+ ds = set_geolocation_coordinates(ds, attrs=attrs)
+
# Add dataset CRS coordinate
ds = add_dataset_crs_coords(ds)
@@ -496,56 +482,8 @@ def finalize_dataset(ds, sensor_name):
return ds
-def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
- """Ensure chunk size to be smaller than the array shape.
-
- Parameters
- ----------
- encoding_dict : dict
- Dictionary containing the encoding to write DISDRODB L0B netCDFs.
- ds : xarray.Dataset
- Input dataset.
-
- Returns
- -------
- dict
- Encoding dictionary.
- """
- for var in ds.data_vars:
- shape = ds[var].shape
- chunks = encoding_dict[var]["chunksizes"]
- if chunks is not None:
- chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))]
- encoding_dict[var]["chunksizes"] = chunks
- return encoding_dict
-
-
-def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
- """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary.
-
- Parameters
- ----------
- ds : xarray.Dataset
- Input xarray dataset
- encoding_dict : dict
- Dictionary containing the encoding to write the xarray dataset as a netCDF.
-
- Returns
- -------
- xr.Dataset
- Output xarray dataset
- """
- for var in ds.data_vars:
- chunks = encoding_dict[var].pop("chunksizes")
- dims = list(ds[var].dims)
- chunks_dict = dict(zip(dims, chunks))
- if chunks is not None:
- ds[var] = ds[var].chunk(chunks_dict)
- return ds
-
-
-def set_encodings(ds: xr.Dataset, sensor_name: str) -> xr.Dataset:
- """Apply the encodings to the xarray Dataset.
+def set_l0b_encodings(ds: xr.Dataset, sensor_name: str):
+ """Apply the L0B encodings to the xarray Dataset.
Parameters
----------
@@ -559,24 +497,8 @@ def set_encodings(ds: xr.Dataset, sensor_name: str) -> xr.Dataset:
xr.Dataset
Output xarray dataset.
"""
- # Get encoding dictionary
encoding_dict = get_l0b_encodings_dict(sensor_name)
- encoding_dict = {k: encoding_dict[k] for k in ds.data_vars}
-
- # Ensure chunksize smaller than the array shape
- encoding_dict = sanitize_encodings_dict(encoding_dict, ds)
-
- # Rechunk variables for fast writing !
- # - This pop the chunksize argument from the encoding dict !
- ds = rechunk_dataset(ds, encoding_dict)
-
- # Set time encoding
- ds["time"].encoding.update(get_time_encoding())
-
- # Set the variable encodings
- for var in ds.data_vars:
- ds[var].encoding.update(encoding_dict[var])
-
+ ds = set_encodings(ds=ds, encoding_dict=encoding_dict)
return ds
@@ -608,7 +530,7 @@ def write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None:
sensor_name = ds.attrs.get("sensor_name")
# Set encodings
- ds = set_encodings(ds=ds, sensor_name=sensor_name)
+ ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name)
# Write netcdf
ds.to_netcdf(filepath, engine="netcdf4")
diff --git a/disdrodb/l0/l0c_processing.py b/disdrodb/l0/l0c_processing.py
new file mode 100644
index 00000000..c84253bf
--- /dev/null
+++ b/disdrodb/l0/l0c_processing.py
@@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Functions to process DISDRODB L0B files into DISDRODB L0C netCDF files."""
+import logging
+
+import numpy as np
+import pandas as pd
+
+from disdrodb.api.info import get_start_end_time_from_filepaths
+from disdrodb.l1.resampling import add_sample_interval
+from disdrodb.utils.logger import log_warning # , log_info
+from disdrodb.utils.time import (
+ ensure_sorted_by_time,
+ regularize_timesteps,
+)
+
+logger = logging.getLogger(__name__)
+
+
+TOLERANCE_SECONDS = 120
+
+
+def get_files_per_days(filepaths):
+ """
+ Organize files by the days they cover based on their start and end times.
+
+ Parameters
+ ----------
+ filepaths : list of str
+ List of file paths to be processed.
+
+ Returns
+ -------
+ dict
+ Dictionary where keys are days (as strings) and values are lists of file paths
+ that cover those days.
+
+ Notes
+ -----
+ This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors.
+ """
+ # Retrieve file start_time and end_time
+ files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
+
+ # Add tolerance to account for imprecise time logging by the sensors
+ # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ...
+ files_start_time = files_start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+ files_end_time = files_end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+
+ # Retrieve file start day and end day
+ start_day = files_start_time.min().astype("M8[D]")
+ end_day = files_end_time.max().astype("M8[D]") + np.array(1, dtype="m8[D]")
+
+ # Create an array with all days in time period covered by the files
+ list_days = np.asanyarray(pd.date_range(start=start_day, end=end_day, freq="D")).astype("M8[D]")
+
+ # Expand dimension to match each day using broadcasting
+ files_start_time = files_start_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
+ files_end_time = files_end_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1)
+
+ # Create an array of all days
+ # - Expand dimension to match each day using broadcasting
+ days = list_days[np.newaxis, :] # shape (1, n_days)
+
+ # Use broadcasting to create a boolean matrix indicating which files cover which days
+ mask = (files_start_time <= days) & (files_end_time >= days) # shape (n_files, n_days)
+
+ # Build a mapping from days to file indices
+ # For each day (column), find the indices of files (rows) that cover that day
+ dict_days = {}
+ filepaths = np.array(filepaths)
+ for i, day in enumerate(list_days):
+ file_indices = np.where(mask[:, i])[0]
+ if file_indices.size > 0:
+ dict_days[str(day)] = filepaths[file_indices].tolist()
+
+ return dict_days
+
+
+def retrieve_possible_measurement_intervals(metadata):
+ """Retrieve list of possible measurements intervals."""
+ measurement_interval = metadata.get("measurement_interval", [])
+ if isinstance(measurement_interval, (int, float, str)):
+ measurement_interval = [measurement_interval]
+ measurement_intervals = [int(v) for v in measurement_interval]
+ return measurement_intervals
+
+
+def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbose=True, logger=None):
+ """Drop timesteps with unexpected sample intervals."""
+ # TODO
+ # - correct logged sample_interval for trailing seconds. Example (58,59,61,62) converted to 60 s ?
+ # - Need to know more how Parsivel software computes sample_interval variable ...
+
+ # Retrieve logged sample_interval
+ sample_interval = ds["sample_interval"].compute().data
+ timesteps = ds["time"].compute().data
+ is_valid_sample_interval = np.isin(sample_interval.data, measurement_intervals)
+ indices_invalid_sample_interval = np.where(~is_valid_sample_interval)[0]
+ if len(indices_invalid_sample_interval) > 0:
+ # Log information for each invalid timestep
+ invalid_timesteps = pd.to_datetime(timesteps[indices_invalid_sample_interval]).strftime("%Y-%m-%d %H:%M:%S")
+ invalid_sample_intervals = sample_interval[indices_invalid_sample_interval]
+ for tt, ss in zip(invalid_timesteps, invalid_sample_intervals):
+ msg = f"Unexpected sampling interval ({ss} s) at {tt}. The measurement has been dropped."
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+ # Remove timesteps with invalid sample intervals
+ indices_valid_sample_interval = np.where(is_valid_sample_interval)[0]
+ ds = ds.isel(time=indices_valid_sample_interval)
+ return ds
+
+
+def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_interval=10, min_block_size=5):
+ """
+ Split a dataset into subsets where each subset has a consistent sampling interval.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The input dataset with a 'time' dimension.
+ measurement_intervals : list or array-like
+ A list of possible primary sampling intervals (in seconds) that the dataset might have.
+ min_sample_interval : int, optional
+ The minimum expected sampling interval in seconds. Defaults to 10s.
+ min_block_size : float, optional
+ The minimum number of timesteps with a given sampling interval to be considered.
+ Otherwise such portion of data is discarded !
+ Defaults to 5 timesteps.
+
+ Returns
+ -------
+ dict
+ A dictionary where keys are the identified sampling intervals (in seconds),
+ and values are xarray.Datasets containing only data from those intervals.
+ """
+ # Define array of possible measurement intervals
+ measurement_intervals = np.array(measurement_intervals)
+
+ # If a single measurement interval expected, return dictionary with input dataset
+ if len(measurement_intervals) == 1:
+ dict_ds = {measurement_intervals[0]: ds}
+ return dict_ds
+
+ # Check sorted by time and sort if necessary
+ ds = ensure_sorted_by_time(ds)
+
+ # Calculate time differences in seconds
+ deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
+
+ # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s)
+ # - This account for possible trailing seconds of the logger
+ # Example: for sample_interval = 10, deltat values like 8, 9, 11, 12 become 10 ...
+ # Example: for sample_interval = 10, deltat values like 6, 7 or 13, 14 become respectively 5 and 15 ...
+ # Example: for sample_interval = 30, deltat values like 28,29,30,31,32 deltat become 30 ...
+ # Example: for sample_interval = 30, deltat values like 26, 27 or 33, 34 become respectively 25 and 35 ...
+ min_half_sample_interval = min_sample_interval / 2
+ deltadt = np.round(deltadt / min_half_sample_interval) * min_half_sample_interval
+
+ # Map each delta to one of the possible_measurement_intervals if exact match, otherwise np.nan
+ mapped_intervals = np.where(np.isin(deltadt, measurement_intervals), deltadt, np.nan)
+ if np.all(np.isnan(mapped_intervals)):
+ raise ValueError("Impossible to identify timesteps with expected sampling intervals.")
+
+ # Infill np.nan values by using neighbor intervals
+ # Forward fill
+ for i in range(1, len(mapped_intervals)):
+ if np.isnan(mapped_intervals[i]):
+ mapped_intervals[i] = mapped_intervals[i - 1]
+
+ # Backward fill (in case the first entries were np.nan)
+ for i in range(len(mapped_intervals) - 2, -1, -1):
+ if np.isnan(mapped_intervals[i]):
+ mapped_intervals[i] = mapped_intervals[i + 1]
+
+ # Now all intervals are assigned to one of the possible measurement_intervals.
+ # Identify boundaries where interval changes
+ change_points = np.where(mapped_intervals[:-1] != mapped_intervals[1:])[0] + 1
+
+ # Split ds into segments according to change_points
+ segments = np.split(np.arange(ds.sizes["time"]), change_points)
+
+ # Remove segments with less than 10 points
+ segments = [seg for seg in segments if len(seg) >= min_block_size]
+ if len(segments) == 0:
+ raise ValueError(
+ f"No blocks of {min_block_size} consecutive timesteps with constant sampling interval are available.",
+ )
+
+ # Define dataset indices for each sampling interva
+ dict_sampling_interval_indices = {}
+ for seg in segments:
+ # Define the assumed sampling interval of such segment
+ start_idx = seg[0]
+ segment_sampling_interval = int(mapped_intervals[start_idx])
+ if segment_sampling_interval not in dict_sampling_interval_indices:
+ dict_sampling_interval_indices[segment_sampling_interval] = [seg]
+ else:
+ dict_sampling_interval_indices[segment_sampling_interval].append(seg)
+ dict_sampling_interval_indices = {
+ k: np.concatenate(list_indices) for k, list_indices in dict_sampling_interval_indices.items()
+ }
+
+ # Define dictionary of datasets
+ dict_ds = {k: ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()}
+ return dict_ds
+
+
+def has_same_value_over_time(da):
+ """
+ Check if a DataArray has the same value over all timesteps, considering NaNs as equal.
+
+ Parameters
+ ----------
+ da : xarray.DataArray
+ The DataArray to check. Must have a 'time' dimension.
+
+ Returns
+ -------
+ bool
+ True if the values are the same (or NaN in the same positions) across all timesteps,
+ False otherwise.
+ """
+ # Select the first timestep
+ da_first = da.isel(time=0)
+
+ # Create a boolean array that identifies where values are equal or both NaN
+ equal_or_nan = (da == da_first) | (da.isnull() & da_first.isnull()) # noqa: PD003
+
+ # Check if all values match this condition across all dimensions
+ return bool(equal_or_nan.all().item())
+
+
+def remove_duplicated_timesteps(ds, ensure_variables_equality=True, logger=None, verbose=True):
+ """Removes duplicated timesteps from a xarray dataset."""
+ # Check for duplicated timesteps
+ timesteps, counts = np.unique(ds["time"].data, return_counts=True)
+ duplicated_timesteps = timesteps[counts > 1]
+
+ # If no duplicated timesteps, returns dataset as is
+ if len(duplicated_timesteps) == 0:
+ return ds
+
+ # If there are duplicated timesteps
+ # - First check for variables equality
+ # - Keep first occurrence of duplicated timesteps if values are equals
+ # - Drop duplicated timesteps where values are different
+ different_duplicated_timesteps = []
+ equal_duplicated_timesteps = []
+ for t in duplicated_timesteps:
+ # Select dataset at given duplicated timestep
+ ds_duplicated = ds.sel(time=t)
+ n_t = len(ds_duplicated["time"])
+
+ # Check raw_drop_number equality
+ if not has_same_value_over_time(ds_duplicated["raw_drop_number"]):
+ different_duplicated_timesteps.append(t)
+ msg = (
+ f"Presence of {n_t} duplicated timesteps at {t}."
+ "They have different 'raw_drop_number' values. These timesteps are dropped."
+ )
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+
+ # Check other variables equality
+ other_variables_to_check = [v for v in ds.data_vars if v != "raw_drop_number"]
+ variables_with_different_values = [
+ var for var in other_variables_to_check if not has_same_value_over_time(ds_duplicated[var])
+ ]
+ if len(variables_with_different_values) > 0:
+ msg = (
+ f"Presence of {n_t} duplicated timesteps at {t}."
+ f"The duplicated timesteps have different values in variables {variables_with_different_values}. "
+ )
+ if ensure_variables_equality:
+ different_duplicated_timesteps.append(t)
+ msg = msg + "These timesteps are dropped."
+ else:
+ equal_duplicated_timesteps.append(t)
+ msg = msg + (
+ "These timesteps are not dropped because 'raw_drop_number' values are equals."
+ "'ensure_variables_equality' is False."
+ )
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+ else:
+ equal_duplicated_timesteps.append(t)
+
+ # Ensure single occurrence of duplicated timesteps
+ equal_duplicated_timesteps = np.unique(equal_duplicated_timesteps)
+ different_duplicated_timesteps = np.unique(different_duplicated_timesteps)
+
+ # - Keep first occurrence of equal_duplicated_timesteps
+ if len(equal_duplicated_timesteps) > 0:
+ indices_to_drop = [np.where(ds["time"] == t)[0][1:] for t in equal_duplicated_timesteps]
+ indices_to_drop = np.concatenate(indices_to_drop)
+ # Keep only indices not in indices_to_drop
+ mask = ~np.isin(np.arange(ds["time"].size), indices_to_drop)
+ ds = ds.isel(time=np.where(mask)[0])
+
+ # - Drop different_duplicated_timesteps
+ if len(different_duplicated_timesteps) > 0:
+ mask = np.isin(ds["time"], different_duplicated_timesteps, invert=True)
+ ds = ds.isel(time=np.where(mask)[0])
+
+ return ds
+
+
+def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None):
+ """Check for the regularity of timesteps."""
+ # Check sorted by time and sort if necessary
+ ds = ensure_sorted_by_time(ds)
+
+ # Calculate number of timesteps
+ n = len(ds["time"].data)
+
+ # Calculate time differences in seconds
+ deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
+
+ # Identify unique time intervals and their occurrences
+ unique_deltadt, counts = np.unique(deltadt, return_counts=True)
+
+ # Determine the most frequent time interval (mode)
+ most_frequent_deltadt_idx = np.argmax(counts)
+ most_frequent_deltadt = unique_deltadt[most_frequent_deltadt_idx]
+
+ # Count fraction occurrence of deltadt
+ fractions = np.round(counts / len(deltadt) * 100, 2)
+
+ # Compute stats about expected deltadt
+ sample_interval_counts = counts[unique_deltadt == sample_interval].item()
+ sample_interval_fraction = fractions[unique_deltadt == sample_interval].item()
+
+ # Compute stats about most frequent deltadt
+ most_frequent_deltadt_counts = counts[unique_deltadt == most_frequent_deltadt].item()
+ most_frequent_deltadt_fraction = fractions[unique_deltadt == most_frequent_deltadt].item()
+
+ # Compute stats about unexpected deltadt
+ unexpected_intervals = unique_deltadt[unique_deltadt != sample_interval]
+ unexpected_intervals_counts = counts[unique_deltadt != sample_interval]
+ unexpected_intervals_fractions = fractions[unique_deltadt != sample_interval]
+ frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 5]
+
+ # Report warning if the samplin_interval deltadt occurs less often than 60 % of times
+ # -> TODO: maybe only report in stations where the disdro does not log only data when rainy
+ if sample_interval_fraction < 60:
+ msg = (
+ f"The expected (sampling) interval between observations occurs only "
+ f"{sample_interval_counts}/{n} times ({sample_interval_fraction} %)."
+ )
+
+ # Report warning if a deltadt occurs more often then the sampling interval
+ if most_frequent_deltadt != sample_interval:
+ msg = (
+ f"The most frequent time interval between observations is {most_frequent_deltadt} s "
+ f"(occurs {most_frequent_deltadt_counts}/{n} times) ({most_frequent_deltadt_fraction}%) "
+ f"although the expected (sampling) interval is {sample_interval} s "
+ f"and occurs {sample_interval_counts}/{n} times ({sample_interval_fraction}%)."
+ )
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+
+ # Report with a warning all unexpected deltadt with frequency larger than 5 %
+ if len(frequent_unexpected_intervals) > 0:
+ msg_parts = ["The following unexpected intervals occur frequently:"]
+ for interval in frequent_unexpected_intervals:
+ c = unexpected_intervals_counts[unexpected_intervals == interval].item()
+ f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
+ msg_parts.append(f" {interval} ({f}%) ({c}/{n}) | ")
+ msg = " ".join(msg_parts)
+
+ msg = "The following time intervals between observations occurs often: "
+ for interval in frequent_unexpected_intervals:
+ c = unexpected_intervals_counts[unexpected_intervals == interval].item()
+ f = unexpected_intervals_fractions[unexpected_intervals == interval].item()
+ msg = msg + f"{interval} s ({f}%) ({c}/{n})"
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+ return ds
+
+
+def finalize_l0c_dataset(ds, sample_interval, start_day, end_day, verbose=True, logger=None):
+ """Finalize a L0C dataset with unique sampling interval.
+
+ It adds the sampling_interval coordinate and it regularizes
+ the timesteps for trailing seconds.
+ """
+ # Add sample interval as coordinate
+ ds = add_sample_interval(ds, sample_interval=sample_interval)
+
+ # Regularize timesteps (for trailing seconds)
+ ds = regularize_timesteps(
+ ds,
+ sample_interval=sample_interval,
+ robust=False, # if True, raise error if an error occur during regularization
+ add_quality_flag=True,
+ verbose=verbose,
+ logger=logger,
+ )
+
+ # Performs checks about timesteps regularity
+ ds = check_timesteps_regularity(ds=ds, sample_interval=sample_interval, verbose=verbose, logger=logger)
+
+ # Slice for requested day
+ ds = ds.sel({"time": slice(start_day, end_day)})
+ return ds
+
+
+def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_equality=True, logger=None, verbose=True):
+ """
+ Create a daily file by merging and processing data from multiple filepaths.
+
+ Parameters
+ ----------
+ day : str or numpy.datetime64
+ The day for which the daily file is to be created.
+ Should be in a format that can be converted to numpy.datetime64.
+ filepaths : list of str
+ List of filepaths to the data files to be processed.
+
+ Returns
+ -------
+ xarray.Dataset
+ The processed dataset containing data for the specified day.
+
+ Raises
+ ------
+ ValueError
+ If less than 5 timesteps are available for the specified day.
+
+ Notes
+ -----
+ - The function adds a tolerance for searching timesteps
+ before and after 00:00 to account for imprecise logging times.
+ - It checks that duplicated timesteps have the same raw drop number values.
+ - The function infers the time integration sample interval and
+ regularizes timesteps to handle trailing seconds.
+ - The data is loaded into memory and connections to source files
+ are closed before returning the dataset.
+ """
+ import xarray as xr # Load in each process when function is called !
+
+ # ---------------------------------------------------------------------------------------.
+ # Define start day and end of day
+ start_day = np.array(day).astype("M8[D]")
+ end_day = start_day + np.array(1, dtype="m8[D]") - np.array(1, dtype="m8[s]") # avoid 00:00 of next day !
+
+ # Add tolerance for searching timesteps before and after 00:00 to account for imprecise logging time
+ # - Example: timestep 23:59:30 that should be 00.00 goes into the next day ...
+ start_day_tol = start_day - np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+ end_day_tol = end_day + np.array(TOLERANCE_SECONDS, dtype="m8[s]")
+
+ # ---------------------------------------------------------------------------------------.
+ # Open files with data within the provided day and concatenate them
+ # list_ds = [xr.open_dataset(filepath, chunks={}).sel({"time": slice(start_day_tol, end_day_tol)})
+ # for filepath in filepaths]
+ list_ds = [xr.open_dataset(filepath, chunks={}, cache=False).sortby("time") for filepath in filepaths]
+ list_ds = [ds.sel({"time": slice(start_day_tol, end_day_tol)}) for ds in list_ds]
+ if len(list_ds) > 1:
+ # Concatenate dataset
+ # - If some variable are missing in one file, it is filled with NaN. This should not occur anyway.
+ # - The resulting dataset can have duplicated timesteps !
+ ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby(
+ "time",
+ )
+ else:
+ ds = list_ds[0]
+
+ # Compute data
+ ds = ds.compute()
+
+ # Close connection to source files
+ _ = [ds.close() for ds in list_ds]
+ ds.close()
+ del list_ds
+
+ # ---------------------------------------------------------------------------------------.
+ # If sample interval is a dataset variable, drop timesteps with unexpected measurement intervals !
+ if "sample_interval" in ds:
+ ds = drop_timesteps_with_invalid_sample_interval(
+ ds=ds,
+ measurement_intervals=measurement_intervals,
+ verbose=verbose,
+ logger=logger,
+ )
+
+ # ---------------------------------------------------------------------------------------.
+ # Remove duplicated timesteps
+ ds = remove_duplicated_timesteps(
+ ds,
+ ensure_variables_equality=ensure_variables_equality,
+ logger=logger,
+ verbose=verbose,
+ )
+
+ # Raise error if less than 3 timesteps left
+ n_timesteps = len(ds["time"])
+ if n_timesteps < 3:
+ raise ValueError(f"{n_timesteps} timesteps left after removing duplicated timesteps.")
+
+ # ---------------------------------------------------------------------------------------.
+ # Split dataset by sampling intervals
+ dict_ds = split_dataset_by_sampling_intervals(
+ ds=ds,
+ measurement_intervals=measurement_intervals,
+ min_sample_interval=10,
+ min_block_size=5,
+ )
+
+ # Log a warning if two sampling intervals are present within a given day
+ if len(dict_ds) > 1:
+ occuring_sampling_intervals = list(dict_ds)
+ msg = f"The dataset contains both sampling intervals {occuring_sampling_intervals}."
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+
+ # ---------------------------------------------------------------------------------------.
+ # Finalize L0C datasets
+ # - Add sample_interval coordinate
+ # - Regularize timesteps for trailing seconds
+ dict_ds = {
+ sample_interval: finalize_l0c_dataset(
+ ds=ds,
+ sample_interval=sample_interval,
+ start_day=start_day,
+ end_day=end_day,
+ verbose=verbose,
+ logger=logger,
+ )
+ for sample_interval, ds in dict_ds.items()
+ }
+ return dict_ds
+
+
+# ---------------------------------------------------------------------------------------.
+#### DEPRECATED CODE
+
+
+# def copy_l0b_to_l0c_directory(filepath):
+# """Copy L0B file to L0C directory."""
+# import netCDF4
+
+# # Copy file
+# l0c_filepath = filepath.replace("L0B", "L0C")
+# _ = shutil.copy(filepath, l0c_filepath)
+
+# # Edit DISDRODB product attribute
+# with netCDF4.Dataset(l0c_filepath, mode="a") as nc_file:
+# # Modify the global attribute
+# nc_file.setncattr("disdrodb_product", "L0C")
+
+# def find_isel_common_time(da1, da2):
+# """
+# Find the indices of common time steps between two data arrays.
+
+# Parameters
+# ----------
+# da1 : xarray.DataArray
+# The first data array with a time coordinate.
+# da2 : xarray.DataArray
+# The second data array with a time coordinate.
+
+# Returns
+# -------
+# da1_isel : numpy.ndarray
+# Indices of the common time steps in the first data array.
+# da2_isel : numpy.ndarray
+# Indices of the common time steps in the second data array.
+
+# Notes
+# -----
+# This function assumes that both input data arrays have a "time" coordinate.
+# The function finds the intersection of the time steps in both data arrays
+# and returns the indices of these common time steps for each data array.
+# """
+# intersecting_timesteps = np.intersect1d(da1["time"], da2["time"])
+# da1_isel = np.where(np.isin(da1["time"], intersecting_timesteps))[0]
+# da2_isel = np.where(np.isin(da2["time"], intersecting_timesteps))[0]
+# return da1_isel, da2_isel
+
+
+# def check_same_raw_drop_number_values(list_ds, filepaths):
+# """
+# Check if the 'raw_drop_number' values are the same across multiple datasets.
+
+# This function compares the 'raw_drop_number' values of multiple datasets to ensure they are identical
+# at common timesteps.
+
+# If any discrepancies are found, a ValueError is raised indicating which files
+# have differing values.
+
+# Parameters
+# ----------
+# list_ds : list of xarray.Dataset
+# A list of xarray Datasets to be compared.
+# filepaths : list of str
+# A list of file paths corresponding to the datasets in `list_ds`.
+
+# Raises
+# ------
+# ValueError
+# If 'raw_drop_number' values differ at any common timestep between any two datasets.
+# """
+# # Retrieve variable to compare
+# list_drop_number = [ds["raw_drop_number"].compute() for ds in list_ds]
+# # Compare values
+# combos = list(itertools.combinations(range(len(list_drop_number)), 2))
+# for i, j in combos:
+# da1 = list_drop_number[i]
+# da2 = list_drop_number[j]
+# da1_isel, da2_isel = find_isel_common_time(da1=da1, da2=da2)
+# if not np.all(da1.isel(time=da1_isel).data == da2.isel(time=da2_isel).data):
+# file1 = filepaths[i]
+# file2 = filepaths[i]
+# msg = f"Duplicated timesteps have different values between file {file1} and {file2}"
+# raise ValueError(msg)
diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py b/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py
index 25f473c1..706e870e 100644
--- a/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py
+++ b/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py
@@ -161,6 +161,8 @@ def df_sanitizer_fun(df):
df["time"] = df["sensor_date"] + "-" + df["sensor_time"]
df["time"] = pd.to_datetime(df["time"], format="%d.%m.%y-%H:%M:%S", errors="coerce")
+ # TODO: correct time is unavailable yet !
+
# Drop row if start_identifier different than 00
df = df[df["start_identifier"].astype(str) == "00"]
diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py b/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py
index a42f29d2..eabbdfaa 100644
--- a/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py
+++ b/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py
@@ -71,18 +71,18 @@ def df_sanitizer_fun(df):
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
- # Convert ID to integer
- # - First convert to numeric and if errors arise (corrupted rows), drop rows
- df["ID"] = pd.to_numeric(df["ID"], errors="coerce")
- df = df.dropna(subset="ID")
- df["ID"] = df["ID"].astype(int)
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
- groups = df.groupby((df["ID"].diff() <= 0).cumsum())
+ groups = df.groupby((df["ID"].astype(int).diff() <= 0).cumsum())
# - Reshape the dataframe
group_dfs = []
diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py b/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py
index 55dca220..0c98c0c9 100644
--- a/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py
+++ b/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py
@@ -38,7 +38,7 @@ def reader(
"date",
"time",
"sensor_status",
- "interval",
+ "sample_interval",
"n1",
"n2",
"n3",
@@ -99,7 +99,7 @@ def df_sanitizer_fun(df):
import pandas as pd
# - Replace 'status' NaN with 0
- df["sensor_status"] = df["sensor_status"].fillna(0)
+ df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int)
# - Define 'time' datetime column
df["time"] = df["date"].astype(str) + " " + df["time"].astype(str)
diff --git a/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py b/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py
index 3366c62e..0c88ae8e 100644
--- a/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py
+++ b/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py
@@ -71,18 +71,18 @@ def df_sanitizer_fun(df):
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
- # Convert ID to integer
- # - First convert to numeric and if errors arise (corrupted rows), drop rows
- df["ID"] = pd.to_numeric(df["ID"], errors="coerce")
- df = df.dropna(subset="ID")
- df["ID"] = df["ID"].astype(int)
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
- groups = df.groupby((df["ID"].diff() <= 0).cumsum())
+ groups = df.groupby((df["ID"].astype(int).diff() <= 0).cumsum())
# - Reshape the dataframe
group_dfs = []
diff --git a/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py b/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py
index ce667dbb..0e5b222a 100644
--- a/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py
+++ b/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py
@@ -38,7 +38,7 @@ def reader(
"date",
"time",
"sensor_status",
- "interval",
+ "sample_interval",
"n1",
"n2",
"n3",
@@ -99,7 +99,7 @@ def df_sanitizer_fun(df):
import pandas as pd
# - Replace 'status' NaN with 0
- df["sensor_status"] = df["sensor_status"].fillna(0)
+ df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int)
# - Define 'time' datetime column
df["time"] = df["date"].astype(str) + " " + df["time"].astype(str)
diff --git a/disdrodb/l0/readers/EPFL/UNIL_2022.py b/disdrodb/l0/readers/EPFL/UNIL_2022.py
index e380015a..cc2cec95 100644
--- a/disdrodb/l0/readers/EPFL/UNIL_2022.py
+++ b/disdrodb/l0/readers/EPFL/UNIL_2022.py
@@ -92,10 +92,15 @@ def df_sanitizer_fun(df):
df["time"] = pd.to_datetime(df["time"], format="%d-%m-%Y %H:%M:%S", errors="coerce")
# - Split TO_BE_SPLITTED columns
+
df_splitted = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=1)
df_splitted.columns = ["datalogger_voltage", "rainfall_rate_32bit"]
df["rainfall_rate_32bit"] = df_splitted["rainfall_rate_32bit"]
+ # Remove rows with error in data reading
+ # - When datalogger error: rainfall_rate_32bit: Error in data reading!
+ df = df[df["rainfall_rate_32bit"] != "Error in data reading! 0"]
+
# - Drop columns not agreeing with DISDRODB L0 standards
columns_to_drop = [
"id",
diff --git a/disdrodb/l0/readers/GPM/MC3E.py b/disdrodb/l0/readers/GPM/MC3E.py
index 775d14f4..30156005 100644
--- a/disdrodb/l0/readers/GPM/MC3E.py
+++ b/disdrodb/l0/readers/GPM/MC3E.py
@@ -65,40 +65,104 @@ def reader(
#### - Define dataframe sanitizer function for L0 processing
def df_sanitizer_fun(df):
# - Import pandas
+ import numpy as np
import pandas as pd
- # - Define 'time' datetime
- df_time = pd.to_datetime(df["time"], format="%Y%m%d%H%M%S", errors="coerce")
+ # - Convert 'time' column to datetime
+ df["time"] = pd.to_datetime(df["time"], format="%Y%m%d%H%M%S", errors="coerce")
- # - Split the 'TO_BE_SPLITTED' column
- df = df["TO_BE_SPLITTED"].str.split(",", n=9, expand=True)
+ # Count number of delimiters in the column to be parsed
+ # --> Some first rows are corrupted, so count the most frequent occurrence
+ possible_delimiters, counts = np.unique(df["TO_BE_SPLITTED"].str.count(","), return_counts=True)
+ n_delimiters = possible_delimiters[np.argmax(counts)]
- # - Assign column names
- column_names = [
- "station_name",
- "sensor_status",
- "sensor_temperature",
- "number_particles",
- "rainfall_rate_32bit",
- "reflectivity_16bit",
- "mor_visibility",
- "weather_code_synop_4680",
- "weather_code_synop_4677",
- "raw_drop_number",
- ]
- df.columns = column_names
-
- # - Add the time column
- df["time"] = df_time
+ if n_delimiters == 1031: # first files
+ # - Select valid rows
+ df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1031]
+ # - Get time column
+ df_time = df["time"]
+ # - Split the 'TO_BE_SPLITTED' column
+ df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=7)
+ # - Assign column names
+ column_names = [
+ "station_name",
+ "sensor_status",
+ "sensor_temperature",
+ "reflectivity_32bit",
+ "mor_visibility",
+ "weather_code_synop_4680",
+ "weather_code_synop_4677",
+ "raw_drop_number",
+ ]
+ df.columns = column_names
+ # - Add time column
+ df["time"] = df_time
+ # - Remove columns not in other files
+ df = df.drop(columns="reflectivity_32bit")
+ # - Add missing columns and set NaN value
+ missing_columns = [
+ "number_particles",
+ "rainfall_rate_32bit",
+ "reflectivity_16bit",
+ ]
+ for column in missing_columns:
+ df[column] = "NaN"
+ elif n_delimiters == 1033: # (most of the files)
+ # - Select valid rows
+ df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1033]
+ # - Get time column
+ df_time = df["time"]
+ # - Split the column be parsed
+ df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=9)
+ # - Assign column names
+ column_names = [
+ "station_name",
+ "sensor_status",
+ "sensor_temperature",
+ "number_particles",
+ "rainfall_rate_32bit",
+ "reflectivity_16bit",
+ "mor_visibility",
+ "weather_code_synop_4680",
+ "weather_code_synop_4677",
+ "raw_drop_number",
+ ]
+ df.columns = column_names
+ # - Add time column
+ df["time"] = df_time
+ elif n_delimiters == 1035: # APU 17 first files
+ # - Select valid rows
+ df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1035]
+ # - Get time column
+ df_time = df["time"]
+ # - Split the column be parsed
+ df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=11)
+ # - Assign column names
+ column_names = [
+ "station_name",
+ "sensor_date",
+ "sensor_time",
+ "sensor_status",
+ "sensor_temperature",
+ "number_particles",
+ "rainfall_rate_32bit",
+ "reflectivity_16bit",
+ "mor_visibility",
+ "weather_code_synop_4680",
+ "weather_code_synop_4677",
+ "raw_drop_number",
+ ]
+ df.columns = column_names
+ # - Add time column
+ df["time"] = df_time
+ # - Drop columns not needed
+ df = df.drop(columns=["sensor_time", "sensor_date"])
+ else:
+ # Wrong number of delimiters ... likely a corrupted file
+ raise ValueError("Unexpected number of comma delimiters !")
# - Drop columns not agreeing with DISDRODB L0 standards
df = df.drop(columns=["station_name"])
-
- # - Drop rows with invalid values
- # --> Ensure that weather_code_synop_4677 has length 2
- # --> If a previous column is missing it will have 000
- df = df[df["weather_code_synop_4677"].str.len() == 2]
-
return df
##------------------------------------------------------------------------.
diff --git a/disdrodb/l0/readers/GPM/NSSTC.py b/disdrodb/l0/readers/GPM/NSSTC.py
index 7595ada7..908b1349 100644
--- a/disdrodb/l0/readers/GPM/NSSTC.py
+++ b/disdrodb/l0/readers/GPM/NSSTC.py
@@ -82,7 +82,7 @@ def df_sanitizer_fun(df):
possible_delimiters, counts = np.unique(df["TO_BE_SPLITTED"].str.count(","), return_counts=True)
n_delimiters = possible_delimiters[np.argmax(counts)]
- if n_delimiters == 1027:
+ if n_delimiters == 1027: # APU 2010
# - Select valid rows
df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1027]
# - Get time column
@@ -110,6 +110,37 @@ def df_sanitizer_fun(df):
]
for column in missing_columns:
df[column] = "NaN"
+ elif n_delimiters == 1031: # APU08 (2011)
+ # - Select valid rows
+ df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1031]
+ # - Get time column
+ df_time = df["time"]
+ # - Split the 'TO_BE_SPLITTED' column
+ df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=7)
+ # - Assign column names
+ column_names = [
+ "station_name",
+ "sensor_status",
+ "sensor_temperature",
+ "reflectivity_32bit",
+ "mor_visibility",
+ "weather_code_synop_4680",
+ "weather_code_synop_4677",
+ "raw_drop_number",
+ ]
+ df.columns = column_names
+ # - Add time column
+ df["time"] = df_time
+ # - Remove columns not in other files
+ df = df.drop(columns="reflectivity_32bit")
+ # - Add missing columns and set NaN value
+ missing_columns = [
+ "number_particles",
+ "rainfall_rate_32bit",
+ "reflectivity_16bit",
+ ]
+ for column in missing_columns:
+ df[column] = "NaN"
elif n_delimiters == 1033:
# - Select valid rows
df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1033]
diff --git a/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py b/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py
index faefb118..7e8d42cf 100644
--- a/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py
+++ b/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py
@@ -81,8 +81,14 @@ def df_sanitizer_fun(df):
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
+
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
diff --git a/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py b/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py
index 55b0c8da..c33df841 100644
--- a/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py
+++ b/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py
@@ -38,7 +38,7 @@ def reader(
"date",
"time",
"sensor_status",
- "interval",
+ "sample_interval",
"n1",
"n2",
"n3",
@@ -99,7 +99,7 @@ def df_sanitizer_fun(df):
import pandas as pd
# - Replace 'status' NaN with 0
- df["sensor_status"] = df["sensor_status"].fillna(0)
+ df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int)
# - Replace all ',' with '.' in RI, RA, RAT
df["RI"] = df["RI"].replace({",": "."}, regex=True)
diff --git a/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py b/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py
index 53b4fbc9..dc8607af 100644
--- a/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py
+++ b/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py
@@ -64,14 +64,21 @@ def reader(
#### - Define dataframe sanitizer function for L0 processing
def df_sanitizer_fun(df):
# - Import pandas
+ import numpy as np
import pandas as pd
# Create ID and Value columns
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
+
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
diff --git a/disdrodb/l0/readers/NCAR/SNOWIE_SB.py b/disdrodb/l0/readers/NCAR/SNOWIE_SB.py
index 223d4ffd..8d8330e4 100644
--- a/disdrodb/l0/readers/NCAR/SNOWIE_SB.py
+++ b/disdrodb/l0/readers/NCAR/SNOWIE_SB.py
@@ -71,14 +71,21 @@ def reader(
#### - Define dataframe sanitizer function for L0 processing
def df_sanitizer_fun(df):
# - Import pandas
+ import numpy as np
import pandas as pd
# Create ID and Value columns
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
+
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
diff --git a/disdrodb/l0/readers/NCAR/VORTEX2_2010.py b/disdrodb/l0/readers/NCAR/VORTEX2_2010.py
index a3d6752d..3d5fe922 100644
--- a/disdrodb/l0/readers/NCAR/VORTEX2_2010.py
+++ b/disdrodb/l0/readers/NCAR/VORTEX2_2010.py
@@ -82,8 +82,14 @@ def df_sanitizer_fun(df):
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
+
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
@@ -127,8 +133,8 @@ def df_sanitizer_fun(df):
# "23": "station_number",
"24": "rainfall_amount_absolute_32bit",
"25": "error_code",
- "30": "rainfall_rate_16_bit",
- "31": "rainfall_rate_12_bit",
+ "30": "rainfall_rate_16bit",
+ "31": "rainfall_rate_12bit",
"32": "rainfall_accumulated_16bit",
"90": "raw_drop_concentration",
"91": "raw_drop_average_velocity",
diff --git a/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py b/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py
index 16f99b25..f0cf2602 100644
--- a/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py
+++ b/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py
@@ -82,8 +82,18 @@ def df_sanitizer_fun(df):
df = df["TO_PARSE"].str.split(":", expand=True, n=1)
df.columns = ["ID", "Value"]
- # Drop rows with no values
+ # Select only rows with values
df = df[df["Value"].astype(bool)]
+ df = df[df["Value"].apply(lambda x: x is not None)]
+
+ # Drop rows with invalid IDs
+ # - Corrupted rows
+ valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0")
+ df = df[df["ID"].astype(str).isin(valid_id_str)]
+
+ # Raise error if no more rows after removed corrupted ones
+ if len(df) == 0:
+ raise ValueError("No rows left after removing corrupted ones.")
# Create the dataframe with each row corresponding to a timestep
# - Group rows based on when ID values restart
diff --git a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py
index c1e9e2c0..baef8ab5 100644
--- a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py
+++ b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py
@@ -97,6 +97,8 @@ def df_sanitizer_fun(df):
# Preprocess the raw spectrum
# - The 'ZERO' indicates no drops detected
# --> "" generates an array of zeros in L0B processing
+ df["raw_drop_number"] = df["raw_drop_number"].astype("string")
+ df["raw_drop_number"] = df["raw_drop_number"].str.strip()
df["raw_drop_number"] = df["raw_drop_number"].replace("ZERO", "")
# Remove and " acronyms from the raw_drop_number field
diff --git a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py
index 7814f66f..da0ad731 100644
--- a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py
+++ b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py
@@ -144,7 +144,7 @@ def df_sanitizer_fun(df):
df["longitude"] = df_lon
# - Drop columns not agreeing with DISDRODB L0 standards
- df = df.drop(columns=["serial_number", "sensor_time", "serial_number"])
+ df = df.drop(columns=["serial_number", "sensor_time", "sensor_date", "serial_number"])
return df
diff --git a/disdrodb/l0/readers/NETHERLANDS/DELFT.py b/disdrodb/l0/readers/NETHERLANDS/DELFT.py
index 5fa5632a..fcd829cd 100644
--- a/disdrodb/l0/readers/NETHERLANDS/DELFT.py
+++ b/disdrodb/l0/readers/NETHERLANDS/DELFT.py
@@ -156,9 +156,7 @@ def df_sanitizer_fun(df):
"station_name",
"station_number",
"sensor_serial_number",
- "sample_interval",
"sensor_serial_number",
- # "epoch_time",
# "number_particles_all_detected",
]
df = df.drop(columns=columns_to_drop)
diff --git a/disdrodb/l0/routines.py b/disdrodb/l0/routines.py
deleted file mode 100644
index 8b137814..00000000
--- a/disdrodb/l0/routines.py
+++ /dev/null
@@ -1,760 +0,0 @@
-#!/usr/bin/env python3
-
-# -----------------------------------------------------------------------------.
-# Copyright (c) 2021-2023 DISDRODB developers
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-# -----------------------------------------------------------------------------.
-"""Implement DISDRODB wrappers to launch L0 processing in the terminal."""
-
-import datetime
-import logging
-import time
-from typing import Optional
-
-import click
-
-from disdrodb.utils.logger import (
- # log_warning,
- # log_error,
- log_info,
-)
-from disdrodb.utils.scripts import _execute_cmd
-
-logger = logging.getLogger(__name__)
-
-####--------------------------------------------------------------------------.
-#### CLIck
-
-
-def click_l0_stations_options(function: object):
- """Click command line options for DISDRODB archive L0 processing.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.option(
- "--data_sources",
- type=str,
- show_default=True,
- default="",
- help="DISDRODB data sources to process",
- )(function)
- function = click.option(
- "--campaign_names",
- type=str,
- show_default=True,
- default="",
- help="DISDRODB campaign names to process",
- )(function)
- function = click.option(
- "--station_names",
- type=str,
- show_default=True,
- default="",
- help="DISDRODB station names to process",
- )(function)
- return function
-
-
-def click_l0_processing_options(function: object):
- """Click command line default parameters for L0 processing options.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.option(
- "-p",
- "--parallel",
- type=bool,
- show_default=True,
- default=False,
- help="Process files in parallel",
- )(function)
- function = click.option(
- "-d",
- "--debugging_mode",
- type=bool,
- show_default=True,
- default=False,
- help="Switch to debugging mode",
- )(function)
- function = click.option("-v", "--verbose", type=bool, show_default=True, default=True, help="Verbose")(function)
- function = click.option(
- "-f",
- "--force",
- type=bool,
- show_default=True,
- default=False,
- help="Force overwriting",
- )(function)
- return function
-
-
-def click_remove_l0a_option(function: object):
- """Click command line argument for ``remove_l0a``."""
- function = click.option(
- "--remove_l0a",
- type=bool,
- show_default=True,
- default=False,
- help="If true, remove the L0A files once the L0B processing is terminated.",
- )(function)
- return function
-
-
-def click_l0_archive_options(function: object):
- """Click command line arguments for L0 processing archiving of a station.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.option(
- "--l0b_concat",
- type=bool,
- show_default=True,
- default=False,
- help="Produce single L0B netCDF file.",
- )(function)
- function = click.option(
- "--remove_l0b",
- type=bool,
- show_default=True,
- default=False,
- help="If true, remove all source L0B files once L0B concatenation is terminated.",
- )(function)
- function = click.option(
- "--remove_l0a",
- type=bool,
- show_default=True,
- default=False,
- help="If true, remove the L0A files once the L0B processing is terminated.",
- )(function)
- function = click.option(
- "-l0b",
- "--l0b_processing",
- type=bool,
- show_default=True,
- default=True,
- help="Perform L0B processing.",
- )(function)
- function = click.option(
- "-l0a",
- "--l0a_processing",
- type=bool,
- show_default=True,
- default=True,
- help="Perform L0A processing.",
- )(function)
- return function
-
-
-def click_l0b_concat_options(function: object):
- """Click command line default parameters for L0B concatenation.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.option(
- "--remove_l0b",
- type=bool,
- show_default=True,
- default=False,
- help="If true, remove all source L0B files once L0B concatenation is terminated.",
- )(function)
- function = click.option("-v", "--verbose", type=bool, show_default=True, default=False, help="Verbose")(function)
- return function
-
-
-####--------------------------------------------------------------------------.
-#### Run L0A and L0B Station processing
-
-
-def run_disdrodb_l0a_station(
- # Station arguments
- data_source,
- campaign_name,
- station_name,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
-):
- """Run the L0A processing of a station calling the disdrodb_l0a_station in the terminal."""
- # Define command
- cmd = " ".join(
- [
- "disdrodb_run_l0a_station",
- # Station arguments
- data_source,
- campaign_name,
- station_name,
- # Processing options
- "--force",
- str(force),
- "--verbose",
- str(verbose),
- "--debugging_mode",
- str(debugging_mode),
- "--parallel",
- str(parallel),
- "--base_dir",
- str(base_dir),
- ],
- )
- # Execute command
- _execute_cmd(cmd)
-
-
-def run_disdrodb_l0b_station(
- # Station arguments
- data_source,
- campaign_name,
- station_name,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
- remove_l0a: bool = False,
-):
- """Run the L0B processing of a station calling disdrodb_run_l0b_station in the terminal."""
- # Define command
- cmd = " ".join(
- [
- "disdrodb_run_l0b_station",
- # Station arguments
- data_source,
- campaign_name,
- station_name,
- # Processing options
- "--force",
- str(force),
- "--verbose",
- str(verbose),
- "--debugging_mode",
- str(debugging_mode),
- "--parallel",
- str(parallel),
- "--remove_l0a",
- str(remove_l0a),
- "--base_dir",
- str(base_dir),
- ],
- )
- # Execute command
- _execute_cmd(cmd)
-
-
-def run_disdrodb_l0b_concat_station(
- data_source,
- campaign_name,
- station_name,
- remove_l0b=False,
- verbose=False,
- base_dir=None,
-):
- """Concatenate the L0B files of a single DISDRODB station.
-
- This function runs the ``disdrodb_run_l0b_concat_station`` script in the terminal.
- """
- cmd = " ".join(
- [
- "disdrodb_run_l0b_concat_station",
- data_source,
- campaign_name,
- station_name,
- "--remove_l0b",
- str(remove_l0b),
- "--verbose",
- str(verbose),
- "--base_dir",
- str(base_dir),
- ],
- )
- _execute_cmd(cmd)
-
-
-####--------------------------------------------------------------------------.
-#### Run L0 Station processing (L0A + L0B)
-
-
-def run_disdrodb_l0_station(
- data_source,
- campaign_name,
- station_name,
- # L0 archive options
- l0a_processing: bool = True,
- l0b_processing: bool = True,
- l0b_concat: bool = False,
- remove_l0a: bool = False,
- remove_l0b: bool = False,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
-):
- """Run the L0 processing of a specific DISDRODB station from the terminal.
-
- Parameters
- ----------
- data_source : str
- Institution name (when campaign data spans more than 1 country),
- or country (when all campaigns (or sensor networks) are inside a given country).
- Must be UPPER CASE.
- campaign_name : str
- Campaign name. Must be UPPER CASE.
- station_name : str
- Station name
- l0a_processing : bool
- Whether to launch processing to generate L0A Apache Parquet file(s) from raw data.
- The default is ``True``.
- l0b_processing : bool
- Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.
- The default is ``True``.
- l0b_concat : bool
- Whether to concatenate all raw files into a single L0B netCDF file.
- If ``l0b_concat=True``, all raw files will be saved into a single L0B netCDF file.
- If ``l0b_concat=False``, each raw file will be converted into the corresponding L0B netCDF file.
- The default is ``False``.
- remove_l0a : bool
- Whether to keep the L0A files after having generated the L0B netCDF products.
- The default is ``False``.
- remove_l0b : bool
- Whether to remove the L0B files after having concatenated all L0B netCDF files.
- It takes places only if ``l0b_concat=True``.
- The default is ``False``.
- force : bool
- If ``True``, overwrite existing data into destination directories.
- If ``False``, raise an error if there are already data into destination directories.
- The default is ``False``.
- verbose : bool
- Whether to print detailed processing information into terminal.
- The default is ``True``.
- parallel : bool
- If ``True``, the files are processed simultaneously in multiple processes.
- Each process will use a single thread to avoid issues with the HDF/netCDF library.
- By default, the number of process is defined with ``os.cpu_count()``.
- If ``False``, the files are processed sequentially in a single process.
- If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
- debugging_mode : bool
- If ``True``, it reduces the amount of data to process.
- For L0A, it processes just the first 3 raw data files for each station.
- For L0B, it processes just the first 100 rows of 3 L0A files for each station.
- The default is ``False``.
- base_dir : str (optional)
- Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
- If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
- """
- # ---------------------------------------------------------------------.
- t_i = time.time()
- msg = f"L0 processing of station {station_name} has started."
- log_info(logger=logger, msg=msg, verbose=verbose)
-
- # ------------------------------------------------------------------.
- # L0A processing
- if l0a_processing:
- run_disdrodb_l0a_station(
- # Station arguments
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- # Processing options
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- )
- # ------------------------------------------------------------------.
- # L0B processing
- if l0b_processing:
- run_disdrodb_l0b_station(
- # Station arguments
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- # Processing options
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- remove_l0a=remove_l0a,
- )
-
- # ------------------------------------------------------------------------.
- # If l0b_concat=True, concat the netCDF in a single file
- if l0b_concat:
- run_disdrodb_l0b_concat_station(
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- remove_l0b=remove_l0b,
- verbose=verbose,
- )
-
- # -------------------------------------------------------------------------.
- # End of L0 processing for all stations
- timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i))
- msg = f"L0 processing of stations {station_name} completed in {timedelta_str}"
- log_info(logger, msg, verbose)
-
-
-####---------------------------------------------------------------------------.
-#### Run L0 Archive processing
-
-
-def _check_available_stations(list_info):
- # If no stations available, raise an error
- if len(list_info) == 0:
- msg = "No stations available given the provided `data_sources` and `campaign_names` arguments !"
- raise ValueError(msg)
-
-
-def _filter_list_info(list_info, station_names):
- # Filter by provided stations
- if station_names is not None:
- list_info = [info for info in list_info if info[2] in station_names]
- # If nothing left, raise an error
- if len(list_info) == 0:
- raise ValueError("No stations available given the provided `station_names` argument !")
- return list_info
-
-
-def _get_starting_product(l0a_processing, l0b_processing):
- if l0a_processing:
- product = "RAW"
- elif l0b_processing:
- product = "L0A"
- else:
- raise ValueError("At least l0a_processing or l0b_processing must be `True`.")
- return product
-
-
-def run_disdrodb_l0(
- data_sources=None,
- campaign_names=None,
- station_names=None,
- # L0 archive options
- l0a_processing: bool = True,
- l0b_processing: bool = True,
- l0b_concat: bool = False,
- remove_l0a: bool = False,
- remove_l0b: bool = False,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
-):
- """Run the L0 processing of DISDRODB stations.
-
- This function allows to launch the processing of many DISDRODB stations with a single command.
- From the list of all available DISDRODB stations, it runs the processing of the
- stations matching the provided data_sources, campaign_names and station_names.
-
- Parameters
- ----------
- data_sources : list
- Name of data source(s) to process.
- The name(s) must be UPPER CASE.
- If campaign_names and station are not specified, process all stations.
- The default is ``None``.
- campaign_names : list
- Name of the campaign(s) to process.
- The name(s) must be UPPER CASE.
- The default is ``None``.
- station_names : list
- Station names to process.
- The default is ``None``.
- l0a_processing : bool
- Whether to launch processing to generate L0A Apache Parquet file(s) from raw data.
- The default is ``True``.
- l0b_processing : bool
- Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.
- The default is ``True``.
- l0b_concat : bool
- Whether to concatenate all raw files into a single L0B netCDF file.
- If ``l0b_concat=True``, all raw files will be saved into a single L0B netCDF file.
- If ``l0b_concat=False``, each raw file will be converted into the corresponding L0B netCDF file.
- The default is ``False``.
- remove_l0a : bool
- Whether to keep the L0A files after having generated the L0B netCDF products.
- The default is ``False``.
- remove_l0b : bool
- Whether to remove the L0B files after having concatenated all L0B netCDF files.
- It takes places only if ``l0b_concat = True``.
- The default is ``False``.
- force : bool
- If ``True``, overwrite existing data into destination directories.
- If ``False``, raise an error if there are already data into destination directories.
- The default is ``False``.
- verbose : bool
- Whether to print detailed processing information into terminal.
- The default is ``False``.
- parallel : bool
- If ``True``, the files are processed simultaneously in multiple processes.
- Each process will use a single thread to avoid issues with the HDF/netCDF library.
- By default, the number of process is defined with ``os.cpu_count()``.
- If ``False``, the files are processed sequentially in a single process.
- If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
- debugging_mode : bool
- If ``True``, it reduces the amount of data to process.
- For L0A, it processes just the first 3 raw data files.
- For L0B, it processes just the first 100 rows of 3 L0A files.
- The default is ``False``.
- base_dir : str (optional)
- Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
- If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
- """
- from disdrodb.api.io import available_stations
-
- # Get list of available stations
- product = _get_starting_product(l0a_processing=l0a_processing, l0b_processing=l0b_processing)
- list_info = available_stations(
- base_dir=base_dir,
- product=product,
- data_sources=data_sources,
- campaign_names=campaign_names,
- )
- _check_available_stations(list_info)
- list_info = _filter_list_info(list_info, station_names)
-
- # Print message
- n_stations = len(list_info)
- print(f"L0 processing of {n_stations} stations started.")
-
- # Loop over stations
- for data_source, campaign_name, station_name in list_info:
- print(f"L0 processing of {data_source} {campaign_name} {station_name} station started.")
- # Run processing
- run_disdrodb_l0_station(
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- # L0 archive options
- l0a_processing=l0a_processing,
- l0b_processing=l0b_processing,
- l0b_concat=l0b_concat,
- remove_l0a=remove_l0a,
- remove_l0b=remove_l0b,
- # Process options
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- )
- print(f"L0 processing of {data_source} {campaign_name} {station_name} station ended.")
-
-
-def run_disdrodb_l0a(
- data_sources=None,
- campaign_names=None,
- station_names=None,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
-):
- """Run the L0A processing of DISDRODB stations.
-
- This function allows to launch the processing of many DISDRODB stations with a single command.
- From the list of all available DISDRODB stations, it runs the processing of the
- stations matching the provided data_sources, campaign_names and station_names.
-
- Parameters
- ----------
- data_sources : list
- Name of data source(s) to process.
- The name(s) must be UPPER CASE.
- If campaign_names and station are not specified, process all stations.
- The default is ``None``.
- campaign_names : list
- Name of the campaign(s) to process.
- The name(s) must be UPPER CASE.
- The default is ``None``.
- station_names : list
- Station names to process.
- The default is ``None``.
- force : bool
- If ``True``, overwrite existing data into destination directories.
- If ``False``, raise an error if there are already data into destination directories.
- The default is ``False``.
- verbose : bool
- Whether to print detailed processing information into terminal.
- The default is ``True``.
- parallel : bool
- If ``True``, the files are processed simultaneously in multiple processes.
- By default, the number of process is defined with ``os.cpu_count()``.
- If ``False``, the files are processed sequentially in a single process.
- debugging_mode : bool
- If ``True``, it reduces the amount of data to process.
- For L0A, it processes just the first 3 raw data files.
- The default is ``False``.
- base_dir : str (optional)
- Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
- If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
- """
- run_disdrodb_l0(
- base_dir=base_dir,
- data_sources=data_sources,
- campaign_names=campaign_names,
- station_names=station_names,
- # L0 archive options
- l0a_processing=True,
- l0b_processing=False,
- l0b_concat=False,
- remove_l0a=False,
- remove_l0b=False,
- # Processing options
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- )
-
-
-def run_disdrodb_l0b(
- data_sources=None,
- campaign_names=None,
- station_names=None,
- # Processing options
- force: bool = False,
- verbose: bool = False,
- debugging_mode: bool = False,
- parallel: bool = True,
- base_dir: Optional[str] = None,
- remove_l0a: bool = False,
-):
- """Run the L0B processing of DISDRODB stations.
-
- This function allows to launch the processing of many DISDRODB stations with a single command.
- From the list of all available DISDRODB L0A stations, it runs the processing of the
- stations matching the provided data_sources, campaign_names and station_names.
-
- Parameters
- ----------
- data_sources : list
- Name of data source(s) to process.
- The name(s) must be UPPER CASE.
- If campaign_names and station are not specified, process all stations.
- The default is ``None``.
- campaign_names : list
- Name of the campaign(s) to process.
- The name(s) must be UPPER CASE.
- The default is ``None``.
- station_names : list
- Station names to process.
- The default is ``None``.
- force : bool
- If ``True``, overwrite existing data into destination directories.
- If ``False``, raise an error if there are already data into destination directories.
- The default is ``False``.
- verbose : bool
- Whether to print detailed processing information into terminal.
- The default is ``True``.
- parallel : bool
- If ``True``, the files are processed simultaneously in multiple processes.
- By default, the number of process is defined with ``os.cpu_count()``.
- If ``False``, the files are processed sequentially in a single process.
- debugging_mode : bool
- If ``True``, it reduces the amount of data to process.
- For L0B, it processes just the first 100 rows of 3 L0A files.
- The default is ``False``.
- base_dir : str (optional)
- Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
- If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
- """
- run_disdrodb_l0(
- base_dir=base_dir,
- data_sources=data_sources,
- campaign_names=campaign_names,
- station_names=station_names,
- # L0 archive options
- l0a_processing=False,
- l0b_processing=True,
- l0b_concat=False,
- remove_l0a=remove_l0a,
- remove_l0b=False,
- # Processing options
- force=force,
- verbose=verbose,
- debugging_mode=debugging_mode,
- parallel=parallel,
- )
-
-
-####---------------------------------------------------------------------------.
-def run_disdrodb_l0b_concat(
- data_sources=None,
- campaign_names=None,
- station_names=None,
- remove_l0b=False,
- verbose=False,
- base_dir=None,
-):
- """Concatenate the L0B files of the DISDRODB archive.
-
- This function is called by the ``disdrodb_run_l0b_concat`` script.
- """
- from disdrodb.api.io import available_stations
-
- list_info = available_stations(
- base_dir=base_dir,
- product="L0B",
- data_sources=data_sources,
- campaign_names=campaign_names,
- )
-
- _check_available_stations(list_info)
- list_info = _filter_list_info(list_info, station_names)
-
- # Print message
- n_stations = len(list_info)
- print(f"Concatenation of {n_stations} L0B stations started.")
-
- # Start the loop to launch the concatenation of each station
- for data_source, campaign_name, station_name in list_info:
- print(f"L0B files concatenation of {data_source} {campaign_name} {station_name} station started.")
- run_disdrodb_l0b_concat_station(
- base_dir=base_dir,
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- remove_l0b=remove_l0b,
- verbose=verbose,
- )
- print(f"L0 files concatenation of {data_source} {campaign_name} {station_name} station ended.")
-
-
-####---------------------------------------------------------------------------.
diff --git a/disdrodb/l0/standards.py b/disdrodb/l0/standards.py
index 2c563abb..4d4390cf 100644
--- a/disdrodb/l0/standards.py
+++ b/disdrodb/l0/standards.py
@@ -18,8 +18,6 @@
# -----------------------------------------------------------------------------.
"""Retrieve L0 sensor standards."""
-import datetime
-import importlib
import logging
import numpy as np
@@ -29,11 +27,6 @@
logger = logging.getLogger(__name__)
-PRODUCT_VERSION = "V0"
-SOFTWARE_VERSION = "V" + importlib.metadata.version("disdrodb")
-CONVENTIONS = "CF-1.10, ACDD-1.3"
-EPOCH = "seconds since 1970-01-01 00:00:00"
-
####--------------------------------------------------------------------------.
#### Variables validity dictionary
@@ -252,150 +245,6 @@ def get_l0b_cf_attrs_dict(sensor_name: str) -> dict:
return read_config_file(sensor_name=sensor_name, product="L0A", filename="l0b_cf_attrs.yml")
-####-------------------------------------------------------------------------.
-#### Coordinates attributes
-
-
-def get_coords_attrs_dict():
- """Return dictionary with DISDRODB coordinates attributes."""
- attrs_dict = {}
- # Define diameter attributes
- attrs_dict["diameter_bin_center"] = {
- "name": "diameter_bin_center",
- "standard_name": "diameter_bin_center",
- "long_name": "diameter_bin_center",
- "units": "mm",
- "description": "Bin center drop diameter value",
- }
- attrs_dict["diameter_bin_width"] = {
- "name": "diameter_bin_width",
- "standard_name": "diameter_bin_width",
- "long_name": "diameter_bin_width",
- "units": "mm",
- "description": "Drop diameter bin width",
- }
- attrs_dict["diameter_bin_upper"] = {
- "name": "diameter_bin_upper",
- "standard_name": "diameter_bin_upper",
- "long_name": "diameter_bin_upper",
- "units": "mm",
- "description": "Bin upper bound drop diameter value",
- }
- attrs_dict["velocity_bin_lower"] = {
- "name": "velocity_bin_lower",
- "standard_name": "velocity_bin_lower",
- "long_name": "velocity_bin_lower",
- "units": "mm",
- "description": "Bin lower bound drop diameter value",
- }
- # Define velocity attributes
- attrs_dict["velocity_bin_center"] = {
- "name": "velocity_bin_center",
- "standard_name": "velocity_bin_center",
- "long_name": "velocity_bin_center",
- "units": "m/s",
- "description": "Bin center drop fall velocity value",
- }
- attrs_dict["velocity_bin_width"] = {
- "name": "velocity_bin_width",
- "standard_name": "velocity_bin_width",
- "long_name": "velocity_bin_width",
- "units": "m/s",
- "description": "Drop fall velocity bin width",
- }
- attrs_dict["velocity_bin_upper"] = {
- "name": "velocity_bin_upper",
- "standard_name": "velocity_bin_upper",
- "long_name": "velocity_bin_upper",
- "units": "m/s",
- "description": "Bin upper bound drop fall velocity value",
- }
- attrs_dict["velocity_bin_lower"] = {
- "name": "velocity_bin_lower",
- "standard_name": "velocity_bin_lower",
- "long_name": "velocity_bin_lower",
- "units": "m/s",
- "description": "Bin lower bound drop fall velocity value",
- }
- # Define geolocation attributes
- attrs_dict["latitude"] = {
- "name": "latitude",
- "standard_name": "latitude",
- "long_name": "Latitude",
- "units": "degrees_north",
- }
- attrs_dict["longitude"] = {
- "name": "longitude",
- "standard_name": "longitude",
- "long_name": "Longitude",
- "units": "degrees_east",
- }
- attrs_dict["altitude"] = {
- "name": "altitude",
- "standard_name": "altitude",
- "long_name": "Altitude",
- "units": "m",
- "description": "Elevation above sea level",
- }
- # Define time attributes
- attrs_dict["time"] = {
- "name": "time",
- "standard_name": "time",
- "long_name": "time",
- "description": "UTC Time",
- }
-
- return attrs_dict
-
-
-####-------------------------------------------------------------------------.
-#### DISDRODB attributes
-
-
-def set_disdrodb_attrs(ds, product: str):
- """Add DISDRODB processing information to the netCDF global attributes.
-
- It assumes stations metadata are already added the dataset.
-
- Parameters
- ----------
- ds : xarray.Dataset
- Dataset
- product: str
- DISDRODB product.
-
- Returns
- -------
- xarray dataset
- Dataset.
- """
- # Add dataset conventions
- ds.attrs["Conventions"] = CONVENTIONS
-
- # Add featureType
- platform_type = ds.attrs["platform_type"]
- if platform_type == "fixed":
- ds.attrs["featureType"] = "timeSeries"
- else:
- ds.attrs["featureType"] = "trajectory"
-
- # Add time_coverage_start and time_coverage_end
- ds.attrs["time_coverage_start"] = str(ds["time"].data[0])
- ds.attrs["time_coverage_end"] = str(ds["time"].data[-1])
-
- # DISDRODDB attributes
- # - Add DISDRODB processing info
- now = datetime.datetime.utcnow()
- current_time = now.strftime("%Y-%m-%d %H:%M:%S")
- ds.attrs["disdrodb_processing_date"] = current_time
- # - Add DISDRODB product and version
- ds.attrs["disdrodb_product_version"] = PRODUCT_VERSION
- ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION
- ds.attrs["disdrodb_product"] = product
-
- return ds
-
-
####-------------------------------------------------------------------------.
#### Bin Coordinates Information
@@ -762,20 +611,6 @@ def get_l0b_encodings_dict(sensor_name: str) -> dict:
return encoding_dict
-def get_time_encoding() -> dict:
- """Create time encoding.
-
- Returns
- -------
- dict
- Time encoding.
- """
- encoding = {}
- encoding["units"] = EPOCH
- encoding["calendar"] = "proleptic_gregorian"
- return encoding
-
-
####-------------------------------------------------------------------------.
#### L0B processing tools
diff --git a/disdrodb/l0/template_tools.py b/disdrodb/l0/template_tools.py
index 4cb8de19..9fb51b54 100644
--- a/disdrodb/l0/template_tools.py
+++ b/disdrodb/l0/template_tools.py
@@ -194,7 +194,7 @@ def print_df_summary_stats(
# Define columns of interest
_, columns_of_interest = _get_selected_column_names(df, column_indices)
# Remove columns of dtype object or string
- indices_to_remove = np.where((df.dtypes == type(object)) | (df.dtypes == str))
+ indices_to_remove = np.where((df.dtypes == type(object)) | (df.dtypes == str)) # noqa
indices = np.arange(0, len(df.columns))
indices = indices[np.isin(indices, indices_to_remove, invert=True)]
columns = df.columns[indices]
@@ -325,9 +325,7 @@ def str_has_decimal_digits(string: str) -> bool:
bool
True if string has digits.
"""
- if len(string.split(".")) == 2:
- return True
- return False
+ return len(string.split(".")) == 2
def get_decimal_ndigits(string: str) -> int:
diff --git a/disdrodb/l1/__init__.py b/disdrodb/l1/__init__.py
new file mode 100644
index 00000000..3bba3aaf
--- /dev/null
+++ b/disdrodb/l1/__init__.py
@@ -0,0 +1,17 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB L1 module."""
diff --git a/disdrodb/l1/beard_model.py b/disdrodb/l1/beard_model.py
new file mode 100644
index 00000000..1e25ff38
--- /dev/null
+++ b/disdrodb/l1/beard_model.py
@@ -0,0 +1,716 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Utilities to estimate the drop fall velocity using the Beard model."""
+
+
+import numpy as np
+import xarray as xr
+
+
+def get_gravitational_acceleration(latitude, altitude=0):
+ """
+ Computes gravitational acceleration at a given altitude and latitude.
+
+ Parameters
+ ----------
+ altitude : float
+ Altitude in meters. The default is 0 m (sea level).
+ latitude : float
+ Latitude in degrees.
+
+ Returns
+ -------
+ float
+ Gravitational acceleration in m/s^2.
+ """
+ g0 = 9.806229 - 0.025889372 * np.cos(2 * np.deg2rad(latitude))
+ return g0 - 2.879513 * altitude / 1e6
+
+
+def get_air_pressure_at_height(
+ altitude,
+ latitude,
+ temperature,
+ sea_level_air_pressure=101_325,
+ lapse_rate=0.0065,
+ gas_constant_dry_air=287.04,
+):
+ """
+ Computes the air pressure at a given height in a standard atmosphere.
+
+ According to the hypsometric formula of Brutsaert 1982; Ulaby et al. 1981
+
+ Parameters
+ ----------
+ altitude : float
+ Altitude in meters.
+ latitude : float
+ Latitude in degrees.
+ temperature : float
+ Temperature at altitude in Kelvin.
+ sea_level_air_pressure : float, optional
+ Standard atmospheric pressure at sea level in Pascals. The default is 101_325 Pascals.
+ lapse_rate : float, optional
+ Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m.
+ gas_constant_dry_air : float, optional
+ Gas constant for dry air in J/(kg*K). The default is 287.04 J/(kg*K).
+
+ Returns
+ -------
+ float
+ Air pressure in Pascals.
+ """
+ g = get_gravitational_acceleration(altitude=altitude, latitude=latitude)
+ return sea_level_air_pressure * np.exp(
+ -g / (lapse_rate * gas_constant_dry_air) * np.log(1 + lapse_rate * altitude / temperature),
+ )
+
+
+def get_air_temperature_at_height(altitude, sea_level_temperature, lapse_rate=0.0065):
+ """
+ Computes the air temperature at a given height in a standard atmosphere.
+
+ Reference: Brutsaert 1982; Ulaby et al. 1981
+
+ Parameters
+ ----------
+ altitude : float
+ Altitude in meters.
+ sea_level_temperature : float
+ Standard temperature at sea level in Kelvin.
+ lapse_rate : float, optional
+ Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m.
+
+ Returns
+ -------
+ float
+ Air temperature in Kelvin.
+ """
+ return sea_level_temperature - lapse_rate * altitude
+
+
+def get_vapor_actual_pressure_at_height(
+ altitude,
+ sea_level_temperature,
+ sea_level_relative_humidity,
+ sea_level_air_pressure=101_325,
+ lapse_rate=0.0065,
+):
+ """
+ Computes the vapor pressure using Yamamoto's exponential relationship.
+
+ Reference: Brutsaert 1982
+
+ Parameters
+ ----------
+ altitude : float
+ Altitude in meters.
+ sea_level_temperature : float
+ Standard temperature at sea level in Kelvin.
+ sea_level_relative_humidity : float
+ Relative humidity at sea level. A value between 0 and 1.
+ sea_level_air_pressure : float, optional
+ Standard atmospheric pressure at sea level in Pascals. The default is 101_325 Pascals.
+ lapse_rate : float, optional
+ Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m.
+
+ Returns
+ -------
+ float
+ Vapor pressure in Pascals.
+ """
+ temperature_at_altitude = get_air_temperature_at_height(
+ altitude=altitude,
+ sea_level_temperature=sea_level_temperature,
+ lapse_rate=lapse_rate,
+ )
+ esat = get_vapor_saturation_pressure(sea_level_temperature)
+ actual_vapor = sea_level_relative_humidity / (1 / esat - (1 - sea_level_relative_humidity) / sea_level_air_pressure)
+ return actual_vapor * np.exp(-(5.8e3 * lapse_rate / (temperature_at_altitude**2) + 5.5e-5) * altitude)
+
+
+def get_vapor_saturation_pressure(temperature):
+ """
+ Computes the saturation vapor pressure over water as a function of temperature.
+
+ Use formulation and coefficients of Wexler (1976, 1977).
+ References: Brutsaert 1982; Pruppacher & Klett 1978; Flatau & al. 1992
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Saturation vapor pressure in Pascal.
+ """
+ # Polynomial coefficients
+ g = [
+ -0.29912729e4,
+ -0.60170128e4,
+ 0.1887643854e2,
+ -0.28354721e-1,
+ 0.17838301e-4,
+ -0.84150417e-9,
+ 0.44412543e-12,
+ 0.2858487e1,
+ ]
+ # Perform polynomial accumulation using Horner rule
+ esat = g[6]
+ for i in [5, 4, 3, 2]:
+ esat = esat * temperature + g[i]
+ esat = esat + g[7] * np.log(temperature)
+ for i in [1, 0]:
+ esat = esat * temperature + g[i]
+ return np.exp(esat / (temperature**2))
+
+
+def get_vapor_actual_pressure(relative_humidity, temperature):
+ """
+ Computes the actual vapor pressure over water.
+
+ Parameters
+ ----------
+ relative_humidity : float
+ Relative humidity. A value between 0 and 1.
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Actual vapor pressure in Pascal.
+ """
+ esat = get_vapor_saturation_pressure(temperature)
+ return relative_humidity * esat
+
+
+def get_pure_water_density(temperature):
+ """
+ Computes the density of pure water at standard pressure.
+
+ For temperatures above freezing uses Kell formulation.
+ For temperatures below freezing use Dorsch & Boyd formulation.
+
+ References: Pruppacher & Klett 1978; Weast & Astle 1980
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Density of pure water in kg/m^3.
+ """
+ # Convert to Celsius
+ temperature = temperature - 273.15
+
+ # Define mask
+ above_freezing_mask = temperature > 0
+
+ # Compute density above freezing temperature
+ c = [9.9983952e2, 1.6945176e1, -7.9870401e-3, -4.6170461e-5, 1.0556302e-7, -2.8054253e-10, 1.6879850e-2]
+ density = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:6], start=1))
+ density_above_0 = density / (1 + c[6] * temperature)
+
+ # Compute density below freezing temperature
+ c = [999.84, 0.086, -0.0108]
+ density_below_0 = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:], start=1))
+
+ # Define final density
+ density = xr.where(above_freezing_mask, density_above_0, density_below_0)
+ return density
+
+
+def get_pure_water_compressibility(temperature):
+ """
+ Computes the isothermal compressibility of pure ordinary water.
+
+ Reference: Kell, Weast & Astle 1980
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Compressibility of water in Pascals.
+ """
+ # Convert to Celsius
+ temperature = temperature - 273.15
+
+ # Compute compressibility
+ c = [5.088496e1, 6.163813e-1, 1.459187e-3, 2.008438e-5, -5.857727e-8, 4.10411e-10, 1.967348e-2]
+ compressibility = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:6], start=1))
+ compressibility = compressibility / (1 + c[6] * temperature) * 1e-11
+ return compressibility
+
+
+def get_pure_water_surface_tension(temperature):
+ """
+ Computes the surface tension of pure ordinary water against air.
+
+ Reference: Pruppacher & Klett 1978
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Surface tension in N/m.
+ """
+ sigma = 0.0761 - 0.000155 * (temperature - 273.15)
+ return sigma
+
+
+def get_air_dynamic_viscosity(temperature):
+ """
+ Computes the dynamic viscosity of dry air.
+
+ Reference: Beard 1977; Pruppacher & Klett 1978
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+
+ Returns
+ -------
+ float
+ Dynamic viscosity of dry air in kg/(m*s) (aka Pa*s).
+ """
+ # Convert to Celsius
+ temperature = temperature - 273.15
+
+ # Define mask
+ above_freezing_mask = temperature > 0
+
+ # Compute viscosity above freezing temperature
+ viscosity_above_0 = (1.721 + 0.00487 * temperature) / 1e5
+
+ # Compute viscosity below freezing temperature
+ viscosity_below_0 = (1.718 + 0.0049 * temperature - 1.2 * temperature**2 / 1e5) / 1e5
+
+ # Define final viscosity
+ viscosity = xr.where(above_freezing_mask, viscosity_above_0, viscosity_below_0)
+ return viscosity
+
+
+def get_air_density(temperature, air_pressure, vapor_pressure, gas_constant_dry_air=287.04):
+ """
+ Computes the air density according to the equation of state for moist air.
+
+ Reference: Brutsaert 1982
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+ air_pressure : float
+ Air pressure in Pascals.
+ vapor_pressure : float
+ Vapor pressure in Pascals.
+ gas_constant_dry_air : float, optional
+ Gas constant for dry air in J/(kg*K). The default is 287.04 J/(kg*K).
+
+ Returns
+ -------
+ float
+ Air density in kg/m^3.
+ """
+ # # Define constant for water vapor in J/(kg·K)
+ # gas_constant_water_vapor=461.5
+
+ # # Partial pressure of dry air (Pa)
+ # pressure_dry_air = air_pressure - vapor_pressure
+
+ # # Density of dry air (kg/m^3)
+ # density_dry_air = pressure_dry_air / (gas_constant_dry_air * temperature)
+
+ # # Density of water vapor (kg/m^3)
+ # density_water_vapor = vapor_pressure / (gas_constant_water_vapor * temperature)
+
+ # # Total air density (kg/m^3)
+ # air_density = density_dry_air + density_water_vapor
+
+ return air_pressure * (1 - 0.378 * vapor_pressure / air_pressure) / (gas_constant_dry_air * temperature)
+
+
+def get_water_density(temperature, air_pressure, sea_level_air_pressure=101_325):
+ """
+ Computes the density of water according to Weast & Astle 1980.
+
+ Parameters
+ ----------
+ temperature : float
+ Temperature in Kelvin.
+ air_pressure : float
+ Air pressure in Pascals.
+ sea_level_air_pressure : float
+ Standard atmospheric pressure at sea level in Pascals.
+ The default is 101_325 Pascal.
+ freezing_temperature : float, optional
+ Freezing temperature of water in Kelvin. The default is 273.15 K.
+
+ Returns
+ -------
+ float
+ Water density in kg/m^3.
+ """
+ delta_pressure = sea_level_air_pressure - air_pressure
+ water_compressibility = get_pure_water_compressibility(temperature)
+ return get_pure_water_density(temperature) * np.exp(-1 * water_compressibility * delta_pressure)
+
+
+def get_raindrop_reynolds_number(diameter, temperature, air_density, water_density, g):
+ """Compute raindrop Reynolds number.
+
+ It quantifies the relative strength of the convective inertia and linear viscous
+ forces acting on the drop at terminal velocity.
+
+ Estimates Reynolds number for drops with diameter between 19 um and 7 mm.
+ Coefficients are taken from Table 1 of Beard 1976.
+
+ Reference: Beard 1976; Pruppacher & Klett 1978
+
+ Parameters
+ ----------
+ diameter : float
+ Diameter of the raindrop in meters.
+ temperature : float
+ Temperature in Kelvin.
+ air_density : float
+ Density of air in kg/m^3.
+ water_density : float
+ Density of water in kg/m^3.
+ g : float
+ Gravitational acceleration in m/s^2.
+
+ Returns
+ -------
+ float
+ Reynolds number for the raindrop.
+ """
+ # Define mask for small and large particles
+ small_diam_mask = diameter < 1.07e-3 # < 1mm
+
+ # Compute properties
+ pure_water_surface_tension = get_pure_water_surface_tension(temperature) # N/m
+ air_viscosity = get_air_dynamic_viscosity(temperature) # kg/(m*s) (aka Pa*s).
+ delta_density = water_density - air_density
+
+ # Compute Davis number for small droplets
+ davis_number = 4 * air_density * delta_density * g * diameter**3 / (3 * air_viscosity**2)
+
+ # Compute the slip correction (is approx 1 and can be discarded)
+ # l0 = 6.62*1e-8 # m
+ # v0 = 0.01818 # g / m / s
+ # p0 = 101_325_25 # Pa
+ # t0 = 293.15 # K
+ # c_sc = 1 + 2.51*l0*(air_viscosity/v0)*(air_pressure/p0)*((temperature/t0)**3)/diameter
+
+ # Compute modified Bond and physical property numbers for large droplets
+ bond_number = 4 * delta_density * g * diameter**2 / (3 * pure_water_surface_tension)
+ property_number = pure_water_surface_tension**3 * air_density**2 / (air_viscosity**4 * delta_density * g)
+
+ # Compute Reynolds_number_for small particles (diameter < 0.00107) (1 mm)
+ # --> First 9 bins of Parsivel ...
+ b = [-3.18657, 0.992696, -0.00153193, -0.000987059, -0.000578878, 0.0000855176, -0.00000327815]
+ x = np.log(davis_number)
+ y = b[0] + sum(b * x**i for i, b in enumerate(b[1:], start=1))
+ reynolds_number_small = np.exp(y) # TODO: miss C_sc = slip correction factor ?
+
+ # Compute Reynolds_number_for large particles (diameter >= 0.00107)
+ b = [-5.00015, 5.23778, -2.04914, 0.475294, -0.0542819, 0.00238449]
+ log_property_number = np.log(property_number) / 6
+ x = np.log(bond_number) + log_property_number
+ y = b[0]
+ y = b[0] + sum(b * x**i for i, b in enumerate(b[1:], start=1))
+ reynolds_number_large = np.exp(log_property_number + y)
+
+ # Define final reynolds number
+ reynolds_number = xr.where(small_diam_mask, reynolds_number_small, reynolds_number_large)
+ return reynolds_number
+
+
+def get_fall_velocity_beard_1976(diameter, temperature, air_density, water_density, g):
+ """
+ Computes the terminal fall velocity of a raindrop in still air.
+
+ Reference: Beard 1976; Pruppacher & Klett 1978
+
+ Parameters
+ ----------
+ diameter : float
+ Diameter of the raindrop in meters.
+ temperature : float
+ Temperature in Kelvin.
+ air_density : float
+ Density of air in kg/m^3.
+ water_density : float
+ Density of water in kg/m^3.
+ g : float
+ Gravitational acceleration in m/s^2.
+
+ Returns
+ -------
+ float
+ Terminal fall velocity of the raindrop in m/s.
+ """
+ air_viscosity = get_air_dynamic_viscosity(temperature)
+ reynolds_number = get_raindrop_reynolds_number(
+ diameter=diameter,
+ temperature=temperature,
+ air_density=air_density,
+ water_density=water_density,
+ g=g,
+ )
+ fall_velocity = air_viscosity * reynolds_number / (air_density * diameter)
+ return fall_velocity
+
+
+def get_drag_coefficient(diameter, air_density, water_density, fall_velocity, g=9.81):
+ """
+ Computes the drag coefficient for a raindrop.
+
+ Parameters
+ ----------
+ diameter : float
+ Diameter of the raindrop in meters.
+ air_density : float
+ Density of air in kg/m^3.
+ water_density : float
+ Density of water in kg/m^3.
+ fall_velocity : float
+ Terminal fall velocity of the raindrop in m/s.
+ g : float
+ Gravitational acceleration in m/s^2.
+
+ Returns
+ -------
+ float
+ Drag coefficient of the raindrop.
+ """
+ delta_density = water_density - air_density
+ drag_coefficient = 4 * delta_density * g * diameter / (3 * air_density * fall_velocity**2)
+ return drag_coefficient
+
+
+def retrieve_fall_velocity(
+ diameter,
+ altitude,
+ latitude,
+ temperature,
+ relative_humidity,
+ air_pressure=None,
+ sea_level_air_pressure=101_325,
+ gas_constant_dry_air=287.04,
+ lapse_rate=0.0065,
+):
+ """
+ Computes the terminal fall velocity and drag coefficients for liquid raindrops.
+
+ Parameters
+ ----------
+ diameter : float
+ Diameter of the raindrop in meters.
+ altitude : float
+ Altitude in meters.
+ temperature : float
+ Temperature in Kelvin.
+ relative_humidity : float
+ Relative humidity. A value between 0 and 1.
+ latitude : float
+ Latitude in degrees.
+ air_pressure : float
+ Air pressure in Pascals.
+ If None, air_pressure at altitude is inferred assuming
+ a standard atmospheric pressure at sea level.
+ sea_level_air_pressure : float
+ Standard atmospheric pressure at sea level in Pascals.
+ The default is 101_325 Pascal.
+ gas_constant_dry_air : float, optional
+ Gas constant for dry air in J/(kg*K). The default is 287.04 is J/(kg*K).
+ lapse_rate : float, optional
+ Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m.
+
+ Returns
+ -------
+ tuple
+ Terminal fall velocity and drag coefficients for liquid raindrops.
+ """
+ # Retrieve air pressure at altitude if not specified
+ if air_pressure is None:
+ air_pressure = get_air_pressure_at_height(
+ altitude=altitude,
+ latitude=latitude,
+ temperature=temperature,
+ sea_level_air_pressure=sea_level_air_pressure,
+ lapse_rate=lapse_rate,
+ gas_constant_dry_air=gas_constant_dry_air,
+ )
+
+ # Retrieve vapour pressure (from relative humidity)
+ vapor_pressure = get_vapor_actual_pressure(
+ relative_humidity=relative_humidity,
+ temperature=temperature,
+ )
+
+ # Retrieve air density and water density
+ air_density = get_air_density(
+ temperature=temperature,
+ air_pressure=air_pressure,
+ vapor_pressure=vapor_pressure,
+ gas_constant_dry_air=gas_constant_dry_air,
+ )
+ water_density = get_water_density(
+ temperature=temperature,
+ air_pressure=air_pressure,
+ sea_level_air_pressure=sea_level_air_pressure,
+ )
+
+ # Retrieve accurate gravitational_acceleration
+ g = get_gravitational_acceleration(altitude=altitude, latitude=latitude)
+
+ # Compute fall velocity
+ fall_velocity = get_fall_velocity_beard_1976(
+ diameter=diameter,
+ temperature=temperature,
+ air_density=air_density,
+ water_density=water_density,
+ g=g,
+ )
+
+ # drag_coefficient = get_drag_coefficient(diameter=diameter,
+ # air_density=air_density,
+ # water_density=water_density,
+ # g=g.
+ # fall_velocity=fall_velocity)
+
+ return fall_velocity
+
+
+####-----------------------------------------------------------------------------------------
+#### OLD CODE
+
+
+# def get_fall_velocity_beard_1977(diameter):
+# """
+# Compute the fall velocity of raindrops using the Beard (1977) relationship.
+
+# Parameters
+# ----------
+# diameter : array-like
+# Diameter of the raindrops in millimeters.
+# Valid up to 7 mm (0.7 cm).
+
+# Returns
+# -------
+# fall_velocity : array-like
+# Fall velocities in meters per second.
+
+# Notes
+# -----
+# This method uses an exponential function based on the work of Beard (1977),
+# valid at sea level conditions (pressure = 1 atm, temperature = 20°C,
+# air density = 1.194 kg/m³).
+
+# References
+# ----------
+# Beard, K. V. (1977).
+# Terminal velocity adjustment for cloud and precipitation drops aloft.
+# Journal of the Atmospheric Sciences, 34(8), 1293-1298.
+# https://doi.org/10.1175/1520-0469(1977)034<1293:TVAFCA>2.0.CO;2
+
+# """
+# diameter_cm = diameter/1000
+# c = [7.06037, 1.74951, 4.86324, 6.60631, 4.84606, 2.14922, 0.58714, 0.096348, 0.00869209, 0.00033089]
+# log_diameter = np.log(diameter_cm)
+# y = c[0] + sum(c * log_diameter**i for i, c in enumerate(c[1:], start=1))
+# fall_velocity = np.exp(y)
+# return fall_velocity
+
+
+# def get_fall_velocity_beard_1977(diameter, temperature, air_pressure, gas_constant_dry_air=287.04):
+# """
+# Computes the terminal fall velocity of a raindrop in still air.
+
+# This function is based on the Table 4 coefficients of Kenneth V. Beard (1977),
+# "Terminal Velocity and Shape of Cloud and Precipitation Drops Aloft",
+# Journal of the Atmospheric Sciences, Vol. 34, pp. 1293-1298.
+
+# Note: This approximation is valid at sea level with conditions:
+# Pressure = 1 atm, Temperature = 20°C, (saturated) air density = 1.194 kg/m³.
+
+# Parameters
+# ----------
+# diameter : array-like
+# Array of equivolume drop diameters in meters.
+
+# Returns
+# -------
+# fall_velocity : array-like
+# Array of terminal fall velocity in meters per second (m/s).
+# For diameters greater than 7 mm, the function returns NaN.
+
+# """
+# # PROBLEMATIC
+# # Compute sea level velocity
+# c = [7.06037, 1.74951, 4.86324, 6.60631, 4.84606, 2.14922, 0.58714, 0.096348, 0.00869209, 0.00033089]
+# log_diameter = np.log(diameter / 1000 * 10)
+# y = c[0] + sum(c * log_diameter**i for i, c in enumerate(c[1:], start=1))
+# v0 = np.exp(y)
+
+# # Compute fall velocity
+# t_20 = 273.15 + 20
+# eps_s = get_air_dynamic_viscosity(t_20) / get_air_dynamic_viscosity(temperature) - 1
+# eps_c = -1 + (
+# np.sqrt(
+# get_air_density(
+# temperature=t_20,
+# air_pressure=101325,
+# vapor_pressure=0,
+# gas_constant_dry_air=gas_constant_dry_air,
+# )
+# / get_air_density(
+# temperature=temperature,
+# air_pressure=air_pressure,
+# vapor_pressure=0,
+# gas_constant_dry_air=gas_constant_dry_air,
+# ),
+# )
+# )
+# a = 1.104 * eps_s
+# b = (1.058 * eps_c - 1.104 * eps_s) / 5.01
+# x = np.log(diameter) + 5.52
+# f = (a + b * x) + 1
+# fall_velocity = v0 * f
+# # fall_velocity.plot()
+
+# eps = 1.104 * eps_s + (1.058 * eps_c - 1.104 * eps_s) * np.log(diameter / 1e-3) / 5.01
+# # eps = 1.104 * eps_s + (1.058 * eps_c - 1.104 * eps_s) * np.log(diameter / 4e-5) / 5.01
+# fall_velocity = 0.01 * v0 * (1 + eps)
+# return fall_velocity
diff --git a/disdrodb/l1/encoding_attrs.py b/disdrodb/l1/encoding_attrs.py
new file mode 100644
index 00000000..35a4abab
--- /dev/null
+++ b/disdrodb/l1/encoding_attrs.py
@@ -0,0 +1,605 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Attributes and encoding options for DISDRODB products."""
+
+
+def get_attrs_dict():
+ """Temporary attributes."""
+ attrs_dict = {
+ #### L1
+ "drop_number": {
+ "description": "Counts of drops per diameter and velocity class",
+ "long_name": "Drop counts per diameter and velocity class",
+ "units": "",
+ },
+ "drop_counts": {
+ "description": "Counts of drops per diameter class",
+ "long_name": "Drop counts per diameter class",
+ "units": "",
+ },
+ "Dmin": {
+ "description": "Minimum drop diameter",
+ "long_name": "Minimum drop diameter",
+ "units": "mm",
+ },
+ "Dmax": {
+ "description": "Maximum drop diameter",
+ "long_name": "Maximum drop diameter",
+ "units": "mm",
+ },
+ "fall_velocity": {
+ "description": "Estimated drop fall velocity per diameter class",
+ "long_name": "Estimated drop fall velocity",
+ "units": "m s-1",
+ },
+ "drop_average_velocity": {
+ "description": "Average measured drop fall velocity per diameter class",
+ "long_name": "Measured average drop fall velocity",
+ "units": "m s-1",
+ },
+ "n_drops_selected": {
+ "description": "Total number of selected drops",
+ "long_name": "Total number of selected drops",
+ "units": "",
+ },
+ "n_drops_discarded": {
+ "description": "Total number of discarded drops",
+ "long_name": "Total number of discarded drops",
+ "units": "",
+ },
+ #### L2
+ "drop_number_concentration": {
+ "description": "Number concentration of drops per diameter class per unit volume",
+ "long_name": "Drop number concentration per diameter class",
+ "units": "m-3 mm-1",
+ },
+ "drop_volume": {
+ "standard_name": "",
+ "units": "mm3",
+ "long_name": "Volume of Drops per Diameter Class",
+ },
+ "drop_total_volume": {
+ "standard_name": "",
+ "units": "mm3",
+ "long_name": "Total Volume of Drops",
+ },
+ "drop_relative_volume_ratio": {
+ "standard_name": "",
+ "units": "",
+ "long_name": "Relative Volume Ratio of Drops",
+ },
+ "KEmin": {
+ "standard_name": "",
+ "units": "J",
+ "long_name": "Minimum Drop Kinetic Energy",
+ },
+ "KEmax": {
+ "standard_name": "",
+ "units": "J",
+ "long_name": "Maximum Drop Kinetic Energy",
+ },
+ "E": {
+ "description": "Kinetic energy per unit rainfall depth",
+ "standard_name": "",
+ "units": "J m-2 mm-1",
+ "long_name": "Rainfall Kinetic Energy",
+ },
+ "KE": {
+ "standard_name": "",
+ "units": "J m-2 h-1",
+ "long_name": "Kinetic Energy Density Flux",
+ },
+ "M1": {
+ "standard_name": "",
+ "units": "m-3 mm",
+ "long_name": "First Moment of the Drop Size Distribution",
+ },
+ "M2": {
+ "standard_name": "",
+ "units": "m-3 mm2",
+ "long_name": "Second Moment of the Drop Size Distribution",
+ },
+ "M3": {
+ "standard_name": "",
+ "units": "m-3 mm3",
+ "long_name": "Third Moment of the Drop Size Distribution",
+ },
+ "M4": {
+ "standard_name": "",
+ "units": "m-3 mm4",
+ "long_name": "Fourth Moment of the Drop Size Distribution",
+ },
+ "M5": {
+ "standard_name": "",
+ "units": "m-3 mm5",
+ "long_name": "Fifth Moment of the Drop Size Distribution",
+ },
+ "M6": {
+ "standard_name": "",
+ "units": "m-3 mm6",
+ "long_name": "Sixth Moment of the Drop Size Distribution",
+ },
+ "Nt": {
+ "standard_name": "number_concentration_of_rain_drops_in_air",
+ "units": "m-3",
+ "long_name": "Total Number Concentration",
+ },
+ "R": {
+ "standard_name": "rainfall_rate",
+ "units": "mm h-1",
+ "long_name": "Instantaneous Rainfall Rate",
+ },
+ "P": {
+ "standard_name": "precipitation_amount",
+ "units": "mm",
+ "long_name": "Rain Accumulation",
+ },
+ "Z": {
+ "standard_name": "equivalent_reflectivity_factor",
+ "units": "dBZ",
+ "long_name": "Equivalent Radar Reflectivity Factor",
+ },
+ "W": {
+ "description": "Water Mass of the Drop Size Distribution",
+ "standard_name": "mass_concentration_of_liquid_water_in_air",
+ "units": "g m-3",
+ "long_name": "Liquid Water Content",
+ },
+ "D10": {
+ "standard_name": "",
+ "units": "mm",
+ "long_name": "10th Percentile Drop Diameter",
+ },
+ "D50": {
+ "standard_name": "median_volume_diameter",
+ "units": "mm",
+ "long_name": "Median Volume Drop Diameter",
+ },
+ "D90": {
+ "standard_name": "",
+ "units": "mm",
+ "long_name": "90th Percentile Drop Diameter",
+ },
+ "Dmode": {
+ "standard_name": "",
+ "units": "mm",
+ "long_name": "Mode Diameter of the Drop Size Distribution",
+ },
+ "Dm": {
+ "standard_name": "Dm",
+ "units": "mm",
+ "long_name": "Mean Volume Diameter",
+ },
+ "sigma_m": {
+ "standard_name": "",
+ "units": "mm",
+ "long_name": "Standard Deviation of Mass Spectrum",
+ },
+ "Nw": {
+ "standard_name": "normalized_intercept_parameter",
+ "units": "mm-1 m-3", # TODO
+ "long_name": "Normalized Intercept Parameter of a Normalized Gamma Distribution",
+ },
+ "N0": {
+ "standard_name": "intercept_parameter",
+ "units": "mm-1 m-3", # TODO
+ "long_name": "Intercept Parameter of the Modeled Drop Size Distribution",
+ },
+ "mu": {
+ "standard_name": "shape_parameter",
+ "units": "1", # TODO
+ "long_name": "Shape Parameter of the Modeled Drop Size Distribution",
+ },
+ "Lambda": {
+ "standard_name": "distribution_slope",
+ "units": "1/mm", # TODO
+ "long_name": "Slope Parameter of the Modeled Drop Size Distribution",
+ },
+ "sigma": {
+ "standard_name": "distribution_slope",
+ "units": "1/mm", # TODO
+ "long_name": "Slope Parameter of the Modeled Lognormal Distribution",
+ },
+ # Radar variables
+ "Zh": {
+ "description": "Radar reflectivity factor at horizontal polarization",
+ "long_name": "Horizontal Reflectivity",
+ "units": "dBZ",
+ },
+ "Zdr": {
+ "description": "Differential reflectivity",
+ "long_name": "Differential Reflectivity",
+ "units": "dB",
+ },
+ "rho_hv": {
+ "description": "Correlation coefficient between horizontally and vertically polarized reflectivity",
+ "long_name": "Copolarized Correlation Coefficient",
+ "units": "",
+ },
+ "ldr": {
+ "description": "Linear depolarization ratio",
+ "long_name": "Linear Depolarization Ratio",
+ "units": "dB",
+ },
+ "Kdp": {
+ "description": "Specific differential phase",
+ "long_name": "Specific Differential Phase",
+ "units": "deg/km",
+ },
+ "Ai": {
+ "description": "Specific attenuation",
+ "long_name": "Specific attenuation",
+ "units": "dB/km",
+ },
+ }
+ return attrs_dict
+
+
+def get_encoding_dict():
+ """Temporary encoding dictionary."""
+ encoding_dict = {
+ "M1": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "M2": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "M3": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "M4": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "M5": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "M6": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Nt": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "R": {
+ "dtype": "uint16",
+ "scale_factor": 0.01,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "P": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Z": {
+ "dtype": "uint16",
+ "scale_factor": 0.01,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "W": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Dm": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "sigma_m": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Dmode": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Nw": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "D50": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "D10": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "D90": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_number": {
+ "dtype": "uint32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ "_FillValue": 4294967295,
+ },
+ "drop_counts": {
+ "dtype": "uint32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ "_FillValue": 4294967295,
+ },
+ "n_drops_selected": {
+ "dtype": "uint32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ "_FillValue": 4294967295,
+ },
+ "n_drops_discarded": {
+ "dtype": "uint32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ "_FillValue": 4294967295,
+ },
+ "Dmin": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Dmax": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_average_velocity": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "fall_velocity": {
+ "dtype": "uint16",
+ "scale_factor": 0.001,
+ "_FillValue": 65535,
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_number_concentration": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_volume": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_total_volume": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "drop_relative_volume_ratio": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "KEmin": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "KEmax": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "E": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "KE": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ # Radar variables
+ "Zh": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Zdr": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "rho_hv": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "ldr": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Kdp": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ "Ai": {
+ "dtype": "float32",
+ "zlib": True,
+ "complevel": 3,
+ "shuffle": True,
+ "fletcher32": False,
+ "contiguous": False,
+ },
+ }
+ return encoding_dict
diff --git a/disdrodb/l1/fall_velocity.py b/disdrodb/l1/fall_velocity.py
new file mode 100644
index 00000000..6e7d8dc4
--- /dev/null
+++ b/disdrodb/l1/fall_velocity.py
@@ -0,0 +1,260 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Theoretical models to estimate the drop fall velocity."""
+
+
+import numpy as np
+
+
+def get_fall_velocity_atlas_1973(diameter):
+ """
+ Compute the fall velocity of raindrops using the Atlas et al. (1973) relationship.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ fall_velocity : array-like
+ Fall velocities corresponding to the input diameters, in meters per second.
+
+ References
+ ----------
+ Atlas, D., Srivastava, R. C., & Sekhon, R. S. (1973).
+ Doppler radar characteristics of precipitation at vertical incidence.
+ Reviews of Geophysics, 11(1), 1-35.
+ https://doi.org/10.1029/RG011i001p00001
+
+ Atlas, D., & Ulbrich, C. W. (1977).
+ Path- and area-integrated rainfall measurement by microwave attenuation in the 1-3 cm band.
+ Journal of Applied Meteorology, 16(12), 1322-1331.
+ https://doi.org/10.1175/1520-0450(1977)016<1322:PAAIRM>2.0.CO;2
+
+ Gunn, R., & Kinzer, G. D. (1949).
+ The terminal velocity of fall for water droplets in stagnant air.
+ Journal of Meteorology, 6(4), 243-248.
+ https://doi.org/10.1175/1520-0469(1949)006<0243:TTVOFF>2.0.CO;2
+
+ """
+ fall_velocity = 9.65 - 10.3 * np.exp(-0.6 * diameter) # clip to 0 !
+ fall_velocity = np.clip(fall_velocity, 0, None)
+ return fall_velocity
+
+
+def get_fall_velocity_brandes_2002(diameter):
+ """
+ Compute the fall velocity of raindrops using the Brandes et al. (2002) relationship.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ fall_velocity : array-like
+ Fall velocities in meters per second.
+
+ References
+ ----------
+ Brandes, E. A., Zhang, G., & Vivekanandan, J. (2002).
+ Experiments in rainfall estimation with a polarimetric radar in a subtropical environment.
+ Journal of Applied Meteorology, 41(6), 674-685.
+ https://doi.org/10.1175/1520-0450(2002)041<0674:EIREWA>2.0.CO;2
+
+ """
+ fall_velocity = -0.1021 + 4.932 * diameter - 0.9551 * diameter**2 + 0.07934 * diameter**3 - 0.002362 * diameter**4
+ return fall_velocity
+
+
+def get_fall_velocity_uplinger_1981(diameter):
+ """
+ Compute the fall velocity of raindrops using Uplinger (1981) relationship.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+ Valid for diameters between 0.1 mm and 7 mm.
+
+ Returns
+ -------
+ fall_velocity : array-like
+ Fall velocities in meters per second.
+
+ References
+ ----------
+ Uplinger, C. W. (1981). A new formula for raindrop terminal velocity.
+ In Proceedings of the 20th Conference on Radar Meteorology (pp. 389-391).
+ AMS.
+
+ """
+ # Valid between 0.1 and 7 mm
+ fall_velocity = 4.874 * diameter * np.exp(-0.195 * diameter)
+ return fall_velocity
+
+
+def get_fall_velocity_van_dijk_2002(diameter):
+ """
+ Compute the fall velocity of raindrops using van Dijk et al. (2002) relationship.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ fall_velocity : array-like
+ Fall velocities in meters per second.
+
+ References
+ ----------
+ van Dijk, A. I. J. M., Bruijnzeel, L. A., & Rosewell, C. J. (2002).
+ Rainfall intensity-kinetic energy relationships: a critical literature appraisal.
+ Journal of Hydrology, 261(1-4), 1-23.
+ https://doi.org/10.1016/S0022-1694(02)00020-3
+
+ """
+ fall_velocity = -0.254 + 5.03 * diameter - 0.912 * diameter**2 + 0.0561 * diameter**3
+ return fall_velocity
+
+
+def get_fall_velocity_beard_1976(diameter, ds_env):
+ """Calculate the fall velocity of a particle using the Beard (1976) model.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+ ds_env : xr.Dataset
+ A dataset containing the following environmental variables:
+ - 'altitude' : Altitude in meters (m).
+ - 'latitude' : Latitude in degrees.
+ - 'temperature' : Temperature in degrees Celsius (°C).
+ - 'relative_humidity' : Relative humidity in percentage (%).
+ - 'sea_level_air_pressure' : Sea level air pressure in Pascals (Pa).
+ - 'lapse_rate' : Lapse rate in degrees Celsius per meter (°C/m).
+
+ Returns
+ -------
+ fall_velocity : array-like
+ The calculated fall velocities of the raindrops.
+ """
+ from disdrodb.l1.beard_model import retrieve_fall_velocity
+
+ # Input diameter in mmm
+ fall_velocity = retrieve_fall_velocity(
+ diameter=diameter / 1000, # diameter expected in m !!!
+ altitude=ds_env["altitude"],
+ latitude=ds_env["latitude"],
+ temperature=ds_env["temperature"],
+ relative_humidity=ds_env["relative_humidity"],
+ # TODO: add air_pressure # TODO
+ sea_level_air_pressure=ds_env["sea_level_air_pressure"],
+ lapse_rate=ds_env["lapse_rate"],
+ )
+ return fall_velocity
+
+
+def ensure_valid_coordinates(ds, default_altitude=0, default_latitude=0, default_longitude=0):
+ """Ensure dataset valid coordinates for altitude, latitude, and longitude.
+
+ Invalid values are np.nan and -9999.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The dataset for which to ensure valid geolocation coordinates.
+ default_altitude : float, optional
+ The default value to use for invalid altitude values. Defaults to 0.
+ default_latitude : float, optional
+ The default value to use for invalid latitude values. Defaults to 0.
+ default_longitude : float, optional
+ The default value to use for invalid longitude values. Defaults to 0.
+
+ Returns
+ -------
+ xarray.Dataset
+ The dataset with invalid coordinates replaced by default values.
+
+ """
+ invalid_altitude = np.logical_or(np.isnan(ds["altitude"]), ds["altitude"] == -9999)
+ ds["altitude"] = ds["altitude"].where(~invalid_altitude, default_altitude)
+
+ invalid_lat = np.logical_or(np.isnan(ds["latitude"]), ds["latitude"] == -9999)
+ ds["latitude"] = ds["latitude"].where(~invalid_lat, default_latitude)
+
+ invalid_lon = np.logical_or(np.isnan(ds["longitude"]), ds["longitude"] == -9999)
+ ds["longitude"] = ds["longitude"].where(~invalid_lon, default_longitude)
+ return ds
+
+
+def get_raindrop_fall_velocity(diameter, method, ds_env=None):
+ """Calculate the fall velocity of raindrops based on their diameter.
+
+ Parameters
+ ----------
+ diameter : array-like
+ The diameter of the raindrops in millimeters.
+ method : str
+ The method to use for calculating the fall velocity. Must be one of the following:
+ 'Atlas1973', 'Beard1976', 'Brandes2002', 'Uplinger1981', 'VanDijk2002'.
+ ds_env : xr.Dataset, optional
+ A dataset containing the following environmental variables:
+ - 'altitude' : Altitude in meters (m).
+ - 'latitude' : Latitude in degrees.
+ - 'temperature' : Temperature in degrees Celsius (°C).
+ - 'relative_humidity' : Relative humidity. A value between 0 and 1.
+ - 'sea_level_air_pressure' : Sea level air pressure in Pascals (Pa).
+ - 'lapse_rate' : Lapse rate in degrees Celsius per meter (°C/m).
+ It is required for for the 'Beard1976' method.
+
+ Returns
+ -------
+ fall_velocity : array-like
+ The calculated fall velocities of the raindrops.
+
+ Notes
+ -----
+ The 'Beard1976' method requires additional environmental parameters such as altitude and latitude.
+ These parameters can be provided through the `ds_env` argument. If not provided, default values will be used.
+ """
+ # Input diameter in mm
+ dict_methods = {
+ "Atlas1973": get_fall_velocity_atlas_1973,
+ "Beard1976": get_fall_velocity_beard_1976,
+ "Brandes2002": get_fall_velocity_brandes_2002,
+ "Uplinger1981": get_fall_velocity_uplinger_1981,
+ "VanDijk2002": get_fall_velocity_van_dijk_2002,
+ }
+ # Check valid method
+ available_methods = list(dict_methods)
+ if method not in dict_methods:
+ raise ValueError(f"{method} is an invalid fall velocity method. Valid methods: {available_methods}.")
+ # Copy diameter
+ diameter = diameter.copy()
+ # Ensure valid altitude and geolocation (if missing set defaults)
+ # - altitude required by Beard
+ # - latitude required for gravity
+ ds_env = ensure_valid_coordinates(ds_env)
+ # Retrieve fall velocity
+ func = dict_methods[method]
+ fall_velocity = func(diameter, ds_env=ds_env) if method == "Beard1976" else func(diameter)
+ return fall_velocity
diff --git a/disdrodb/l1/filters.py b/disdrodb/l1/filters.py
new file mode 100644
index 00000000..b72f0dfd
--- /dev/null
+++ b/disdrodb/l1/filters.py
@@ -0,0 +1,192 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Utilities for filtering the disdrometer raw drop spectra."""
+
+import numpy as np
+import xarray as xr
+
+
+def filter_diameter_bins(ds, minimum_diameter=None, maximum_diameter=None):
+ """
+ Filter the dataset to include only diameter bins within specified bounds.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The dataset containing diameter bin data.
+ minimum_diameter : float, optional
+ The minimum diameter to include in the filter, in millimeters.
+ Defaults to the minimum value in `ds["diameter_bin_lower"]`.
+ maximum_diameter : float, optional
+ The maximum diameter to include in the filter, in millimeters.
+ Defaults to the maximum value in `ds["diameter_bin_upper"]`.
+
+ Returns
+ -------
+ xarray.Dataset
+ The filtered dataset containing only the specified diameter bins.
+ """
+ # Initialize default arguments
+ if minimum_diameter is None:
+ minimum_diameter = ds["diameter_bin_lower"].min().item()
+ if maximum_diameter is None:
+ maximum_diameter = ds["diameter_bin_upper"].max().item()
+ # Select valid bins
+ valid_indices = np.logical_and(
+ ds["diameter_bin_lower"] >= minimum_diameter,
+ ds["diameter_bin_upper"] <= maximum_diameter,
+ )
+ ds = ds.isel({"diameter_bin_center": valid_indices})
+ # Update history
+ history = ds.attrs.get("history", "")
+ ds.attrs["history"] = (
+ history + f" Selected drops with diameters between {minimum_diameter} and {maximum_diameter} mm \n"
+ )
+ return ds
+
+
+def filter_velocity_bins(ds, minimum_velocity=0, maximum_velocity=12):
+ """
+ Filter the dataset to include only velocity bins within specified bounds.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The dataset containing velocity bin data.
+ minimum_velocity : float, optional
+ The minimum velocity to include in the filter, in meters per second.
+ Defaults to 0 m/s.
+ maximum_velocity : float, optional
+ The maximum velocity to include in the filter, in meters per second.
+ Defaults to 12 m/s.
+
+ Returns
+ -------
+ xarray.Dataset
+ The filtered dataset containing only the specified velocity bins.
+ """
+ # Initialize default arguments
+ if minimum_velocity is None:
+ minimum_velocity = ds["velocity_bin_lower"].min().item()
+ if maximum_velocity is None:
+ maximum_velocity = ds["velocity_bin_upper"].max().item()
+ # Select valid bins
+ valid_indices = np.logical_and(
+ ds["velocity_bin_lower"] >= minimum_velocity,
+ ds["velocity_bin_upper"] <= maximum_velocity,
+ )
+ ds = ds.isel({"velocity_bin_center": valid_indices})
+ # Update history
+ history = ds.attrs.get("history", "")
+ ds.attrs["history"] = (
+ history + f" Selected drops with fall velocity between {minimum_velocity} and {maximum_velocity} m/s \n"
+ )
+ return ds
+
+
+def define_spectrum_mask(
+ drop_number,
+ fall_velocity,
+ above_velocity_fraction=None,
+ above_velocity_tolerance=None,
+ below_velocity_fraction=None,
+ below_velocity_tolerance=None,
+ small_diameter_threshold=1, # 1, # 2
+ small_velocity_threshold=2.5, # 2.5, # 3
+ maintain_smallest_drops=False,
+):
+ """Define a mask for the drop spectrum based on fall velocity thresholds.
+
+ Parameters
+ ----------
+ drop_number : xarray.DataArray
+ Array of drop counts per diameter and velocity bins.
+ fall_velocity : array-like
+ The expected terminal fall velocities for drops of given sizes.
+ above_velocity_fraction : float, optional
+ Fraction of terminal fall velocity above which drops are considered too fast.
+ Either specify ``above_velocity_fraction`` or ``above_velocity_tolerance``.
+ above_velocity_tolerance : float, optional
+ Absolute tolerance above which drops terminal fall velocities are considered too fast.
+ Either specify ``above_velocity_fraction`` or ``above_velocity_tolerance``.
+ below_velocity_fraction : float, optional
+ Fraction of terminal fall velocity below which drops are considered too slow.
+ Either specify ``below_velocity_fraction`` or ``below_velocity_tolerance``.
+ below_velocity_tolerance : float, optional
+ Absolute tolerance below which drops terminal fall velocities are considered too slow.
+ Either specify ``below_velocity_fraction`` or ``below_velocity_tolerance``.
+ maintain_smallest : bool, optional
+ If True, ensures that the small drops in the spectrum are retained in the mask.
+ The smallest drops are characterized by ``small_diameter_threshold``
+ and ``small_velocity_threshold`` arguments.
+ Defaults to False.
+ small_diameter_threshold : float, optional
+ The diameter threshold to use for keeping the smallest drop.
+ Defaults to 1 mm.
+ small_velocity_threshold : float, optional
+ The fall velocity threshold to use for keeping the smallest drops.
+ Defaults to 2.5 m/s.
+
+ Returns
+ -------
+ xarray.DataArray
+ A boolean mask array indicating valid bins according to the specified criteria.
+
+ """
+ # Ensure it creates a 2D mask if the fall_velocity does not vary over time
+ if "time" in drop_number.dims and "time" not in fall_velocity.dims:
+ drop_number = drop_number.isel(time=0)
+
+ # Check arguments
+ if above_velocity_fraction is not None and above_velocity_tolerance is not None:
+ raise ValueError("Either specify 'above_velocity_fraction' or 'above_velocity_tolerance'.")
+ if below_velocity_fraction is not None and below_velocity_tolerance is not None:
+ raise ValueError("Either specify 'below_velocity_fraction' or 'below_velocity_tolerance'.")
+
+ # Define above/below velocity thresholds
+ if above_velocity_fraction is not None:
+ above_fall_velocity = fall_velocity * (1 + above_velocity_fraction)
+ elif above_velocity_tolerance is not None:
+ above_fall_velocity = fall_velocity + above_velocity_tolerance
+ else:
+ above_fall_velocity = np.inf
+ if below_velocity_fraction is not None:
+ below_fall_velocity = fall_velocity * (1 - below_velocity_fraction)
+ elif below_velocity_tolerance is not None:
+ below_fall_velocity = fall_velocity - below_velocity_tolerance
+ else:
+ below_fall_velocity = 0
+
+ # Define velocity 2D array
+ velocity_lower = xr.ones_like(drop_number) * drop_number["velocity_bin_lower"]
+ velocity_upper = xr.ones_like(drop_number) * drop_number["velocity_bin_upper"]
+
+ # Define mask
+ mask = np.logical_and(
+ velocity_lower >= below_fall_velocity,
+ velocity_upper <= above_fall_velocity,
+ )
+
+ # Maintant smallest drops
+ if maintain_smallest_drops:
+ mask_smallest = np.logical_and(
+ drop_number["diameter_bin_upper"] < small_diameter_threshold,
+ drop_number["velocity_bin_upper"] < small_velocity_threshold,
+ )
+ mask = np.logical_or(mask, mask_smallest)
+
+ return mask
diff --git a/disdrodb/l1/processing.py b/disdrodb/l1/processing.py
new file mode 100644
index 00000000..7783ad99
--- /dev/null
+++ b/disdrodb/l1/processing.py
@@ -0,0 +1,194 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Core functions for DISDRODB L1 production."""
+
+
+import xarray as xr
+
+from disdrodb.l1.encoding_attrs import get_attrs_dict, get_encoding_dict
+from disdrodb.l1.fall_velocity import get_raindrop_fall_velocity
+from disdrodb.l1.filters import define_spectrum_mask, filter_diameter_bins, filter_velocity_bins
+from disdrodb.l1.resampling import add_sample_interval
+from disdrodb.l1_env.routines import load_env_dataset
+from disdrodb.l2.empirical_dsd import get_drop_average_velocity, get_min_max_diameter # TODO: maybe move out of L2
+from disdrodb.utils.attrs import set_attrs
+from disdrodb.utils.encoding import set_encodings
+from disdrodb.utils.time import ensure_sample_interval_in_seconds, infer_sample_interval
+
+
+def generate_l1(
+ ds,
+ # Fall velocity option
+ fall_velocity_method="Beard1976",
+ # Diameter-Velocity Filtering Options
+ minimum_diameter=0,
+ maximum_diameter=10,
+ minimum_velocity=0,
+ maximum_velocity=12,
+ above_velocity_fraction=0.5,
+ above_velocity_tolerance=None,
+ below_velocity_fraction=0.5,
+ below_velocity_tolerance=None,
+ small_diameter_threshold=1, # 2
+ small_velocity_threshold=2.5, # 3
+ maintain_smallest_drops=True,
+):
+ """Generate the DISDRODB L1 dataset from the DISDRODB L0C dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ DISDRODB L0C dataset.
+ fall_velocity_method : str, optional
+ Method to compute fall velocity.
+ The default method is ``"Beard1976"``.
+ minimum_diameter : float, optional
+ Minimum diameter for filtering. The default value is 0 mm.
+ maximum_diameter : float, optional
+ Maximum diameter for filtering. The default value is 10 mm.
+ minimum_velocity : float, optional
+ Minimum velocity for filtering. The default value is 0 m/s.
+ maximum_velocity : float, optional
+ Maximum velocity for filtering. The default value is 12 m/s.
+ above_velocity_fraction : float, optional
+ Fraction of drops above velocity threshold. The default value is 0.5.
+ above_velocity_tolerance : float or None, optional
+ Tolerance for above velocity filtering. The default is ``None``.
+ below_velocity_fraction : float, optional
+ Fraction of drops below velocity threshold. The default value is 0.5.
+ below_velocity_tolerance : float or None, optional
+ Tolerance for below velocity filtering. The default is ``None``.
+ small_diameter_threshold : float, optional
+ Threshold for small diameter drops. The default value is 1.
+ small_velocity_threshold : float, optional
+ Threshold for small velocity drops. The default value is 2.5.
+ maintain_smallest_drops : bool, optional
+ Whether to maintain the smallest drops. The default is ``True``.
+
+ Returns
+ -------
+ xarray.Dataset
+ DISRODB L1 dataset.
+ """
+ # Take as input an L0 !
+
+ # Retrieve source attributes
+ attrs = ds.attrs.copy()
+
+ # Determine if the velocity dimension is available
+ has_velocity_dimension = "velocity_bin_center" in ds.dims
+
+ # Initialize L2 dataset
+ ds_l1 = xr.Dataset()
+
+ # Retrieve sample interval
+ # --> sample_interval is a coordinate of L0C products
+ if "sample_interval" in ds:
+ sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"].data)
+ else:
+ # This line is not called in the DISDRODB processing chain !
+ sample_interval = infer_sample_interval(ds, verbose=False)
+
+ # Re-add sample interval as coordinate (in seconds)
+ ds = add_sample_interval(ds, sample_interval=sample_interval)
+
+ # ---------------------------------------------------------------------------
+ # Retrieve ENV dataset or take defaults
+ # --> Used only for Beard fall velocity currently !
+ ds_env = load_env_dataset(ds)
+
+ # -------------------------------------------------------------------------------------------
+ # Filter dataset by diameter and velocity bins
+ # - Filter diameter bins
+ ds = filter_diameter_bins(ds=ds, minimum_diameter=minimum_diameter, maximum_diameter=maximum_diameter)
+ # - Filter velocity bins
+ if has_velocity_dimension:
+ ds = filter_velocity_bins(ds=ds, minimum_velocity=minimum_velocity, maximum_velocity=maximum_velocity)
+
+ # -------------------------------------------------------------------------------------------
+ # Compute fall velocity
+ fall_velocity = get_raindrop_fall_velocity(
+ diameter=ds["diameter_bin_center"],
+ method=fall_velocity_method,
+ ds_env=ds_env, # mm
+ )
+
+ # Add fall velocity
+ ds_l1["fall_velocity"] = fall_velocity
+
+ # -------------------------------------------------------------------------------------------
+ # Define filtering mask according to fall velocity
+ if has_velocity_dimension:
+ mask = define_spectrum_mask(
+ drop_number=ds["raw_drop_number"],
+ fall_velocity=fall_velocity,
+ above_velocity_fraction=above_velocity_fraction,
+ above_velocity_tolerance=above_velocity_tolerance,
+ below_velocity_fraction=below_velocity_fraction,
+ below_velocity_tolerance=below_velocity_tolerance,
+ small_diameter_threshold=small_diameter_threshold,
+ small_velocity_threshold=small_velocity_threshold,
+ maintain_smallest_drops=maintain_smallest_drops,
+ )
+
+ # -------------------------------------------------------------------------------------------
+ # Retrieve drop number and drop_counts arrays
+ if has_velocity_dimension:
+ drop_number = ds["raw_drop_number"].where(mask) # 2D (diameter, velocity)
+ drop_counts = drop_number.sum(dim="velocity_bin_center") # 1D (diameter)
+
+ else:
+ drop_number = ds["raw_drop_number"] # 1D (diameter)
+ drop_counts = ds["raw_drop_number"] # 1D (diameter)
+
+ # Add drop number and drop_counts
+ ds_l1["drop_number"] = drop_number
+ ds_l1["drop_counts"] = drop_counts
+
+ # -------------------------------------------------------------------------------------------
+ # Compute and add drop average velocity if an optical disdrometer (i.e OTT Parsivel or ThiesLPM)
+ if has_velocity_dimension:
+ ds_l1["drop_average_velocity"] = get_drop_average_velocity(drop_number)
+
+ # -------------------------------------------------------------------------------------------
+ # Compute minimum and max drop diameter observed
+ min_drop_diameter, max_drop_diameter = get_min_max_diameter(drop_counts)
+
+ # Add drop statistics
+ ds_l1["Dmin"] = min_drop_diameter
+ ds_l1["Dmax"] = max_drop_diameter
+ ds_l1["n_drops_selected"] = drop_counts.sum(dim=["diameter_bin_center"])
+ ds_l1["n_drops_discarded"] = drop_counts.sum(dim=["diameter_bin_center"])
+
+ # -------------------------------------------------------------------------------------------
+ #### Add L0C coordinates that might got lost
+ if "time_qc" in ds:
+ ds_l1 = ds_l1.assign_coords({"time_qc": ds["time_qc"]})
+
+ #### ----------------------------------------------------------------------------.
+ #### Add encodings and attributes
+ # Add variables attributes
+ attrs_dict = get_attrs_dict()
+ ds_l1 = set_attrs(ds_l1, attrs_dict=attrs_dict)
+
+ # Add variables encoding
+ encoding_dict = get_encoding_dict()
+ ds_l1 = set_encodings(ds_l1, encoding_dict=encoding_dict)
+
+ # Add global attributes
+ ds_l1.attrs = attrs
+ return ds_l1
diff --git a/disdrodb/l1/resampling.py b/disdrodb/l1/resampling.py
new file mode 100644
index 00000000..3cfcabbf
--- /dev/null
+++ b/disdrodb/l1/resampling.py
@@ -0,0 +1,236 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Utilities for temporal resampling."""
+
+
+import pandas as pd
+import xarray as xr
+
+from disdrodb.utils.time import regularize_dataset
+
+DEFAULT_ACCUMULATIONS = ["10s", "30s", "1min", "2min", "5min", "10min", "30min", "1hour"]
+
+
+def add_sample_interval(ds, sample_interval):
+ """Add a sample_interval coordinate to the dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The input dataset to which the sample_interval coordinate will be added.
+ sample_interval : int or float
+ The dataset sample interval in seconds.
+
+ Returns
+ -------
+ xarray.Dataset
+ The dataset with the added sample interval coordinate.
+
+ Notes
+ -----
+ The function adds a new coordinate named 'sample_interval' to the dataset and
+ updates the 'measurement_interval' attribute.
+ """
+ # Add sample_interval coordinate
+ ds["sample_interval"] = sample_interval
+ ds["sample_interval"].attrs["description"] = "Sample interval"
+ ds["sample_interval"].attrs["long_name"] = "Sample interval"
+ ds["sample_interval"].attrs["units"] = "seconds"
+ ds = ds.set_coords("sample_interval")
+ # Update measurement_interval attribute
+ ds.attrs = ds.attrs.copy()
+ ds.attrs["measurement_interval"] = int(sample_interval)
+ return ds
+
+
+def define_window_size(sample_interval, accumulation_interval):
+ """
+ Calculate the rolling window size based on sampling and accumulation intervals.
+
+ Parameters
+ ----------
+ sampling_interval : int
+ The sampling interval in seconds.
+ accumulation_interval : int
+ The desired accumulation interval in seconds.
+
+ Returns
+ -------
+ int
+ The calculated window size as the number of sampling intervals required to cover the accumulation interval.
+
+ Raises
+ ------
+ ValueError
+ If the accumulation interval is not a multiple of the sampling interval.
+
+ Examples
+ --------
+ >>> define_window_size(60, 300)
+ 5
+
+ >>> define_window_size(120, 600)
+ 5
+ """
+ # Check compatitiblity
+ if accumulation_interval % sample_interval != 0:
+ raise ValueError("The accumulation interval must be a multiple of the sample interval.")
+
+ # Calculate the window size
+ window_size = accumulation_interval // sample_interval
+
+ return window_size
+
+
+def resample_dataset(ds, sample_interval, accumulation_interval, rolling=True):
+ """
+ Resample the dataset to a specified accumulation interval.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The input dataset to be resampled.
+ sample_interval : int
+ The sample interval of the input dataset.
+ accumulation_interval : int
+ The interval in seconds over which to accumulate the data.
+ rolling : bool, optional
+ If True, apply a rolling window before resampling. Default is True.
+ If True, forward rolling is performed.
+ The output timesteps correspond to the starts of the periods over which
+ the resampling operation has been performed !
+
+ Returns
+ -------
+ xarray.Dataset
+ The resampled dataset with updated attributes.
+
+ Notes
+ -----
+ - The function regularizes the dataset (infill possible missing timesteps)
+ before performing the resampling operation.
+ - Variables are categorized into those to be averaged, accumulated, minimized, and maximized.
+ - Custom processing for quality flags and handling of NaNs is defined.
+ - The function updates the dataset attributes and the sample_interval coordinate.
+
+ """
+ # Retrieve attributes
+ attrs = ds.attrs.copy()
+
+ # TODO: here infill NaN with zero if necessary before regularizing !
+
+ # Ensure regular dataset without missing timesteps
+ ds = regularize_dataset(ds, freq=f"{sample_interval}s")
+
+ # Initialize resample dataset
+ ds_resampled = xr.Dataset()
+
+ # Retrieve variables to average/sum
+ var_to_average = ["fall_velocity"]
+ var_to_cumulate = ["raw_drop_number", "drop_number", "drop_counts", "n_drops_selected", "n_drops_discarded"]
+ var_to_min = ["Dmin"]
+ var_to_max = ["Dmax"]
+
+ # Retrieve available variables
+ var_to_average = [var for var in var_to_average if var in ds]
+ var_to_cumulate = [var for var in var_to_cumulate if var in ds]
+ var_to_min = [var for var in var_to_min if var in ds]
+ var_to_max = [var for var in var_to_max if var in ds]
+
+ # TODO Define custom processing
+ # - quality_flag --> take worst
+ # - skipna if less than fraction (to not waste lot of data when aggregating over i.e. hours)
+
+ # Resample the dataset
+ # - Rolling currently does not allow direct rolling forward.
+ # - We currently use center=False which means search for data backward (right-aligned) !
+ # - We then drop the first 'window_size' NaN timesteps and we shift backward the timesteps.
+ # - https://github.com/pydata/xarray/issues/9773
+ # - https://github.com/pydata/xarray/issues/8958
+ if not rolling:
+ # Resample
+ if len(var_to_average) > 0:
+ ds_resampled.update(
+ ds[var_to_average].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).mean(skipna=False),
+ )
+ if len(var_to_cumulate) > 0:
+ ds_resampled.update(
+ ds[var_to_cumulate].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).sum(skipna=False),
+ )
+ if len(var_to_min) > 0:
+ ds_resampled.update(
+ ds[var_to_min].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).min(skipna=False),
+ )
+ if len(var_to_max) > 0:
+ ds_resampled.update(
+ ds[var_to_max].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).max(skipna=False),
+ )
+
+ else:
+ # Roll and Resample
+ window_size = define_window_size(sample_interval=sample_interval, accumulation_interval=accumulation_interval)
+ if len(var_to_average) > 0:
+ ds_resampled.update(ds[var_to_average].rolling({"time": window_size}, center=False).mean(skipna=False))
+ if len(var_to_cumulate) > 0:
+ ds_resampled.update(ds[var_to_cumulate].rolling({"time": window_size}, center=False).sum(skipna=False))
+
+ if len(var_to_min) > 0:
+ ds_resampled.update(ds[var_to_min].rolling({"time": window_size}, center=False).min(skipna=False))
+ if len(var_to_max) > 0:
+ ds_resampled.update(ds[var_to_max].rolling({"time": window_size}, center=False).max(skipna=False))
+ # Ensure time to correspond to the start time of the integration
+ ds_resampled = ds_resampled.isel(time=slice(window_size - 1, None)).assign_coords(
+ {"time": ds_resampled["time"].data[: -window_size + 1]},
+ )
+
+ # Add attributes
+ ds_resampled.attrs = attrs
+ if rolling:
+ ds_resampled.attrs["rolled"] = "True"
+ else:
+ ds_resampled.attrs["rolled"] = "False"
+
+ # Add accumulation_interval as new sample_interval coordinate
+ ds_resampled = add_sample_interval(ds_resampled, sample_interval=accumulation_interval)
+ return ds_resampled
+
+
+def get_possible_accumulations(sample_interval, accumulations=None):
+ """
+ Get a list of valid accumulation intervals based on the sampling time.
+
+ Parameters
+ ----------
+ - sample_interval (int): The inferred sampling time in seconds.
+ - accumulations (list of int or string): List of desired accumulation intervals.
+ If provide integers, specify accumulation in seconds.
+
+ Returns
+ -------
+ - list of int: Valid accumulation intervals in seconds.
+ """
+ # Select default accumulations
+ if accumulations is None:
+ accumulations = DEFAULT_ACCUMULATIONS
+
+ # Get accumulations in seconds
+ accumulations = [int(pd.Timedelta(acc).total_seconds()) if isinstance(acc, str) else acc for acc in accumulations]
+
+ # Filter candidate accumulations to include only those that are multiples of the sampling time
+ possible_accumulations = [acc for acc in accumulations if acc % sample_interval == 0]
+
+ return possible_accumulations
diff --git a/disdrodb/l1/routines.py b/disdrodb/l1/routines.py
new file mode 100644
index 00000000..96477aa5
--- /dev/null
+++ b/disdrodb/l1/routines.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement DISDRODB L1 processing."""
+
+import datetime
+import logging
+import os
+import time
+from typing import Optional
+
+import dask
+import xarray as xr
+
+# Directory
+from disdrodb.api.create_directories import (
+ create_logs_directory,
+ create_product_directory,
+)
+from disdrodb.api.io import get_filepaths, get_required_product
+from disdrodb.api.path import (
+ define_l1_filename,
+)
+from disdrodb.configs import get_base_dir
+from disdrodb.l1.processing import generate_l1
+from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel
+
+# Logger
+from disdrodb.utils.logger import (
+ close_logger,
+ create_logger_file,
+ create_product_logs,
+ log_error,
+ log_info,
+)
+from disdrodb.utils.writer import write_product
+
+logger = logging.getLogger(__name__)
+
+
+def get_l1_options():
+ """Get L1 options."""
+ # - TODO: from YAML
+ # - TODO: as function of sensor name
+
+ # minimum_diameter
+ # --> OTT_Parsivel: 0.2495
+ # --> RD80: 0.313
+ # --> LPM: 0.125 (we currently discard first bin with this setting)
+
+ # maximum_diameter
+ # LPM: 8 mm
+ # RD80: 5.6 mm
+ # OTT: 26 mm
+
+ l1_options = {
+ # Fall velocity option
+ "fall_velocity_method": "Beard1976",
+ # Diameter-Velocity Filtering Options
+ "minimum_diameter": 0.2495, # OTT Parsivel first two bin no data !
+ "maximum_diameter": 8,
+ "minimum_velocity": 0,
+ "maximum_velocity": 12,
+ "above_velocity_fraction": 0.5,
+ "above_velocity_tolerance": None,
+ "below_velocity_fraction": 0.5,
+ "below_velocity_tolerance": None,
+ "small_diameter_threshold": 1, # 2
+ "small_velocity_threshold": 2.5, # 3
+ "maintain_smallest_drops": True,
+ }
+ return l1_options
+
+
+@delayed_if_parallel
+@single_threaded_if_parallel
+def _generate_l1(
+ filepath,
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # Processing options
+ force,
+ verbose,
+ parallel, # this is used only to initialize the correct logger !
+):
+ """Generate the L1 product from the DISRODB L0C netCDF file.
+
+ Parameters
+ ----------
+ filepath : str
+ Path to the L0C netCDF file.
+ data_dir : str
+ Directory where the L1 netCDF file will be saved.
+ logs_dir : str
+ Directory where the log file will be saved.
+ campaign_name : str
+ Name of the campaign.
+ station_name : str
+ Name of the station.
+ force : bool
+ If True, overwrite existing files.
+ verbose : bool
+ Whether to verbose the processing.
+
+ Returns
+ -------
+ str
+ Path to the log file generated during processing.
+
+ Notes
+ -----
+ If an error occurs during processing, it is caught and logged,
+ but no error is raised to interrupt the execution.
+ """
+ # -----------------------------------------------------------------.
+ # Define product name
+ product = "L1"
+
+ # -----------------------------------------------------------------.
+ # Create file logger
+ filename = os.path.basename(filepath)
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
+ filename=filename,
+ parallel=parallel,
+ )
+
+ ##------------------------------------------------------------------------.
+ # Log start processing
+ msg = f"{product} processing of {filename} has started."
+ log_info(logger, msg, verbose=verbose)
+
+ ##------------------------------------------------------------------------.
+ # Retrieve L1 configurations
+ l1_options = get_l1_options()
+
+ ##------------------------------------------------------------------------.
+ ### Core computation
+ try:
+ # Open the raw netCDF
+ with xr.open_dataset(filepath, chunks={}, cache=False) as ds:
+ ds = ds[["raw_drop_number"]].load()
+
+ # Produce L1 dataset
+ ds = generate_l1(ds=ds, **l1_options)
+
+ # Write L1 netCDF4 dataset
+ if ds["time"].size > 1:
+ # Define filepath
+ filename = define_l1_filename(ds, campaign_name=campaign_name, station_name=station_name)
+ filepath = os.path.join(data_dir, filename)
+ # Write to disk
+ write_product(ds, product=product, filepath=filepath, force=force)
+
+ ##--------------------------------------------------------------------.
+ # Clean environment
+ del ds
+
+ # Log end processing
+ msg = f"{product} processing of {filename} has ended."
+ log_info(logger, msg, verbose=verbose)
+
+ ##--------------------------------------------------------------------.
+ # Otherwise log the error
+ except Exception as e:
+ error_type = str(type(e).__name__)
+ msg = f"{error_type}: {e}"
+ log_error(logger, msg, verbose=verbose)
+
+ # Close the file logger
+ close_logger(logger)
+
+ # Return the logger file path
+ return logger_filepath
+
+
+def run_l1_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L1 processing of a specific DISDRODB station when invoked from the terminal.
+
+ The L1 routines just filter the raw drop spectrum and compute basic statistics.
+ The L1 routine expects as input L0C files where each file has a unique sample interval.
+
+ This function is intended to be called through the ``disdrodb_run_l1_station``
+ command-line interface.
+
+ Parameters
+ ----------
+ data_source : str
+ The name of the institution (for campaigns spanning multiple countries) or
+ the name of the country (for campaigns or sensor networks within a single country).
+ Must be provided in UPPER CASE.
+ campaign_name : str
+ The name of the campaign. Must be provided in UPPER CASE.
+ station_name : str
+ The name of the station.
+ force : bool, optional
+ If ``True``, existing data in the destination directories will be overwritten.
+ If ``False`` (default), an error will be raised if data already exists in the destination directories.
+ verbose : bool, optional
+ If ``True`` (default), detailed processing information will be printed to the terminal.
+ If ``False``, less information will be displayed.
+ parallel : bool, optional
+ If ``True``, files will be processed in multiple processes simultaneously,
+ with each process using a single thread to avoid issues with the HDF/netCDF library.
+ If ``False`` (default), files will be processed sequentially in a single process,
+ and multi-threading will be automatically exploited to speed up I/O tasks.
+ debugging_mode : bool, optional
+ If ``True``, the amount of data processed will be reduced.
+ Only the first 3 files will be processed. By default, ``False``.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+
+ """
+ # Define product
+ product = "L1"
+
+ # Define base directory
+ base_dir = get_base_dir(base_dir)
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+
+ # ------------------------------------------------------------------------.
+ # Start processing
+ if verbose:
+ t_i = time.time()
+ msg = f"{product} processing of station {station_name} has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # ------------------------------------------------------------------------.
+ # Create directory structure
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=product,
+ force=force,
+ )
+
+ # -------------------------------------------------------------------------.
+ # List files to process
+ required_product = get_required_product(product)
+ flag_not_available_data = False
+ try:
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=required_product,
+ # Processing options
+ debugging_mode=debugging_mode,
+ )
+ except Exception as e:
+ print(str(e)) # Case where no file paths available
+ flag_not_available_data = True
+
+ # -------------------------------------------------------------------------.
+ # If no data available, print error message and return None
+ if flag_not_available_data:
+ msg = (
+ f"{product} processing of {data_source} {campaign_name} {station_name}"
+ + f"has not been launched because of missing {required_product} data."
+ )
+ print(msg)
+ return
+
+ # -----------------------------------------------------------------.
+ # Generate L1 files
+ # - Loop over the L0 netCDF files and generate L1 files.
+ # - If parallel=True, it does that in parallel using dask.delayed
+ list_tasks = [
+ _generate_l1(
+ filepath=filepath,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ parallel=parallel,
+ )
+ for filepath in filepaths
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+
+ # -----------------------------------------------------------------.
+ # Define L1 summary logs
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
+
+ # ---------------------------------------------------------------------.
+ # End L1 processing
+ if verbose:
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ msg = f"{product} processing of station {station_name} completed in {timedelta_str}"
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+
+####-------------------------------------------------------------------------------------------------------------------.
diff --git a/disdrodb/l1_env/__init__.py b/disdrodb/l1_env/__init__.py
new file mode 100644
index 00000000..b6330547
--- /dev/null
+++ b/disdrodb/l1_env/__init__.py
@@ -0,0 +1,17 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Core functions for DISDRODB L1 ENV production."""
diff --git a/disdrodb/l1_env/routines.py b/disdrodb/l1_env/routines.py
new file mode 100644
index 00000000..5acc40f1
--- /dev/null
+++ b/disdrodb/l1_env/routines.py
@@ -0,0 +1,38 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Core functions for DISDRODB ENV production."""
+
+import xarray as xr
+
+
+def get_default_environment_dataset():
+ """Define defaults values for the ENV dataset."""
+ ds_env = xr.Dataset()
+ ds_env["sea_level_air_pressure"] = 101_325
+ ds_env["gas_constant_dry_air"] = 287.04
+ ds_env["lapse_rate"] = 0.0065
+ ds_env["relative_humidity"] = 0.95 # Value between 0 and 1 !
+ ds_env["temperature"] = 20 + 273.15
+ return ds_env
+
+
+def load_env_dataset(ds):
+ """Load the ENV dataset."""
+ # TODO - Retrieve relative_humidity and temperature from L1-ENV
+ ds_env = get_default_environment_dataset()
+ ds_env = ds_env.assign_coords({"altitude": ds["altitude"], "latitude": ds["latitude"]})
+ return ds_env
diff --git a/disdrodb/l2/__init__.py b/disdrodb/l2/__init__.py
new file mode 100644
index 00000000..36d681b8
--- /dev/null
+++ b/disdrodb/l2/__init__.py
@@ -0,0 +1,17 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Module for DISDRODB L2 production."""
diff --git a/disdrodb/l2/empirical_dsd.py b/disdrodb/l2/empirical_dsd.py
new file mode 100644
index 00000000..49d9ef90
--- /dev/null
+++ b/disdrodb/l2/empirical_dsd.py
@@ -0,0 +1,1330 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Functions for computation of DSD parameters."""
+
+import numpy as np
+import xarray as xr
+
+
+def get_effective_sampling_area(sensor_name, diameter):
+ """Compute the effective sampling area of the disdrometer."""
+ if sensor_name in ["OTT_Parsivel", "OTT_Parsivel2"]:
+ # Calculate sampling area for each diameter bin (S_i)
+ L = 180 / 1000 # Length of the Parsivel beam in m (180 mm)
+ B = 30 / 1000 # Width of the Parsivel beam in m (30mm)
+ sampling_area = L * (B - diameter / 1000 / 2)
+ elif sensor_name in "Thies_LPM":
+ # TODO: provided as variable varying with time?
+ L = 228 / 1000 # Length of the Parsivel beam in m (228 mm)
+ B = 20 / 1000 # Width of the Parsivel beam in m (20 mm)
+ sampling_area = L * (B - diameter / 1000 / 2)
+ elif sensor_name in "RD80":
+ sampling_area = 1 # TODO
+ else:
+ raise NotImplementedError
+ return sampling_area
+
+
+def _get_spectrum_dims(ds):
+ if "velocity_bin_center" in ds.dims:
+ dims = ["diameter_bin_center", "velocity_bin_center"]
+ else:
+ dims = ["diameter_bin_center"]
+ return dims
+
+
+def get_drop_volume(diameter):
+ """
+ Compute the volume of a droplet assuming it is spherical.
+
+ Parameters
+ ----------
+ diameter : float or array-like
+ The diameter of the droplet(s). Can be a scalar or an array of diameters.
+
+ Returns
+ -------
+ array-like
+ The volume of the droplet(s) calculated in cubic units based on the input diameter(s).
+
+ Notes
+ -----
+ The volume is calculated using the formula for the volume of a sphere:
+ V = (π/6) * d^3, where d is the diameter of the droplet.
+ """
+ return np.pi / 6 * diameter**3 # /6 = 4/3*(0.5**3)
+
+
+####-------------------------------------------------------------------------------------------------------------------.
+
+
+def get_drop_average_velocity(drop_number):
+ r"""
+ Calculate the drop average velocity \\( v_m(D))) \\) per diameter class.
+
+ Parameters
+ ----------
+ drop_number : xarray.DataArray
+ Array of drop counts \\( n(D,v) \\) per diameter (and velocity, if available) bins
+ over the time integration period.
+
+ Returns
+ -------
+ average_velocity : xarray.DataArray
+ Array of drop average velocity \\( v_m(D))) \\) in m·s⁻¹ .
+ """
+ velocity = xr.ones_like(drop_number) * drop_number["velocity_bin_center"]
+ average_velocity = ((velocity * drop_number).sum(dim="velocity_bin_center")) / drop_number.sum(
+ dim="velocity_bin_center",
+ )
+ # average_velocity = average_velocity.where(average_velocity > 0, 0)
+ return average_velocity
+
+
+def get_drop_number_concentration(drop_number, velocity, diameter_bin_width, sampling_area, sample_interval):
+ r"""
+ Calculate the volumetric drop number concentration \\( N(D) \\) per diameter class.
+
+ Computes the drop number concentration \\( N(D) \\) [m⁻³·mm⁻¹] for each diameter
+ class based on the measured drop counts and sensor parameters. This represents
+ the number of drops per unit volume per unit diameter interval.
+ It is also referred to as the drop size distribution N(D) per cubic metre per millimetre [m-3 mm-1]
+
+ Parameters
+ ----------
+ velocity : xarray.DataArray
+ Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s).
+ diameter_bin_width : xarray.DataArray
+ Width of each diameter bin \\( \\Delta D \\) in millimeters (mm).
+ drop_number : xarray.DataArray
+ Array of drop counts \\( n(D,v) \\) per diameter (and velocity, if available)
+ bins over the time integration period.
+ sample_interval : float or xarray.DataArray
+ Time over which the drops are counted \\( \\Delta t \\) in seconds (s).
+ sampling_area : float or xarray.DataArray
+ The effective sampling area \\( A \\) of the sensor in square meters (m²).
+
+ Returns
+ -------
+ drop_number_concentration : xarray.DataArray or ndarray
+ Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹, representing
+ the number of drops per unit volume per unit diameter interval.
+
+ Notes
+ -----
+ The drop number concentration \\( N(D) \\) is calculated using:
+
+ .. math::
+
+ N(D) = \frac{n(D)}{A_{\text{eff}}(D) \\cdot \\Delta D \\cdot \\Delta t \\cdot v(D)}
+
+ where:
+
+ - \\( n(D,v) \\): Number of drops counted in diameter (and velocity) bins.
+ - \\( A_{\text{eff}}(D) \\): Effective sampling area of the sensor for diameter \\( D \\) in square meters (m²).
+ - \\( \\Delta D \\): Diameter bin width in millimeters (mm).
+ - \\( \\Delta t \\): Time integration period in seconds (s).
+ - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) in meters per second (m/s).
+
+ The effective sampling area \\( A_{\text{eff}}(D) \\) depends on the sensor and may vary with drop diameter.
+ """
+ # Ensure velocity is 2D (diameter, velocity)
+ velocity = xr.ones_like(drop_number) * velocity
+
+ # Compute drop number concentration
+ # - For disdrometer with velocity bins
+ if "velocity_bin_center" in drop_number.dims:
+ drop_number_concentration = (drop_number / velocity).sum(dim=["velocity_bin_center"]) / (
+ sampling_area * diameter_bin_width * sample_interval
+ )
+ # - For impact disdrometers
+ else:
+ drop_number_concentration = drop_number / (sampling_area * diameter_bin_width * sample_interval * velocity)
+ return drop_number_concentration
+
+
+# def get_drop_number_concentration1(drop_counts, velocity, diameter_bin_width, sampling_area, sample_interval):
+# r"""
+# Calculate the volumetric drop number concentration \\( N(D) \\) per diameter class.
+
+# Computes the drop number concentration \\( N(D) \\) [m⁻³·mm⁻¹] for each diameter
+# class based on the measured drop counts and sensor parameters. This represents
+# the number of drops per unit volume per unit diameter interval.
+# It is also referred to as the drop size distribution N(D) per cubic metre per millimetre [m-3 mm-1]
+
+# Parameters
+# ----------
+# velocity : xarray.DataArray
+# Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s).
+# diameter_bin_width : xarray.DataArray
+# Width of each diameter bin \\( \\Delta D \\) in millimeters (mm).
+# drop_counts : xarray.DataArray
+# Array of drop counts \\( n(D) \\) per diameter bin over the time integration period.
+# sample_interval : float or xarray.DataArray
+# Time over which the drops are counted \\( \\Delta t \\) in seconds (s).
+# sampling_area : xarray.DataArray
+
+# Returns
+# -------
+# drop_number_concentration : xarray.DataArray or ndarray
+# Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹, representing
+# the number of drops per unit volume per unit diameter interval.
+
+# Notes
+# -----
+# The drop number concentration \\( N(D) \\) is calculated using:
+
+# .. math::
+
+# N(D) = \frac{n(D)}{A_{\text{eff}}(D) \\cdot \\Delta D \\cdot \\Delta t \\cdot v(D)}
+
+# where:
+
+# - \\( n(D) \\): Number of drops counted in diameter bin \\( D \\).
+# - \\( A_{\text{eff}}(D) \\): Effective sampling area of the sensor for diameter \\( D \\) in square meters (m²).
+# - \\( \\Delta D \\): Diameter bin width in millimeters (mm).
+# - \\( \\Delta t \\): Time integration period in seconds (s).
+# - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) in meters per second (m/s).
+
+# The effective sampling area \\( A_{\text{eff}}(D) \\) depends on the sensor and may vary with drop diameter.
+# """
+# drop_number_concentration = drop_counts / (sampling_area * diameter_bin_width * sample_interval * velocity)
+# return drop_number_concentration
+
+
+def get_total_number_concentration(drop_number_concentration, diameter_bin_width):
+ r"""
+ Compute the total number concentration \\( N_t \\) from the drop size distribution.
+
+ Calculates the total number concentration \\( N_t \\) [m⁻³] by integrating the
+ drop number concentration over all diameter bins.
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹.
+ diameter_bin_width : xarray.DataArray
+ Width of each diameter bin \\( \\Delta D \\) in millimeters (mm).
+
+ Returns
+ -------
+ total_number_concentration : xarray.DataArray or ndarray
+ Total number concentration \\( N_t \\) in m⁻³, representing the total number
+ of drops per unit volume.
+
+ Notes
+ -----
+ The total number concentration \\( N_t \\) is calculated by integrating over the diameter bins:
+
+ .. math::
+
+ N_t = \\sum_{\text{bins}} N(D) \\cdot \\Delta D
+
+ where:
+
+ - \\( N(D) \\): Drop number concentration in each diameter bin [m⁻³·mm⁻¹].
+ - \\( \\Delta D \\): Diameter bin width in millimeters (mm).
+
+ """
+ total_number_concentration = (drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center")
+ return total_number_concentration
+
+
+def get_moment(drop_number_concentration, diameter, diameter_bin_width, moment):
+ r"""
+ Calculate the m-th moment of the drop size distribution.
+
+ Computes the m-th moment of the drop size distribution (DSD), denoted as E[D**m],
+ where D is the drop diameter and m is the order of the moment. This is useful
+ in meteorology and hydrology for characterizing precipitation. For example,
+ weather radar measurements correspond to the sixth moment of the DSD (m = 6).
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ The drop number concentration N(D) for each diameter bin,
+ typically in units of number per cubic meter per millimeter (m⁻³ mm⁻¹).
+ diameter : xarray.DataArray
+ The equivalent volume diameters D of the drops in each bin, in meters (m).
+ diameter_bin_width : xarray.DataArray
+ The width dD of each diameter bin, in millimeters (mm).
+ moment : int or float
+ The order m of the moment to compute.
+
+ Returns
+ -------
+ moment_value : xarray.DataArray
+ The computed m-th moment of the drop size distribution, typically in units
+ dependent on the input units, such as mmᵐ m⁻³.
+
+ Notes
+ -----
+ The m-th moment is calculated using the formula:
+
+ .. math::
+
+ M_m = \\sum_{\text{bins}} N(D) \\cdot D^m \\cdot dD
+
+ where:
+
+ - \\( M_m \\) is the m-th moment of the DSD.
+ - \\( N(D) \\) is the drop number concentration for diameter \\( D \\).
+ - \\( D^m \\) is the diameter raised to the power of \\( m \\).
+ - \\( dD \\) is the diameter bin width.
+
+ This computation integrates over the drop size distribution to provide a
+ scalar value representing the statistical momen
+ """
+ return ((diameter * 1000) ** moment * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center")
+
+
+####------------------------------------------------------------------------------------------------------------------
+#### Rain and Reflectivity
+
+
+def get_rain_rate(drop_counts, sampling_area, diameter, sample_interval):
+ r"""
+ Compute the rain rate \\( R \\) [mm/h] based on the drop size distribution and drop velocities.
+
+ This function calculates the rain rate by integrating over the drop size distribution (DSD),
+ considering the volume of water falling per unit time and area. It uses the number of drops
+ counted in each diameter class, the effective sampling area of the sensor, the diameters of the
+ drops, and the time interval over which the drops are counted.
+
+ Parameters
+ ----------
+ drop_counts : xarray.DataArray
+ Array representing the number of drops per diameter class \\( n(D) \\) in each bin.
+ sample_interval : float or xarray.DataArray
+ The time duration over which drops are counted \\( \\Delta t \\) in seconds (s).
+ sampling_area : float or xarray.DataArray
+ The effective sampling area \\( A \\) of the sensor in square meters (m²).
+ diameter : xarray.DataArray
+ Array of drop diameters \\( D \\) in meters (m).
+
+ Returns
+ -------
+ rain_rate : xarray.DataArray
+ The computed rain rate \\( R \\) in millimeters per hour (mm/h), which represents the volume
+ of water falling per unit area per unit time.
+
+ Notes
+ -----
+ The rain rate \\( R \\) is calculated using the following formula:
+
+ .. math::
+
+ R = \frac{\\pi}{6} \times 10^{-3} \times 3600 \times
+ \\sum_{\text{bins}} n(D) \cdot A(D) \cdot D^3 \cdot \\Delta t
+
+ Where:
+ - \\( n(D) \\) is the number of drops in each diameter class.
+ - \\( A(D) \\) is the effective sampling area.
+ - \\( D \\) is the drop diameter.
+ - \\( \\Delta t \\) is the time interval for drop counts.
+
+ This formula incorporates a conversion factor to express the rain rate in millimeters per hour.
+ """
+ rain_rate = (
+ np.pi
+ / 6
+ / sample_interval
+ * (drop_counts / sampling_area * diameter**3).sum(dim="diameter_bin_center")
+ * 3600
+ * 1000
+ )
+
+ # 0.6 or / 6 --> Different variant across articles and codes !!! (pydsd 0.6, raupach 2015, ...)
+ # --> 1/6 * 3600 = 600 = 0.6 * 1e3 = 6 * 1e2
+ # --> 1/6 * 3600 * 1000 = 0.6 * 1e6 = 6 * 1e5 --> 6 * 1e-4 (if diameter in mm)
+ # rain_rate = np.pi * 0.6 * 1e3 / sample_interval * (
+ # (drop_counts * diameter**3 / sampling_area).sum(dim="diameter_bin_center") * 1000))
+ # rain_rate = np.pi / 6 / sample_interval * (
+ # (drop_counts * diameter**3 / sampling_area).sum(dim="diameter_bin_center") * 1000 * 3600)
+
+ return rain_rate
+
+
+def get_rain_rate_from_dsd(drop_number_concentration, velocity, diameter, diameter_bin_width):
+ r"""
+ Compute the rain rate \\( R \\) [mm/h] based on the drop size distribution and raindrop velocities.
+
+ Calculates the rain rate by integrating over the drop size distribution (DSD),
+ considering the volume of water falling per unit time and area.
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹.
+ velocity : xarray.DataArray
+ Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s).
+ diameter : xarray.DataArray
+ Array of drop diameters \\( D \\) in meters (m).
+ diameter_bin_width : xarray.DataArray
+ Width of each diameter bin \\( \\Delta D \\) in millimeters (mm).
+
+ Returns
+ -------
+ rain_rate : xarray.DataArray
+ The rain rate \\( R \\) in millimeters per hour (mm/h), representing the volume
+ of water falling per unit area per unit time.
+
+ Notes
+ -----
+ The rain rate \\( R \\) is calculated using:
+
+ .. math::
+
+ R = \frac{\\pi}{6} \times 10^{-3} \times 3600 \times
+ \\sum_{\text{bins}} N(D) \\cdot v(D) \\cdot D^3 \\cdot \\Delta D
+
+ where:
+
+ - \\( N(D) \\): Drop number concentration [m⁻³·mm⁻¹].
+ - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) [m/s].
+ - \\( D \\): Drop diameter [mm].
+ - \\( \\Delta D \\): Diameter bin width [mm].
+ - The factor \\( \frac{\\pi}{6} \\) converts the diameter cubed to volume of a sphere.
+ - The factor \\( 10^{-3} \\) converts from mm³ to m³.
+ - The factor \\( 3600 \\) converts from seconds to hours.
+
+ """
+ # The following formula assume diameter in mm !!!
+ rain_rate = (
+ np.pi
+ / 6
+ * (drop_number_concentration * velocity * diameter**3 * diameter_bin_width).sum(dim="diameter_bin_center")
+ * 3600
+ * 1000
+ )
+
+ # Alternative formulation
+ # 3600*1000/6 = 6e5
+ # 1e-9 for mm to meters conversion
+ # --> 6 * 1 e-4
+ # rain_rate = 6 * np.pi * 1e-4 * (
+ # (drop_number_concentration * velocity * diameter**3 * diameter_bin_width).sum(dim="diameter_bin_center")
+ # )
+ return rain_rate
+
+
+def get_rain_accumulation(rain_rate, sample_interval):
+ """
+ Calculate the total rain accumulation over a specified time period.
+
+ Parameters
+ ----------
+ rain_rate : float or array-like
+ The rain rate in millimeters per hour (mm/h).
+ sample_interval : int
+ The time over which to accumulate rain, specified in seconds.
+
+ Returns
+ -------
+ float or numpy.ndarray
+ The total rain accumulation in millimeters (mm) over the specified time period.
+
+ """
+ rain_accumulation = rain_rate / 3600 * sample_interval
+ return rain_accumulation
+
+
+def get_equivalent_reflectivity_factor(drop_number_concentration, diameter, diameter_bin_width):
+ r"""
+ Compute the equivalent reflectivity factor in decibels relative to 1 mm⁶·m⁻³ (dBZ).
+
+ The equivalent reflectivity (in mm⁶·m⁻³) is obtained from the sixth moment of the drop size distribution (DSD).
+ The reflectivity factor is expressed in decibels relative to 1 mm⁶·m⁻³ using the formula:
+
+ .. math::
+
+ Z = 10 \cdot \log_{10}(z)
+
+ where \\( z \\) is the reflectivity in linear units of the DSD.
+
+ To convert back the reflectivity factor to linear units (mm⁶·m⁻³), use the formula:
+
+ .. math::
+
+ z = 10^{(Z/10)}
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ Array representing the concentration of droplets per diameter class in number per unit volume.
+ diameter : xarray.DataArray
+ Array of droplet diameters in meters (m).
+ diameter_bin_width : xarray.DataArray
+ Array representing the width of each diameter bin in millimeters (mm).
+
+ Returns
+ -------
+ xarray.DataArray
+ The equivalent reflectivity factor in decibels (dBZ).
+
+ Notes
+ -----
+ The function computes the sixth moment of the DSD using the formula:
+
+ .. math::
+
+ z = \\sum n(D) \cdot D^6 \cdot \\Delta D
+
+ where \\( n(D) \\) is the drop number concentration, \\( D \\) is the drop diameter, and
+ \\( \\Delta D \\) is the diameter bin width.
+
+ """
+ # Compute reflectivity in mm⁶·m⁻³
+ z = ((diameter * 1000) ** 6 * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center")
+ invalid_mask = z > 0
+ z = z.where(invalid_mask)
+ # Compute equivalent reflectivity factor in dBZ
+ # - np.log10(np.nan) returns -Inf !
+ # --> We mask again after the log
+ Z = 10 * np.log10(z)
+ Z = Z.where(invalid_mask)
+ return Z
+
+
+####------------------------------------------------------------------------------------------------------------------
+#### Liquid Water Content / Mass Parameters
+
+
+def get_mass_spectrum(drop_number_concentration, diameter, water_density=1000):
+ """
+ Calculate the rain drop mass spectrum m(D) in g/m3 mm-1.
+
+ It represents the mass of liquid water as a function of raindrop diameter.
+
+ Parameters
+ ----------
+ drop_number_concentration : array-like
+ The concentration of droplets (number of droplets per unit volume) in each diameter bin.
+ diameter : array-like
+ The diameters of the droplets for each bin, in meters (m).
+
+
+ Returns
+ -------
+ array-like
+ The calculated rain drop mass spectrum in grams per cubic meter per diameter (g/m3 mm-1).
+
+ """
+ # Convert water density from kg/m3 to g/m3
+ water_density = water_density * 1000
+
+ # Calculate the volume constant for the water droplet formula
+ vol_constant = np.pi / 6.0 * water_density
+
+ # Calculate the mass spectrum (lwc per diameter bin)
+ return vol_constant * (diameter**3 * drop_number_concentration) # [g/m3 mm-1]
+
+
+def get_liquid_water_content(drop_number_concentration, diameter, diameter_bin_width, water_density=1000):
+ """
+ Calculate the liquid water content based on drop number concentration and drop diameter.
+
+ Parameters
+ ----------
+ drop_number_concentration : array-like
+ The concentration of droplets (number of droplets per unit volume) in each diameter bin.
+ diameter : array-like
+ The diameters of the droplets for each bin, in meters (m).
+ diameter_bin_width : array-like
+ The width of each diameter bin, in millimeters (mm).
+ water_density : float, optional
+ The density of water in kg/m^3. The default is 1000 kg/m3.
+
+ Returns
+ -------
+ array-like
+ The calculated liquid water content in grams per cubic meter (g/m3).
+
+ """
+ # Convert water density from kg/m3 to g/m3
+ water_density = water_density * 1000
+
+ # Calculate the volume constant for the water droplet formula
+ vol_constant = np.pi / 6.0 * water_density
+
+ # Calculate the liquid water content
+ lwc = vol_constant * (diameter**3 * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center")
+ return lwc
+
+
+def get_mom_liquid_water_content(moment_3, water_density=1000):
+ r"""
+ Calculate the liquid water content (LWC) from the third moment of the DSD.
+
+ LWC represents the mass of liquid water per unit volume of air.
+
+ Parameters
+ ----------
+ moment_3 : float or array-like
+ The third moment of the drop size distribution, \\( M_3 \\), in units of
+ [m⁻³·mm³] (number per cubic meter times diameter cubed).
+ water_density : float, optional
+ The density of water in kilograms per cubic meter (kg/m³).
+ Default is 1000 kg/m³ (approximate density of water at 20°C).
+
+ Returns
+ -------
+ lwc : float or array-like
+ The liquid water content in grams per cubic meter (g/m³).
+
+ Notes
+ -----
+ The liquid water content is calculated using the formula:
+
+ .. math::
+
+ \text{LWC} = \frac{\\pi \rho_w}{6} \\cdot M_3
+
+ where:
+
+ - \\( \text{LWC} \\) is the liquid water content [g/m³].
+ - \\( \rho_w \\) is the density of water [g/mm³].
+ - \\( M_3 \\) is the third moment of the DSD [m⁻³·mm³].
+
+ Examples
+ --------
+ Compute the liquid water content from the third moment:
+
+ >>> moment_3 = 1e6 # Example value in [m⁻³·mm³]
+ >>> lwc = get_liquid_water_content_from_moments(moment_3)
+ >>> print(f"LWC: {lwc:.4f} g/m³")
+ LWC: 0.0005 g/m³
+ """
+ # Convert water density from kg/m³ to g/mm³
+ water_density = water_density * 1e-6 # [kg/m³] * 1e-6 = [g/mm³]
+ # Calculate LWC [g/m3]
+ lwc = (np.pi * water_density / 6) * moment_3 # [g/mm³] * [m⁻³·mm³] = [g/m³]
+ return lwc
+
+
+####--------------------------------------------------------------------------------------------------------
+#### Diameter parameters
+
+
+def _get_last_xr_valid_idx(da_condition, dim, fill_value=None):
+ """
+ Get the index of the last True value along a specified dimension in an xarray DataArray.
+
+ This function finds the last index along the given dimension where the condition is True.
+ If all values are False or NaN along that dimension, the function returns ``fill_value``.
+
+ Parameters
+ ----------
+ da_condition : xarray.DataArray
+ A boolean DataArray where True indicates valid or desired values.
+ Should have the dimension specified in `dim`.
+ dim : str
+ The name of the dimension along which to find the last True index.
+ fill_value : int or float
+ The fill value when all values are False or NaN along the specified dimension.
+ The default is ``dim_size - 1``.
+
+ Returns
+ -------
+ last_idx : xarray.DataArray
+ An array containing the index of the last True value along the specified dimension.
+ If all values are False or NaN, the corresponding entry in `last_idx` will be NaN.
+
+ Notes
+ -----
+ The function works by reversing the DataArray along the specified dimension and using
+ `argmax` to find the first True value in the reversed array. It then calculates the
+ corresponding index in the original array. To handle cases where all values are False
+ or NaN (and `argmax` would return 0), the function checks if there is any True value
+ along the dimension and assigns NaN to `last_idx` where appropriate.
+
+ Examples
+ --------
+ >>> import xarray as xr
+ >>> da = xr.DataArray([[False, False, True], [False, False, False]], dims=["time", "diameter_bin_center"])
+ >>> last_idx = _get_last_xr_valid_idx(da, "diameter_bin_center")
+ >>> print(last_idx)
+
+ array([2., nan])
+ Dimensions without coordinates: time
+
+ In this example, for the first time step, the last True index is 2.
+ For the second time step, all values are False, so the function returns NaN.
+
+ """
+ # Get the size of the 'diameter_bin_center' dimension
+ dim_size = da_condition.sizes[dim]
+
+ # Define default fillvalue
+ if fill_value is None:
+ fill_value = dim_size - 1
+
+ # Reverse the mask along 'diameter_bin_center'
+ da_condition_reversed = da_condition.isel({dim: slice(None, None, -1)})
+
+ # Check if there is any True value along the dimension for each slice
+ has_true = da_condition.any(dim=dim)
+
+ # Find the first non-zero index in the reversed array
+ last_idx_from_end = da_condition_reversed.argmax(dim=dim)
+
+ # Calculate the last True index in the original array
+ last_idx = xr.where(
+ has_true,
+ dim_size - last_idx_from_end - 1,
+ fill_value,
+ )
+ return last_idx
+
+
+def get_min_max_diameter(drop_counts):
+ """
+ Get the minimum and maximum diameters where drop_counts is non-zero.
+
+ Parameters
+ ----------
+ drop_counts : xarray.DataArray
+ Drop counts with dimensions ("time", "diameter_bin_center") and
+ coordinate "diameter_bin_center".
+
+ Returns
+ -------
+ min_drop_diameter : xarray.DataArray
+ Minimum diameter where drop_counts is non-zero, for each time step.
+ max_drop_diameter : xarray.DataArray
+ Maximum diameter where drop_counts is non-zero, for each time step.
+ """
+ # Create a boolean mask where drop_counts is non-zero
+ non_zero_mask = drop_counts > 0
+
+ # Find the first non-zero index along 'diameter_bin_center' for each time
+ # - Return 0 if all False, zero or NaN
+ first_non_zero_idx = non_zero_mask.argmax(dim="diameter_bin_center")
+
+ # Calculate the last non-zero index in the original array
+ last_non_zero_idx = _get_last_xr_valid_idx(da_condition=non_zero_mask, dim="diameter_bin_center")
+
+ # Get the 'diameter_bin_center' coordinate
+ diameters = drop_counts["diameter_bin_center"]
+
+ # Retrieve the diameters corresponding to the first and last non-zero indices
+ min_drop_diameter = diameters.isel(diameter_bin_center=first_non_zero_idx.astype(int))
+ max_drop_diameter = diameters.isel(diameter_bin_center=last_non_zero_idx.astype(int))
+
+ # Identify time steps where all drop_counts are zero
+ is_all_zero_or_nan = ~non_zero_mask.any(dim="diameter_bin_center")
+
+ # Mask with NaN where no drop or all values are NaN
+ min_drop_diameter = min_drop_diameter.where(~is_all_zero_or_nan)
+ max_drop_diameter = max_drop_diameter.where(~is_all_zero_or_nan)
+
+ return min_drop_diameter, max_drop_diameter
+
+
+def get_mode_diameter(drop_number_concentration):
+ """Get raindrop diameter with highest occurrence."""
+ diameter = drop_number_concentration["diameter_bin_center"]
+ # If all NaN, set to 0 otherwise argmax fail when all NaN data
+ idx_all_nan_mask = np.isnan(drop_number_concentration).all(dim="diameter_bin_center")
+ drop_number_concentration = drop_number_concentration.where(~idx_all_nan_mask, 0)
+ # Find index where all 0
+ # --> argmax will return 0
+ idx_all_zero = (drop_number_concentration == 0).all(dim="diameter_bin_center")
+ # Find the diameter index corresponding the "mode"
+ idx_observed_mode = drop_number_concentration.argmax(dim="diameter_bin_center")
+ # Find the diameter corresponding to the "mode"
+ diameter_mode = diameter.isel({"diameter_bin_center": idx_observed_mode})
+ diameter_mode = diameter_mode.drop(
+ ["diameter_bin_width", "diameter_bin_lower", "diameter_bin_upper", "diameter_bin_center"],
+ )
+ # Set to np.nan where data where all NaN or all 0
+ idx_mask = np.logical_or(idx_all_nan_mask, idx_all_zero)
+ diameter_mode = diameter_mode.where(~idx_mask)
+ return diameter_mode
+
+
+####-------------------------------------------------------------------------------------------------------------------.
+#### Mass diameters
+
+
+def get_mean_volume_drop_diameter(moment_3, moment_4):
+ r"""
+ Calculate the volume-weighted mean volume diameter \\( D_m \\) from DSD moments.
+
+ The mean volume diameter of a drop size distribution (DSD) is computed using
+ the third and fourth moments.
+
+ The volume-weighted mean volume diameter is also referred as the mass mean diameter.
+ It represents the first moment of the mass spectrum.
+
+ Parameters
+ ----------
+ moment_3 : float or array-like
+ The third moment of the drop size distribution, \\( M_3 \\), in units of
+ [m⁻³·mm³].
+ moment_4 : float or array-like
+ The fourth moment of the drop size distribution, \\( M_4 \\), in units of
+ [m⁻³·mm⁴].
+
+ Returns
+ -------
+ D_m : float or array-like
+ The mean volume diameter in millimeters (mm).
+
+ Notes
+ -----
+ The mean volume diameter is calculated using the formula:
+
+ .. math::
+
+ D_m = \frac{M_4}{M_3}
+
+ where:
+
+ - \\( D_m \\) is the mean volume diameter [mm].
+ - \\( M_3 \\) is the third moment of the DSD [m⁻³·mm³].
+ - \\( M_4 \\) is the fourth moment of the DSD [m⁻³·mm⁴].
+
+ Examples
+ --------
+ Compute the mean volume diameter from the third and fourth moments:
+
+ >>> moment_3 = 1e6 # Example value in [m⁻³·mm³]
+ >>> moment_4 = 5e6 # Example value in [m⁻³·mm⁴]
+ >>> D_m = get_mean_volume_drop_diameter(moment_3, moment_4)
+ >>> print(f"Mean Volume Diameter D_m: {D_m:.4f} mm")
+ Mean Volume Diameter D_m: 5.0000 mm
+
+ """
+ D_m = moment_4 / moment_3 # Units: [mm⁴] / [mm³] = [mm]
+ return D_m
+
+
+def get_std_volume_drop_diameter(drop_number_concentration, diameter_bin_width, diameter, mean_volume_diameter):
+ r"""
+ Calculate the standard deviation of the mass-weighted drop diameter (σₘ).
+
+ This parameter is often also referred as the mass spectrum standard deviation.
+ It quantifies the spread or variability of DSD.
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ The drop number concentration \\( N(D) \\) for each diameter bin, typically in units of
+ number per cubic meter per millimeter (m⁻³·mm⁻¹).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m).
+ diameter_bin_width : xarray.DataArray
+ The width \\( \\Delta D \\) of each diameter bin, in millimeters (mm).
+ mean_volume_diameter : xarray.DataArray
+ The mean volume diameter \\( D_m \\), in millimeters (mm). This is typically computed using the
+ third and fourth moments or directly from the DSD.
+
+ Returns
+ -------
+ sigma_m : xarray.DataArray or float
+ The standard deviation of the mass-weighted drop diameter, \\( \\sigma_m \\),
+ in millimeters (mm).
+
+ Notes
+ -----
+ The standard deviation of the mass-weighted drop diameter is calculated using the formula:
+
+ .. math::
+
+ \\sigma_m = \\sqrt{\frac{\\sum [N(D) \\cdot (D - D_m)^2 \\cdot D^3
+ \\cdot \\Delta D]}{\\sum [N(D) \\cdot D^3 \\cdot \\Delta D]}}
+
+ where:
+
+ - \\( N(D) \\) is the drop number concentration for diameter \\( D \\) [m⁻³·mm⁻¹].
+ - \\( D \\) is the drop diameter [mm].
+ - \\( D_m \\) is the mean volume diameter [mm].
+ - \\( \\Delta D \\) is the diameter bin width [mm].
+ - The numerator computes the weighted variance of diameters.
+ - The weighting factor \\( D^3 \\) accounts for mass (since mass ∝ \\( D^3 \\)).
+
+ **Physical Interpretation:**
+
+ - A smaller \\( \\sigma_m \\) indicates that the mass is concentrated around the
+ mean mass-weighted diameter, implying less variability in drop sizes.
+ - A larger \\( \\sigma_m \\) suggests a wider spread of drop sizes contributing
+ to the mass, indicating greater variability.
+
+ References
+ ----------
+ - Smith, P. L., Johnson, R. W., & Kliche, D. V. (2019). On Use of the Standard
+ Deviation of the Mass Distribution as a Parameter in Raindrop Size Distribution
+ Functions. *Journal of Applied Meteorology and Climatology*, 58(4), 787-796.
+ https://doi.org/10.1175/JAMC-D-18-0086.1
+ - Williams, C. R., and Coauthors, 2014: Describing the Shape of Raindrop Size Distributions Using Uncorrelated
+ Raindrop Mass Spectrum Parameters. J. Appl. Meteor. Climatol., 53, 1282-1296, https://doi.org/10.1175/JAMC-D-13-076.1.
+ """
+ const = drop_number_concentration * diameter_bin_width * diameter**3
+ numerator = ((diameter * 1000 - mean_volume_diameter) ** 2 * const).sum(dim="diameter_bin_center")
+ sigma_m = np.sqrt(numerator / const.sum(dim="diameter_bin_center"))
+ return sigma_m
+
+
+def get_median_volume_drop_diameter(drop_number_concentration, diameter, diameter_bin_width, water_density=1000):
+ r"""
+ Compute the median volume drop diameter (D50).
+
+ The median volume drop diameter (D50) is defined as the diameter at which half of the total liquid water content
+ is contributed by drops smaller than D50, and half by drops larger than D50.
+
+ Drops smaller (respectively larger) than D50 contribute to half of the
+ total rainwater content in the sampled volume.
+ D50 is sensitive to the concentration of large drops.
+
+ Often referred also as D50 (50 for 50 percentile of the distribution).
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ The drop number concentration \( N(D) \) for each diameter bin, typically in units of
+ number per cubic meter per millimeter (m⁻³·mm⁻¹).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \( D \) of the drops in each bin, in meters (m).
+ diameter_bin_width : xarray.DataArray
+ The width \( \Delta D \) of each diameter bin, in millimeters (mm).
+ water_density : float, optional
+ The density of water in kg/m^3. The default is 1000 kg/m3.
+
+ Returns
+ -------
+ xarray.DataArray
+ Median volume drop diameter (D50) [mm].
+ The drop diameter that divides the volume of water contained in the sample into two equal parts.
+
+ """
+ d50 = get_quantile_volume_drop_diameter(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ fraction=0.5,
+ water_density=water_density,
+ )
+ return d50
+
+
+def get_quantile_volume_drop_diameter(
+ drop_number_concentration,
+ diameter,
+ diameter_bin_width,
+ fraction,
+ water_density=1000,
+):
+ r"""
+ Compute the diameter corresponding to a specified fraction of the cumulative liquid water content (LWC).
+
+ This function calculates the diameter \( D_f \) at which the cumulative LWC reaches
+ a specified fraction \( f \) of the total LWC for each drop size distribution (DSD).
+ When \( f = 0.5 \), it computes the median volume drop diameter.
+
+
+ Parameters
+ ----------
+ drop_number_concentration : xarray.DataArray
+ The drop number concentration \( N(D) \) for each diameter bin, typically in units of
+ number per cubic meter per millimeter (m⁻³·mm⁻¹).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \( D \) of the drops in each bin, in meters (m).
+ diameter_bin_width : xarray.DataArray
+ The width \( \Delta D \) of each diameter bin, in millimeters (mm).
+ fraction : float
+ The fraction \( f \) of the total liquid water content to compute the diameter for.
+ Default is 0.5, which computes the median volume diameter (D50).
+ For other percentiles, use 0.1 for D10, 0.9 for D90, etc. Must be between 0 and 1 (exclusive).
+ water_density : float, optional
+ The density of water in kg/m^3. The default is 1000 kg/m3.
+
+ Returns
+ -------
+ D_f : xarray.DataArray
+ The diameter \( D_f \) corresponding to the specified fraction \( f \) of cumulative LWC,
+ in millimeters (mm). For `fraction=0.5`, this is the median volume drop diameter D50.
+
+ Notes
+ -----
+ The calculation involves computing the cumulative sum of the liquid water content
+ contributed by each diameter bin and finding the diameter at which the cumulative
+ sum reaches the specified fraction \( f \) of the total liquid water content.
+
+ Linear interpolation is used between the two diameter bins where the cumulative LWC
+ crosses the target LWC fraction.
+
+ """
+ # Check fraction
+ if not (0 < fraction < 1):
+ raise ValueError("Fraction must be between 0 and 1 (exclusive)")
+
+ # Convert water density from kg/m3 to g/m3
+ water_density = water_density * 1000
+
+ # Compute LWC per diameter bin [g/m3]
+ lwc_per_diameter = np.pi / 6.0 * water_density * (diameter**3 * drop_number_concentration * diameter_bin_width)
+
+ # Compute rain rate per diameter [mm/hr]
+ # rain_rate_per_diameter = np.pi / 6 * (
+ # (drop_number_concentration * velocity * diameter**3 * diameter_bin_width) * 3600 * 1000
+ # )
+
+ # Compute the cumulative sum of LWC along the diameter bins
+ cumulative_lwc = lwc_per_diameter.cumsum(dim="diameter_bin_center")
+
+ # ------------------------------------------------------.
+ # Retrieve total lwc and target lwc
+ total_lwc = cumulative_lwc.isel(diameter_bin_center=-1)
+ target_lwc = total_lwc * fraction
+
+ # Retrieve idx half volume is reached
+ # --> If all NaN or False, argmax and _get_last_xr_valid_idx(fill_value=0) return 0 !
+ idx_upper = (cumulative_lwc >= target_lwc).argmax(dim="diameter_bin_center")
+ idx_lower = _get_last_xr_valid_idx(
+ da_condition=(cumulative_lwc <= target_lwc),
+ dim="diameter_bin_center",
+ fill_value=0,
+ )
+
+ # Define mask when fraction fall exactly at a diameter bin center
+ # - Also related to the case of only values in the first bin.
+ solution_is_bin_center = idx_upper == idx_lower
+
+ # Define diameter increment from lower bin center
+ y1 = cumulative_lwc.isel(diameter_bin_center=idx_lower)
+ y2 = cumulative_lwc.isel(diameter_bin_center=idx_upper)
+ yt = target_lwc
+ d1 = diameter.isel(diameter_bin_center=idx_lower) # m
+ d2 = diameter.isel(diameter_bin_center=idx_upper) # m
+ d_increment = (d2 - d1) * (yt - y1) / (y2 - y1)
+
+ # Define quantile diameter
+ d = xr.where(solution_is_bin_center, d1, d1 + d_increment)
+
+ # Set NaN where total sum is 0 or all NaN
+ mask_invalid = np.logical_or(total_lwc == 0, np.isnan(total_lwc))
+ d = d.where(~mask_invalid)
+
+ # Convert diameter to mm
+ d = d * 1000
+
+ return d
+
+
+####-----------------------------------------------------------------------------------------------------
+#### Normalized Gamma Parameters
+
+
+def get_normalized_intercept_parameter(liquid_water_content, mean_volume_diameter, water_density=1000):
+ r"""
+ Calculate the normalized intercept parameter \\( N_w \\) of the drop size distribution.
+
+ A higher \\( N_w \\) indicates a higher concentration of smaller drops.
+ The \\( N_w \\) is used in models to represent the DSD when assuming a normalized gamma distribution.
+
+ Parameters
+ ----------
+ liquid_water_content : float or array-like
+ Liquid water content \\( LWC \\) in grams per cubic meter (g/m³).
+ mean_volume_diameter : float or array-like
+ Mean volume diameter \\( D_m \\) in millimeters (mm).
+ water_density : float, optional
+ Density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³).
+ The default is 1000 kg/m³.
+
+ Returns
+ -------
+ Nw : xarray.DataArray or float
+ Normalized intercept parameter \\( N_w \\) in units of m⁻3·mm⁻¹.
+
+ Notes
+ -----
+ The normalized intercept parameter \\( N_w \\) is calculated using the formula:
+
+ .. math::
+
+ N_w = \frac{256}{\\pi \rho_w} \\cdot \frac{W}{D_m^4}
+
+ where:
+
+ - \\( N_w \\) is the normalized intercept parameter.
+ - \\( W \\) is the liquid water content in g/m³.
+ - \\( D_m \\) is the mean volume diameter in mm.
+ - \\( \rho_w \\) is the density of water in kg/m³.
+ """
+ # Conversion to g/m3
+ water_density = water_density * 1000 # g/m3
+
+ # Compute Nw
+ # --> 1e9 is used to convert from mm-4 to m-3 mm-1
+ # - 256 = 4**4
+ # - lwc = (np.pi * water_density / 6) * moment_3
+ Nw = (256.0 / (np.pi * water_density)) * liquid_water_content / mean_volume_diameter**4 * 1e9
+ return Nw
+
+
+def get_mom_normalized_intercept_parameter(moment_3, moment_4):
+ r"""
+ Calculate the normalized intercept parameter \\( N_w \\) of the drop size distribution.
+
+ moment_3 : float or array-like
+ The third moment of the drop size distribution, \\( M_3 \\), in units of
+ [m⁻³·mm³] (number per cubic meter times diameter cubed).
+
+ moment_4 : float or array-like
+ The foruth moment of the drop size distribution, \\( M_3 \\), in units of
+ [m⁻³·mm4].
+
+ Returns
+ -------
+ Nw : xarray.DataArray or float
+ Normalized intercept parameter \\( N_w \\) in units of m⁻3·mm⁻¹.
+
+ References
+ ----------
+ Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001:
+ The Concept of “Normalized” Distribution to Describe Raindrop Spectra:
+ A Tool for Cloud Physics and Cloud Remote Sensing.
+ J. Appl. Meteor. Climatol., 40, 1118-1140,
+ https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2
+
+ """
+ Nw = 256 / 6 * moment_3**5 / moment_4**4
+ return Nw
+
+
+####--------------------------------------------------------------------------------------------------------
+#### Kinetic Energy Parameters
+
+
+def get_min_max_drop_kinetic_energy(drop_number, diameter, velocity, water_density=1000):
+ r"""
+ Calculate the minimum and maximum kinetic energy of raindrops in a drop size distribution (DSD).
+
+ This function computes the kinetic energy of individual raindrops based on their diameters and
+ fall velocities and returns the minimum and maximum values among these drops for each time step.
+
+ Parameters
+ ----------
+ drop_number : xarray.DataArray
+ The number of drops in each diameter (and velocity, if available) bin(s).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m).
+ velocity : xarray.DataArray or float
+ The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s).
+ water_density : float, optional
+ The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³).
+ Default is 1000 kg/m³.
+
+ Returns
+ -------
+ min_drop_kinetic_energy : xarray.DataArray
+ The minimum kinetic energy among the drops present in the DSD, in joules (J).
+ max_drop_kinetic_energy : xarray.DataArray
+ The maximum kinetic energy among the drops present in the DSD, in joules (J).
+
+ Notes
+ -----
+ The kinetic energy \\( KE \\) of an individual drop is calculated using:
+
+ .. math::
+
+ KE = \frac{1}{2} \\cdot m \\cdot v^2
+
+ where:
+
+ - \\( m \\) is the mass of the drop, calculated as:
+
+ .. math::
+
+ m = \frac{\\pi}{6} \\cdot \rho_w \\cdot D^3
+
+ with \\( D \\) being the drop diameter.
+
+ - \\( v \\) is the fall velocity of the drop.
+ """
+ # Ensure velocity is 2D (diameter, velocity)
+ velocity = xr.ones_like(drop_number) * velocity
+
+ # # Compute the mass of each drop: m = (π/6) * rho_w * D^3
+ # mass = (np.pi / 6) * water_density * diameter**3 # Units: kg
+
+ # # Compute kinetic energy: KE = 0.5 * m * v^2
+ # ke = 0.5 * mass * velocity**2 # Units: J
+
+ # Compute kinetic energy
+ ke = 1 / 12 * water_density * np.pi * diameter**3 * velocity**2
+
+ # Select kinetic energies where drops are present
+ ke = ke.where(drop_number > 0)
+
+ # Compute min, mean and maximum drop kinetic energy
+ max_drop_kinetic_energy = ke.max(dim=_get_spectrum_dims(ke))
+ min_drop_kinetic_energy = ke.min(dim=_get_spectrum_dims(ke))
+ return min_drop_kinetic_energy, max_drop_kinetic_energy
+
+
+def get_kinetic_energy_density_flux(
+ drop_number,
+ diameter,
+ velocity,
+ sampling_area,
+ sample_interval,
+ water_density=1000,
+):
+ r"""
+ Calculate the kinetic energy flux density (KE) of rainfall over time.
+
+ This function computes the total kinetic energy of raindrops passing through the sensor's sampling area
+ per unit time and area, resulting in the kinetic energy flux density
+ in joules per square meter per hour (J·m⁻²·h⁻¹).
+
+ Typical values range between 0 and 5000 J·m⁻²·h⁻¹ .
+ KE = E * R
+
+ Parameters
+ ----------
+ drop_number : xarray.DataArray
+ The number of drops in each diameter (and velocity, if available) bin(s).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m).
+ velocity : xarray.DataArray or float
+ The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s).
+ Values are broadcasted to match the dimensions of `drop_number`.
+ sampling_area : float
+ The effective sampling area \\( A \\) of the sensor in square meters (m²).
+ sample_interval : float
+ The time over which the drops are counted \\( \\Delta t \\) in seconds (s).
+ water_density : float, optional
+ The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³).
+ Default is 1000 kg/m³.
+
+ Returns
+ -------
+ kinetic_energy_flux : xarray.DataArray
+ The kinetic energy flux density of rainfall in joules per square meter per hour (J·m⁻²·h⁻¹).
+ Dimensions are reduced to ('time',).
+
+ Notes
+ -----
+ The kinetic energy flux density \\( KE \\) is calculated using:
+
+ .. math::
+
+ KE = \frac{1}{2} \\cdot \frac{\rho_w \\pi}{6} \\cdot \frac{1}{\\Delta t} \\cdot 3600 \\cdot \\sum_{i,j}
+ \\left( \frac{n_{ij} \\cdot D_i^3 \\cdot v_j^2}{A} \right)
+
+ where:
+
+ - \\( n_{ij} \\) is the number of drops in diameter bin \\( i \\) and velocity bin \\( j \\).
+ - \\( D_i \\) is the diameter of bin \\( i \\).
+ - \\( v_j \\) is the velocity of bin \\( j \\).
+ - \\( A \\) is the sampling area.
+ - \\( \\Delta t \\) is the time integration period in seconds.
+ - The factor \\( 3600 \\) converts the rate to per hour.
+
+ """
+ # Ensure velocity is 2D (diameter, velocity)
+ velocity = xr.ones_like(drop_number) * velocity
+
+ # # Compute rain drop kinetic energy [J]
+ # ke = 0.5 * water_density * np.pi / 6 * diameter **3 * velocity**2
+ # # Compute total kinetic energy in [J / m2]
+ # total_kinetic_energy = (ke * drop_number / sampling_area).sum(dim=["diameter_bin_center", "velocity_bin_center"])
+ # # Compute kinetic energy density flux (KE) (J/m2/h)
+ # kinetic_energy_flux = total_kinetic_energy / sample_interval * 3600
+
+ # Compute kinetic energy flux density (KE) (J/m2/h)
+ kinetic_energy_flux = (
+ water_density
+ * np.pi
+ / 12
+ / sample_interval
+ * 3600
+ * ((drop_number * diameter**3 * velocity**2) / sampling_area).sum(
+ dim=_get_spectrum_dims(drop_number),
+ )
+ )
+ return kinetic_energy_flux
+
+
+def get_rainfall_kinetic_energy(drop_number, diameter, velocity, rain_accumulation, sampling_area, water_density=1000):
+ r"""
+ Calculate the kinetic energy per unit rainfall depth (E) in joules per square meter per millimeter (J·m⁻²·mm⁻¹).
+
+ This function computes the kinetic energy of the rainfall per millimeter of rain, providing a measure of the
+ energy associated with each unit of rainfall depth. This parameter is useful for understanding the potential
+ impact of raindrop erosion and the intensity of rainfall events.
+
+ The values typically range between 0 and 40 J·m⁻²·mm⁻¹.
+ E is related to the kinetic energy flux density (KE) by the rain rate: E = KE/R .
+
+ Parameters
+ ----------
+ drop_number : xarray.DataArray
+ The number of drops in each diameter (and velocity, if available) bin(s).
+ diameter : xarray.DataArray
+ The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m).
+ velocity : xarray.DataArray or float
+ The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s).
+ Values are broadcasted to match the dimensions of `drop_number`.
+ rain_accumulation : xarray.DataArray or float
+ The total rainfall accumulation over the time integration period, in millimeters (mm).
+ sampling_area : float
+ The effective sampling area \\( A \\) of the sensor in square meters (m²).
+ water_density : float, optional
+ The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³).
+ Default is 1000 kg/m³.
+
+ Returns
+ -------
+ E : xarray.DataArray
+ The kinetic energy per unit rainfall depth in joules per square meter per millimeter (J·m⁻²·mm⁻¹).
+ Dimensions are reduced to ('time',).
+
+ Notes
+ -----
+ The kinetic energy per unit rainfall depth \\( E \\) is calculated using:
+
+ .. math::
+
+ E = \frac{1}{2} \\cdot \frac{\\pi}{6} \\cdot \frac{\rho_w}{R} \\cdot \\sum_{i,j}
+ \\left( \frac{n_{ij} \\cdot D_i^3 \\cdot v_j^2}{A} \right)
+
+ where:
+
+ - \\( n_{ij} \\) is the number of drops in diameter bin \\( i \\) and velocity bin \\( j \\).
+ - \\( D_i \\) is the diameter of bin \\( i \\).
+ - \\( v_j \\) is the velocity of bin \\( j \\).
+ - \\( A \\) is the sampling area.
+ - \\( R \\) is the rainfall accumulation over the integration period (mm).
+ """
+ # Ensure velocity has the same dimensions as drop_number
+ velocity = xr.ones_like(drop_number) * velocity
+ # Compute rainfall kinetic energy per unit rainfall depth
+ E = (
+ 0.5
+ * np.pi
+ / 6
+ * water_density
+ / rain_accumulation
+ * ((drop_number * diameter**3 * velocity**2) / sampling_area).sum(
+ dim=_get_spectrum_dims(drop_number),
+ )
+ )
+ return E
diff --git a/disdrodb/l2/event.py b/disdrodb/l2/event.py
new file mode 100644
index 00000000..41472008
--- /dev/null
+++ b/disdrodb/l2/event.py
@@ -0,0 +1,388 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Functions for event definition."""
+import dask
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from disdrodb.api.info import get_start_end_time_from_filepaths
+from disdrodb.utils.time import acronym_to_seconds, ensure_sorted_by_time
+
+
+@dask.delayed
+def _delayed_open_dataset(filepath):
+ with dask.config.set(scheduler="synchronous"):
+ ds = xr.open_dataset(filepath, chunks={}, autoclose=True, cache=False)
+ return ds
+
+
+def identify_events(
+ filepaths,
+ parallel=False,
+ min_n_drops=5,
+ neighbor_min_size=2,
+ neighbor_time_interval="5MIN",
+ intra_event_max_time_gap="6H",
+ event_min_duration="5MIN",
+ event_min_size=3,
+):
+ """Return a list of rainy events.
+
+ Rainy timesteps are defined when n_drops_selected > min_n_drops.
+ Any rainy isolated timesteps (based on neighborhood criteria) is removed.
+ Then, consecutive rainy timesteps are grouped into the same event if the time gap between them does not
+ exceed `intra_event_max_time_gap`. Finally, events that do not meet minimum size or duration
+ requirements are filtered out.
+
+ Parameters
+ ----------
+ filepaths: list
+ List of L1C file paths.
+ parallel: bool
+ Whether to load the files in parallel.
+ Set parallel=True only in a multiprocessing environment.
+ The default is False.
+ neighbor_time_interval : str
+ The time interval around a given a timestep defining the neighborhood.
+ Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
+ neighbor_min_size : int, optional
+ The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
+ timestep to be considered non-isolated. Isolated timesteps are removed !
+ - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs.
+ - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
+ - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
+ Defaults to 1.
+ intra_event_max_time_gap: str
+ The maximum time interval between two timesteps to be considered part of the same event.
+ This parameters is used to group timesteps into events !
+ event_min_duration : str
+ The minimum duration an event must span. Events shorter than this duration are discarded.
+ event_min_size : int, optional
+ The minimum number of valid timesteps required for an event. Defaults to 1.
+
+ Returns
+ -------
+ list of dict
+ A list of events, where each event is represented as a dictionary with keys:
+ - "start_time": np.datetime64, start time of the event
+ - "end_time": np.datetime64, end time of the event
+ - "duration": np.timedelta64, duration of the event
+ - "n_timesteps": int, number of valid timesteps in the event
+ """
+ # Open datasets in parallel
+ if parallel:
+ list_ds = dask.compute([_delayed_open_dataset(filepath) for filepath in filepaths])[0]
+ else:
+ list_ds = [xr.open_dataset(filepath, chunks={}, cache=False) for filepath in filepaths]
+ # Filter dataset for requested variables
+ variables = ["time", "n_drops_selected"]
+ list_ds = [ds[variables] for ds in list_ds]
+ # Concat datasets
+ ds = xr.concat(list_ds, dim="time", compat="no_conflicts", combine_attrs="override")
+ # Read in memory the variable needed
+ ds = ds.compute()
+ # Close file on disk
+ _ = [ds.close() for ds in list_ds]
+ del list_ds
+ # Sort dataset by time
+ ds = ensure_sorted_by_time(ds)
+ # Define candidate timesteps to group into events
+ idx_valid = ds["n_drops_selected"].data > min_n_drops
+ timesteps = ds["time"].data[idx_valid]
+ # Define event list
+ event_list = group_timesteps_into_event(
+ timesteps=timesteps,
+ neighbor_min_size=neighbor_min_size,
+ neighbor_time_interval=neighbor_time_interval,
+ intra_event_max_time_gap=intra_event_max_time_gap,
+ event_min_duration=event_min_duration,
+ event_min_size=event_min_size,
+ )
+ return event_list
+
+
+def group_timesteps_into_event(
+ timesteps,
+ intra_event_max_time_gap,
+ event_min_size=0,
+ event_min_duration="0S",
+ neighbor_min_size=0,
+ neighbor_time_interval="0S",
+):
+ """
+ Group candidate timesteps into events based on temporal criteria.
+
+ This function groups valid candidate timesteps into events by considering how they cluster
+ in time. Any isolated timesteps (based on neighborhood criteria) are first removed. Then,
+ consecutive timesteps are grouped into the same event if the time gap between them does not
+ exceed `intra_event_max_time_gap`. Finally, events that do not meet minimum size or duration
+ requirements are filtered out.
+
+ Please note that neighbor_min_size and neighbor_time_interval are very sensitive to the
+ actual sample interval of the data !
+
+ Parameters
+ ----------
+ timesteps: np.ndarray
+ Candidate timesteps to be grouped into events.
+ neighbor_time_interval : str
+ The time interval around a given a timestep defining the neighborhood.
+ Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
+ neighbor_min_size : int, optional
+ The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
+ timestep to be considered non-isolated. Isolated timesteps are removed !
+ - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs.
+ - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
+ - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
+ Defaults to 1.
+ intra_event_max_time_gap: str
+ The maximum time interval between two timesteps to be considered part of the same event.
+ This parameters is used to group timesteps into events !
+ event_min_duration : str
+ The minimum duration an event must span. Events shorter than this duration are discarded.
+ event_min_size : int, optional
+ The minimum number of valid timesteps required for an event. Defaults to 1.
+
+ Returns
+ -------
+ list of dict
+ A list of events, where each event is represented as a dictionary with keys:
+ - "start_time": np.datetime64, start time of the event
+ - "end_time": np.datetime64, end time of the event
+ - "duration": np.timedelta64, duration of the event
+ - "n_timesteps": int, number of valid timesteps in the event
+ """
+ # Retrieve datetime arguments
+ neighbor_time_interval = pd.Timedelta(acronym_to_seconds(neighbor_time_interval), unit="seconds")
+ intra_event_max_time_gap = pd.Timedelta(acronym_to_seconds(intra_event_max_time_gap), unit="seconds")
+ event_min_duration = pd.Timedelta(acronym_to_seconds(event_min_duration), unit="seconds")
+
+ # Remove isolated timesteps
+ timesteps = remove_isolated_timesteps(
+ timesteps,
+ neighbor_min_size=neighbor_min_size,
+ neighbor_time_interval=neighbor_time_interval,
+ )
+
+ # Group timesteps into events
+ # - If two timesteps are separated by less than intra_event_max_time_gap, are considered the same event
+ events = group_timesteps_into_events(timesteps, intra_event_max_time_gap)
+
+ # Define list of event
+ event_list = [
+ {
+ "start_time": event[0],
+ "end_time": event[-1],
+ "duration": (event[-1] - event[0]).astype("m8[m]"),
+ "n_timesteps": len(event),
+ }
+ for event in events
+ ]
+
+ # Filter event list by duration
+ event_list = [event for event in event_list if event["duration"] >= event_min_duration]
+
+ # Filter event list by duration
+ event_list = [event for event in event_list if event["n_timesteps"] >= event_min_size]
+
+ return event_list
+
+
+def remove_isolated_timesteps(timesteps, neighbor_min_size, neighbor_time_interval):
+ """
+ Remove isolated timesteps that do not have enough neighboring timesteps within a specified time gap.
+
+ A timestep is considered isolated (and thus removed) if it does not have at least `neighbor_min_size` other
+ timesteps within the `neighbor_time_interval` before or after it.
+ In other words, for each timestep, we look for how many other timesteps fall into the
+ time interval [t - neighbor_time_interval, t + neighbor_time_interval], excluding it itself.
+ If the count of such neighbors is less than `neighbor_min_size`, that timestep is removed.
+
+ Parameters
+ ----------
+ timesteps : array-like of np.datetime64
+ Sorted or unsorted array of valid timesteps.
+ neighbor_time_interval : np.timedelta64
+ The time interval around a given a timestep defining the neighborhood.
+ Only timesteps that fall within this time interval before or after a timestep are considered neighbors.
+ neighbor_min_size : int, optional
+ The minimum number of neighboring timesteps required within `neighbor_time_interval` for a
+ timestep to be considered non-isolated.
+ - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs.
+ - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`.
+ - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`.
+ Defaults to 1.
+
+ Returns
+ -------
+ np.ndarray
+ Array of timesteps with isolated entries removed.
+ """
+ # Sort timesteps
+ timesteps = np.array(timesteps)
+ timesteps.sort()
+
+ # Do nothing if neighbor_min_size is 0
+ if neighbor_min_size == 0:
+ return timesteps
+
+ # Compute the start and end of the interval for each timestep
+ t_starts = timesteps - neighbor_time_interval
+ t_ends = timesteps + neighbor_time_interval
+
+ # Use searchsorted to find the positions where these intervals would be inserted
+ # to keep the array sorted. This effectively gives us the bounds of timesteps
+ # within the neighbor interval.
+ left_indices = np.searchsorted(timesteps, t_starts, side="left")
+ right_indices = np.searchsorted(timesteps, t_ends, side="right")
+
+ # The number of neighbors is the difference in indices minus one (to exclude the timestep itself)
+ n_neighbors = right_indices - left_indices - 1
+ valid_mask = n_neighbors >= neighbor_min_size
+
+ non_isolated_timesteps = timesteps[valid_mask]
+
+ # NON VECTORIZED CODE
+ # non_isolated_timesteps = []
+ # n_neighbours_arr = []
+ # for i, t in enumerate(timesteps):
+ # n_neighbours = np.sum(np.logical_and(timesteps >= (t - neighbor_time_interval),
+ # timesteps <= (t + neighbor_time_interval))) - 1
+ # n_neighbours_arr.append(n_neighbours)
+ # if n_neighbours > neighbor_min_size:
+ # non_isolated_timesteps.append(t)
+ # non_isolated_timesteps = np.array(non_isolated_timesteps)
+ return non_isolated_timesteps
+
+
+def group_timesteps_into_events(timesteps, intra_event_max_time_gap):
+ """
+ Group valid timesteps into events based on a maximum allowed dry interval.
+
+ Parameters
+ ----------
+ timesteps : array-like of np.datetime64
+ Sorted array of valid timesteps.
+ intra_event_max_time_gap : np.timedelta64
+ Maximum time interval allowed between consecutive valid timesteps for them
+ to be considered part of the same event.
+
+ Returns
+ -------
+ list of np.ndarray
+ A list of events, where each event is an array of timesteps.
+ """
+ # Deal with case with no timesteps
+ if len(timesteps) == 0:
+ return []
+
+ # Ensure timesteps are sorted
+ timesteps.sort()
+
+ # Compute differences between consecutive timesteps
+ diffs = np.diff(timesteps)
+
+ # Identify the indices where the gap is larger than intra_event_max_time_gap
+ # These indices represent boundaries between events
+ break_indices = np.where(diffs > intra_event_max_time_gap)[0] + 1
+
+ # Split the timesteps at the identified break points
+ events = np.split(timesteps, break_indices)
+
+ # NON VECTORIZED CODE
+ # events = []
+ # current_event = [timesteps[0]]
+ # for i in range(1, len(timesteps)):
+ # current_t = timesteps[i]
+ # previous_t = timesteps[i - 1]
+
+ # if current_t - previous_t <= intra_event_max_time_gap:
+ # current_event.append(current_t)
+ # else:
+ # events.append(current_event)
+ # current_event = [current_t]
+
+ # events.append(current_event)
+ return events
+
+
+####-----------------------------------------------------------------------------------.
+
+
+def get_events_info(list_events, filepaths, accumulation_interval, rolling):
+ """
+ Provide information about the required files for each event.
+
+ For each event in `list_events`, this function identifies the file paths from `filepaths` that
+ overlap with the event period, adjusted by the `accumulation_interval`. The event period is
+ extended backward or forward based on the `rolling` parameter.
+
+ Parameters
+ ----------
+ list_events : list of dict
+ List of events, where each event is a dictionary containing at least 'start_time' and 'end_time'
+ keys with `numpy.datetime64` values.
+ filepaths : list of str
+ List of file paths corresponding to data files.
+ accumulation_interval : numpy.timedelta64 or int
+ Time interval to adjust the event period for accumulation. If an integer is provided, it is
+ assumed to be in seconds.
+ rolling : bool
+ If True, adjust the event period backward by `accumulation_interval` (rolling backward).
+ If False, adjust forward (aggregate forward).
+
+ Returns
+ -------
+ list of dict
+ A list where each element is a dictionary containing:
+ - 'start_time': Adjusted start time of the event (`numpy.datetime64`).
+ - 'end_time': Adjusted end time of the event (`numpy.datetime64`).
+ - 'filepaths': List of file paths overlapping with the adjusted event period.
+
+ """
+ # Ensure accumulation_interval is numpy.timedelta64
+ if not isinstance(accumulation_interval, np.timedelta64):
+ accumulation_interval = np.timedelta64(accumulation_interval, "s")
+
+ # Retrieve file start_time and end_time
+ files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths)
+
+ # Retrieve information for each event
+ event_info = []
+ for event_dict in list_events:
+ # Retrieve event time period
+ event_start_time = event_dict["start_time"]
+ event_end_time = event_dict["end_time"]
+
+ # Add buffer to account for accumulation interval
+ if rolling: # backward
+ event_start_time = event_start_time - np.array(accumulation_interval, dtype="m8[s]")
+ else: # aggregate forward
+ event_end_time = event_end_time + np.array(accumulation_interval, dtype="m8[s]")
+
+ # Derive event filepaths
+ overlaps = (files_start_time <= event_end_time) & (files_end_time >= event_start_time)
+ event_filepaths = np.array(filepaths)[overlaps].tolist()
+
+ # Create dictionary
+ if len(event_filepaths) > 0:
+ event_info.append(
+ {"start_time": event_start_time, "end_time": event_end_time, "filepaths": event_filepaths},
+ )
+
+ return event_info
diff --git a/disdrodb/l2/processing.py b/disdrodb/l2/processing.py
new file mode 100644
index 00000000..03bcb687
--- /dev/null
+++ b/disdrodb/l2/processing.py
@@ -0,0 +1,683 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement DISDRODB L2 processing."""
+
+import numpy as np
+import xarray as xr
+
+from disdrodb.l1.encoding_attrs import get_attrs_dict, get_encoding_dict
+from disdrodb.l1.fall_velocity import get_raindrop_fall_velocity
+from disdrodb.l1_env.routines import load_env_dataset
+from disdrodb.l2.empirical_dsd import (
+ get_drop_average_velocity,
+ get_drop_number_concentration,
+ get_drop_volume,
+ get_effective_sampling_area,
+ get_equivalent_reflectivity_factor,
+ get_kinetic_energy_density_flux,
+ get_liquid_water_content,
+ get_mean_volume_drop_diameter,
+ get_median_volume_drop_diameter,
+ get_min_max_drop_kinetic_energy,
+ get_mode_diameter,
+ get_moment,
+ get_normalized_intercept_parameter,
+ get_quantile_volume_drop_diameter,
+ get_rain_accumulation,
+ get_rain_rate,
+ get_rain_rate_from_dsd,
+ get_rainfall_kinetic_energy,
+ get_std_volume_drop_diameter,
+ get_total_number_concentration,
+)
+from disdrodb.psd import create_psd, estimate_model_parameters
+from disdrodb.psd.fitting import compute_gof_stats
+from disdrodb.scattering import get_radar_parameters
+from disdrodb.utils.attrs import set_attrs
+from disdrodb.utils.encoding import set_encodings
+from disdrodb.utils.time import ensure_sample_interval_in_seconds
+
+
+def define_diameter_array(diameter_min=0, diameter_max=10, diameter_spacing=0.05):
+ """
+ Define an array of diameters and their corresponding bin properties.
+
+ Parameters
+ ----------
+ diameter_min : float, optional
+ The minimum diameter value. The default value is 0 mm.
+ diameter_max : float, optional
+ The maximum diameter value. The default value is 10 mm.
+ diameter_spacing : float, optional
+ The spacing between diameter values. The default value is 0.05 mm.
+
+ Returns
+ -------
+ xr.DataArray
+ A DataArray containing the center of each diameter bin, with coordinates for
+ the bin width, lower bound, upper bound, and center.
+
+ """
+ diameters_bounds = np.arange(diameter_min, diameter_max + diameter_spacing / 2, step=diameter_spacing)
+ diameters_bin_lower = diameters_bounds[:-1]
+ diameters_bin_upper = diameters_bounds[1:]
+ diameters_bin_width = diameters_bin_upper - diameters_bin_lower
+ diameters_bin_center = diameters_bin_lower + diameters_bin_width / 2
+ da = xr.DataArray(
+ diameters_bin_center,
+ dims="diameter_bin_center",
+ coords={
+ "diameter_bin_width": ("diameter_bin_center", diameters_bin_width),
+ "diameter_bin_lower": ("diameter_bin_center", diameters_bin_lower),
+ "diameter_bin_upper": ("diameter_bin_center", diameters_bin_upper),
+ "diameter_bin_center": ("diameter_bin_center", diameters_bin_center),
+ },
+ )
+ return da
+
+
+def define_velocity_array(ds):
+ """
+ Create the fall velocity DataArray using various methods.
+
+ If 'velocity_bin_center' is a dimension in the dataset, returns a Dataset
+ with 'measured_velocity', 'average_velocity', and 'fall_velocity' as variables.
+ Otherwise, returns the 'fall_velocity' DataArray from the input dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ The input dataset containing velocity variables.
+
+ Returns
+ -------
+ velocity: xarray.DataArray
+ """
+ drop_number = ds["drop_number"]
+ if "velocity_bin_center" in ds.dims:
+ velocity = xr.Dataset(
+ {
+ "measured_velocity": xr.ones_like(drop_number) * ds["velocity_bin_center"],
+ "average_velocity": xr.ones_like(drop_number) * ds["drop_average_velocity"],
+ "fall_velocity": xr.ones_like(drop_number) * ds["fall_velocity"],
+ },
+ ).to_array(dim="velocity_method")
+ else:
+ velocity = ds["fall_velocity"]
+ return velocity
+
+
+def compute_integral_parameters(
+ drop_number_concentration,
+ velocity,
+ diameter,
+ diameter_bin_width,
+ sample_interval,
+ water_density,
+):
+ """
+ Compute integral parameters of a drop size distribution (DSD).
+
+ Parameters
+ ----------
+ drop_number_concentration : array-like
+ Drop number concentration in each diameter bin [#/m3/mm].
+ velocity : array-like
+ Fall velocity of drops in each diameter bin [m/s].
+ diameter : array-like
+ Diameter of drops in each bin in m.
+ diameter_bin_width : array-like
+ Width of each diameter bin in mm.
+ sample_interval : float
+ Time interval over which the samples are collected in seconds.
+ water_density : float or array-like
+ Density of water [kg/m3].
+
+ Returns
+ -------
+ ds : xarray.Dataset
+ Dataset containing the computed integral parameters:
+ - Nt : Total number concentration [#/m3]
+ - R : Rain rate [mm/h]
+ - P : Rain accumulation [mm]
+ - Z : Reflectivity factor [dBZ]
+ - W : Liquid water content [g/m3]
+ - D10 : Diameter at the 10th quantile of the cumulative LWC distribution [mm]
+ - D50 : Median volume drop diameter [mm]
+ - D90 : Diameter at the 90th quantile of the cumulative LWC distribution [mm]
+ - Dmode : Diameter at which the distribution peaks [mm]
+ - Dm : Mean volume drop diameter [mm]
+ - sigma_m : Standard deviation of the volume drop diameter [mm]
+ - Nw : Normalized intercept parameter [m-3·mm⁻¹]
+ - M1 to M6 : Moments of the drop size distribution
+ """
+ # diameter in m!
+
+ # Initialize dataset
+ ds = xr.Dataset()
+
+ # Compute total number concentration (Nt) [#/m3]
+ total_number_concentration = get_total_number_concentration(
+ drop_number_concentration=drop_number_concentration,
+ diameter_bin_width=diameter_bin_width,
+ )
+
+ # Compute rain rate
+ rain_rate = get_rain_rate_from_dsd(
+ drop_number_concentration=drop_number_concentration,
+ velocity=velocity,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ )
+
+ # Compute rain accumulation (P) [mm]
+ rain_accumulation = get_rain_accumulation(rain_rate=rain_rate, sample_interval=sample_interval)
+
+ # Compute moments (m0 to m6)
+ for moment in range(0, 7):
+ ds[f"M{moment}"] = get_moment(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ moment=moment,
+ )
+
+ # Compute Liquid Water Content (LWC) (W) [g/m3]
+ liquid_water_content = get_liquid_water_content(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ water_density=water_density,
+ )
+
+ # lwc_m = get_mom_liquid_water_content(moment_3=ds_l2["M3"],
+ # water_density=water_density)
+
+ # Compute reflectivity in dBZ
+ reflectivity_factor = get_equivalent_reflectivity_factor(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ )
+
+ # Compute the diameter at which the distribution peak
+ mode_diameter = get_mode_diameter(drop_number_concentration)
+
+ # Compute mean_volume_diameter (Dm) [mm]
+ mean_volume_diameter = get_mean_volume_drop_diameter(moment_3=ds["M3"], moment_4=ds["M4"])
+
+ # Compute σₘ[mm]
+ sigma_m = get_std_volume_drop_diameter(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ mean_volume_diameter=mean_volume_diameter,
+ )
+
+ # Compute normalized_intercept_parameter (Nw) [m-3·mm⁻¹]
+ normalized_intercept_parameter = get_normalized_intercept_parameter(
+ liquid_water_content=liquid_water_content,
+ mean_volume_diameter=mean_volume_diameter,
+ water_density=water_density,
+ )
+
+ # Nw = get_mom_normalized_intercept_parameter(moment_3=ds_l2["M3"],
+ # moment_4=ds_l2["M4"])
+
+ # Compute median volume_drop_diameter
+ d50 = get_median_volume_drop_diameter(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ water_density=water_density,
+ )
+
+ # Compute volume_drop_diameter for the 10th and 90th quantile of the cumulative LWC distribution
+ d10 = get_quantile_volume_drop_diameter(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ fraction=0.1,
+ water_density=water_density,
+ )
+
+ d90 = get_quantile_volume_drop_diameter(
+ drop_number_concentration=drop_number_concentration,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ fraction=0.9,
+ water_density=water_density,
+ )
+
+ ds["Nt"] = total_number_concentration
+ ds["R"] = rain_rate
+ ds["P"] = rain_accumulation
+ ds["Z"] = reflectivity_factor
+ ds["W"] = liquid_water_content
+
+ ds["D10"] = d10
+ ds["D50"] = d50
+ ds["D90"] = d90
+ ds["Dmode"] = mode_diameter
+ ds["Dm"] = mean_volume_diameter
+ ds["sigma_m"] = sigma_m
+
+ ds["Nw"] = normalized_intercept_parameter
+
+ return ds
+
+
+####--------------------------------------------------------------------------
+#### L2 Empirical Parameters
+
+
+def generate_l2_empirical(ds, ds_env=None):
+ """Generate the DISDRODB L2E dataset from the DISDRODB L1 dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ DISDRODB L1 dataset.
+ ds_env : xarray.Dataset, optional
+ Environmental dataset used for fall velocity and water density estimates.
+ If None, a default environment dataset will be loaded.
+
+ Returns
+ -------
+ xarray.Dataset
+ DISRODB L2E dataset.
+ """
+ # Retrieve attributes
+ attrs = ds.attrs.copy()
+
+ # -------------------------------------------------------
+ #### Preprocessing
+ # Discard all timesteps without measured drops
+ # - This allow to speed up processing
+ # - Regularization can be done at the end
+ ds = ds.isel(time=ds["n_drops_selected"] > 0)
+
+ # Retrieve ENV dataset or take defaults
+ # --> Used for fall velocity and water density estimates
+ if ds_env is None:
+ ds_env = load_env_dataset(ds)
+
+ # TODO: Derive water density as function of ENV (temperature, ...)
+ # --> (T == 10){density_water <- 999.7}else if(T == 20){density_water <- 998.2}else{density_water <- 995.7}
+ water_density = 1000 # kg / m3
+
+ # Determine if the velocity dimension is available
+ has_velocity_dimension = "velocity_bin_center" in ds.dims
+
+ # -------------------------------------------------------
+ # Extract variables from L1
+ sensor_name = ds.attrs["sensor_name"]
+ diameter = ds["diameter_bin_center"] / 1000 # m
+ diameter_bin_width = ds["diameter_bin_width"] # mm
+ drop_number = ds["drop_number"]
+ drop_counts = ds["drop_counts"]
+ sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"]) # s
+
+ # Compute sampling area [m2]
+ sampling_area = get_effective_sampling_area(sensor_name=sensor_name, diameter=diameter) # m2
+
+ # Select relevant L1 variables to L2 product
+ variables = [
+ "drop_number",
+ "drop_counts",
+ "drop_number_concentration",
+ "sample_interval",
+ "n_drops_selected",
+ "n_drops_discarded",
+ "Dmin",
+ "Dmax",
+ "drop_average_velocity",
+ "fall_velocity",
+ ]
+
+ variables = [var for var in variables if var in ds]
+ ds_l1_subset = ds[variables]
+
+ # -------------------------------------------------------------------------------------------
+ # Compute and add drop average velocity if an optical disdrometer (i.e OTT Parsivel or ThiesLPM)
+ # - Recompute it because if input dataset is aggregated, it must be updated !
+ if has_velocity_dimension:
+ ds["drop_average_velocity"] = get_drop_average_velocity(ds["drop_number"])
+
+ # -------------------------------------------------------------------------------------------
+ # Define velocity array with dimension 'velocity_method'
+ velocity = define_velocity_array(ds)
+
+ # -------------------------------------------------------
+ #### Compute L2 variables
+ # Compute drop number concentration (Nt) [#/m3/mm]
+ drop_number_concentration = get_drop_number_concentration(
+ drop_number=drop_number,
+ velocity=velocity,
+ diameter_bin_width=diameter_bin_width,
+ sample_interval=sample_interval,
+ sampling_area=sampling_area,
+ )
+
+ # Compute rain rate (R) [mm/hr]
+ rain_rate = get_rain_rate(
+ drop_counts=drop_counts,
+ sampling_area=sampling_area,
+ diameter=diameter,
+ sample_interval=sample_interval,
+ )
+
+ # Compute rain accumulation (P) [mm]
+ rain_accumulation = get_rain_accumulation(rain_rate=rain_rate, sample_interval=sample_interval)
+
+ # Compute drop volume information (per diameter bin)
+ drop_volume = drop_counts * get_drop_volume(diameter) # (np.pi/6 * diameter**3 * drop_counts)
+ drop_total_volume = drop_volume.sum(dim="diameter_bin_center")
+ drop_relative_volume_ratio = drop_volume / drop_total_volume
+
+ # Compute kinetic energy variables
+ # --> TODO: implement from_dsd (using drop_concentration!)
+ min_drop_kinetic_energy, max_drop_kinetic_energy = get_min_max_drop_kinetic_energy(
+ drop_number=drop_number,
+ diameter=diameter,
+ velocity=velocity,
+ water_density=water_density,
+ )
+
+ kinetic_energy_density_flux = get_kinetic_energy_density_flux(
+ drop_number=drop_number,
+ diameter=diameter,
+ velocity=velocity,
+ sample_interval=sample_interval,
+ sampling_area=sampling_area,
+ water_density=water_density,
+ )
+
+ rainfall_kinetic_energy = get_rainfall_kinetic_energy(
+ drop_number=drop_number,
+ diameter=diameter,
+ velocity=velocity,
+ sampling_area=sampling_area,
+ rain_accumulation=rain_accumulation,
+ water_density=water_density,
+ )
+
+ # ----------------------------------------------------------------------------
+ # Compute integral parameters
+ ds_l2 = compute_integral_parameters(
+ drop_number_concentration=drop_number_concentration,
+ velocity=velocity,
+ diameter=diameter,
+ diameter_bin_width=diameter_bin_width,
+ sample_interval=sample_interval,
+ water_density=water_density,
+ )
+
+ # ----------------------------------------------------------------------------
+ #### Create L2 Dataset
+ # Update with L1 parameters
+ ds_l2.update(ds_l1_subset)
+
+ ds_l2["drop_number"] = drop_number # 2D V x D
+ ds_l2["drop_counts"] = drop_counts # 1D D
+ ds_l2["drop_number_concentration"] = drop_number_concentration
+
+ ds_l2["drop_volume"] = drop_volume
+ ds_l2["drop_total_volume"] = drop_total_volume
+ ds_l2["drop_relative_volume_ratio"] = drop_relative_volume_ratio
+
+ ds_l2["R"] = rain_rate
+ ds_l2["P"] = rain_accumulation
+
+ # TODO: adapt code to compute from drop_number_concentration
+ ds_l2["KEmin"] = min_drop_kinetic_energy
+ ds_l2["KEmax"] = max_drop_kinetic_energy
+ ds_l2["E"] = rainfall_kinetic_energy
+ ds_l2["KE"] = kinetic_energy_density_flux
+
+ # ----------------------------------------------------------------------------
+
+ # ----------------------------------------------------------------------------.
+ # Remove timesteps where rain rate is 0
+ ds_l2 = ds_l2.isel(time=ds_l2["R"] > 0)
+
+ # ----------------------------------------------------------------------------.
+ #### Add encodings and attributes
+ # Add variables attributes
+ attrs_dict = get_attrs_dict()
+ ds_l2 = set_attrs(ds_l2, attrs_dict=attrs_dict)
+
+ # Add variables encoding
+ encoding_dict = get_encoding_dict()
+ ds_l2 = set_encodings(ds_l2, encoding_dict=encoding_dict)
+
+ # Add global attributes
+ ds_l2.attrs = attrs
+
+ return ds_l2
+
+
+####--------------------------------------------------------------------------
+#### L2 Model Parameters
+
+
+def generate_l2_model(
+ ds,
+ ds_env=None,
+ fall_velocity_method="Beard1976",
+ # PSD discretization
+ diameter_min=0,
+ diameter_max=8,
+ diameter_spacing=0.05,
+ # Fitting options
+ psd_model=None,
+ optimization=None,
+ optimization_kwargs=None,
+ # GOF metrics options
+ gof_metrics=True,
+):
+ """
+ Generate the DISDRODB L2M dataset from a DISDRODB L2E dataset.
+
+ This function estimates PSD model parameters and successively computes DSD integral parameters.
+ Optionally, radar variables at various bands are simulated using T-matrix simulations.
+ Goodness-of-fit metrics of the PSD can also be optionally included into the output dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ DISDRODB L2E dataset.
+ ds_env : xarray.Dataset, optional
+ Environmental dataset used for fall velocity and water density estimates.
+ If None, a default environment dataset will be loaded.
+ diameter_min : float, optional
+ Minimum PSD diameter. The default value is 0 mm.
+ diameter_max : float, optional
+ Maximum PSD diameter. The default value is 8 mm.
+ diameter_spacing : float, optional
+ PSD diameter spacing. The default value is 0.05 mm.
+ psd_model : str
+ The PSD model to fit. See ``available_psd_models()``.
+ optimization : str, optional
+ The fitting optimization procedure. Either "GS" (Grid Search), "ML (Maximum Likelihood)
+ or "MOM" (Method of Moments).
+ optimization_kwargs : dict, optional
+ Dictionary with arguments to customize the fitting procedure.
+ gof_metrics : bool, optional
+ Whether to add goodness-of-fit metrics to the output dataset. The default is True.
+
+ Returns
+ -------
+ xarray.Dataset
+ DISDRODB L2M dataset.
+ """
+ # ----------------------------------------------------------------------------.
+ #### NOTES
+ # - Final processing: Optionally filter dataset only when PSD has fitted ?
+ # --> but good to have everything to compare across models
+
+ # ----------------------------------------------------------------------------.
+ # Retrieve attributes
+ attrs = ds.attrs.copy()
+
+ # -------------------------------------------------------
+ # Derive water density as function of ENV (temperature, ...)
+ # TODO --> Add into ds_env !
+ # --> (T == 10){density_water <- 999.7}else if(T == 20){density_water <- 998.2}else{density_water <- 995.7}
+ water_density = 1000 # kg / m3
+
+ # Retrieve ENV dataset or take defaults
+ # --> Used for fall velocity and water density estimates
+ if ds_env is None:
+ ds_env = load_env_dataset(ds)
+
+ ####------------------------------------------------------.
+ #### Preprocessing
+ # - Filtering criteria for when fitting a PSD
+ # TODO --> try to fit and define reasonable criteria based on R2, max deviation, rain_rate abs/relative error
+
+ ####------------------------------------------------------.
+ #### Define default PSD optimization arguments
+ if psd_model is None and optimization is None:
+ psd_model = "NormalizedGammaPSD"
+ optimization = "GS"
+ optimization_kwargs = {
+ "target": "ND",
+ "transformation": "identity",
+ "error_order": 1, # MAE
+ }
+
+ ####------------------------------------------------------.
+ #### Retrieve PSD parameters
+ ds_psd_params = estimate_model_parameters(
+ ds=ds,
+ psd_model=psd_model,
+ optimization=optimization,
+ optimization_kwargs=optimization_kwargs,
+ )
+ psd_name = ds_psd_params.attrs["disdrodb_psd_model"]
+ psd = create_psd(psd_name, parameters=ds_psd_params)
+
+ ####-------------------------------------------------------
+ #### Compute integral parameters
+ # Define diameter array
+ diameter = define_diameter_array(
+ diameter_min=diameter_min,
+ diameter_max=diameter_max,
+ diameter_spacing=diameter_spacing,
+ )
+ diameter_bin_width = diameter["diameter_bin_width"]
+
+ # Retrieve time of integration
+ sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"])
+
+ # Retrieve drop number concentration
+ drop_number_concentration = psd(diameter)
+
+ # Retrieve fall velocity for each new diameter bin
+ velocity = get_raindrop_fall_velocity(diameter=diameter, method=fall_velocity_method, ds_env=ds_env) # mm
+
+ # Compute integral parameters
+ ds_params = compute_integral_parameters(
+ drop_number_concentration=drop_number_concentration,
+ velocity=velocity,
+ diameter=diameter / 1000, # in meters !
+ diameter_bin_width=diameter_bin_width,
+ sample_interval=sample_interval,
+ water_density=water_density,
+ )
+
+ #### ----------------------------------------------------------------------------
+ #### Create L2 Dataset
+ # Update with PSD parameters
+ ds_params.update(ds_psd_params)
+
+ # Add GOF statistics if asked
+ # TODO: Add metrics variables or GOF DataArray ?
+ if gof_metrics:
+ ds_gof = compute_gof_stats(drop_number_concentration=ds["drop_number_concentration"], psd=psd)
+ ds_params.update(ds_gof)
+
+ #### ----------------------------------------------------------------------------.
+ #### Add encodings and attributes
+ # Add variables attributes
+ attrs_dict = get_attrs_dict()
+ ds_params = set_attrs(ds_params, attrs_dict=attrs_dict)
+
+ # Add variables encoding
+ encoding_dict = get_encoding_dict()
+ ds_params = set_encodings(ds_params, encoding_dict=encoding_dict)
+
+ # Add global attributes
+ ds_params.attrs = attrs
+ ds_params.attrs["disdrodb_psd_model"] = psd_name
+
+ # Return dataset
+ return ds_params
+
+
+####-------------------------------------------------------------------------------------------------------------------.
+#### L2 Radar Parameters
+
+
+def generate_l2_radar(ds, radar_band=None, canting_angle_std=7, diameter_max=8, axis_ratio="Thurai2007", parallel=True):
+ """Simulate polarimetric radar variables from empirical drop number concentration or the estimated PSD.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Dataset containing the drop number concentration variable or the PSD parameters.
+ radar_band : str or list of str, optional
+ Radar band(s) to be used.
+ If ``None`` (the default), all available radar bands are used.
+ canting_angle_std : float or list of float, optional
+ Standard deviation of the canting angle. The default value is 7.
+ diameter_max : float or list of float, optional
+ Maximum diameter. The default value is 8 mm.
+ axis_ratio : str or list of str, optional
+ Method to compute the axis ratio. The default method is ``Thurai2007``.
+ parallel : bool, optional
+ Whether to compute radar variables in parallel.
+ The default value is ``True``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the computed radar parameters.
+ """
+ # Retrieve radar variables from L2E drop number concentration or from estimated L2M PSD model
+ ds_radar = get_radar_parameters(
+ ds=ds,
+ radar_band=radar_band,
+ canting_angle_std=canting_angle_std,
+ diameter_max=diameter_max,
+ axis_ratio=axis_ratio,
+ parallel=parallel,
+ )
+
+ #### ----------------------------------------------------------------------------.
+ #### Add encodings and attributes
+ # Add variables attributes
+ attrs_dict = get_attrs_dict()
+ ds_radar = set_attrs(ds_radar, attrs_dict=attrs_dict)
+
+ # Add variables encoding
+ encoding_dict = get_encoding_dict()
+ ds_radar = set_encodings(ds_radar, encoding_dict=encoding_dict)
+
+ # Return dataset
+ return ds_radar
diff --git a/disdrodb/l2/processing_options.py b/disdrodb/l2/processing_options.py
new file mode 100644
index 00000000..fd4639e1
--- /dev/null
+++ b/disdrodb/l2/processing_options.py
@@ -0,0 +1,109 @@
+# TODO: Write to YAML
+# TODO: radar_simulation_enabled: differentiate between L2E and L2M:
+
+config = {
+ "global_settings": {
+ "time_integration": [
+ "1MIN",
+ "10MIN",
+ "ROLL1MIN",
+ "ROLL10MIN",
+ ], # ["10S", "30S", "1MIN", "5MIN", "10MIN", "15MIN", "30MIN", "1H", "ROLL5MIN", "ROLL10MIN"],
+ # Radar options
+ "radar_simulation_enabled": True,
+ "radar_simulation_options": {
+ "radar_band": ["S", "C", "X", "Ku", "Ka", "W"],
+ "canting_angle_std": 7,
+ "diameter_max": 8,
+ "axis_ratio": "Thurai2007",
+ },
+ # L2E options
+ # "l2e_options": {}
+ # L2M options
+ "l2m_options": {
+ "fall_velocity_method": "Beard1976",
+ "diameter_min": 0,
+ "diameter_max": 8,
+ "diameter_spacing": 0.05,
+ "gof_metrics": True,
+ "models": {
+ # PSD models fitting options
+ "GAMMA_ML": {
+ "psd_model": "GammaPSD",
+ "optimization": "ML",
+ "optimization_kwargs": {
+ "init_method": "M246",
+ "probability_method": "cdf",
+ "likelihood": "multinomial",
+ "truncated_likelihood": True,
+ "optimizer": "Nelder-Mead",
+ },
+ },
+ "NGAMMA_GS_LOG_ND_MAE": {
+ "psd_model": "NormalizedGammaPSD",
+ "optimization": "GS",
+ "optimization_kwargs": {
+ "target": "ND",
+ "transformation": "log",
+ "error_order": 1, # MAE
+ },
+ },
+ # "NGAMMA_GS_ND_MAE": {
+ # "psd_model": "NormalizedGammaPSD",
+ # "optimization": "GS",
+ # "optimization_kwargs": {
+ # "target": "ND",
+ # "transformation": "identity",
+ # "error_order": 1, # MAE
+ # },
+ # },
+ # "NGAMMA_GS_Z": {
+ # "psd_model": "NormalizedGammaPSD",
+ # "optimization": "GS",
+ # "optimization_kwargs": {
+ # "target": "Z",
+ # "transformation": "identity", # unused
+ # "error_order": 1, # unused
+ # },
+ # },
+ },
+ },
+ },
+ "specific_settings": {
+ "10S": {
+ "radar_simulation_enabled": False,
+ },
+ "30S": {
+ "radar_simulation_enabled": False,
+ },
+ "10MIN": {
+ "radar_simulation_enabled": False,
+ },
+ "15MIN": {
+ "radar_simulation_enabled": False,
+ },
+ "30MIN": {
+ "radar_simulation_enabled": False,
+ },
+ "1H": {
+ "radar_simulation_enabled": False,
+ },
+ "ROLL10MIN": {
+ "radar_simulation_enabled": False,
+ },
+ },
+}
+
+
+def get_l2_processing_options():
+ """Retrieve L2 processing options."""
+ # TODO: Implement validation !
+ l2_options_dict = {}
+ for tt in config["global_settings"]["time_integration"]:
+ l2_options_dict[tt] = config["global_settings"].copy()
+ _ = l2_options_dict[tt].pop("time_integration", None)
+ # Add specific settings
+ for tt, product_options in config["specific_settings"].items():
+ if tt in l2_options_dict:
+ l2_options_dict[tt].update(product_options)
+ return l2_options_dict
diff --git a/disdrodb/l2/routines.py b/disdrodb/l2/routines.py
new file mode 100644
index 00000000..df2663a1
--- /dev/null
+++ b/disdrodb/l2/routines.py
@@ -0,0 +1,843 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implements routines for DISDRODB L2 processing."""
+
+import datetime
+import logging
+import os
+import time
+from typing import Optional
+
+import dask
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+# Directory
+from disdrodb.api.create_directories import (
+ create_logs_directory,
+ create_product_directory,
+)
+from disdrodb.api.info import group_filepaths
+from disdrodb.api.io import get_filepaths, get_required_product
+from disdrodb.api.path import (
+ define_accumulation_acronym,
+ define_l2e_filename,
+ define_l2m_filename,
+)
+from disdrodb.configs import get_base_dir
+from disdrodb.l1.resampling import (
+ regularize_dataset,
+ resample_dataset,
+)
+from disdrodb.l2.event import get_events_info, identify_events
+from disdrodb.l2.processing import (
+ generate_l2_empirical,
+ generate_l2_model,
+ generate_l2_radar,
+)
+from disdrodb.l2.processing_options import get_l2_processing_options
+from disdrodb.metadata import read_station_metadata
+from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel
+
+# Logger
+from disdrodb.utils.logger import (
+ close_logger,
+ create_logger_file,
+ create_product_logs,
+ log_error,
+ log_info,
+)
+from disdrodb.utils.time import ensure_sample_interval_in_seconds, get_resampling_information
+from disdrodb.utils.writer import write_product
+
+logger = logging.getLogger(__name__)
+
+
+####----------------------------------------------------------------------------.
+#### L2E
+
+
+@delayed_if_parallel
+@single_threaded_if_parallel
+def _generate_l2e(
+ start_time,
+ end_time,
+ filepaths,
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # Sampling options
+ accumulation_interval,
+ rolling,
+ # Radar options
+ radar_simulation_enabled,
+ radar_simulation_options,
+ # Processing options
+ force,
+ verbose,
+ parallel, # this is used by the decorator and to initialize correctly the logger !
+):
+ # -----------------------------------------------------------------.
+ # Define product name
+ product = "L2E"
+
+ # -----------------------------------------------------------------.
+ # Create file logger
+ sample_interval_acronym = define_accumulation_acronym(seconds=accumulation_interval, rolling=rolling)
+ starting_time = pd.to_datetime(start_time).strftime("%Y%m%d%H%M%S")
+ ending_time = pd.to_datetime(end_time).strftime("%Y%m%d%H%M%S")
+ filename = f"L2E.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}"
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
+ filename=filename,
+ parallel=parallel,
+ )
+ ##------------------------------------------------------------------------.
+ # Log start processing
+ msg = f"{product} processing of {filename} has started."
+ log_info(logger, msg, verbose=verbose)
+
+ ##------------------------------------------------------------------------.
+ ### Core computation
+ try:
+ # ------------------------------------------------------------------------.
+ #### Open the dataset over the period of interest
+ # - Open the netCDFs
+ list_ds = [xr.open_dataset(filepath, chunks={}, cache=False, autoclose=True) for filepath in filepaths]
+ # - Concatenate datasets
+ ds = xr.concat(list_ds, dim="time", compat="no_conflicts", combine_attrs="override")
+ ds = ds.sel(time=slice(start_time, end_time)).compute()
+ # - Close file on disk
+ _ = [ds.close() for ds in list_ds]
+
+ ##------------------------------------------------------------------------.
+ #### Resample dataset
+ # Here we set NaN in the raw_drop_number to 0
+ # - We assume that NaN corresponds to 0
+ # - When we regularize, we infill with NaN
+ # - When we aggregate with sum, we don't skip NaN
+ # --> Aggregation with original missing timesteps currently results in NaN !
+ # TODO: Add tolerance on fraction of missing timesteps for large accumulation_intervals
+ ds["drop_number"] = xr.where(np.isnan(ds["drop_number"]), 0, ds["drop_number"])
+
+ # - Regularize dataset
+ # --> Infill missing timesteps with np.Nan
+ sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"]).item()
+ ds = regularize_dataset(ds, freq=f"{sample_interval}s")
+
+ # - Resample dataset
+ ds = resample_dataset(
+ ds=ds,
+ sample_interval=sample_interval,
+ accumulation_interval=accumulation_interval,
+ rolling=rolling,
+ )
+
+ ##------------------------------------------------------------------------.
+ # Remove timesteps with no drops or NaN (from L2E computations)
+ # timestep_zero_drops = ds["time"].data[ds["n_drops_selected"].data == 0]
+ # timestep_nan = ds["time"].data[np.isnan(ds["n_drops_selected"].data)]
+ indices_valid_timesteps = np.where(
+ ~np.logical_or(ds["n_drops_selected"].data == 0, np.isnan(ds["n_drops_selected"].data)),
+ )[0]
+ ds = ds.isel(time=indices_valid_timesteps)
+
+ ##------------------------------------------------------------------------.
+ #### Generate L2E product
+ ds = generate_l2_empirical(ds=ds)
+
+ # Simulate L2M-based radar variables if asked
+ if radar_simulation_enabled:
+ ds_radar = generate_l2_radar(ds, parallel=not parallel, **radar_simulation_options)
+ ds.update(ds_radar)
+ ds.attrs = ds_radar.attrs.copy()
+
+ ##------------------------------------------------------------------------.
+ #### Regularize back dataset
+ # TODO: infill timestep_zero_drops and timestep_nan differently ?
+ # --> R, P, LWC = 0,
+ # --> Z, D, with np.nan?
+
+ ##------------------------------------------------------------------------.
+ # Write netCDF4 dataset
+ if ds["time"].size > 1:
+ filename = define_l2e_filename(
+ ds,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ )
+ filepath = os.path.join(data_dir, filename)
+ write_product(ds, product=product, filepath=filepath, force=force)
+
+ ##--------------------------------------------------------------------.
+ # Clean environment
+ del ds
+
+ # Log end processing
+ msg = f"{product} processing of {filename} has ended."
+ log_info(logger, msg, verbose=verbose)
+
+ ##--------------------------------------------------------------------.
+ # Otherwise log the error
+ except Exception as e:
+ error_type = str(type(e).__name__)
+ msg = f"{error_type}: {e}"
+ log_error(logger, msg, verbose=verbose)
+
+ # Close the file logger
+ close_logger(logger)
+
+ # Return the logger file path
+ return logger_filepath
+
+
+def is_possible_product(accumulation_interval, sample_interval, rolling):
+ """Assess if production is possible given the requested accumulation interval and source sample_interval."""
+ # Avoid rolling product generation at source sample interval
+ if rolling and accumulation_interval == sample_interval:
+ return False
+ # Avoid product generation if the accumulation_interval is less than the sample interval
+ if accumulation_interval < sample_interval:
+ return False
+ # Avoid producti generation if accumulation_interval is not multiple of sample_interval
+ return accumulation_interval % sample_interval == 0
+
+
+def flatten_list(nested_list):
+ """Flatten a nested list into a single-level list."""
+ if isinstance(nested_list, list) and len(nested_list) == 0:
+ return nested_list
+ # If list is already flat, return as is to avoid flattening to chars
+ if isinstance(nested_list, list) and not isinstance(nested_list[0], list):
+ return nested_list
+ return [item for sublist in nested_list for item in sublist] if isinstance(nested_list, list) else [nested_list]
+
+
+def run_l2e_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Generate the L2E product of a specific DISDRODB station when invoked from the terminal.
+
+ This function is intended to be called through the ``disdrodb_run_l2e_station``
+ command-line interface.
+
+ The DISDRODB L2E routine generate a L2E file for each event.
+ Events are defined based on the DISDRODB event settings options.
+ The DISDRODB event settings allows to produce L2E files either
+ per custom block of time (i.e day/month/year) or for blocks of rainy events.
+
+ For stations with varying measurement intervals, DISDRODB defines a separate list of 'events'
+ for each measurement interval option. In other words, DISDRODB does not
+ mix files with data acquired at different sample intervals when resampling the data.
+
+ L0C product generation ensure creation of files with unique sample intervals.
+
+ Parameters
+ ----------
+ data_source : str
+ The name of the institution (for campaigns spanning multiple countries) or
+ the name of the country (for campaigns or sensor networks within a single country).
+ Must be provided in UPPER CASE.
+ campaign_name : str
+ The name of the campaign. Must be provided in UPPER CASE.
+ station_name : str
+ The name of the station.
+ force : bool, optional
+ If ``True``, existing data in the destination directories will be overwritten.
+ If ``False`` (default), an error will be raised if data already exists in the destination directories.
+ verbose : bool, optional
+ If ``True`` (default), detailed processing information will be printed to the terminal.
+ If ``False``, less information will be displayed.
+ parallel : bool, optional
+ If ``True``, files will be processed in multiple processes simultaneously,
+ with each process using a single thread to avoid issues with the HDF/netCDF library.
+ If ``False`` (default), files will be processed sequentially in a single process,
+ and multi-threading will be automatically exploited to speed up I/O tasks.
+ debugging_mode : bool, optional
+ If ``True``, the amount of data processed will be reduced.
+ Only the first 3 files will be processed. By default, ``False``.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+
+ """
+ # Define product
+ product = "L2E"
+
+ # Define base directory
+ base_dir = get_base_dir(base_dir)
+
+ # ------------------------------------------------------------------------.
+ # Start processing
+ if verbose:
+ t_i = time.time()
+ msg = f"{product} processing of station {station_name} has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # -------------------------------------------------------------------------.
+ # List L1 files to process
+ required_product = get_required_product(product)
+ flag_not_available_data = False
+ try:
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=required_product,
+ # Processing options
+ debugging_mode=False,
+ )
+ except Exception as e:
+ print(str(e)) # Case where no file paths available
+ flag_not_available_data = True
+
+ # -------------------------------------------------------------------------.
+ # If no data available, print error message and return None
+ if flag_not_available_data:
+ msg = (
+ f"{product} processing of {data_source} {campaign_name} {station_name}"
+ + f"has not been launched because of missing {required_product} data."
+ )
+ print(msg)
+ return
+
+ # -------------------------------------------------------------------------.
+ # Retrieve L2 processing options
+ # - Each dictionary item contains the processing options for a given rolling/accumulation_interval combo
+ l2_processing_options = get_l2_processing_options()
+
+ # ---------------------------------------------------------------------.
+ # Group filepaths by sample intervals
+ # - Typically the sample interval is fixed
+ # - Some stations might change the sample interval along the years
+ # - For each sample interval, separated processing take place here after !
+ dict_filepaths = group_filepaths(filepaths, groups="sample_interval")
+
+ # -------------------------------------------------------------------------.
+ # Define list of event
+ # - [(start_time, end_time)]
+ # TODO: Here pass event option list !
+ # TODO: Implement more general define_events function
+ # - Either rainy events
+ # - Either time blocks (day/month/year)
+ # TODO: Define events identification settings based on accumulation
+ # - This is currently done at the source sample interval !
+ # - Should we allow event definition for each accumulation interval and
+ # move this code inside the loop below
+
+ # sample_interval = list(dict_filepaths)[0]
+ # filepaths = dict_filepaths[sample_interval]
+
+ dict_list_events = {
+ sample_interval: identify_events(filepaths, parallel=parallel)
+ for sample_interval, filepaths in dict_filepaths.items()
+ }
+
+ # ---------------------------------------------------------------------.
+ # Subset for debugging mode
+ if debugging_mode:
+ dict_list_events = {
+ sample_interval: list_events[0 : min(len(list_events), 3)]
+ for sample_interval, list_events in dict_list_events.items()
+ }
+
+ # ---------------------------------------------------------------------.
+ # Loop
+ # rolling = False
+ # accumulation_interval = 60
+ # sample_interval_acronym = "1MIN"
+ # l2_options = l2_processing_options["1MIN"]
+ for sample_interval_acronym, l2_options in l2_processing_options.items():
+
+ # Retrieve accumulation_interval and rolling option
+ accumulation_interval, rolling = get_resampling_information(sample_interval_acronym)
+
+ # Retrieve radar simulation options
+ radar_simulation_enabled = l2_options.get("radar_simulation_enabled", False)
+ radar_simulation_options = l2_options["radar_simulation_options"]
+
+ # ------------------------------------------------------------------.
+ # Group filepaths by events
+ # - This is done separately for each possible source sample interval
+ # - It groups filepaths by start_time and end_time provided by list_events
+ # - Here 'events' can also simply be period of times ('day', 'months', ...)
+ # - When aggregating/resampling/accumulating data, we need to load also
+ # some data before/after the actual event start_time/end_time
+ # - get_events_info adjust the event times to accounts for the required "border" data.
+ events_info = [
+ get_events_info(
+ list_events=list_events,
+ filepaths=dict_filepaths[sample_interval],
+ accumulation_interval=accumulation_interval,
+ rolling=rolling,
+ )
+ for sample_interval, list_events in dict_list_events.items()
+ if is_possible_product(
+ accumulation_interval=accumulation_interval,
+ sample_interval=sample_interval,
+ rolling=rolling,
+ )
+ ]
+ events_info = flatten_list(events_info)
+
+ # ------------------------------------------------------------------.
+ # Skip processing if no files available
+ # - When not compatible accumulation_interval with source sample_interval
+ if len(events_info) == 0:
+ continue
+
+ # ------------------------------------------------------------------.
+ # Create product directory
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=product,
+ force=force,
+ # Option for L2E
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ )
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Option for L2E
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ )
+
+ # ------------------------------------------------------------------.
+ # Generate files
+ # - L2E product generation is optionally parallelized over events
+ # - If parallel=True, it does that in parallel using dask.delayed
+ list_tasks = [
+ _generate_l2e(
+ start_time=event_info["start_time"],
+ end_time=event_info["end_time"],
+ filepaths=event_info["filepaths"],
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Sampling options
+ rolling=rolling,
+ accumulation_interval=accumulation_interval,
+ # Radar options
+ radar_simulation_enabled=radar_simulation_enabled,
+ radar_simulation_options=radar_simulation_options,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ parallel=parallel,
+ )
+ for event_info in events_info
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+
+ # -----------------------------------------------------------------.
+ # Define product summary logs
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Product options
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ # Logs list
+ list_logs=list_logs,
+ )
+
+ # ---------------------------------------------------------------------.
+ # End product processing
+ if verbose:
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ msg = f"{product} processing of station {station_name} completed in {timedelta_str}"
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+
+####----------------------------------------------------------------------------.
+#### L2M
+
+
+@delayed_if_parallel
+@single_threaded_if_parallel
+def _generate_l2m(
+ filepath,
+ data_dir,
+ logs_dir,
+ campaign_name,
+ station_name,
+ # L2M options
+ sample_interval,
+ rolling,
+ model_name,
+ l2m_options,
+ # Radar options
+ radar_simulation_enabled,
+ radar_simulation_options,
+ # Processing options
+ force,
+ verbose,
+ parallel, # this is used only to initialize the correct logger !
+):
+ # -----------------------------------------------------------------.
+ # Define product name
+ product = "L2M"
+
+ # -----------------------------------------------------------------.
+ # Define model options
+ psd_model = l2m_options["models"][model_name]["psd_model"]
+ optimization = l2m_options["models"][model_name]["optimization"]
+ optimization_kwargs = l2m_options["models"][model_name]["optimization_kwargs"]
+ other_options = {k: v for k, v in l2m_options.items() if k != "models"}
+
+ # -----------------------------------------------------------------.
+ # Create file logger
+ filename = os.path.basename(filepath)
+ logger, logger_filepath = create_logger_file(
+ logs_dir=logs_dir,
+ filename=filename,
+ parallel=parallel,
+ )
+
+ ##------------------------------------------------------------------------.
+ # Log start processing
+ msg = f"{product} processing of {filename} has started."
+ log_info(logger, msg, verbose=verbose)
+
+ ##------------------------------------------------------------------------.
+ ### Core computation
+ try:
+ # Open the raw netCDF
+ with xr.open_dataset(filepath, chunks={}, cache=False) as ds:
+ variables = [
+ "drop_number_concentration",
+ "fall_velocity",
+ "D50",
+ "Nw",
+ "Nt",
+ "M1",
+ "M2",
+ "M3",
+ "M4",
+ "M5",
+ "M6",
+ ]
+ ds = ds[variables].load()
+
+ # Produce L2M dataset
+ ds = generate_l2_model(
+ ds=ds,
+ psd_model=psd_model,
+ optimization=optimization,
+ optimization_kwargs=optimization_kwargs,
+ **other_options,
+ )
+
+ # Simulate L2M-based radar variables if asked
+ if radar_simulation_enabled:
+ ds_radar = generate_l2_radar(ds, parallel=not parallel, **radar_simulation_options)
+ ds.update(ds_radar)
+ ds.attrs = ds_radar.attrs.copy()
+
+ # Write L2M netCDF4 dataset
+ if ds["time"].size > 1:
+ # Define filepath
+ filename = define_l2m_filename(
+ ds,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ sample_interval=sample_interval,
+ rolling=rolling,
+ model_name=model_name,
+ )
+ filepath = os.path.join(data_dir, filename)
+ # Write to disk
+ write_product(ds, product=product, filepath=filepath, force=force)
+
+ ##--------------------------------------------------------------------.
+ # Clean environment
+ del ds
+
+ # Log end processing
+ msg = f"{product} processing of {filename} has ended."
+ log_info(logger, msg, verbose=verbose)
+
+ ##--------------------------------------------------------------------.
+ # Otherwise log the error
+ except Exception as e:
+ error_type = str(type(e).__name__)
+ msg = f"{error_type}: {e}"
+ log_error(logger, msg, verbose=verbose)
+
+ # Close the file logger
+ close_logger(logger)
+
+ # Return the logger file path
+ return logger_filepath
+
+
+def run_l2m_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = True,
+ parallel: bool = True,
+ debugging_mode: bool = False,
+ base_dir: Optional[str] = None,
+):
+ """
+ Run the L2M processing of a specific DISDRODB station when invoked from the terminal.
+
+ This function is intended to be called through the ``disdrodb_run_l2m_station``
+ command-line interface.
+
+ Parameters
+ ----------
+ data_source : str
+ The name of the institution (for campaigns spanning multiple countries) or
+ the name of the country (for campaigns or sensor networks within a single country).
+ Must be provided in UPPER CASE.
+ campaign_name : str
+ The name of the campaign. Must be provided in UPPER CASE.
+ station_name : str
+ The name of the station.
+ force : bool, optional
+ If ``True``, existing data in the destination directories will be overwritten.
+ If ``False`` (default), an error will be raised if data already exists in the destination directories.
+ verbose : bool, optional
+ If ``True`` (default), detailed processing information will be printed to the terminal.
+ If ``False``, less information will be displayed.
+ parallel : bool, optional
+ If ``True``, files will be processed in multiple processes simultaneously,
+ with each process using a single thread to avoid issues with the HDF/netCDF library.
+ If ``False`` (default), files will be processed sequentially in a single process,
+ and multi-threading will be automatically exploited to speed up I/O tasks.
+ debugging_mode : bool, optional
+ If ``True``, the amount of data processed will be reduced.
+ Only the first 3 files will be processed. By default, ``False``.
+ base_dir : str, optional
+ The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``.
+ If not specified, the path specified in the DISDRODB active configuration will be used.
+
+ """
+ # Define product
+ product = "L2M"
+
+ # Define base directory
+ base_dir = get_base_dir(base_dir)
+
+ # ------------------------------------------------------------------------.
+ # Start processing
+ if verbose:
+ t_i = time.time()
+ msg = f"{product} processing of station {station_name} has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # -------------------------------------------------------------------------.
+ # Retrieve L2 processing options
+ # - Each dictionary item contains the processing options for a given rolling/accumulation_interval combo
+ l2_processing_options = get_l2_processing_options()
+
+ # ---------------------------------------------------------------------.
+ # Retrieve source sampling interval
+ # - If a station has varying measurement interval over time, choose the smallest one !
+ metadata = read_station_metadata(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+ sample_interval = metadata["measurement_interval"]
+ if isinstance(sample_interval, list):
+ sample_interval = min(sample_interval)
+
+ # ---------------------------------------------------------------------.
+ # Loop
+ # sample_interval_acronym = "1MIN"
+ # l2_options = l2_processing_options["1MIN"]
+ for sample_interval_acronym, l2_options in l2_processing_options.items():
+
+ # Retrieve accumulation_interval and rolling option
+ accumulation_interval, rolling = get_resampling_information(sample_interval_acronym)
+
+ # Retrieve L2M processing options
+ l2m_options = l2_options["l2m_options"]
+
+ # Retrieve radar simulation options
+ radar_simulation_enabled = l2_options.get("radar_simulation_enabled", False)
+ radar_simulation_options = l2_options["radar_simulation_options"]
+
+ # ------------------------------------------------------------------.
+ # Avoid generation of rolling products for source sample interval !
+ if rolling and accumulation_interval == sample_interval:
+ continue
+
+ # Avoid product generation if the accumulation_interval is less than the sample interval
+ if accumulation_interval < sample_interval:
+ continue
+
+ # -----------------------------------------------------------------.
+ # List files to process
+ required_product = get_required_product(product)
+ flag_not_available_data = False
+ try:
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=required_product,
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ # Processing options
+ debugging_mode=debugging_mode,
+ )
+ except Exception as e:
+ print(str(e)) # Case where no file paths available
+ flag_not_available_data = True
+
+ # If no data available, try with other L2E accumulation intervals
+ if flag_not_available_data:
+ msg = (
+ f"{product} processing of {data_source} {campaign_name} {station_name}"
+ + f"has not been launched because of missing {required_product} {sample_interval_acronym} data ."
+ )
+ print(msg)
+ continue
+
+ # -----------------------------------------------------------------.
+ # Loop over distributions to fit
+ # model_name = "GAMMA_ML"
+ # model_options = l2m_options["models"][model_name]
+ for model_name, model_options in l2m_options["models"].items():
+
+ # Retrieve model options
+ psd_model = model_options["psd_model"]
+ optimization = model_options["optimization"]
+
+ # -----------------------------------------------------------------.
+ msg = f" - Production of L2M_{model_name} for sample interval {accumulation_interval} s has started."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+ msg = f" - Estimating {psd_model} parameters using {optimization}."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # -------------------------------------------------------------.
+ # Create product directory
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=product,
+ force=force,
+ # Option for L2E
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ )
+
+ # Define logs directory
+ logs_dir = create_logs_directory(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Option for L2E
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ )
+
+ # Generate L2M files
+ # - Loop over the L2E netCDF files and generate L2M files.
+ # - If parallel=True, it does that in parallel using dask.delayed
+ list_tasks = [
+ _generate_l2m(
+ filepath=filepath,
+ data_dir=data_dir,
+ logs_dir=logs_dir,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L2M option
+ sample_interval=accumulation_interval,
+ rolling=rolling,
+ model_name=model_name,
+ l2m_options=l2m_options,
+ # Radar options
+ radar_simulation_enabled=radar_simulation_enabled,
+ radar_simulation_options=radar_simulation_options,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ parallel=parallel,
+ )
+ for filepath in filepaths
+ ]
+ list_logs = dask.compute(*list_tasks) if parallel else list_tasks
+
+ # -----------------------------------------------------------------.
+ # Define L2M summary logs
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ # Product options
+ model_name=model_name,
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Logs list
+ list_logs=list_logs,
+ )
+
+ # ---------------------------------------------------------------------.
+ # End L2M processing
+ if verbose:
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ msg = f"{product} processing of station {station_name} completed in {timedelta_str}"
+ log_info(logger=logger, msg=msg, verbose=verbose)
diff --git a/disdrodb/metadata/geolocation.py b/disdrodb/metadata/geolocation.py
new file mode 100644
index 00000000..8ee1cc76
--- /dev/null
+++ b/disdrodb/metadata/geolocation.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Metadata tools to verify/complete geolocation information."""
+import time
+
+import numpy as np
+import requests
+
+
+def infer_altitude(latitude, longitude, dem="aster30m"):
+ """Infer station altitude using a Digital Elevation Model (DEM).
+
+ This function uses the OpenTopoData API to infer the altitude of a given
+ location specified by latitude and longitude.
+ By default, it uses the ASTER DEM at 30m resolution.
+
+ Parameters
+ ----------
+ latitude : float
+ The latitude of the location for which to infer the altitude.
+ longitude : float
+ The longitude of the location for which to infer the altitude.
+ dem : str, optional
+ The DEM to use for altitude inference. Options are "aster30m" (default),
+ "srtm30", and "mapzen".
+
+ Returns
+ -------
+ elevation : float
+ The inferred altitude of the specified location.
+
+ Raises
+ ------
+ ValueError
+ If the altitude retrieval fails.
+
+ Notes
+ -----
+ - The OpenTopoData API has a limit of 1000 calls per day.
+ - Each request can include up to 100 locations.
+ - The API allows a maximum of 1 call per second.
+
+ References
+ ----------
+ https://www.opentopodata.org/api/
+ """
+ import requests
+
+ url = f"https://api.opentopodata.org/v1/{dem}?locations={latitude},{longitude}"
+ r = requests.get(url)
+
+ data = r.json()
+ if data["status"] == "OK":
+ elevation = data["results"][0]["elevation"]
+ else:
+ raise ValueError("Altitude retrieval failed.")
+ return elevation
+
+
+def infer_altitudes(lats, lons, dem="aster30m"):
+ """
+ Infer altitude of a given location using OpenTopoData API.
+
+ Parameters
+ ----------
+ lats : list or array-like
+ List or array of latitude coordinates.
+ lons : list or array-like
+ List or array of longitude coordinates.
+ dem : str, optional
+ Digital Elevation Model (DEM) to use for altitude inference.
+ The default DEM is "aster30m".
+
+ Returns
+ -------
+ elevations : numpy.ndarray
+ Array of inferred altitudes corresponding to the input coordinates.
+
+ Raises
+ ------
+ ValueError
+ If the latitude and longitude arrays do not have the same length.
+ If altitude retrieval fails for any block of coordinates.
+
+ Notes
+ -----
+ - The OpenTopoData API has a limit of 1000 calls per day.
+ - Each request can include up to 100 locations.
+ - The API allows a maximum of 1 call per second.
+ - The API requests are made in blocks of up to 100 coordinates,
+ with a 2-second delay between requests.
+ """
+ # Check that lats and lons have the same length
+ if len(lats) != len(lons):
+ raise ValueError("Latitude and longitude arrays must have the same length.")
+
+ # Maximum number of locations per API request
+ max_locations = 100
+ elevations = []
+
+ # Total number of coordinates
+ total_coords = len(lats)
+
+ # Loop over the coordinates in blocks of max_locations
+ for i in range(0, total_coords, max_locations):
+
+ # Wait 2 seconds before another API request
+ time.sleep(2)
+
+ # Get the block of coordinates
+ block_lats = lats[i : i + max_locations]
+ block_lons = lons[i : i + max_locations]
+
+ # Create the list_coords string in the format "lat1,lon1|lat2,lon2|..."
+ list_coords = "|".join([f"{lat},{lon}" for lat, lon in zip(block_lats, block_lons)])
+
+ # Define API URL
+ url = f"https://api.opentopodata.org/v1/{dem}?locations={list_coords}&interpolation=nearest"
+
+ # Retrieve info
+ r = requests.get(url)
+ data = r.json()
+
+ # Parse info
+ if data.get("status") == "OK":
+ elevations.extend([result["elevation"] for result in data["results"]])
+ else:
+ raise ValueError(f"Altitude retrieval failed for block starting at index {i}.")
+ elevations = np.array(elevations).astype(float)
+ return elevations
diff --git a/disdrodb/metadata/manipulation.py b/disdrodb/metadata/manipulation.py
index a8f15454..9d9370a3 100644
--- a/disdrodb/metadata/manipulation.py
+++ b/disdrodb/metadata/manipulation.py
@@ -17,6 +17,11 @@
# along with this program. If not, see .
# -----------------------------------------------------------------------------.
"""Metadata Manipulation Tools."""
+import shutil
+
+from disdrodb.api.io import available_stations
+from disdrodb.api.path import define_metadata_filepath
+from disdrodb.configs import get_base_dir
def remove_invalid_metadata_keys(metadata):
@@ -46,3 +51,40 @@ def sort_metadata_dictionary(metadata):
list_metadata_keys = get_valid_metadata_keys()
metadata = {k: metadata[k] for k in list_metadata_keys}
return metadata
+
+
+def update_processed_metadata():
+ """Update metadata in the 'DISDRODB/Processed' directory."""
+ base_dir = get_base_dir()
+ # Retrieve list of all processed stations
+ # --> (data_source, campaign_name, station_name)
+ list_info = available_stations(
+ product="L0B",
+ )
+
+ # Retrieve metadata filepaths
+ list_src_dst_path = [
+ (
+ # Source
+ define_metadata_filepath(
+ product="RAW",
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ check_exists=False,
+ ),
+ # Destination
+ define_metadata_filepath(
+ product="L0B",
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=base_dir,
+ check_exists=False,
+ ),
+ )
+ for data_source, campaign_name, station_name in list_info
+ ]
+ # Copy file from RAW directory to Processed directory
+ _ = [shutil.copyfile(src_path, dst_path) for (src_path, dst_path) in list_src_dst_path]
diff --git a/disdrodb/psd/__init__.py b/disdrodb/psd/__init__.py
new file mode 100644
index 00000000..f5068957
--- /dev/null
+++ b/disdrodb/psd/__init__.py
@@ -0,0 +1,38 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement PSD model and fitting routines."""
+
+
+from disdrodb.psd.fitting import estimate_model_parameters
+from disdrodb.psd.models import (
+ ExponentialPSD,
+ GammaPSD,
+ LognormalPSD,
+ NormalizedGammaPSD,
+ available_psd_models,
+ create_psd,
+)
+
+__all__ = [
+ "available_psd_models",
+ "create_psd",
+ "estimate_model_parameters",
+ "LognormalPSD",
+ "ExponentialPSD",
+ "GammaPSD",
+ "NormalizedGammaPSD",
+]
diff --git a/disdrodb/psd/fitting.py b/disdrodb/psd/fitting.py
new file mode 100644
index 00000000..314bda62
--- /dev/null
+++ b/disdrodb/psd/fitting.py
@@ -0,0 +1,2132 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Routines for PSD fitting."""
+import numpy as np
+import scipy.stats as ss
+import xarray as xr
+from scipy.integrate import quad
+from scipy.optimize import minimize
+from scipy.special import gamma, gammainc, gammaln # Regularized lower incomplete gamma function
+
+from disdrodb.psd.models import ExponentialPSD, GammaPSD, LognormalPSD, NormalizedGammaPSD
+from disdrodb.utils.warnings import suppress_warnings
+
+
+####--------------------------------------------------------------------------------------.
+#### Goodness of fit (GOF)
+def compute_gof_stats(drop_number_concentration, psd):
+ """
+ Compute various goodness-of-fit (GoF) statistics between observed and predicted values.
+
+ Parameters
+ ----------
+ - drop_number_concentration: xarray.DataArray with dimensions ('time', 'diameter_bin_center')
+ - psd: instance of PSD class
+
+ Returns
+ -------
+ - ds: xarray.Dataset containing the computed GoF statistics
+ """
+ from disdrodb.l2.empirical_dsd import get_mode_diameter
+
+ # Retrieve diameter bin width
+ diameter = drop_number_concentration["diameter_bin_center"]
+ diameter_bin_width = drop_number_concentration["diameter_bin_width"]
+
+ # Define observed and predicted values and compute errors
+ observed_values = drop_number_concentration
+ fitted_values = psd(diameter) # .transpose(*observed_values.dims)
+ error = observed_values - fitted_values
+
+ # Compute GOF statistics
+ with suppress_warnings():
+ # Compute Pearson correlation
+ pearson_r = xr.corr(observed_values, fitted_values, dim="diameter_bin_center")
+
+ # Compute MSE
+ mse = (error**2).mean(dim="diameter_bin_center")
+
+ # Compute maximum error
+ max_error = error.max(dim="diameter_bin_center")
+ relative_max_error = error.max(dim="diameter_bin_center") / observed_values.max(dim="diameter_bin_center")
+
+ # Compute difference in total number concentration
+ total_number_concentration_obs = (observed_values * diameter_bin_width).sum(dim="diameter_bin_center")
+ total_number_concentration_pred = (fitted_values * diameter_bin_width).sum(dim="diameter_bin_center")
+ total_number_concentration_difference = total_number_concentration_pred - total_number_concentration_obs
+
+ # Compute Kullback-Leibler divergence
+ # - Compute pdf per bin
+ pk_pdf = observed_values / total_number_concentration_obs
+ qk_pdf = fitted_values / total_number_concentration_pred
+
+ # - Compute probabilities per bin
+ pk = pk_pdf * diameter_bin_width
+ pk = pk / pk.sum(dim="diameter_bin_center") # this might not be necessary
+ qk = qk_pdf * diameter_bin_width
+ qk = qk / qk.sum(dim="diameter_bin_center") # this might not be necessary
+
+ # - Compute divergence
+ log_prob_ratio = np.log(pk / qk)
+ log_prob_ratio = log_prob_ratio.where(np.isfinite(log_prob_ratio))
+ kl_divergence = (pk * log_prob_ratio).sum(dim="diameter_bin_center")
+
+ # Other statistics that can be computed also from different diameter discretization
+ # - Compute max deviation at distribution mode
+ max_deviation = observed_values.max(dim="diameter_bin_center") - fitted_values.max(dim="diameter_bin_center")
+ max_relative_deviation = max_deviation / fitted_values.max(dim="diameter_bin_center")
+
+ # - Compute diameter difference of the distribution mode
+ diameter_mode_deviation = get_mode_diameter(observed_values) - get_mode_diameter(fitted_values)
+
+ # Create an xarray.Dataset to hold the computed statistics
+ ds = xr.Dataset(
+ {
+ "r2": pearson_r**2, # Squared Pearson correlation coefficient
+ "mse": mse, # Mean Squared Error
+ "max_error": max_error, # Maximum Absolute Error
+ "relative_max_error": relative_max_error, # Relative Maximum Error
+ "total_number_concentration_difference": total_number_concentration_difference,
+ "kl_divergence": kl_divergence, # Kullback-Leibler divergence
+ "max_deviation": max_deviation, # Deviation at distribution mode
+ "max_relative_deviation": max_relative_deviation, # Relative deviation at mode
+ "diameter_mode_deviation": diameter_mode_deviation, # Difference in mode diameters
+ },
+ )
+ return ds
+
+
+####--------------------------------------------------------------------------------------.
+#### Maximum Likelihood (ML)
+
+
+def get_expected_probabilities(params, cdf_func, pdf_func, bin_edges, probability_method, normalized=False):
+ """
+ Compute the expected probabilities for each bin given the distribution parameters.
+
+ Parameters
+ ----------
+ params : array-like
+ Parameters for the CDF or PDF function.
+ cdf_func : callable
+ Cumulative distribution function (CDF) that takes bin edges and parameters as inputs.
+ pdf_func : callable
+ Probability density function (PDF) that takes a value and parameters as inputs.
+ bin_edges : array-like
+ Edges of the bins for which to compute the probabilities.
+ probability_method : {'cdf', 'pdf'}
+ Method to compute the probabilities. If 'cdf', use the CDF to compute probabilities.
+ If 'pdf', integrate the PDF over each bin range.
+ normalized : bool, optional
+ If True, normalize the probabilities to sum to 1. Default is False.
+
+ Returns
+ -------
+ expected_probabilities : numpy.ndarray
+ Array of expected probabilities for each bin.
+
+ Notes
+ -----
+ - If the 'cdf' method is used, the probabilities are computed as the difference in CDF values at the bin edges.
+ - If the 'pdf' method is used, the probabilities are computed by integrating the PDF over each bin range.
+ - Any zero or negative probabilities are replaced with a very small positive number (1e-10) to ensure optimization.
+ - If `normalized` is True, the probabilities are normalized to sum to 1.
+
+ """
+ if probability_method == "cdf":
+ # Compute the CDF at bin edges
+ cdf_vals = cdf_func(bin_edges, params)
+ # Compute probabilities for each bin
+ expected_probabilities = np.diff(cdf_vals)
+ # Replace any zero or negative probabilities with a very small positive number
+ # --> Otherwise do not optimize ...
+ expected_probabilities = np.maximum(expected_probabilities, 1e-10)
+ # Or integrate PDF over the bin range
+ else: # probability_method == "pdf":
+ # For each bin, integrate the PDF over the bin range
+ expected_probabilities = np.array(
+ [quad(lambda x: pdf_func(x, params), bin_edges[i], bin_edges[i + 1])[0] for i in range(len(bin_edges) - 1)],
+ )
+ if normalized:
+ # Normalize probabilities to sum to 1
+ total_probability = np.sum(expected_probabilities)
+ expected_probabilities /= total_probability
+ return expected_probabilities
+
+
+def get_adjusted_nt(cdf, params, Nt, bin_edges):
+ """Adjust Nt for the proportion of missing drops. See Johnson's et al., 2013 Eqs. 3 and 4."""
+ # Estimate proportion of missing drops (Johnson's 2011 Eqs. 3)
+ # --> Alternative: p = 1 - np.sum(pdf(diameter, params)* diameter_bin_width) # [-]
+ p = 1 - np.diff(cdf([bin_edges[0], bin_edges[-1]], params)).item() # [-]
+ # Adjusts Nt for the proportion of drops not observed
+ # p = np.clip(p, 0, 1 - 1e-12)
+ if np.isclose(p, 1, atol=1e-12):
+ return np.nan
+ return Nt / (1 - p) # [m-3]
+
+
+def compute_negative_log_likelihood(
+ params,
+ bin_edges,
+ counts,
+ cdf_func,
+ pdf_func,
+ param_constraints=None,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+):
+ """
+ General negative log-likelihood function for fitting distributions to binned data.
+
+ Parameters
+ ----------
+ params : array-like
+ Parameters of the distribution.
+ bin_edges : array-like
+ Edges of the bins (length N+1).
+ counts : array-like
+ Observed counts in each bin (length N).
+ cdf_func : callable
+ Cumulative distribution function of the distribution.
+ pdf_func : callable
+ Probability density function of the distribution.
+ param_constraints : callable, optional
+ Function that checks if parameters are valid.
+ probability_method : str, optional
+ Method to compute expected probabilities, either 'cdf' or 'pdf'. Default is 'cdf'.
+ likelihood : str, optional
+ Type of likelihood to compute, either 'multinomial' or 'poisson'. Default is 'multinomial'.
+ truncated_likelihood : bool, optional
+ Whether to normalize the expected probabilities. Default is True.
+ nll : float
+ Negative log-likelihood value.
+
+ Returns
+ -------
+ nll: float
+ The negative log-likelihood value.
+ """
+ # Check if parameters are valid
+ if param_constraints is not None and not param_constraints(params):
+ return np.inf
+
+ # Compute (unormalized) expected probabilities using CDF
+ expected_probabilities = get_expected_probabilities(
+ params=params,
+ cdf_func=cdf_func,
+ pdf_func=pdf_func,
+ bin_edges=bin_edges,
+ probability_method=probability_method,
+ normalized=truncated_likelihood,
+ )
+
+ # Ensure expected probabilities are valid
+ if np.any(expected_probabilities <= 0):
+ return np.inf
+
+ # Compute negative log-likelihood
+ if likelihood == "poisson":
+ n_total = np.sum(counts)
+ expected_counts = expected_probabilities * n_total
+ expected_counts = np.maximum(expected_counts, 1e-10) # Avoid zero expected counts
+ nll = -np.sum(counts * np.log(expected_counts) - expected_counts)
+ else: # likelihood == "multinomial":
+ # Compute likelihood
+ nll = -np.sum(counts * np.log(expected_probabilities))
+ return nll
+
+
+def estimate_lognormal_parameters(
+ counts,
+ bin_edges,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ output_dictionary=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate the parameters of a lognormal distribution given histogram data.
+
+ Parameters
+ ----------
+ counts : array-like
+ The counts for each bin in the histogram.
+ bin_edges : array-like
+ The edges of the bins.
+ probability_method : str, optional
+ The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``.
+ likelihood : str, optional
+ The likelihood function to use, either ``"multinomial"`` or ``"poisson"``.
+ The default is ``"multinomial"``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ output_dictionary : bool, optional
+ Whether to return the output as a dictionary.
+ If False, returns a numpy array. The default is ``True``
+ optimizer : str, optional
+ The optimization method to use. Default is ``"Nelder-Mead"``.
+
+ Returns
+ -------
+ dict or numpy.ndarray
+ The estimated parameters of the lognormal distribution.
+ If ``output_dictionary`` is ``True``, returns a dictionary with keys ``Nt``, ``mu``, and ``sigma``.
+ If ``output_dictionary`` is ``False``,returns a numpy array with values [Nt, mu, sigma].
+
+ Notes
+ -----
+ The lognormal distribution is defined as:
+ N(D) = Nt / (sqrt(2 * pi) * sigma * D) * exp(-(ln(D) - mu)**2 / (2 * sigma**2))
+ where Nt is the total number of counts, mu is the mean of the log of the distribution,
+ and sigma is the standard deviation of the log of the distribution.
+
+ References
+ ----------
+ .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html#scipy.stats.lognorm
+ """
+ # LogNormal
+ # - mu = log(scale)
+ # - loc = 0
+
+ # Initialize bad results
+ null_output = (
+ {"Nt": np.nan, "mu": np.nan, "sigma": np.nan} if output_dictionary else np.array([np.nan, np.nan, np.nan])
+ )
+
+ # Define the CDF and PDF functions for the lognormal distribution
+ def lognorm_cdf(x, params):
+ sigma, scale = params
+ return ss.lognorm.cdf(x, sigma, loc=0, scale=scale)
+
+ def lognorm_pdf(x, params):
+ sigma, scale = params
+ return ss.lognorm.pdf(x, sigma, loc=0, scale=scale)
+
+ # Define valid parameters for the lognormal distribution
+ def param_constraints(params):
+ sigma, scale = params
+ return sigma > 0 and scale > 0
+
+ # Definite initial guess for the parameters
+ initial_params = [1.0, 1.0] # sigma, scale
+
+ # Define bounds for sigma and scale
+ bounds = [(1e-6, None), (1e-6, None)]
+
+ # Minimize the negative log-likelihood
+ with suppress_warnings():
+ result = minimize(
+ compute_negative_log_likelihood,
+ initial_params,
+ args=(
+ bin_edges,
+ counts,
+ lognorm_cdf,
+ lognorm_pdf,
+ param_constraints,
+ probability_method,
+ likelihood,
+ truncated_likelihood,
+ ),
+ bounds=bounds,
+ method=optimizer,
+ )
+
+ # Check if the fit had success
+ if not result.success:
+ return null_output
+
+ # Define Nt
+ Nt = np.sum(counts).item()
+
+ # Retrieve parameters
+ params = result.x
+ if truncated_likelihood:
+ Nt = get_adjusted_nt(cdf=lognorm_cdf, params=params, Nt=Nt, bin_edges=bin_edges)
+ sigma, scale = params
+ mu = np.log(scale)
+
+ # Define output
+ output = {"Nt": Nt, "mu": mu, "sigma": sigma} if output_dictionary else np.array([Nt, mu, sigma])
+ return output
+
+
+def estimate_exponential_parameters(
+ counts,
+ bin_edges,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ output_dictionary=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate the parameters of an exponential distribution given histogram data.
+
+ Parameters
+ ----------
+ counts : array-like
+ The counts for each bin in the histogram.
+ bin_edges : array-like
+ The edges of the bins.
+ probability_method : str, optional
+ The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``.
+ likelihood : str, optional
+ The likelihood function to use, either ``"multinomial"`` or ``"poisson"``.
+ The default is ``"multinomial"``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ output_dictionary : bool, optional
+ Whether to return the output as a dictionary.
+ If False, returns a numpy array. The default is ``True``
+ optimizer : str, optional
+ The optimization method to use. Default is ``"Nelder-Mead"``.
+
+ Returns
+ -------
+ dict or numpy.ndarray
+ The estimated parameters of the exponential distribution.
+ If ``output_dictionary`` is ``True``, returns a dictionary with keys ``N0`` and ``Lambda``.
+ If `output_dictionary` is ``False``, returns a numpy array with [N0, Lambda].
+
+ Notes
+ -----
+ The exponential distribution is defined as:
+ N(D) = N0 * exp(-Lambda * D) = Nt * Lambda * exp(-Lambda * D)
+ where Lambda = 1 / scale and N0 = Nt * Lambda.
+
+ References
+ ----------
+ .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html
+ """
+ # Initialize bad results
+ null_output = {"N0": np.nan, "Lambda": np.nan} if output_dictionary else np.array([np.nan, np.nan])
+
+ # Define the CDF and PDF functions for the exponential distribution
+ def exp_cdf(x, params):
+ scale = params[0]
+ return ss.expon.cdf(x, loc=0, scale=scale)
+
+ def exp_pdf(x, params):
+ scale = params[0]
+ return ss.expon.pdf(x, loc=0, scale=scale)
+
+ # Define valid parameters for the exponential distribution
+ def param_constraints(params):
+ scale = params[0]
+ return scale > 0
+
+ # Definite initial guess for the scale parameter
+ initial_params = [1.0] # scale
+
+ # Define bounds for scale
+ bounds = [(1e-6, None)]
+
+ # Minimize the negative log-likelihood
+ with suppress_warnings():
+ result = minimize(
+ compute_negative_log_likelihood,
+ initial_params,
+ args=(
+ bin_edges,
+ counts,
+ exp_cdf,
+ exp_pdf,
+ param_constraints,
+ probability_method,
+ likelihood,
+ truncated_likelihood,
+ ),
+ bounds=bounds,
+ method=optimizer,
+ )
+
+ # Check if the fit had success
+ if not result.success:
+ return null_output
+
+ # Define Nt
+ Nt = np.sum(counts).item()
+
+ # Retrieve parameters
+ params = result.x
+ if truncated_likelihood:
+ Nt = get_adjusted_nt(cdf=exp_cdf, params=params, Nt=Nt, bin_edges=bin_edges)
+ scale = params[0]
+ Lambda = 1 / scale
+ N0 = Nt * Lambda
+
+ # Define output
+ output = {"N0": N0, "Lambda": Lambda} if output_dictionary else np.array([N0, Lambda])
+ return output
+
+
+def estimate_gamma_parameters(
+ counts,
+ a,
+ scale,
+ bin_edges,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ output_dictionary=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate the parameters of a gamma distribution given histogram data.
+
+ Parameters
+ ----------
+ counts : array-like
+ The counts for each bin in the histogram.
+ a: float
+ The shape parameter of the scipy.stats.gamma distribution.
+ A good default value is 1.
+ scale: float
+ The scale parameter of the scipy.stats.gamma distribution.
+ A good default value is 1.
+ bin_edges : array-like
+ The edges of the bins.
+ probability_method : str, optional
+ The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``.
+ likelihood : str, optional
+ The likelihood function to use, either ``"multinomial"`` or ``"poisson"``.
+ The default is ``"multinomial"``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ output_dictionary : bool, optional
+ Whether to return the output as a dictionary.
+ If False, returns a numpy array. The default is ``True``
+ optimizer : str, optional
+ The optimization method to use. Default is ``"Nelder-Mead"``.
+
+ Returns
+ -------
+ dict or numpy.ndarray
+ The estimated parameters of the gamma distribution.
+ If ``output_dictionary`` is ``True``, returns a dictionary with keys ``N0``, ``mu`` and ``Lambda``.
+ If `output_dictionary` is ``False``, returns a numpy array with [N0, mu, Lambda].
+
+ Notes
+ -----
+ The gamma distribution is defined as:
+ N(D) = N0 * D**mu * exp(-Lambda*D)
+ where Lambda = 1/scale, and mu = a - 1 with ``a`` being the shape parameter of the gamma distribution.
+ N0 is defined as N0 = Nt*Lambda**(mu+1)/gamma(mu+1).
+
+ References
+ ----------
+ .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gamma.html
+
+ """
+ # Initialize bad results
+ null_output = (
+ {"N0": np.nan, "mu": np.nan, "lambda": np.nan} if output_dictionary else np.array([np.nan, np.nan, np.nan])
+ )
+
+ # Define the CDF and PDF functions for the gamma distribution
+ def gamma_cdf(x, params):
+ a, scale = params
+ return ss.gamma.cdf(x, a, loc=0, scale=scale)
+
+ def gamma_pdf(x, params):
+ a, scale = params
+ return ss.gamma.pdf(x, a, loc=0, scale=scale)
+
+ # Define valid parameters for the gamma distribution
+ # mu = -0.99 is a vertical line essentially ...
+ def param_constraints(params):
+ a, scale = params
+ return a > 0.1 and scale > 0 # using a > 0 cause some troubles
+
+ # Definite initial guess for the parameters
+ initial_params = [a, scale] # (mu=a-1, a=mu+1)
+
+ # Define bounds for a and scale
+ bounds = [(1e-6, None), (1e-6, None)]
+
+ # Minimize the negative log-likelihood
+ with suppress_warnings():
+ result = minimize(
+ compute_negative_log_likelihood,
+ initial_params,
+ args=(
+ bin_edges,
+ counts,
+ gamma_cdf,
+ gamma_pdf,
+ param_constraints,
+ probability_method,
+ likelihood,
+ truncated_likelihood,
+ ),
+ method=optimizer,
+ bounds=bounds,
+ )
+
+ # Check if the fit had success
+ if not result.success:
+ return null_output
+
+ # Define Nt
+ Nt = np.sum(counts).item()
+
+ # Retrieve parameters
+ params = result.x
+ if truncated_likelihood:
+ Nt = get_adjusted_nt(cdf=gamma_cdf, params=params, Nt=Nt, bin_edges=bin_edges)
+ a, scale = params
+ mu = a - 1
+ Lambda = 1 / scale
+
+ # Compute N0
+ # - Use logarithmic computations to prevent overflow
+ # - N0 = Nt * Lambda ** (mu + 1) / gamma(mu + 1)
+ with suppress_warnings():
+ log_N0 = np.log(Nt) + (mu + 1) * np.log(Lambda) - gammaln(mu + 1)
+ N0 = np.exp(log_N0)
+ if not np.isfinite(N0):
+ N0 = np.nan
+
+ # Define output
+ output = {"N0": N0, "mu": mu, "Lambda": Lambda} if output_dictionary else np.array([N0, mu, Lambda])
+ return output
+
+
+def _get_initial_gamma_parameters(ds, mom_method=None):
+ if mom_method is None:
+ ds_init = xr.Dataset(
+ {
+ "a": xr.ones_like(ds["M1"]),
+ "scale": xr.ones_like(ds["M1"]),
+ },
+ )
+ else:
+ ds_init = get_mom_parameters(
+ ds=ds,
+ psd_model="GammaPSD",
+ mom_methods=mom_method,
+ )
+ ds_init["a"] = ds_init["mu"] + 1
+ ds_init["scale"] = 1 / ds_init["Lambda"]
+ return ds_init
+
+
+def get_gamma_parameters(
+ ds,
+ init_method=None,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate gamma distribution parameters for drop size distribution (DSD) data.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input dataset containing drop size distribution data. It must include the following variables:
+ - ``drop_number_concentration``: The number concentration of drops.
+ - ``diameter_bin_width``": The width of each diameter bin.
+ - ``diameter_bin_lower``: The lower bounds of the diameter bins.
+ - ``diameter_bin_upper``: The upper bounds of the diameter bins.
+ - ``diameter_bin_center``: The center values of the diameter bins.
+ - The moments M0...M6 variables required to compute the initial parameters
+ with the specified mom_method.
+ init_method: str or list
+ The method(s) of moments used to initialize the gamma parameters.
+ If None, the scale parameter is set to 1 and mu to 0 (a=1).
+ probability_method : str, optional
+ Method to compute probabilities. The default is ``cdf``.
+ likelihood : str, optional
+ Likelihood function to use for fitting. The default is ``multinomial``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ optimizer : str, optional
+ Optimization method to use. The default is ``Nelder-Mead``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the estimated gamma distribution parameters:
+ - ``N0``: Intercept parameter.
+ - ``mu``: Shape parameter.
+ - ``Lambda``: Scale parameter.
+ The dataset will also have an attribute ``disdrodb_psd_model`` set to ``GammaPSD``.
+
+ Notes
+ -----
+ The function uses `xr.apply_ufunc` to fit the lognormal distribution parameters
+ in parallel, leveraging Dask for parallel computation.
+
+ """
+ # Define inputs
+ counts = ds["drop_number_concentration"] * ds["diameter_bin_width"]
+ diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1])
+
+ # Define initial parameters (a, scale)
+ ds_init = _get_initial_gamma_parameters(ds, mom_method=init_method)
+
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "bin_edges": diameter_breaks,
+ "probability_method": probability_method,
+ "likelihood": likelihood,
+ "truncated_likelihood": truncated_likelihood,
+ "optimizer": optimizer,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ estimate_gamma_parameters,
+ counts,
+ ds_init["a"],
+ ds_init["scale"],
+ kwargs=kwargs,
+ input_core_dims=[["diameter_bin_center"], [], []],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["N0", "mu", "Lambda"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "GammaPSD"
+ return ds_params
+
+
+def get_lognormal_parameters(
+ ds,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate lognormal distribution parameters for drop size distribution (DSD) data.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input dataset containing drop size distribution data. It must include the following variables:
+ - ``drop_number_concentration``: The number concentration of drops.
+ - ``diameter_bin_width``": The width of each diameter bin.
+ - ``diameter_bin_lower``: The lower bounds of the diameter bins.
+ - ``diameter_bin_upper``: The upper bounds of the diameter bins.
+ - ``diameter_bin_center``: The center values of the diameter bins.
+ probability_method : str, optional
+ Method to compute probabilities. The default is ``cdf``.
+ likelihood : str, optional
+ Likelihood function to use for fitting. The default is ``multinomial``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ optimizer : str, optional
+ Optimization method to use. The default is ``Nelder-Mead``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the estimated lognormal distribution parameters:
+ - ``Nt``: Total number concentration.
+ - ``mu``: Mean of the lognormal distribution.
+ - ``sigma``: Standard deviation of the lognormal distribution.
+ The resulting dataset will have an attribute ``disdrodb_psd_model`` set to ``LognormalPSD``.
+
+ Notes
+ -----
+ The function uses `xr.apply_ufunc` to fit the lognormal distribution parameters
+ in parallel, leveraging Dask for parallel computation.
+
+ """
+ # Define inputs
+ counts = ds["drop_number_concentration"] * ds["diameter_bin_width"]
+ diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1])
+
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "bin_edges": diameter_breaks,
+ "probability_method": probability_method,
+ "likelihood": likelihood,
+ "truncated_likelihood": truncated_likelihood,
+ "optimizer": optimizer,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ estimate_lognormal_parameters,
+ counts,
+ kwargs=kwargs,
+ input_core_dims=[["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["Nt", "mu", "sigma"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "LognormalPSD"
+
+ return ds_params
+
+
+def get_exponential_parameters(
+ ds,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate the parameters of an exponential particle size distribution (PSD) from the given dataset.
+
+ Fitting this model is equivalent to fitting a GammaPSD model fixing ``mu`` to 0.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input dataset containing drop number concentration data and diameter information.
+ It must include the following variables:
+ - ``drop_number_concentration``: The number concentration of drops.
+ - ``diameter_bin_width``": The width of each diameter bin.
+ - ``diameter_bin_lower``: The lower bounds of the diameter bins.
+ - ``diameter_bin_upper``: The upper bounds of the diameter bins.
+ - ``diameter_bin_center``: The center values of the diameter bins.
+ probability_method : str, optional
+ Method to compute probabilities. The default is ``cdf``.
+ likelihood : str, optional
+ Likelihood function to use for fitting. The default is ``multinomial``.
+ truncated_likelihood : bool, optional
+ Whether to use truncated likelihood. The default is ``True``.
+ optimizer : str, optional
+ Optimization method to use. The default is ``Nelder-Mead``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the estimated expontial distribution parameters:
+ - ``N0``: Intercept parameter.
+ - ``Lambda``: Scale parameter.
+ The resulting dataset will have an attribute ``disdrodb_psd_model`` set to ``ExponentialPSD``.
+
+ Notes
+ -----
+ The function uses `xr.apply_ufunc` to fit the exponential distribution parameters
+ in parallel, leveraging Dask for parallel computation.
+
+ """
+ # Define inputs
+ counts = ds["drop_number_concentration"] * ds["diameter_bin_width"]
+ diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1])
+
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "bin_edges": diameter_breaks,
+ "probability_method": probability_method,
+ "likelihood": likelihood,
+ "truncated_likelihood": truncated_likelihood,
+ "optimizer": optimizer,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ estimate_exponential_parameters,
+ counts,
+ kwargs=kwargs,
+ input_core_dims=[["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 2}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["N0", "Lambda"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "ExponentialPSD"
+ return ds_params
+
+
+####-------------------------------------------------------------------------------------------------------------------.
+
+
+def _estimate_gamma_parameters_johnson(
+ drop_number_concentration,
+ diameter,
+ diameter_breaks,
+ output_dictionary=True,
+ method="Nelder-Mead",
+ mu=0.5,
+ Lambda=3,
+ **kwargs,
+):
+ """Deprecated Maximum likelihood estimation of Gamma model.
+
+ N(D) = N_t * lambda**(mu+1) / gamma(mu+1) D**mu exp(-lambda*D)
+
+ Args:
+ spectra: The DSD for which to find parameters [mm-1 m-3].
+ widths: Class widths for each DSD bin [mm].
+ diams: Class-centre diameters for each DSD bin [mm].
+ mu: Initial value for shape parameter mu [-].
+ lambda_param: Initial value for slope parameter lambda [mm^-1].
+ kwargs: Extra arguments for the optimization process.
+
+ Returns
+ -------
+ Dictionary with estimated mu, lambda, and N0.
+ mu (shape) N0 (scale) lambda(slope)
+
+ Notes
+ -----
+ The last bin counts are not accounted in the fitting procedure !
+
+ References
+ ----------
+ Johnson, R. W., D. V. Kliche, and P. L. Smith, 2011: Comparison of Estimators for Parameters of Gamma Distributions
+ with Left-Truncated Samples. J. Appl. Meteor. Climatol., 50, 296-310, https://doi.org/10.1175/2010JAMC2478.1
+
+ Johnson, R.W., Kliche, D., & Smith, P.L. (2010).
+ Maximum likelihood estimation of gamma parameters for coarsely binned and truncated raindrop size data.
+ Quarterly Journal of the Royal Meteorological Society, 140. DOI:10.1002/qj.2209
+
+ """
+ # Initialize bad results
+ if output_dictionary:
+ null_output = {"mu": np.nan, "lambda": np.nan, "N0": np.nan}
+ else:
+ null_output = np.array([np.nan, np.nan, np.nan])
+
+ # Initialize parameters
+ # --> Ideally with method of moments estimate
+ # --> See equation 8 of Johnson's 2013
+ x0 = [mu, Lambda]
+
+ # Compute diameter_bin_width
+ diameter_bin_width = np.diff(diameter_breaks)
+
+ # Convert drop_number_concentration from mm-1 m-3 to m-3.
+ spectra = np.asarray(drop_number_concentration) * diameter_bin_width
+
+ # Define cost function
+ # - Parameter to be optimized on first positions
+ def _cost_function(parameters, spectra, diameter_breaks):
+ # Assume spectra to be in unit [m-3] (drop_number_concentration*diameter_bin_width) !
+ mu, Lambda = parameters
+ # Precompute gamma integrals between various diameter bins
+ # - gamminc(mu+1) already divides the integral by gamma(mu+1) !
+ pgamma_d = gammainc(mu + 1, Lambda * diameter_breaks)
+ # Compute probability with interval
+ delta_pgamma_bins = pgamma_d[1:] - pgamma_d[:-1]
+ # Compute normalization over interval
+ denominator = pgamma_d[-1] - pgamma_d[0]
+ # Compute cost function
+ # a = mu - 1, x = lambda
+ if mu > -1 and Lambda > 0:
+ cost = np.sum(-spectra * np.log(delta_pgamma_bins / denominator))
+ return cost
+ return np.inf
+
+ # Minimize the cost function
+ with suppress_warnings():
+ bounds = [(0, None), (0, None)] # Force mu and lambda to be non-negative
+ res = minimize(
+ _cost_function,
+ x0=x0,
+ args=(spectra, diameter_breaks),
+ method=method,
+ bounds=bounds,
+ **kwargs,
+ )
+
+ # Check if the fit had success
+ if not res.success:
+ return null_output
+
+ # Extract parameters
+ mu = res.x[0] # [-]
+ Lambda = res.x[1] # [mm-1]
+
+ # Estimate tilde_N_T using the total drop concentration
+ tilde_N_T = np.sum(drop_number_concentration * diameter_bin_width) # [m-3]
+
+ # Estimate proportion of missing drops (Johnson's 2011 Eqs. 3)
+ with suppress_warnings():
+ D = diameter
+ p = 1 - np.sum((Lambda ** (mu + 1)) / gamma(mu + 1) * D**mu * np.exp(-Lambda * D) * diameter_bin_width) # [-]
+
+ # Convert tilde_N_T to N_T using Johnson's 2013 Eqs. 3 and 4.
+ # - Adjusts for the proportion of drops not observed
+ N_T = tilde_N_T / (1 - p) # [m-3]
+
+ # Compute N0
+ N0 = N_T * (Lambda ** (mu + 1)) / gamma(mu + 1) # [m-3 * mm^(-mu-1)]
+
+ # Compute Dm
+ # Dm = (mu + 4)/ Lambda
+
+ # Compute Nw
+ # Nw = N0* D^mu / f(mu) , with f(mu of the Normalized PSD)
+
+ # Define output
+ output = {"mu": mu, "Lambda": Lambda, "N0": N0} if output_dictionary else np.array([mu, Lambda, N0])
+ return output
+
+
+def get_gamma_parameters_johnson2014(ds, method="Nelder-Mead"):
+ """Deprecated model. See Gamma Model with truncated_likelihood and 'pdf'."""
+ drop_number_concentration = ds["drop_number_concentration"]
+ diameter = ds["diameter_bin_center"]
+ diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1])
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "diameter_breaks": diameter_breaks,
+ "method": method,
+ }
+ da_params = xr.apply_ufunc(
+ _estimate_gamma_parameters_johnson,
+ drop_number_concentration,
+ diameter,
+ # diameter_bin_width,
+ kwargs=kwargs,
+ input_core_dims=[["diameter_bin_center"], ["diameter_bin_center"]], # ["diameter_bin_center"],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["mu", "Lambda", "N0"]})
+
+ # Convert to skill Dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+ return ds_params
+
+
+####-----------------------------------------------------------------------------------------.
+#### Grid Search (GS)
+
+
+def _compute_rain_rate(ND, D, dD, V):
+ axis = 1 if ND.ndim == 2 else None
+ rain_rate = np.pi / 6 * np.sum(ND * V * (D / 1000) ** 3 * dD, axis=axis) * 3600 * 1000
+ return rain_rate # mm/h
+
+
+def _compute_lwc(ND, D, dD, rho_w=1000):
+ axis = 1 if ND.ndim == 2 else None
+ lwc = np.pi / 6.0 * (rho_w * 1000) * np.sum((D / 1000) ** 3 * ND * dD, axis=axis)
+ return lwc # g/m3
+
+
+def _compute_z(ND, D, dD):
+ axis = 1 if ND.ndim == 2 else None
+ z = np.sum(((D) ** 6 * ND * dD), axis=axis) # mm⁶·m⁻³
+ Z = 10 * np.log10(z)
+ return Z
+
+
+def _compute_cost_function(ND_obs, ND_preds, D, dD, V, target, transformation, error_order):
+ # Assume ND_obs of shape (D bins) and ND_preds of shape (# params, D bins)
+ if target == "ND":
+ if transformation == "identity":
+ errors = np.mean(np.abs(ND_obs[None, :] - ND_preds) ** error_order, axis=1)
+ if transformation == "log":
+ errors = np.mean(np.abs(np.log(ND_obs[None, :] + 1) - np.log(ND_preds + 1)) ** error_order, axis=1)
+ if transformation == "np.sqrt":
+ errors = np.mean(np.abs(np.sqrt(ND_obs[None, :]) - np.sqrt(ND_preds)) ** error_order, axis=1)
+ elif target == "Z":
+ errors = np.abs(_compute_z(ND_obs, D, dD) - _compute_z(ND_preds, D, dD))
+ elif target == "R":
+ errors = np.abs(_compute_rain_rate(ND_obs, D, dD, V) - _compute_rain_rate(ND_preds, D, dD, V))
+ elif target == "LWC":
+ errors = np.abs(_compute_lwc(ND_obs, D, dD) - _compute_lwc(ND_preds, D, dD))
+ else:
+ raise ValueError("Invalid target")
+ return errors
+
+
+def apply_exponential_gs(
+ Nt,
+ ND_obs,
+ V,
+ # Coords
+ D,
+ dD,
+ # Error options
+ target,
+ transformation,
+ error_order,
+):
+ """Apply Grid Search for the ExponentialPSD distribution."""
+ # Define set of mu values
+ lambda_arr = np.arange(0.01, 20, step=0.01)
+
+ # Perform grid search
+ with suppress_warnings():
+ # Compute ND
+ N0_arr = Nt * lambda_arr
+ ND_preds = ExponentialPSD.formula(D=D[None, :], N0=N0_arr[:, None], Lambda=lambda_arr[:, None])
+
+ # Compute errors
+ errors = _compute_cost_function(
+ ND_obs=ND_obs,
+ ND_preds=ND_preds,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Identify best parameter set
+ best_index = np.argmin(errors)
+ return np.array([N0_arr[best_index].item(), lambda_arr[best_index].item()])
+
+
+def _apply_gamma_gs(mu_values, lambda_values, Nt, ND_obs, D, dD, V, target, transformation, error_order):
+ """Routine for GammaPSD parameters grid search."""
+ # Define combinations of parameters for grid search
+ combo = np.meshgrid(mu_values, lambda_values, indexing="xy")
+ mu_arr = combo[0].ravel()
+ lambda_arr = combo[1].ravel()
+
+ # Perform grid search
+ with suppress_warnings():
+ # Compute ND
+ N0 = np.exp(np.log(Nt) + (mu_arr[:, None] + 1) * np.log(lambda_arr[:, None]) - gammaln(mu_arr[:, None] + 1))
+ ND_preds = GammaPSD.formula(D=D[None, :], N0=N0, Lambda=lambda_arr[:, None], mu=mu_arr[:, None])
+
+ # Compute errors
+ errors = _compute_cost_function(
+ ND_obs=ND_obs,
+ ND_preds=ND_preds,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Best parameter
+ best_index = np.argmin(errors)
+ return N0[best_index].item(), mu_arr[best_index].item(), lambda_arr[best_index].item()
+
+
+def apply_gamma_gs(
+ Nt,
+ ND_obs,
+ V,
+ # Coords
+ D,
+ dD,
+ # Error options
+ target,
+ transformation,
+ error_order,
+):
+ """Estimate GammaPSD model parameters using Grid Search."""
+ # Define initial set of parameters
+ mu_step = 0.5
+ lambda_step = 0.5
+ mu_values = np.arange(0.01, 20, step=mu_step)
+ lambda_values = np.arange(0, 60, step=lambda_step)
+
+ # First round of GS
+ N0, mu, Lambda = _apply_gamma_gs(
+ mu_values=mu_values,
+ lambda_values=lambda_values,
+ Nt=Nt,
+ ND_obs=ND_obs,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Second round of GS
+ mu_values = np.arange(mu - mu_step * 2, mu + mu_step * 2, step=mu_step / 20)
+ lambda_values = np.arange(Lambda - lambda_step * 2, Lambda + lambda_step * 2, step=lambda_step / 20)
+ N0, mu, Lambda = _apply_gamma_gs(
+ mu_values=mu_values,
+ lambda_values=lambda_values,
+ Nt=Nt,
+ ND_obs=ND_obs,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ return np.array([N0, mu, Lambda])
+
+
+def _apply_lognormal_gs(mu_values, sigma_values, Nt, ND_obs, D, dD, V, target, transformation, error_order):
+ """Routine for LognormalPSD parameters grid search."""
+ # Define combinations of parameters for grid search
+ combo = np.meshgrid(mu_values, sigma_values, indexing="xy")
+ mu_arr = combo[0].ravel()
+ sigma_arr = combo[1].ravel()
+
+ # Perform grid search
+ with suppress_warnings():
+ # Compute ND
+ ND_preds = LognormalPSD.formula(D=D[None, :], Nt=Nt, mu=mu_arr[:, None], sigma=sigma_arr[:, None])
+
+ # Compute errors
+ errors = _compute_cost_function(
+ ND_obs=ND_obs,
+ ND_preds=ND_preds,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Best parameter
+ best_index = np.argmin(errors)
+ return Nt, mu_arr[best_index].item(), sigma_arr[best_index].item()
+
+
+def apply_lognormal_gs(
+ Nt,
+ ND_obs,
+ V,
+ # Coords
+ D,
+ dD,
+ # Error options
+ target,
+ transformation,
+ error_order,
+):
+ """Estimate LognormalPSD model parameters using Grid Search."""
+ # Define initial set of parameters
+ mu_step = 0.5
+ sigma_step = 0.5
+ mu_values = np.arange(0.01, 20, step=mu_step) # TODO: define realistic values
+ sigma_values = np.arange(0, 20, step=sigma_step) # TODO: define realistic values
+
+ # First round of GS
+ Nt, mu, sigma = _apply_lognormal_gs(
+ mu_values=mu_values,
+ sigma_values=sigma_values,
+ Nt=Nt,
+ ND_obs=ND_obs,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Second round of GS
+ mu_values = np.arange(mu - mu_step * 2, mu + mu_step * 2, step=mu_step / 20)
+ sigma_values = np.arange(sigma - sigma_step * 2, sigma + sigma_step * 2, step=sigma_step / 20)
+ Nt, mu, sigma = _apply_lognormal_gs(
+ mu_values=mu_values,
+ sigma_values=sigma_values,
+ Nt=Nt,
+ ND_obs=ND_obs,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ return np.array([Nt, mu, sigma])
+
+
+def apply_normalized_gamma_gs(
+ Nw,
+ D50,
+ ND_obs,
+ V,
+ # Coords
+ D,
+ dD,
+ # Error options
+ target,
+ transformation,
+ error_order,
+):
+ """Estimate NormalizedGammaPSD model parameters using Grid Search."""
+ # Define set of mu values
+ mu_arr = np.arange(0.01, 20, step=0.01)
+
+ # Perform grid search
+ with suppress_warnings():
+ # Compute ND
+ ND_preds = NormalizedGammaPSD.formula(D=D[None, :], D50=D50, Nw=Nw, mu=mu_arr[:, None])
+
+ # Compute errors
+ errors = _compute_cost_function(
+ ND_obs=ND_obs,
+ ND_preds=ND_preds,
+ D=D,
+ dD=dD,
+ V=V,
+ target=target,
+ transformation=transformation,
+ error_order=error_order,
+ )
+
+ # Identify best parameter set
+ mu = mu_arr[np.argmin(errors)]
+ return np.array([Nw, mu, D50])
+
+
+def get_exponential_parameters_gs(ds, target="ND", transformation="log", error_order=1):
+ """Estimate the parameters of an Exponential distribution using Grid Search."""
+ # "target": ["ND", "LWC", "Z", "R"]
+ # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration
+ # "error_order": 1, # MAE/MSE ... only for drop_number_concentration
+
+ # Define kwargs
+ kwargs = {
+ "D": ds["diameter_bin_center"].data,
+ "dD": ds["diameter_bin_width"].data,
+ "target": target,
+ "transformation": transformation,
+ "error_order": error_order,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ apply_exponential_gs,
+ # Variables varying over time
+ ds["Nt"],
+ ds["drop_number_concentration"],
+ ds["fall_velocity"],
+ # Other options
+ kwargs=kwargs,
+ # Settings
+ input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 2}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["N0", "Lambda"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "ExponentialPSD"
+ return ds_params
+
+
+def get_gamma_parameters_gs(ds, target="ND", transformation="log", error_order=1):
+ """Compute Grid Search to identify mu and Lambda Gamma distribution parameters."""
+ # "target": ["ND", "LWC", "Z", "R"]
+ # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration
+ # "error_order": 1, # MAE/MSE ... only for drop_number_concentration
+
+ # Define kwargs
+ kwargs = {
+ "D": ds["diameter_bin_center"].data,
+ "dD": ds["diameter_bin_width"].data,
+ "target": target,
+ "transformation": transformation,
+ "error_order": error_order,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ apply_gamma_gs,
+ # Variables varying over time
+ ds["Nt"],
+ ds["drop_number_concentration"],
+ ds["fall_velocity"],
+ # Other options
+ kwargs=kwargs,
+ # Settings
+ input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["N0", "mu", "Lambda"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "GammaPSD"
+ return ds_params
+
+
+def get_lognormal_parameters_gs(ds, target="ND", transformation="log", error_order=1):
+ """Compute Grid Search to identify mu and sigma lognormal distribution parameters."""
+ # "target": ["ND", "LWC", "Z", "R"]
+ # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration
+ # "error_order": 1, # MAE/MSE ... only for drop_number_concentration
+
+ # Define kwargs
+ kwargs = {
+ "D": ds["diameter_bin_center"].data,
+ "dD": ds["diameter_bin_width"].data,
+ "target": target,
+ "transformation": transformation,
+ "error_order": error_order,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ apply_lognormal_gs,
+ # Variables varying over time
+ ds["Nt"],
+ ds["drop_number_concentration"],
+ ds["fall_velocity"],
+ # Other options
+ kwargs=kwargs,
+ # Settings
+ input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["Nt", "mu", "sigma"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "LognormalPSD"
+ return ds_params
+
+
+def get_normalized_gamma_parameters_gs(ds, target="ND", transformation="log", error_order=1):
+ r"""Estimate $\mu$ of a Normalized Gamma distribution using Grid Search.
+
+ The D50 and Nw parameters of the Normalized Gamma distribution are derived empirically from the observed DSD.
+ $\mu$ is derived by minimizing the errors between the observed DSD and modelled Normalized Gamma distribution.
+
+ Parameters
+ ----------
+ Nd : array_like
+ A drop size distribution
+ D50: optional, float
+ Median drop diameter in mm. If none is given, it will be estimated.
+ Nw: optional, float
+ Normalized Intercept Parameter. If none is given, it will be estimated.
+ order: optional, float
+ Order to which square the error when computing the sum of errors.
+ Order = 2 is equivalent to minimize the mean squared error (MSE) (L2 norm). The default is 2.
+ Order = 1 is equivalent to minimize the mean absolute error (MAE) (L1 norm).
+ Higher orders typically stretch higher the gamma distribution.
+
+ Returns
+ -------
+
+
+ """
+ # "target": ["ND", "LWC", "Z", "R"]
+ # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration
+ # "error_order": 1, # MAE/MSE ... only for drop_number_concentration
+
+ # Define kwargs
+ kwargs = {
+ "D": ds["diameter_bin_center"].data,
+ "dD": ds["diameter_bin_width"].data,
+ "target": target,
+ "transformation": transformation,
+ "error_order": error_order,
+ }
+
+ # Fit distribution in parallel
+ da_params = xr.apply_ufunc(
+ apply_normalized_gamma_gs,
+ # Variables varying over time
+ ds["Nw"],
+ ds["D50"],
+ ds["drop_number_concentration"],
+ ds["fall_velocity"],
+ # Other options
+ kwargs=kwargs,
+ # Settings
+ input_core_dims=[[], [], ["diameter_bin_center"], ["diameter_bin_center"]],
+ output_core_dims=[["parameters"]],
+ vectorize=True,
+ dask="parallelized",
+ dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_params = da_params.assign_coords({"parameters": ["Nw", "mu", "D50"]})
+
+ # Create parameters dataset
+ ds_params = da_params.to_dataset(dim="parameters")
+
+ # Add DSD model name to the attribute
+ ds_params.attrs["disdrodb_psd_model"] = "NormalizedGammaPSD"
+ return ds_params
+
+
+####-----------------------------------------------------------------.
+#### Methods of Moments (MOM)
+# - M246 DEFAULT FOR GAMMA ?
+# - LMOM (Johnson et al., 2014)
+
+
+def get_exponential_parameters_Zhang2008(moment_l, moment_m, l, m): # noqa: E741
+ """Calculate Exponential DSD parameters using the method of moments (MOM).
+
+ The choice of moments is given in the parameters.
+
+ Parameters
+ ----------
+ moment_l: float
+ First moment to use.
+ moment_l: float
+ Second moment to use.
+ l : float
+ Moment order.
+ m : float
+ Moment order,
+
+ References
+ ----------
+ [1] Zhang, et. al., 2008, Diagnosing the Intercept Parameter for Exponential Raindrop Size
+ Distribution Based on Video Disdrometer Observations: Model Development. J. Appl.
+ Meteor. Climatol.,
+ https://doi.org/10.1175/2008JAMC1876.1
+ """
+ num = moment_l * gamma(m + 1)
+ den = moment_m * gamma(l + 1)
+ Lambda = np.power(num / den, (1 / (m - l)))
+ N0 = moment_l * np.power(Lambda, l + 1) / gamma(l + 1)
+ return N0, Lambda
+
+
+def get_exponential_parameters_M34(moment_3, moment_4):
+ """Compute exponential distribution parameters following Testud 2001.
+
+ References
+ ----------
+ Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001:
+ The Concept of “Normalized” Distribution to Describe Raindrop Spectra:
+ A Tool for Cloud Physics and Cloud Remote Sensing.
+ J. Appl. Meteor. Climatol., 40, 1118-1140,
+ https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2
+ """
+ N0 = 256 / gamma(4) * moment_3**5 / moment_4**4
+ Dm = moment_4 / moment_3
+ Lambda = 4 / Dm
+ return N0, Lambda
+
+
+def get_gamma_parameters_M012(M0, M1, M2):
+ """Compute gamma distribution parameters following Cao et al., 2009.
+
+ References
+ ----------
+ Cao, Q., and G. Zhang, 2009:
+ Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra.
+ J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1.
+ """
+ # TODO: really bad results. check formula !
+ G = M1**3 / M0 / M2
+ mu = 1 / (1 - G) - 2
+ Lambda = M0 / M1 * (mu + 1)
+ N0 = Lambda ** (mu + 1) * M0 / gamma(mu + 1)
+ return N0, mu, Lambda
+
+
+def get_gamma_parameters_M234(M2, M3, M4):
+ """Compute gamma distribution parameters following Cao et al., 2009.
+
+ References
+ ----------
+ Cao, Q., and G. Zhang, 2009:
+ Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra.
+ J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1.
+ """
+ G = M3**2 / M2 / M4
+ mu = 1 / (1 - G) - 4
+ Lambda = M2 / M3 * (mu + 3)
+ N0 = Lambda ** (mu + 3) * M2 / gamma(mu + 3)
+ return N0, mu, Lambda
+
+
+def get_gamma_parameters_M246(M2, M4, M6):
+ """Compute gamma distribution parameters following Ulbrich 1998.
+
+ References
+ ----------
+ Ulbrich, C. W., and D. Atlas, 1998:
+ Rainfall Microphysics and Radar Properties: Analysis Methods for Drop Size Spectra.
+ J. Appl. Meteor. Climatol., 37, 912-923,
+ https://doi.org/10.1175/1520-0450(1998)037<0912:RMARPA>2.0.CO;2
+
+ Cao, Q., and G. Zhang, 2009:
+ Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra.
+ J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1.
+
+ Thurai, M., Williams, C.R., Bringi, V.N., 2014:
+ Examining the correlations between drop size distribution parameters using data
+ from two side-by-side 2D-video disdrometers.
+ Atmospheric Research, 144, 95-110, https://doi.org/10.1016/j.atmosres.2014.01.002.
+ """
+ G = M4**2 / M2 / M6
+
+ # TODO: Different formulas !
+ # Thurai et al., 2014 (A4), Ulbrich et al., 1998 (2)
+ # mu = ((7.0 - 11.0 * G) -
+ # np.sqrt((7.0 - 11.0 * G) ** 2.0 - 4.0 * (G - 1.0) * (30.0 * G - 12.0)) / (2.0 * (G - 1.0)))
+ mu = (7.0 - 11.0 * G) - np.sqrt(G**2 + 89 * G + 1) / (2.0 * (G - 1.0))
+
+ # Cao et al., 2009 (B3)
+ # --> Wrong ???
+ mu = (7.0 - 11.0 * G) - np.sqrt(G**2 + 14 * G + 1) / (2.0 * (G - 1.0))
+
+ Lambda = np.sqrt((4 + mu) * (3 + mu) * M2 / M4)
+ # Cao et al., 2009
+ N0 = M2 * Lambda ** (3 + mu) / gamma(3 + mu)
+ # # Thurai et al., 2014
+ # N0 = M3 * Lambda ** (4 + mu) / gamma(4 + mu)
+ # # Ulbrich et al., 1998
+ # N0 = M6 * Lambda ** (7.0 + mu) / gamma(7 + mu)
+ return N0, mu, Lambda
+
+
+def get_gamma_parameters_M456(M4, M5, M6):
+ """Compute gamma distribution parameters following Cao et al., 2009.
+
+ References
+ ----------
+ Cao, Q., and G. Zhang, 2009:
+ Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra.
+ J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1.
+ """
+ G = M5**2 / M4 / M6
+ mu = 1 / (1 - G) - 6
+ Lambda = M4 / M5 * (mu + 5)
+ N0 = Lambda ** (mu + 5) * M4 / gamma(mu + 5)
+ return N0, mu, Lambda
+
+
+def get_gamma_parameters_M346(M3, M4, M6):
+ """Compute gamma distribution parameters following Kozu 1991.
+
+ References
+ ----------
+ Kozu, T., and K. Nakamura, 1991:
+ Rainfall Parameter Estimation from Dual-Radar Measurements
+ Combining Reflectivity Profile and Path-integrated Attenuation.
+ J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2
+
+ Tokay, A., and D. A. Short, 1996:
+ Evidence from Tropical Raindrop Spectra of the Origin of Rain from
+ Stratiform versus Convective Clouds.
+ J. Appl. Meteor. Climatol., 35, 355-371,
+ https://doi.org/10.1175/1520-0450(1996)035<0355:EFTRSO>2.0.CO;2
+
+ Cao, Q., and G. Zhang, 2009:
+ Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra.
+ J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1.
+ """
+ G = M4**3 / M3**2 / M6
+
+ # Kozu
+ mu = (5.5 * G - 4 + np.sqrt(G * (G * 0.25 + 2))) / (1 - G)
+
+ # Cao et al., 2009 (equivalent)
+ # mu = (11 * G - 8 + np.sqrt(G * (G + 8))) / (2 * (1 - G))
+
+ Lambda = (mu + 4) * M3 / M4
+ N0 = Lambda ** (mu + 4) * M3 / gamma(mu + 4)
+ return N0, mu, Lambda
+
+
+def get_lognormal_parameters_M346(M3, M4, M6):
+ """Compute lognormal distribution parameters following Kozu1991.
+
+ References
+ ----------
+ Kozu, T., and K. Nakamura, 1991:
+ Rainfall Parameter Estimation from Dual-Radar Measurements
+ Combining Reflectivity Profile and Path-integrated Attenuation.
+ J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2
+ """
+ L3 = np.log(M3)
+ L4 = np.log(M4)
+ L6 = np.log(M6)
+ Nt = np.exp((24 * L3 - 27 * L4 - 6 * L6) / 3)
+ mu = (-10 * L3 + 13.5 * L4 - 3.5 * L6) / 3
+ sigma = (2 * L3 - 3 * L4 + L6) / 3
+ return Nt, mu, sigma
+
+
+def _get_gamma_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset:
+ # Get the correct function and list of variables for the requested method
+ func, needed_moments = MOM_METHODS_DICT["GammaPSD"][mom_method]
+
+ # Extract the required arrays from the dataset
+ arrs = [ds[var_name] for var_name in needed_moments]
+
+ # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data
+ N0, mu, Lambda = func(*arrs)
+
+ # Return a new Dataset containing the results
+ ds = xr.Dataset(
+ {
+ "N0": N0,
+ "mu": mu,
+ "Lambda": Lambda,
+ },
+ coords=ds.coords,
+ )
+ return ds
+
+
+def _get_lognormal_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset:
+ # Get the correct function and list of variables for the requested method
+ func, needed_moments = MOM_METHODS_DICT["LognormalPSD"][mom_method]
+
+ # Extract the required arrays from the dataset
+ arrs = [ds[var_name] for var_name in needed_moments]
+
+ # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data
+ Nt, mu, sigma = func(*arrs)
+
+ # Return a new Dataset containing the results
+ ds = xr.Dataset(
+ {
+ "Nt": Nt,
+ "mu": mu,
+ "sigma": sigma,
+ },
+ coords=ds.coords,
+ )
+ return ds
+
+
+def _get_exponential_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset:
+ # Get the correct function and list of variables for the requested method
+ func, needed_moments = MOM_METHODS_DICT["ExponentialPSD"][mom_method]
+
+ # Extract the required arrays from the dataset
+ arrs = [ds[var_name] for var_name in needed_moments]
+
+ # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data
+ N0, Lambda = func(*arrs)
+
+ # Return a new Dataset containing the results
+ ds = xr.Dataset(
+ {
+ "N0": N0,
+ "Lambda": Lambda,
+ },
+ coords=ds.coords,
+ )
+ return ds
+
+
+####--------------------------------------------------------------------------------------.
+#### Routines dictionary
+
+
+MOM_METHODS_DICT = {
+ "GammaPSD": {
+ # "M012": (get_gamma_parameters_M012, ["M0", "M1", "M2"]),
+ "M234": (get_gamma_parameters_M234, ["M2", "M3", "M4"]),
+ "M246": (get_gamma_parameters_M246, ["M2", "M4", "M6"]),
+ "M456": (get_gamma_parameters_M456, ["M4", "M5", "M6"]),
+ "M346": (get_gamma_parameters_M346, ["M3", "M4", "M6"]),
+ },
+ "LognormalPSD": {
+ "M346": (get_lognormal_parameters_M346, ["M3", "M4", "M6"]),
+ },
+ "ExponentialPSD": {
+ "M234": (get_exponential_parameters_M34, ["M3", "M4"]),
+ },
+}
+
+
+OPTIMIZATION_ROUTINES_DICT = {
+ "MOM": {
+ "GammaPSD": _get_gamma_parameters_mom,
+ "LognormalPSD": _get_lognormal_parameters_mom,
+ "ExponentialPSD": _get_exponential_parameters_mom,
+ },
+ "GS": {
+ "GammaPSD": get_gamma_parameters_gs,
+ "NormalizedGammaPSD": get_normalized_gamma_parameters_gs,
+ "LognormalPSD": get_lognormal_parameters_gs,
+ "ExponentialPSD": get_exponential_parameters_gs,
+ },
+ "ML": {
+ "GammaPSD": get_gamma_parameters,
+ "LognormalPSD": get_lognormal_parameters,
+ "ExponentialPSD": get_exponential_parameters,
+ },
+}
+
+
+def available_mom_methods(psd_model):
+ """Implemented MOM methods for a given PSD model."""
+ return list(MOM_METHODS_DICT[psd_model])
+
+
+def available_optimization(psd_model):
+ """Implemented fitting methods for a given PSD model."""
+ return [opt for opt in list(OPTIMIZATION_ROUTINES_DICT) if psd_model in OPTIMIZATION_ROUTINES_DICT[opt]]
+
+
+####--------------------------------------------------------------------------------------.
+#### Argument checkers
+
+
+def check_psd_model(psd_model, optimization):
+ """Check valid psd_model argument."""
+ valid_psd_models = list(OPTIMIZATION_ROUTINES_DICT[optimization])
+ if psd_model not in valid_psd_models:
+ msg = (
+ f"{optimization} optimization is not available for 'psd_model' {psd_model}. "
+ f"Accepted PSD models are {valid_psd_models}."
+ )
+ raise ValueError(msg)
+
+
+def check_target(target):
+ """Check valid target argument."""
+ valid_targets = ["ND", "R", "Z", "LWC"]
+ if target not in valid_targets:
+ raise ValueError(f"Invalid 'target' {target}. Valid targets are {valid_targets}.")
+ return target
+
+
+def check_transformation(transformation):
+ """Check valid transformation argument."""
+ valid_transformation = ["identity", "log", "sqrt"]
+ if transformation not in valid_transformation:
+ raise ValueError(
+ f"Invalid 'transformation' {transformation}. Valid transformations are {transformation}.",
+ )
+ return transformation
+
+
+def check_likelihood(likelihood):
+ """Check valid likelihood argument."""
+ valid_likelihood = ["multinomial", "poisson"]
+ if likelihood not in valid_likelihood:
+ raise ValueError(f"Invalid 'likelihood' {likelihood}. Valid values are {valid_likelihood}.")
+ return likelihood
+
+
+def check_truncated_likelihood(truncated_likelihood):
+ """Check valid truncated_likelihood argument."""
+ if not isinstance(truncated_likelihood, bool):
+ raise TypeError(f"Invalid 'truncated_likelihood' argument {truncated_likelihood}. Must be True or False.")
+ return truncated_likelihood
+
+
+def check_probability_method(probability_method):
+ """Check valid probability_method argument."""
+ # Check valid probability_method
+ valid_probability_method = ["cdf", "pdf"]
+ if probability_method not in valid_probability_method:
+ raise ValueError(
+ f"Invalid 'probability_method' {probability_method}. Valid values are {valid_probability_method}.",
+ )
+ return probability_method
+
+
+def check_optimizer(optimizer):
+ """Check valid optimizer argument."""
+ # Check valid probability_method
+ valid_optimizer = ["Nelder-Mead", "Powell", "L-BFGS-B"]
+ if optimizer not in valid_optimizer:
+ raise ValueError(
+ f"Invalid 'optimizer' {optimizer}. Valid values are {valid_optimizer}.",
+ )
+ return optimizer
+
+
+def check_mom_methods(mom_methods, psd_model):
+ """Check valid mom_methods arguments."""
+ if isinstance(mom_methods, str):
+ mom_methods = [mom_methods]
+ valid_mom_methods = available_mom_methods(psd_model)
+ invalid_mom_methods = np.array(mom_methods)[np.isin(mom_methods, valid_mom_methods, invert=True)]
+ if len(invalid_mom_methods) > 0:
+ raise ValueError(
+ f"Unknown mom_methods '{invalid_mom_methods}' for {psd_model}. Choose from {valid_mom_methods}.",
+ )
+ return mom_methods
+
+
+def check_optimization(optimization):
+ """Check valid optimization argument."""
+ valid_optimization = list(OPTIMIZATION_ROUTINES_DICT)
+ if optimization not in valid_optimization:
+ raise ValueError(
+ f"Invalid 'optimization' {optimization}. Valid procedure are {valid_optimization}.",
+ )
+ return optimization
+
+
+def check_optimization_kwargs(optimization_kwargs, optimization, psd_model):
+ """Check valid optimization_kwargs."""
+ dict_arguments = {
+ "ML": {
+ "init_method": None,
+ "probability_method": check_probability_method,
+ "likelihood": check_likelihood,
+ "truncated_likelihood": check_truncated_likelihood,
+ "optimizer": check_optimizer,
+ },
+ "GS": {
+ "target": check_target,
+ "transformation": check_transformation,
+ "error_order": None,
+ },
+ "MOM": {
+ "mom_methods": None,
+ },
+ }
+ optimization = check_optimization(optimization)
+ check_psd_model(psd_model=psd_model, optimization=optimization)
+
+ # Retrieve the expected arguments for the given optimization method
+ expected_arguments = dict_arguments.get(optimization, {})
+
+ # Check for missing arguments in optimization_kwargs
+ missing_args = [arg for arg in expected_arguments if arg not in optimization_kwargs]
+ if missing_args:
+ raise ValueError(f"Missing required arguments for {optimization} optimization: {missing_args}")
+
+ # Validate argument values
+ _ = [check(optimization_kwargs[arg]) for arg, check in expected_arguments.items() if callable(check)]
+
+ # Further special checks
+ if optimization == "MOM":
+ _ = check_mom_methods(mom_methods=optimization_kwargs["mom_methods"], psd_model=psd_model)
+ if optimization == "ML":
+ if optimization_kwargs["init_method"] is not None:
+ _ = check_mom_methods(mom_methods=optimization_kwargs["init_method"], psd_model=psd_model)
+
+
+####--------------------------------------------------------------------------------------.
+#### Wrappers for fitting
+
+
+def get_mom_parameters(ds: xr.Dataset, psd_model: str, mom_methods: str) -> xr.Dataset:
+ """
+ Compute PSD model parameters using various method-of-moments (MOM) approaches.
+
+ The method is specified by the `mom_methods` acronym, e.g. 'M012', 'M234', 'M246'.
+
+ Parameters
+ ----------
+ ds : xr.Dataset
+ An xarray Dataset with the required moments M0...M6 as data variables.
+ mom_methods: str or list
+ Valid MOM methods are {'M012', 'M234', 'M246', 'M456', 'M346'}.
+
+ Returns
+ -------
+ xr.Dataset
+ A Dataset containing mu, Lambda, and N0 variables.
+ If multiple mom_methods are specified, the dataset has the dimension mom_method.
+
+ """
+ # Check inputs
+ check_psd_model(psd_model=psd_model, optimization="MOM")
+ mom_methods = check_mom_methods(mom_methods, psd_model=psd_model)
+
+ # Retrieve function
+ func = OPTIMIZATION_ROUTINES_DICT["MOM"][psd_model]
+
+ # Compute parameters
+ if len(mom_methods) == 1:
+ ds = func(ds=ds, mom_method=mom_methods[0])
+ ds.attrs["mom_method"] = mom_methods[0]
+ return ds
+ list_ds = [func(ds=ds, mom_method=mom_method) for mom_method in mom_methods]
+ ds = xr.concat(list_ds, dim="mom_method")
+ ds = ds.assign_coords({"mom_method": mom_methods})
+ return ds
+
+
+def get_ml_parameters(
+ ds,
+ psd_model,
+ init_method=None,
+ probability_method="cdf",
+ likelihood="multinomial",
+ truncated_likelihood=True,
+ optimizer="Nelder-Mead",
+):
+ """
+ Estimate model parameters for a given distribution using Maximum Likelihood.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input dataset containing drop number concentration data and diameter information.
+ It must include the following variables:
+ - ``drop_number_concentration``: The number concentration of drops.
+ - ``diameter_bin_width``": The width of each diameter bin.
+ - ``diameter_bin_lower``: The lower bounds of the diameter bins.
+ - ``diameter_bin_upper``: The upper bounds of the diameter bins.
+ - ``diameter_bin_center``: The center values of the diameter bins.
+ psd_model : str
+ The PSD model to fit. See ``available_psd_models()``.
+ init_method: str or list
+ The method(s) of moments used to initialize the PSD model parameters.
+ See ``available_mom_methods(psd_model)``.
+ probability_method : str, optional
+ Method to compute probabilities. The default is ``cdf``.
+ likelihood : str, optional
+ Likelihood function to use for fitting. The default is ``multinomial``.
+ truncated_likelihood : bool, optional
+ Whether to use Truncated Maximum Likelihood (TML). The default is ``True``.
+ optimizer : str, optional
+ Optimization method to use. The default is ``Nelder-Mead``.
+
+ Returns
+ -------
+ xarray.Dataset
+ The dataset containing the estimated parameters.
+
+ """
+ # -----------------------------------------------------------------------------.
+ # Check arguments
+ check_psd_model(psd_model, optimization="ML")
+ likelihood = check_likelihood(likelihood)
+ probability_method = check_probability_method(probability_method)
+ optimizer = check_optimizer(optimizer)
+
+ # Check valid init_method
+ if init_method is not None:
+ init_method = check_mom_methods(mom_methods=init_method, psd_model=psd_model)
+
+ # Retrieve estimation function
+ func = OPTIMIZATION_ROUTINES_DICT["ML"][psd_model]
+
+ # Retrieve parameters
+ ds_params = func(
+ ds=ds,
+ init_method=init_method,
+ probability_method=probability_method,
+ likelihood=likelihood,
+ truncated_likelihood=truncated_likelihood,
+ optimizer=optimizer,
+ )
+ # Return dataset with parameters
+ return ds_params
+
+
+def get_gs_parameters(ds, psd_model, target="ND", transformation="log", error_order=1):
+ # Check valid psd_model
+ check_psd_model(psd_model, optimization="GS")
+
+ # Check valid target
+ target = check_target(target)
+
+ # Check valid transformation
+ transformation = check_transformation(transformation)
+
+ # Retrieve estimation function
+ func = OPTIMIZATION_ROUTINES_DICT["GS"][psd_model]
+
+ # Estimate parameters
+ ds_params = func(ds, target=target, transformation=transformation, error_order=error_order)
+
+ # Return dataset with parameters
+ return ds_params
+
+
+def estimate_model_parameters(
+ ds,
+ psd_model,
+ optimization,
+ optimization_kwargs,
+):
+
+ optimization = check_optimization(optimization)
+ check_optimization_kwargs(optimization_kwargs=optimization_kwargs, optimization=optimization, psd_model=psd_model)
+
+ # Define function
+ dict_func = {
+ "ML": get_ml_parameters,
+ "MOM": get_mom_parameters,
+ "GS": get_gs_parameters,
+ }
+ func = dict_func[optimization]
+
+ # Retrieve parameters
+ ds_params = func(ds, psd_model=psd_model, **optimization_kwargs)
+
+ # Finalize attributes
+ ds_params.attrs["disdrodb_psd_model"] = psd_model
+ ds_params.attrs["disdrodb_psd_optimization"] = optimization
+ if optimization == "GS":
+ ds_params.attrs["disdrodb_psd_optimization_target"] = optimization_kwargs["target"]
+
+ return ds_params
diff --git a/disdrodb/psd/models.py b/disdrodb/psd/models.py
new file mode 100644
index 00000000..41e1e28b
--- /dev/null
+++ b/disdrodb/psd/models.py
@@ -0,0 +1,729 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Definition of PSD models.
+
+The class implementation is inspired by pytmatrix.psd and pyradsim.psd modules
+and adapted to allow efficient vectorized computations with xarray.
+
+Source code:
+- https://github.com/jleinonen/pytmatrix/blob/master/pytmatrix/psd.py
+- https://github.com/wolfidan/pyradsim/blob/master/pyradsim/psd.py
+
+"""
+
+import numpy as np
+import xarray as xr
+from pytmatrix.psd import PSD
+from scipy.special import gamma as gamma_f
+from scipy.stats import expon, gamma, lognorm
+
+# psd.log_likelihood
+# psd.moment(order)
+# psd.mean
+# psd.variance
+# psd.mode
+
+# TODO
+# - psd.isel(**kwargs)
+# - psd.sel(**kwargs)
+
+# __eq__
+# --> Generalize using self.parameters and deep diff
+
+
+# ------------------------------------------------------------------------------------------------------------.
+
+
+def available_psd_models():
+ """Return a list of available PSD models."""
+ return list(PSD_MODELS_DICT)
+
+
+def check_psd_model(psd_model):
+ """Check validity of a PSD model."""
+ available_models = available_psd_models()
+ if psd_model not in available_models:
+ raise ValueError(f"{psd_model} is an invalid PSD model. Valid models are: {available_models}.")
+ return psd_model
+
+
+def get_psd_model(psd_model):
+ """Retrieve the PSD Class."""
+ return PSD_MODELS_DICT[psd_model]
+
+
+def get_psd_model_formula(psd_model):
+ """Retrieve the PSD formula."""
+ return PSD_MODELS_DICT[psd_model].formula
+
+
+def create_psd(psd_model, parameters): # TODO: check name around
+ """Define a PSD from a dictionary or xr.Dataset of parameters."""
+ psd_class = get_psd_model(psd_model)
+ psd = psd_class.from_parameters(parameters)
+ return psd
+
+
+def get_required_parameters(psd_model):
+ """Retrieve the list of parameters required by a PSD model."""
+ psd_class = get_psd_model(psd_model)
+ return psd_class.required_parameters()
+
+
+def clip_values(D, values, Dmax=np.inf):
+ """Clip values outside the [Dmin,Dmax) interval to 0."""
+ # Handle scalar input
+ if np.isscalar(D):
+ if Dmax < D or D == 0.0:
+ return 0.0
+ return values
+
+ # Handle numpy array input
+ if isinstance(values, np.ndarray):
+ mask = (Dmax < D) | (D == 0)
+ values = np.where(mask, 0, values)
+
+ # Handle xarray.DataArray input
+ elif isinstance(values, xr.DataArray):
+ values = xr.where(np.logical_or(Dmax < D, D == 0), 0, values)
+ values = values.where(~np.isnan(values).any(dim="diameter_bin_center"))
+ else:
+ raise TypeError("Input 'D' and 'values' must be a scalar, numpy array or an xarray.DataArray.")
+ return values
+
+
+def is_scalar(value):
+ """Determines if the input value is a scalar."""
+ return isinstance(value, (float, int)) or isinstance(value, (np.ndarray, xr.DataArray)) and value.size == 1
+
+
+class XarrayPSD(PSD):
+ """PSD class template allowing vectorized computations with xarray.
+
+ We currently inherit from pytmatrix PSD to allow scattering simulations:
+ --> https://github.com/ltelab/pytmatrix-lte/blob/880170b4ca62a04e8c843619fa1b8713b9e11894/pytmatrix/psd.py#L321
+ """
+
+ def __eq__(self, other):
+ """Check if two objects are equal."""
+ return False
+
+ def has_scalar_parameters(self):
+ """Check if the PSD object contains only a single set of parameters."""
+ return np.all(is_scalar(value) for param, value in self.parameters.items())
+
+ def formula(self, D, **parameters):
+ """PSD formula."""
+ pass
+
+ def __call__(self, D):
+ """Compute the PSD."""
+ values = self.formula(D=D, **self.parameters)
+ return clip_values(D=D, values=values, Dmax=self.Dmax)
+
+ def moment(self, order, nbins_diam=1024):
+ """
+ Compute the moments of the Particle Size Distribution (PSD).
+
+ Parameters
+ ----------
+ order : int
+ The order of the moment to compute.
+ nbins_diam : int, optional
+ The number of bins to use for the diameter range (default is 1024).
+
+ Returns
+ -------
+ float
+ The computed moment of the PSD.
+
+ Notes
+ -----
+ The method uses numerical integration (trapezoidal rule) to compute the moment.
+ """
+ dbins = np.linspace(self.Dmin, self.Dmax, nbins_diam)
+ dD = dbins[1] - dbins[0]
+ return np.trapz(dbins**order * self.__call__(dbins), dx=dD)
+
+
+class LognormalPSD(XarrayPSD):
+ """Lognormal drop size distribution (DSD).
+
+ Callable class to provide a lognormal PSD with the given parameters.
+
+ The PSD form is:
+
+ N(D) = Nt/(sqrt(2*pi)*sigma*D)) * exp(-(ln(D)-mu)**2 / (2*sigma**2))
+
+ # g = sigma
+ # theta = 0
+
+ Attributes
+ ----------
+ Nt:
+ g:
+ theta:
+ mu:
+ sigma:
+
+ """
+
+ def __init__(self, Nt=1.0, mu=0.0, sigma=1.0, Dmin=0, Dmax=None, coverage=0.999):
+ self.Nt = Nt
+ self.mu = mu
+ self.sigma = sigma
+ self.parameters = {"Nt": self.Nt, "mu": self.mu, "sigma": self.sigma}
+ # Define Dmin and Dmax
+ self.Dmin = Dmin
+ if Dmax is not None:
+ self.Dmax = Dmax
+ else:
+ dmax = lognorm.ppf(coverage, s=self.sigma, scale=np.exp(self.mu))
+ if isinstance(self.sigma, xr.DataArray):
+ self.Dmax = xr.DataArray(dmax, dims=self.sigma.dims, coords=self.sigma.coords)
+ else:
+ self.Dmax = dmax
+
+ @staticmethod
+ def required_parameters():
+ """Return the required parameters of the PSD."""
+ return ["Nt", "mu", "sigma"]
+
+ @property
+ def name(self):
+ """Return name of the PSD."""
+ return "LognormalPSD"
+
+ @staticmethod
+ def from_parameters(parameters):
+ """Initialize LognormalPSD from a dictionary or xr.Dataset.
+
+ Args:
+ parameters (dict or xr.Dataset): Parameters to initialize the class.
+
+ Returns
+ -------
+ LognormalPSD: An instance of LognormalPSD initialized with the parameters.
+ """
+ Nt = parameters["Nt"]
+ mu = parameters["mu"]
+ sigma = parameters["sigma"]
+ return LognormalPSD(Nt=Nt, mu=mu, sigma=sigma)
+
+ def parameters_summary(self):
+ """Return a string with the parameter summary."""
+ if self.has_scalar_parameters():
+ summary = "".join(
+ [
+ f"{self.name}\n",
+ f"$Nt = {self.Nt:.2f}$\n",
+ f"$\\sigma = {self.sigma:.2f}$\n" f"$\\mu = {self.mu:.2f}$\n\n",
+ ],
+ )
+ else:
+ summary = "" f"{self.name} with N-d parameters \n"
+ return summary
+
+ @staticmethod
+ def formula(D, Nt, mu, sigma):
+ """Calculates the Lognormal PSD values."""
+ coeff = Nt / (np.sqrt(2.0 * np.pi) * sigma * (D))
+ expon = np.exp(-((np.log(D) - mu) ** 2) / (2.0 * sigma**2))
+ return coeff * expon
+
+ # def __eq__(self, other):
+ # try:
+ # return isinstance(other, ExponentialPSD) and \
+ # (self.N0 == other.N0) and (self.Lambda == other.Lambda) and \
+ # (self.Dmax == other.Dmax)
+ # except AttributeError:
+ # return False
+
+ # params dictionary !
+
+
+class ExponentialPSD(XarrayPSD):
+ """Exponential particle size distribution (PSD).
+
+ Callable class to provide an exponential PSD with the given
+ parameters. The attributes can also be given as arguments to the
+ constructor.
+
+ The PSD form is:
+ N(D) = N0 * exp(-Lambda*D)
+
+ Attributes
+ ----------
+ N0: the intercept parameter.
+ Lambda: the inverse scale parameter
+ Dmax: the maximum diameter to consider (defaults to 11/Lambda, i.e. approx. 3*D50, if None)
+
+ Args (call):
+ D: the particle diameter.
+
+ Returns (call):
+ The PSD value for the given diameter.
+ Returns 0 for all diameters larger than Dmax.
+ """
+
+ def __init__(self, N0=1.0, Lambda=1.0, Dmin=0, Dmax=None, coverage=0.999):
+ # Define parameters
+ self.N0 = N0
+ self.Lambda = Lambda
+ self.parameters = {"N0": self.N0, "Lambda": self.Lambda}
+
+ # Define Dmin and Dmax
+ self.Dmin = Dmin
+ if Dmax is not None:
+ self.Dmax = Dmax
+ else:
+ dmax = expon.ppf(coverage, scale=1 / self.Lambda)
+ if isinstance(self.Lambda, xr.DataArray):
+ self.Dmax = xr.DataArray(dmax, dims=self.Lambda.dims, coords=self.Lambda.coords)
+ else:
+ self.Dmax = dmax
+
+ @staticmethod
+ def required_parameters():
+ """Return the required parameters of the PSD."""
+ return ["N0", "Lambda"]
+
+ @property
+ def name(self):
+ """Return name of the PSD."""
+ return "ExponentialPSD"
+
+ @staticmethod
+ def from_parameters(parameters):
+ """Initialize ExponentialPSD from a dictionary or xr.Dataset.
+
+ Args:
+ parameters (dict or xr.Dataset): Parameters to initialize the class.
+
+ Returns
+ -------
+ ExponentialPSD: An instance of ExponentialPSD initialized with the parameters.
+ """
+ N0 = parameters["N0"]
+ Lambda = parameters["Lambda"]
+ return ExponentialPSD(N0=N0, Lambda=Lambda)
+
+ def parameters_summary(self):
+ """Return a string with the parameter summary."""
+ if self.has_scalar_parameters():
+ summary = "".join(
+ [
+ f"{self.name}\n",
+ f"$N0 = {self.N0:.2f}$\n",
+ f"$\\lambda = {self.Lambda:.2f}$\n\n",
+ ],
+ )
+ else:
+ summary = "" f"{self.name} with N-d parameters \n"
+ return summary
+
+ @staticmethod
+ def formula(D, N0, Lambda):
+ """Calculates the Exponential PSD values."""
+ return N0 * np.exp(-Lambda * D)
+
+ def __eq__(self, other):
+ """Check if two objects are equal."""
+ try:
+ return (
+ isinstance(other, ExponentialPSD)
+ and (self.N0 == other.N0)
+ and (self.Lambda == other.Lambda)
+ and (self.Dmax == other.Dmax)
+ )
+ except AttributeError:
+ return False
+
+
+class GammaPSD(ExponentialPSD):
+ """Gamma particle size distribution (PSD).
+
+ Callable class to provide an gamma PSD with the given
+ parameters. The attributes can also be given as arguments to the
+ constructor.
+
+ The PSD form is:
+ N(D) = N0 * D**mu * exp(-Lambda*D)
+
+ Attributes
+ ----------
+ N0: the intercept parameter [mm**(-1-mu) m**-3] (scale parameter)
+ Lambda: the inverse scale parameter [mm-1] (slope parameter)
+ mu: the shape parameter [-]
+ Dmax: the maximum diameter to consider (defaults to 11/Lambda,
+ i.e. approx. 3*D50, if None)
+
+ Args (call):
+ D: the particle diameter.
+
+ Returns (call):
+ The PSD value for the given diameter.
+ Returns 0 for all diameters larger than Dmax.
+
+ References
+ ----------
+ Ulbrich, C. W., 1985: The Effects of Drop Size Distribution Truncation on
+ Rainfall Integral Parameters and Empirical Relations.
+ J. Appl. Meteor. Climatol., 24, 580-590, https://doi.org/10.1175/1520-0450(1985)024<0580:TEODSD>2.0.CO;2
+ """
+
+ def __init__(self, N0=1.0, mu=0.0, Lambda=1.0, Dmin=0, Dmax=None, coverage=0.999):
+ # Define parameters
+ self.N0 = N0
+ self.Lambda = Lambda
+ self.mu = mu
+ self.parameters = {"N0": self.N0, "mu": self.mu, "Lambda": self.Lambda}
+ # Define Dmin and Dmax
+ self.Dmin = Dmin
+ if Dmax is not None:
+ self.Dmax = Dmax
+ else:
+ dmax = gamma.ppf(coverage, a=self.mu + 1.0, scale=1.0 / self.Lambda)
+ if isinstance(self.Lambda, xr.DataArray):
+ self.Dmax = xr.DataArray(dmax, dims=self.Lambda.dims, coords=self.Lambda.coords)
+ else:
+ self.Dmax = dmax
+
+ @staticmethod
+ def required_parameters():
+ """Return the required parameters of the PSD."""
+ return ["N0", "mu", "Lambda"]
+
+ @property
+ def name(self):
+ """Return name of the PSD."""
+ return "GammaPSD"
+
+ @staticmethod
+ def from_parameters(parameters):
+ """Initialize GammaPSD from a dictionary or xr.Dataset.
+
+ Args:
+ parameters (dict or xr.Dataset): Parameters to initialize the class.
+
+ Returns
+ -------
+ GammaPSD: An instance of GammaPSD initialized with the parameters.
+ """
+ N0 = parameters["N0"]
+ Lambda = parameters["Lambda"]
+ mu = parameters["mu"]
+ return GammaPSD(N0=N0, Lambda=Lambda, mu=mu)
+
+ def parameters_summary(self):
+ """Return a string with the parameter summary."""
+ if self.has_scalar_parameters():
+ summary = "".join(
+ [
+ f"{self.name}\n",
+ f"$\\mu = {self.mu:.2f}$\n",
+ f"$N0 = {self.N0:.2f}$\n",
+ f"$\\lambda = {self.Lambda:.2f}$\n\n",
+ ],
+ )
+ else:
+ summary = "" f"{self.name} with N-d parameters \n"
+ return summary
+
+ @staticmethod
+ def formula(D, N0, Lambda, mu):
+ """Calculates the Gamma PSD values."""
+ return N0 * np.exp(mu * np.log(D) - Lambda * D)
+
+ def __eq__(self, other):
+ """Check if two objects are equal."""
+ try:
+ return super().__eq__(other) and self.mu == other.mu
+ except AttributeError:
+ return False
+
+
+class NormalizedGammaPSD(XarrayPSD):
+ """Normalized gamma particle size distribution (PSD).
+
+ Callable class to provide a normalized gamma PSD with the given
+ parameters. The attributes can also be given as arguments to the
+ constructor.
+
+ The PSD form is:
+
+ N(D) = Nw * f(mu) * (D/D50)**mu * exp(-(mu+3.67)*D/D50)
+ f(mu) = 6/(3.67**4) * (mu+3.67)**(mu+4)/Gamma(mu+4)
+
+ An alternative formulation as function of Dm:
+ # Testud (2001), Bringi (2001), Williams et al., 2014, Dolan 2018
+ # --> Normalized with respect to liquid water content (mass) --> Nx=D3/Dm4
+ N(D) = Nw * f1(mu) * (D/Dm)**mu * exp(-(mu+4)*D/Dm) # Nw * f(D; Dm, mu)
+ f1(mu) = 6/(4**4) * (mu+4)**(mu+4)/Gamma(mu+4)
+
+ Note: gamma(4) = 6
+
+ An alternative formulation as function of Dm:
+ # Tokay et al., 2010
+ # Illingworth et al., 2002 (see eq10 to derive full formulation!)
+ # --> Normalized with respect to total concentration --> Nx = #/Dm
+ N(D) = Nt* * f2(mu) * (D/Dm)**mu * exp(-(mu+4)*D/Dm)
+ f2(mu) = (mu+4)**(mu+1)/Gamma(mu+1)
+
+ Attributes
+ ----------
+ D50: the median volume diameter.
+ Nw: the intercept parameter.
+ mu: the shape parameter.
+ Dmax: the maximum diameter to consider (defaults to 3*D50 when
+ if None)
+
+ Args (call):
+ D: the particle diameter.
+
+ Returns (call):
+ The PSD value for the given diameter.
+ Returns 0 for all diameters larger than Dmax.
+
+ References
+ ----------
+ Willis, P. T., 1984: Functional Fits to Some Observed Drop Size Distributions and Parameterization of Rain.
+ J. Atmos. Sci., 41, 1648-1661, https://doi.org/10.1175/1520-0469(1984)041<1648:FFTSOD>2.0.CO;2
+
+ Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001: The Concept of “Normalized” Distribution
+ to Describe Raindrop Spectra: A Tool for Cloud Physics and Cloud Remote Sensing.
+ J. Appl. Meteor. Climatol., 40, 1118-1140, https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2
+
+ Illingworth, A. J., and T. M. Blackman, 2002:
+ The Need to Represent Raindrop Size Spectra as Normalized Gamma Distributions for
+ the Interpretation of Polarization Radar Observations.
+ J. Appl. Meteor. Climatol., 41, 286-297, https://doi.org/10.1175/1520-0450(2002)041<0286:TNTRRS>2.0.CO;2
+
+ Bringi, V. N., G. Huang, V. Chandrasekar, and E. Gorgucci, 2002:
+ A Methodology for Estimating the Parameters of a Gamma Raindrop Size Distribution Model from
+ Polarimetric Radar Data: Application to a Squall-Line Event from the TRMM/Brazil Campaign.
+ J. Atmos. Oceanic Technol., 19, 633-645, https://doi.org/10.1175/1520-0426(2002)019<0633:AMFETP>2.0.CO;2
+
+ Bringi, V. N., V. Chandrasekar, J. Hubbert, E. Gorgucci, W. L. Randeu, and M. Schoenhuber, 2003:
+ Raindrop Size Distribution in Different Climatic Regimes from Disdrometer and Dual-Polarized Radar Analysis.
+ J. Atmos. Sci., 60, 354-365, https://doi.org/10.1175/1520-0469(2003)060<0354:RSDIDC>2.0.CO;2
+
+ Tokay, A., and P. G. Bashor, 2010: An Experimental Study of Small-Scale Variability of Raindrop Size Distribution.
+ J. Appl. Meteor. Climatol., 49, 2348-2365, https://doi.org/10.1175/2010JAMC2269.1
+
+ """
+
+ def __init__(self, Nw=1.0, D50=1.0, mu=0.0, Dmin=0, Dmax=None):
+ self.D50 = D50
+ self.mu = mu
+ self.Dmin = Dmin
+ self.Dmax = 3.0 * D50 if Dmax is None else Dmax
+ self.Nw = Nw
+ self.parameters = {"Nw": Nw, "D50": D50, "mu": mu}
+
+ @staticmethod
+ def required_parameters():
+ """Return the required parameters of the PSD."""
+ return ["Nw", "D50", "mu"]
+
+ @property
+ def name(self):
+ """Return the PSD name."""
+ return "NormalizedGammaPSD"
+
+ @staticmethod
+ def from_parameters(parameters):
+ """Initialize NormalizedGammaPSD from a dictionary or xr.Dataset.
+
+ Args:
+ parameters (dict or xr.Dataset): Parameters to initialize the class.
+
+ Returns
+ -------
+ NormalizedGammaPSD: An instance of NormalizedGammaPSD initialized with the parameters.
+ """
+ D50 = parameters["D50"]
+ Nw = parameters["Nw"]
+ mu = parameters["mu"]
+ return NormalizedGammaPSD(D50=D50, Nw=Nw, mu=mu)
+
+ @staticmethod
+ def formula(D, Nw, D50, mu):
+ """Calculates the NormalizedGamma PSD values."""
+ d_ratio = D / D50
+ nf = Nw * 6.0 / 3.67**4 * (3.67 + mu) ** (mu + 4) / gamma_f(mu + 4)
+ return nf * np.exp(mu * np.log(d_ratio) - (3.67 + mu) * d_ratio)
+
+ def parameters_summary(self):
+ """Return a string with the parameter summary."""
+ if self.has_scalar_parameters():
+ summary = "".join(
+ [
+ f"{self.name}\n",
+ f"$\\mu = {self.mu:.2f}$\n",
+ f"$Nw = {self.Nw:.2f}$\n",
+ f"$D50 = {self.D50:.2f}$\n",
+ ],
+ )
+ else:
+ summary = "" f"{self.name} with N-d parameters \n"
+ return summary
+
+ def __eq__(self, other):
+ """Check if two objects are equal."""
+ try:
+ return (
+ isinstance(other, NormalizedGammaPSD)
+ and (self.D50 == other.D50)
+ and (self.Nw == other.Nw)
+ and (self.mu == other.mu)
+ and (self.Dmax == other.Dmax)
+ )
+ except AttributeError:
+ return False
+
+
+PSD_MODELS_DICT = {
+ "LognormalPSD": LognormalPSD,
+ "ExponentialPSD": ExponentialPSD,
+ "GammaPSD": GammaPSD,
+ "NormalizedGammaPSD": NormalizedGammaPSD,
+}
+
+
+class BinnedPSD(PSD):
+ """Binned gamma particle size distribution (PSD).
+
+ Callable class to provide a binned PSD with the given bin edges and PSD
+ values.
+
+ Args (constructor):
+ The first argument to the constructor should specify n+1 bin edges,
+ and the second should specify n bin_psd values.
+
+ Args (call):
+ D: the particle diameter.
+
+ Returns (call):
+ The PSD value for the given diameter.
+ Returns 0 for all diameters outside the bins.
+ """
+
+ def __init__(self, bin_edges, bin_psd):
+ if len(bin_edges) != len(bin_psd) + 1:
+ raise ValueError("There must be n+1 bin edges for n bins.")
+
+ self.bin_edges = bin_edges
+ self.bin_psd = bin_psd
+
+ def psd_for_D(self, D):
+ """
+ Calculate the particle size distribution (PSD) for a given diameter D.
+
+ Parameters
+ ----------
+ D : float
+ The diameter for which to calculate the PSD.
+
+ Returns
+ -------
+ float
+ The PSD value corresponding to the given diameter D. Returns 0.0 if D is outside the range of bin edges.
+
+ Notes
+ -----
+ This method uses a binary search algorithm to find the appropriate bin for the given diameter D.
+ """
+ if not (self.bin_edges[0] < D <= self.bin_edges[-1]):
+ return 0.0
+
+ # binary search for the right bin
+ start = 0
+ end = len(self.bin_edges)
+ while end - start > 1:
+ half = (start + end) // 2
+ if self.bin_edges[start] < D <= self.bin_edges[half]:
+ end = half
+ else:
+ start = half
+
+ return self.bin_psd[start]
+
+ def __call__(self, D):
+ """Compute the PSD."""
+ if np.shape(D) == (): # D is a scalar
+ return self.psd_for_D(D)
+ return np.array([self.psd_for_D(d) for d in D])
+
+ def __eq__(self, other):
+ """Check PSD equality."""
+ if other is None:
+ return False
+ return (
+ len(self.bin_edges) == len(other.bin_edges)
+ and (self.bin_edges == other.bin_edges).all()
+ and (self.bin_psd == other.bin_psd).all()
+ )
+
+
+####-----------------------------------------------------------------.
+#### Moments Computation
+
+
+def get_exponential_moment(N0, Lambda, moment):
+ """Compute exponential distribution moments."""
+ return N0 * gamma_f(moment + 1) / Lambda ** (moment + 1)
+
+
+def get_gamma_moment_v1(N0, mu, Lambda, moment):
+ """Compute gamma distribution moments.
+
+ References
+ ----------
+ Kozu, T., and K. Nakamura, 1991:
+ Rainfall Parameter Estimation from Dual-Radar Measurements
+ Combining Reflectivity Profile and Path-integrated Attenuation.
+ J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2
+ """
+ # Zhang et al 2001: N0 * gamma_f(mu + moment + 1) * Lambda ** (-(mu + moment + 1))
+ return N0 * gamma_f(mu + moment + 1) / Lambda ** (mu + moment + 1)
+
+
+def get_gamma_moment_v2(Nt, mu, Lambda, moment):
+ """Compute gamma distribution moments.
+
+ References
+ ----------
+ Kozu, T., and K. Nakamura, 1991:
+ Rainfall Parameter Estimation from Dual-Radar Measurements
+ Combining Reflectivity Profile and Path-integrated Attenuation.
+ J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2
+ """
+ return Nt * gamma_f(mu + moment + 1) / gamma_f(mu + 1) / Lambda**moment
+
+
+def get_lognormal_moment(Nt, sigma, mu, moment):
+ """Compute lognormal distribution moments.
+
+ References
+ ----------
+ Kozu, T., and K. Nakamura, 1991:
+ Rainfall Parameter Estimation from Dual-Radar Measurements
+ Combining Reflectivity Profile and Path-integrated Attenuation.
+ J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2
+ """
+ return Nt * np.exp(moment * mu + 1 / 2 * moment * sigma**2)
diff --git a/disdrodb/routines.py b/disdrodb/routines.py
new file mode 100644
index 00000000..1d70c8ed
--- /dev/null
+++ b/disdrodb/routines.py
@@ -0,0 +1,1058 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB CLI routine wrappers."""
+import datetime
+import time
+from typing import Optional
+
+from disdrodb.api.io import available_stations, get_required_product
+from disdrodb.utils.cli import _execute_cmd
+
+####--------------------------------------------------------------------------.
+#### Run DISDRODB Station Processing
+
+
+def run_disdrodb_l0_station(
+ data_source,
+ campaign_name,
+ station_name,
+ # L0 archive options
+ l0a_processing: bool = True,
+ l0b_processing: bool = True,
+ l0c_processing: bool = True,
+ remove_l0a: bool = False,
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0 processing of a specific DISDRODB station from the terminal.
+
+ Parameters
+ ----------
+ data_source : str
+ Institution name (when campaign data spans more than 1 country),
+ or country (when all campaigns (or sensor networks) are inside a given country).
+ Must be UPPER CASE.
+ campaign_name : str
+ Campaign name. Must be UPPER CASE.
+ station_name : str
+ Station name
+ l0a_processing : bool
+ Whether to launch processing to generate L0A Apache Parquet file(s) from raw data.
+ The default is ``True``.
+ l0b_processing : bool
+ Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.
+ The default is ``True``.
+ l0b_processing : bool
+ Whether to launch processing to generate L0C netCDF4 file(s) from L0B data.
+ The default is ``True``.
+ l0c_processing : bool
+ Whether to launch processing to generate L0C netCDF4 file(s) from L0C data.
+ The default is True.
+ remove_l0a : bool
+ Whether to keep the L0A files after having generated the L0B netCDF products.
+ The default is ``False``.
+ remove_l0b : bool
+ Whether to remove the L0B files after having produced L0C netCDF files.
+ The default is False.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``True``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L0A, it processes just the first 3 raw data files for each station.
+ For L0B, it processes just the first 100 rows of 3 L0A files for each station.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ # ---------------------------------------------------------------------.
+ t_i = time.time()
+ print(f"L0 processing of station {station_name} has started.")
+
+ # ------------------------------------------------------------------.
+ # L0A processing
+ if l0a_processing:
+ run_disdrodb_l0a_station(
+ # Station arguments
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ # ------------------------------------------------------------------.
+ # L0B processing
+ if l0b_processing:
+ run_disdrodb_l0b_station(
+ # Station arguments
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0B processing options
+ remove_l0a=remove_l0a,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+
+ # ------------------------------------------------------------------.
+ # L0C processing
+ if l0c_processing:
+ run_disdrodb_l0c_station(
+ # Station arguments
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0C processing options
+ remove_l0b=remove_l0b,
+ # Processing options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+
+ # -------------------------------------------------------------------------.
+ # End of L0 processing for all stations
+ timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i)))
+ print(f"L0 processing of stations {station_name} completed in {timedelta_str}")
+
+
+def run_disdrodb_l0a_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0A processing of a station calling the disdrodb_l0a_station in the terminal."""
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l0a_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+def run_disdrodb_l0b_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # L0B processing options
+ remove_l0a: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0B processing of a station calling disdrodb_run_l0b_station in the terminal."""
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l0b_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # L0B processing options
+ "--remove_l0a",
+ str(remove_l0a),
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+def run_disdrodb_l0c_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # L0C options
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0C processing of a station calling the disdrodb_l0c_station in the terminal."""
+ # TODO: implement remove_l0b!
+
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l0c_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # L0C processing options
+ "--remove_l0b",
+ str(remove_l0b),
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+def run_disdrodb_l1_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L1 processing of a station calling the disdrodb_l1_station in the terminal."""
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l1_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+def run_disdrodb_l2e_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L2E processing of a station calling the disdrodb_l1_station in the terminal."""
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l2e_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+def run_disdrodb_l2m_station(
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L2M processing of a station calling the disdrodb_l2m_station in the terminal."""
+ # Define command
+ cmd = " ".join(
+ [
+ "disdrodb_run_l2m_station",
+ # Station arguments
+ data_source,
+ campaign_name,
+ station_name,
+ # Processing options
+ "--force",
+ str(force),
+ "--verbose",
+ str(verbose),
+ "--debugging_mode",
+ str(debugging_mode),
+ "--parallel",
+ str(parallel),
+ "--base_dir",
+ str(base_dir),
+ ],
+ )
+ # Execute command
+ _execute_cmd(cmd)
+
+
+####--------------------------------------------------------------------------.
+#### Run DISDRODB Archive Processing
+
+
+def run_disdrodb_l0a(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0A processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``True``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L0A, it processes just the first 3 raw data files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ # Define products
+ product = "L0A"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l0a_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l0b(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # L0B processing options
+ remove_l0a: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0B processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB L0A stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ remove_l0a : bool
+ Whether to keep the L0A files after having generated the L0B netCDF products.
+ The default is ``False``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``True``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L0B, it processes just the first 100 rows of 3 L0A files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ # Define products
+ product = "L0B"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l0b_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0B options
+ remove_l0a=remove_l0a,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l0c(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # L0C options
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0C processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ remove_l0b : bool
+ Whether to remove the L0B files after having produced L0C netCDF files.
+ The default is False.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``False``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L1B, it processes just 3 L0B files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ # Define products
+ product = "L0C"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l0c_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0C options
+ remove_l0b=remove_l0b,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l0(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # L0 archive options
+ l0a_processing: bool = True,
+ l0b_processing: bool = True,
+ l0c_processing: bool = True,
+ remove_l0a: bool = False,
+ remove_l0b: bool = False,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L0 processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ l0a_processing : bool
+ Whether to launch processing to generate L0A Apache Parquet file(s) from raw data.
+ The default is ``True``.
+ l0b_processing : bool
+ Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.
+ The default is ``True``.
+ l0c_processing : bool
+ Whether to launch processing to generate L0C netCDF4 file(s) from L0B data.
+ The default is ``True``.
+ remove_l0a : bool
+ Whether to keep the L0A files after having generated the L0B netCDF products.
+ The default is ``False``.
+ remove_l0b : bool
+ Whether to remove the L0B files after having produced all L0C netCDF files.
+ The default is ``False``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``False``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L0A, it processes just the first 3 raw data files.
+ For L0B, it processes just the first 100 rows of 3 L0A files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ # Define starting product
+ if l0c_processing:
+ required_product = get_required_product("L0C")
+ if l0b_processing:
+ required_product = get_required_product("L0B")
+ if l0a_processing:
+ required_product = get_required_product("L0A")
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"L0 processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"L0 processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l0_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L0 archive options
+ l0a_processing=l0a_processing,
+ l0b_processing=l0b_processing,
+ l0c_processing=l0c_processing,
+ remove_l0a=remove_l0a,
+ remove_l0b=remove_l0b,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"L0 processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l1(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L1 processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``False``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L1B, it processes just 3 L0B files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ product = "L1"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l1_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l2e(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L2E processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``False``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L2E, it processes just 3 L1 files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ product = "L2E"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l2e_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+def run_disdrodb_l2m(
+ data_sources=None,
+ campaign_names=None,
+ station_names=None,
+ # Processing options
+ force: bool = False,
+ verbose: bool = False,
+ debugging_mode: bool = False,
+ parallel: bool = True,
+ base_dir: Optional[str] = None,
+):
+ """Run the L2M processing of DISDRODB stations.
+
+ This function allows to launch the processing of many DISDRODB stations with a single command.
+ From the list of all available DISDRODB stations, it runs the processing of the
+ stations matching the provided data_sources, campaign_names and station_names.
+
+ Parameters
+ ----------
+ data_sources : list
+ Name of data source(s) to process.
+ The name(s) must be UPPER CASE.
+ If campaign_names and station are not specified, process all stations.
+ The default is ``None``.
+ campaign_names : list
+ Name of the campaign(s) to process.
+ The name(s) must be UPPER CASE.
+ The default is ``None``.
+ station_names : list
+ Station names to process.
+ The default is ``None``.
+ force : bool
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories.
+ The default is ``False``.
+ verbose : bool
+ Whether to print detailed processing information into terminal.
+ The default is ``False``.
+ parallel : bool
+ If ``True``, the files are processed simultaneously in multiple processes.
+ Each process will use a single thread to avoid issues with the HDF/netCDF library.
+ By default, the number of process is defined with ``os.cpu_count()``.
+ If ``False``, the files are processed sequentially in a single process.
+ If ``False``, multi-threading is automatically exploited to speed up I/0 tasks.
+ debugging_mode : bool
+ If ``True``, it reduces the amount of data to process.
+ For L2MB, it processes just 3 L0B files.
+ The default is ``False``.
+ base_dir : str (optional)
+ Base directory of DISDRODB. Format: ``<...>/DISDRODB``.
+ If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used.
+ """
+ product = "L2M"
+ required_product = get_required_product(product)
+
+ # Get list of available stations
+ list_info = available_stations(
+ base_dir=base_dir,
+ product=required_product,
+ data_sources=data_sources,
+ campaign_names=campaign_names,
+ station_names=station_names,
+ raise_error_if_empty=True,
+ )
+
+ # Print message
+ n_stations = len(list_info)
+ print(f"{product} processing of {n_stations} stations started.")
+
+ # Loop over stations
+ for data_source, campaign_name, station_name in list_info:
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.")
+ # Run processing
+ run_disdrodb_l2m_station(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Process options
+ force=force,
+ verbose=verbose,
+ debugging_mode=debugging_mode,
+ parallel=parallel,
+ )
+ print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.")
+
+
+####--------------------------------------------------------------------------.
diff --git a/disdrodb/scattering/__init__.py b/disdrodb/scattering/__init__.py
new file mode 100644
index 00000000..e79aa02d
--- /dev/null
+++ b/disdrodb/scattering/__init__.py
@@ -0,0 +1,28 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement PSD scattering routines."""
+
+
+from disdrodb.scattering.axis_ratio import available_axis_ratio, get_axis_ratio
+from disdrodb.scattering.routines import available_radar_bands, get_radar_parameters
+
+__all__ = [
+ "available_radar_bands",
+ "available_axis_ratio",
+ "get_axis_ratio",
+ "get_radar_parameters",
+]
diff --git a/disdrodb/scattering/axis_ratio.py b/disdrodb/scattering/axis_ratio.py
new file mode 100644
index 00000000..9542c2a0
--- /dev/null
+++ b/disdrodb/scattering/axis_ratio.py
@@ -0,0 +1,345 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement drop axis ratio theoretical models."""
+
+import numpy as np
+import xarray as xr
+
+
+def available_axis_ratio():
+ """Return a list of the available drop axis ratio methods."""
+ return list(AXIS_RATIO_METHODS)
+
+
+def get_axis_ratio_method(method):
+ """Return the specified drop axis ratio method."""
+ method = check_axis_ratio(method)
+ return AXIS_RATIO_METHODS[method]
+
+
+def check_axis_ratio(method):
+ """Check validity of the specified drop axis ratio method."""
+ available_methods = available_axis_ratio()
+ if method not in available_methods:
+ raise ValueError(f"{method} is an invalid axis-ratio method. Valid methods: {available_methods}.")
+ return method
+
+
+def get_axis_ratio(diameter, method):
+ """
+ Compute the axis ratio of raindrops using the specified method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Raindrops diameter in mm.
+ method : str
+ The method to use for calculating the axis ratio. Available methods are:
+ 'Thurai2005', 'Thurai2007', 'Battaglia2010', 'Brandes2002',
+ 'Pruppacher1970', 'Beard1987', 'Andsager1999'.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ Raises
+ ------
+ ValueError
+ If the specified method is not one of the available methods.
+
+ Notes
+ -----
+ This function serves as a wrapper to various axis ratio models for raindrops.
+ It selects and applies the appropriate model based on the `method` parameter.
+
+ Examples
+ --------
+ >>> diameter = np.array([0.5, 1.0, 2.0, 3.0])
+ >>> axis_ratio = get_axis_ratio(diameter, method="Brandes2002")
+
+ """
+ # Retrieve axis ratio function
+ func = get_axis_ratio_method(method)
+
+ # Retrieve axis ratio
+ axis_ratio = func(diameter)
+
+ # Clip values between 0 and 1
+ axis_ratio = np.clip(axis_ratio, 0, 1)
+ return axis_ratio
+
+
+def get_axis_ratio_andsager_1999(diameter):
+ """
+ Compute the axis ratio of raindrops using the Andsager et al. (1999) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ Notes
+ -----
+ This function calculates the axis ratio of raindrops based on the method described
+ in Andsager et al. (1999). For diameters between 1.1 mm and 4.4 mm, it uses the
+ average axis-ratio relationship given by Kubesh and Beard (1993):
+
+ axis_ratio = 1.012 - 0.144 * D - 1.03 * D^2
+
+ For diameters outside this range (0.1 mm to 1.1 mm and 4.4 mm to 7.0 mm),
+ it uses the equilibrium shape equation from Beard and Chuang (1987).
+
+ References
+ ----------
+ Andsager, K., Beard, K. V., & Laird, N. F. (1999).
+ Laboratory measurements of axis ratios for large raindrops.
+ Journal of the Atmospheric Sciences, 56(15), 2673-2683.
+
+ Kubesh, R. J., & Beard, K. V. (1993).
+ Laboratory measurements of spontaneous oscillations for moderate-size raindrops.
+ Journal of the Atmospheric Sciences, 50(7), 1089-1098.
+
+ Beard, K. V., & Chuang, C. (1987).
+ A new model for the equilibrium shape of raindrops.
+ Journal of the Atmospheric Sciences, 44(11), 1509-1524.
+
+ """
+ # Convert diameter to centimeters
+ diameter_cm = diameter * 0.1
+
+ # Axis ratio for diameters outside 1.1 mm to 4.4 mm using equilibrium model
+ axis_ratio_equilibrium = get_axis_ratio_beard_1987(diameter)
+
+ # Axis ratio for diameters between 1.1 mm and 4.4 mm using Kubesh & Beard (1993) model
+ axis_ratio_kubesh = 1.012 - 0.144 * diameter_cm - 1.03 * diameter_cm**2
+
+ # Combine models based on diameter ranges
+ axis_ratio = xr.where(
+ (diameter_cm >= 1.1) & (diameter_cm < 4.4),
+ axis_ratio_kubesh,
+ axis_ratio_equilibrium,
+ )
+
+ return axis_ratio
+
+
+def get_axis_ratio_battaglia_2010(diameter):
+ """
+ Compute the axis ratio of raindrops using the Battaglia et al. (2010) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ Notes
+ -----
+ - For diameters less than or equal to 1 mm, the axis ratio is constant at 1.0.
+ - For diameters greater than or equal to 5 mm, the axis ratio is constant at 0.7.
+ - Between 1 mm and 5 mm, the axis ratio varies linearly.
+
+ The axis ratio is calculated using the equation:
+
+ axis_ratio = 1.075 - 0.075 * D
+
+ where **D** is the diameter in millimeters.
+
+ References
+ ----------
+ Battaglia, A., Rustemeier, E., Tokay, A., Blahak, U., & Simmer, C. (2010).
+ PARSIVEL Snow Observations: A Critical Assessment.
+ Journal of Atmospheric and Oceanic Technology, 27(2), 333-344.
+ https://doi.org/10.1175/2009JTECHA1332.1
+
+ """
+ axis_ratio = 1.075 - 0.075 * diameter
+ axis_ratio = xr.where(diameter <= 1, 1.0, axis_ratio)
+ axis_ratio = xr.where(diameter >= 5, 0.7, axis_ratio)
+ return axis_ratio
+
+
+def get_axis_ratio_beard_1987(diameter):
+ """
+ Compute the axis ratio of raindrops using the Beard and Chuang (1987) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in centimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ Notes
+ -----
+ The formula is a polynomial fit to the numerical model of Beard and Chuang (1987), with
+ drop diameters between 1 and 7 mm.
+
+ References
+ ----------
+ Beard, K. V., & Chuang, C. (1987).
+ A new model for the equilibrium shape of raindrops.
+ Journal of the Atmospheric Sciences, 44(11), 1509-1524.
+ https://doi.org/10.1175/1520-0469(1987)044<1509:ANMFTE>2.0.CO;2
+ """
+ return 1.0048 + 5.7e-04 * diameter - 2.628e-02 * diameter**2 + 3.682e-03 * diameter**3 - 1.677e-04 * diameter**4
+
+
+def get_axis_ratio_brandes_2002(diameter):
+ """
+ Compute the axis ratio of raindrops using the Brandes et al. (2002) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ References
+ ----------
+ Brandes, E. A., Zhang, G., & Vivekanandan, J. (2002).
+ Experiments in rainfall estimation with a polarimetric radar in a subtropical environment.
+ Journal of Applied Meteorology, 41(6), 674-685.
+ https://doi.org/10.1175/1520-0450(2002)041<0674:EIREWA>2.0.CO;2
+
+ Brandes, et al. 2005: On the Influence of Assumed Drop Size Distribution Form
+ on Radar-Retrieved Thunderstorm Microphysics. J. Appl. Meteor. Climatol., 45, 259-268.
+ """
+ # Valid for drop diameters between 0.1 to 8.1 mm
+ axis_ratio = 0.9951 + 0.0251 * diameter - 0.03644 * diameter**2 + 0.005303 * diameter**3 - 0.0002492 * diameter**4
+ return axis_ratio
+
+
+def get_axis_ratio_pruppacher_1970(diameter):
+ """
+ Compute the axis ratio of raindrops using the Pruppacher and Pitter (1971) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ Notes
+ -----
+ This formula is a linear fit to wind tunnel data of Pruppacher and Pitter (1971) with
+ drop diameters between 1 and 9 mm.
+
+ References
+ ----------
+ Pruppacher, H. R., & Pitter, R. L. (1971).
+ A Semi-Empirical Determination of the Shape of Cloud and Precipitation Drops.
+ Journal of the Atmospheric Sciences, 28(1), 86-94.
+ https://doi.org/10.1175/1520-0469(1971)028<0086:ASEDOT>2.0.CO;2
+ """
+ axis_ratio = 1.03 - 0.062 * diameter
+ return axis_ratio
+
+
+def get_axis_ratio_thurai_2005(diameter):
+ """
+ Compute the axis ratio of raindrops using the Thurai et al. (2005) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ References
+ ----------
+ Thurai, M., and V. N. Bringi, 2005: Drop Axis Ratios from a 2D Video Disdrometer.
+ J. Atmos. Oceanic Technol., 22, 966-978, https://doi.org/10.1175/JTECH1767.1
+
+ """
+ # Valid between 1 and 5 mm
+ axis_ratio = 0.9707 + 4.26e-2 * diameter - 4.29e-2 * diameter**2 + 6.5e-3 * diameter**3 - 3e-4 * diameter**4
+ return axis_ratio
+
+
+def get_axis_ratio_thurai_2007(diameter):
+ """
+ Compute the axis ratio of raindrops using the Thurai et al. (2007) method.
+
+ Parameters
+ ----------
+ diameter : array-like
+ Diameter of the raindrops in millimeters.
+
+ Returns
+ -------
+ axis_ratio : array-like
+ Calculated axis ratios corresponding to the input diameters.
+
+ References
+ ----------
+ Thurai, M., G. J. Huang, V. N. Bringi, W. L. Randeu, and M. Schönhuber, 2007:
+ Drop Shapes, Model Comparisons, and Calculations of Polarimetric Radar Parameters in Rain.
+ J. Atmos. Oceanic Technol., 24, 1019-1032, https://doi.org/10.1175/JTECH2051.1
+
+ """
+ # Assume spherical drop when diameter < 0.7 mm
+ axis_ratio_below_0_7 = 1
+ # Beard and Kubesh (1991) for drops diameter between 0.7 mm and 1.5 mm
+ axis_ratio_below_1_5 = (
+ 1.173 - 0.5165 * diameter + 0.4698 * diameter**2 - 0.1317 * diameter**3 - 8.5e-3 * diameter**4
+ )
+ # Formula fitted on measurements of Thurai et al., 2005 for drop diameter above 1.5 mm
+ # --> This is very similar to Pruppacher1970 !
+ axis_ratio_above_1_5 = (
+ 1.065 - 6.25e-2 * diameter - 3.99e-3 * diameter**2 + 7.66e-4 * diameter**3 - 4.095e-5 * diameter**4
+ )
+ # Combine axis ratio
+ axis_ratio_below_1_5 = xr.where(diameter > 0.7, axis_ratio_below_1_5, axis_ratio_below_0_7)
+ axis_ratio = xr.where(diameter > 1.5, axis_ratio_above_1_5, axis_ratio_below_1_5)
+ return axis_ratio
+
+
+AXIS_RATIO_METHODS = {
+ "Thurai2005": get_axis_ratio_thurai_2005,
+ "Thurai2007": get_axis_ratio_thurai_2007,
+ "Battaglia2010": get_axis_ratio_battaglia_2010,
+ "Brandes2002": get_axis_ratio_brandes_2002,
+ "Pruppacher1970": get_axis_ratio_pruppacher_1970,
+ "Beard1987": get_axis_ratio_beard_1987,
+ "Andsager1999": get_axis_ratio_andsager_1999,
+}
diff --git a/disdrodb/scattering/routines.py b/disdrodb/scattering/routines.py
new file mode 100644
index 00000000..acb9b571
--- /dev/null
+++ b/disdrodb/scattering/routines.py
@@ -0,0 +1,450 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Implement PSD scattering routines."""
+
+import itertools
+
+import dask
+import numpy as np
+import xarray as xr
+from pytmatrix import orientation, radar, refractive, tmatrix_aux
+from pytmatrix.psd import BinnedPSD, PSDIntegrator
+from pytmatrix.tmatrix import Scatterer
+
+from disdrodb.psd.models import create_psd, get_required_parameters
+from disdrodb.scattering.axis_ratio import check_axis_ratio, get_axis_ratio_method
+from disdrodb.utils.warnings import suppress_warnings
+
+# Wavelengths for which the refractive index is defined in pytmatrix (in mm)
+wavelength_dict = {
+ "S": tmatrix_aux.wl_S,
+ "C": tmatrix_aux.wl_C,
+ "X": tmatrix_aux.wl_X,
+ "Ku": tmatrix_aux.wl_Ku,
+ "Ka": tmatrix_aux.wl_Ka,
+ "W": tmatrix_aux.wl_W,
+}
+
+
+def available_radar_bands():
+ """Return a list of the available radar bands."""
+ return list(wavelength_dict)
+
+
+def check_radar_band(radar_band):
+ """Check the validity of the specified radar band."""
+ available_bands = available_radar_bands()
+ if radar_band not in available_bands:
+ raise ValueError(f"{radar_band} is an invalid radar band. Valid radar bands: {available_bands}.")
+ return radar_band
+
+
+def get_radar_wavelength(radar_band):
+ """Get the wavelength of a radar band."""
+ wavelength = wavelength_dict[radar_band]
+ return wavelength
+
+
+def initialize_scatterer(wavelength, canting_angle_std=7, D_max=8, axis_ratio="Thurai2007"):
+ """Initialize T-matrix scatterer object for a given wavelength."""
+ # Retrieve custom axis ratio function
+ axis_ratio_func = get_axis_ratio_method(axis_ratio)
+
+ # Retrieve water complex refractive index
+ # - Here we currently assume 10 °C
+ # - m_w_0C and m_w_20C are also available
+ # TODO: should be another dimension ? Or use scatterer.psd_integrator.m_func?
+ water_refractive_index = refractive.m_w_10C[wavelength]
+
+ # ---------------------------------------------------------------.
+ # Initialize Scatterer class
+ scatterer = Scatterer(wavelength=wavelength, m=water_refractive_index)
+ # - Define particle orientation PDF for orientational averaging
+ # --> The standard deviation of the angle with respect to vertical orientation (the canting angle).
+ scatterer.or_pdf = orientation.gaussian_pdf(std=canting_angle_std)
+ # - Define orientation methods
+ # --> Alternatives: orient_averaged_fixed, orient_single
+ scatterer.orient = orientation.orient_averaged_fixed
+
+ # ---------------------------------------------------------------.
+ # Initialize PSDIntegrator
+ scatterer.psd_integrator = PSDIntegrator()
+ # - Define axis_ratio_func
+ # --> The Scatterer class expects horizontal to vertical
+ scatterer.psd_integrator.axis_ratio_func = lambda D: 1.0 / axis_ratio_func(D)
+ # - Define function to compute refrative index (as function of D)
+ # scatterer.psd_integrator.m_func = None # Use constant value of scatterer.m
+ # - Define number of points over which to integrate
+ scatterer.psd_integrator.num_points = 1024
+ # - Define maximum drop diameter
+ scatterer.psd_integrator.D_max = D_max
+ # - Define geometries
+ scatterer.psd_integrator.geometries = (tmatrix_aux.geom_horiz_back, tmatrix_aux.geom_horiz_forw)
+ # ---------------------------------------------------------------.
+ # Initialize scattering table
+ scatterer.psd_integrator.init_scatter_table(scatterer)
+ return scatterer
+
+
+def compute_radar_variables(scatterer):
+ """Compute radar variables for a given scatter object with a specified PSD.
+
+ To speed up computations, this function should input a scatterer object with
+ a preinitialized scattering table.
+ """
+ # Compute radar parameters
+ radar_vars = {}
+ scatterer.set_geometry(tmatrix_aux.geom_horiz_back)
+ radar_vars["Zh"] = 10 * np.log10(radar.refl(scatterer, h_pol=True)) # dBZ
+ radar_vars["Zdr"] = 10 * np.log10(radar.Zdr(scatterer)) # dB
+ radar_vars["rho_hv"] = radar.rho_hv(scatterer)
+ radar_vars["ldr"] = radar.ldr(scatterer)
+ scatterer.set_geometry(tmatrix_aux.geom_horiz_forw)
+ radar_vars["Kdp"] = radar.Kdp(scatterer)
+ radar_vars["Ai"] = radar.Ai(scatterer)
+ return radar_vars
+
+
+def _estimate_empirical_radar_parameters(
+ drop_number_concentration,
+ bin_edges,
+ scatterer,
+ output_dictionary,
+):
+ # Initialize bad results
+ if output_dictionary:
+ null_output = {"Zh": np.nan, "Zdr": np.nan, "rho_hv": np.nan, "ldr": np.nan, "Kdp": np.nan, "Ai": np.nan}
+ else:
+ null_output = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
+
+ # Assign PSD model to the scatterer object
+ scatterer.psd = BinnedPSD(bin_edges, drop_number_concentration)
+
+ # Get radar variables
+ with suppress_warnings():
+ try:
+ radar_vars = compute_radar_variables(scatterer)
+ output = radar_vars if output_dictionary else np.array(list(radar_vars.values()))
+ except Exception:
+ output = null_output
+ return output
+
+
+def _estimate_model_radar_parameters(
+ parameters,
+ psd_model,
+ psd_parameters_names,
+ scatterer,
+ output_dictionary,
+):
+ # Initialize bad results
+ if output_dictionary:
+ null_output = {"Zh": np.nan, "Zdr": np.nan, "rho_hv": np.nan, "ldr": np.nan, "Kdp": np.nan, "Ai": np.nan}
+ else:
+ null_output = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
+
+ # Assign PSD model to the scatterer object
+ parameters = dict(zip(psd_parameters_names, parameters))
+ scatterer.psd = create_psd(psd_model, parameters)
+
+ # Get radar variables
+ with suppress_warnings():
+ radar_vars = compute_radar_variables(scatterer)
+ try:
+ radar_vars = compute_radar_variables(scatterer)
+ output = radar_vars if output_dictionary else np.array(list(radar_vars.values()))
+ except Exception:
+ output = null_output
+ return output
+
+
+def get_psd_parameters(ds):
+ """Return a xr.Dataset with the PSD parameters."""
+ psd_model = ds.attrs["disdrodb_psd_model"]
+ required_parameters = get_required_parameters(psd_model)
+ missing_parameters = [param for param in required_parameters if param not in ds]
+ if len(missing_parameters) > 0:
+ raise ValueError(f"The {psd_model} parameters {missing_parameters} are not present in the dataset.")
+ return ds[required_parameters]
+
+
+def get_model_radar_parameters(
+ ds,
+ radar_band,
+ canting_angle_std=7,
+ diameter_max=8,
+ axis_ratio="Thurai2007",
+):
+ """Compute radar parameters from a PSD model.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Dataset containing the parameters of the PSD model.
+ The dataset attribute disdrodb_psd_model specifies the PSD model to use.
+ radar_band : str
+ Radar band to be used.
+ canting_angle_std : float, optional
+ Standard deviation of the canting angle. The default value is 7.
+ diameter_max : float, optional
+ Maximum diameter. The default value is 8 mm.
+ axis_ratio : str, optional
+ Method to compute the axis ratio. The default method is ``Thurai2007``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the computed radar parameters.
+ """
+ # Retrieve psd model and parameters.
+ psd_model = ds.attrs["disdrodb_psd_model"]
+ required_parameters = get_required_parameters(psd_model)
+ ds_parameters = get_psd_parameters(ds)
+
+ # Check argument validity
+ axis_ratio = check_axis_ratio(axis_ratio)
+ radar_band = check_radar_band(radar_band)
+
+ # Retrieve wavelengths in mm
+ wavelength = get_radar_wavelength(radar_band)
+
+ # Create DataArray with PSD parameters
+ da_parameters = ds_parameters.to_array(dim="psd_parameters").compute()
+
+ # Initialize scattering table
+ scatterer = initialize_scatterer(
+ wavelength=wavelength,
+ canting_angle_std=canting_angle_std,
+ D_max=diameter_max,
+ axis_ratio=axis_ratio,
+ )
+
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "psd_model": psd_model,
+ "psd_parameters_names": required_parameters,
+ "scatterer": scatterer,
+ }
+
+ # Loop over each PSD (not in parallel --> dask="forbidden")
+ # - It costs much more to initiate the scatterer rather than looping over timesteps !
+ da_radar = xr.apply_ufunc(
+ _estimate_model_radar_parameters,
+ da_parameters,
+ kwargs=kwargs,
+ input_core_dims=[["psd_parameters"]],
+ output_core_dims=[["radar_variables"]],
+ vectorize=True,
+ dask="forbidden",
+ dask_gufunc_kwargs={"output_sizes": {"radar_variables": 5}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_radar = da_radar.assign_coords({"radar_variables": ["Zh", "Zdr", "rho_hv", "ldr", "Kdp", "Ai"]})
+
+ # Create parameters dataset
+ ds_radar = da_radar.to_dataset(dim="radar_variables")
+
+ # Expand dimensions for later merging
+ dims_dict = {
+ "radar_band": [radar_band],
+ "axis_ratio": [axis_ratio],
+ "canting_angle_std": [canting_angle_std],
+ "diameter_max": [diameter_max],
+ }
+ ds_radar = ds_radar.expand_dims(dim=dims_dict)
+ return ds_radar
+
+
+def get_empirical_radar_parameters(
+ ds,
+ radar_band=None,
+ canting_angle_std=7,
+ diameter_max=8,
+ axis_ratio="Thurai2007",
+):
+ """Compute radar parameters from empirical drop number concentration.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Dataset containing the drop number concentration variable.
+ radar_band : str
+ Radar band to be used.
+ canting_angle_std : float, optional
+ Standard deviation of the canting angle. The default value is 7.
+ diameter_max : float, optional
+ Maximum diameter. The default value is 8 mm.
+ axis_ratio : str, optional
+ Method to compute the axis ratio. The default method is ``Thurai2007``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the computed radar parameters.
+ """
+ # Define inputs
+ da_drop_number_concentration = ds["drop_number_concentration"].compute()
+
+ # Define bin edges
+ bin_edges = np.append(ds["diameter_bin_lower"].compute().data, ds["diameter_bin_upper"].compute().data[-1])
+
+ # Check argument validity
+ axis_ratio = check_axis_ratio(axis_ratio)
+ radar_band = check_radar_band(radar_band)
+
+ # Retrieve wavelengths in mm
+ wavelength = get_radar_wavelength(radar_band)
+
+ # Initialize scattering table
+ scatterer = initialize_scatterer(
+ wavelength=wavelength,
+ canting_angle_std=canting_angle_std,
+ D_max=diameter_max,
+ axis_ratio=axis_ratio,
+ )
+
+ # Define kwargs
+ kwargs = {
+ "output_dictionary": False,
+ "bin_edges": bin_edges,
+ "scatterer": scatterer,
+ }
+
+ # Loop over each PSD (not in parallel --> dask="forbidden")
+ # - It costs much more to initiate the scatterer rather than looping over timesteps !
+ da_radar = xr.apply_ufunc(
+ _estimate_empirical_radar_parameters,
+ da_drop_number_concentration,
+ kwargs=kwargs,
+ input_core_dims=[["diameter_bin_center"]],
+ output_core_dims=[["radar_variables"]],
+ vectorize=True,
+ dask="forbidden",
+ dask_gufunc_kwargs={"output_sizes": {"radar_variables": 5}}, # lengths of the new output_core_dims dimensions.
+ output_dtypes=["float64"],
+ )
+
+ # Add parameters coordinates
+ da_radar = da_radar.assign_coords({"radar_variables": ["Zh", "Zdr", "rho_hv", "ldr", "Kdp", "Ai"]})
+
+ # Create parameters dataset
+ ds_radar = da_radar.to_dataset(dim="radar_variables")
+
+ # Expand dimensions for later merging
+ dims_dict = {
+ "radar_band": [radar_band],
+ "axis_ratio": [axis_ratio],
+ "canting_angle_std": [canting_angle_std],
+ "diameter_max": [diameter_max],
+ }
+ ds_radar = ds_radar.expand_dims(dim=dims_dict)
+ return ds_radar
+
+
+def get_radar_parameters(
+ ds,
+ radar_band=None,
+ canting_angle_std=7,
+ diameter_max=8,
+ axis_ratio="Thurai2007",
+ parallel=True,
+):
+ """Compute radar parameters from empirical drop number concentration or PSD model.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Dataset containing the drop number concentration variable.
+ radar_band : str or list of str, optional
+ Radar band(s) to be used.
+ If ``None`` (the default), all available radar bands are used.
+ canting_angle_std : float or list of float, optional
+ Standard deviation of the canting angle. The default value is 7.
+ diameter_max : float or list of float, optional
+ Maximum diameter. The default value is 8 mm.
+ axis_ratio : str or list of str, optional
+ Method to compute the axis ratio. The default method is ``Thurai2007``.
+ parallel : bool, optional
+ Whether to compute radar variables in parallel.
+ The default value is ``True``.
+
+ Returns
+ -------
+ xarray.Dataset
+ Dataset containing the computed radar parameters.
+ """
+ # Decide whether to simulate radar parameters based on empirical PSD or model PSD
+ if "disdrodb_psd_model" not in ds.attrs and "drop_number_concentration" not in ds:
+ raise ValueError("The input dataset is not a DISDRODB L2E or L2M product.")
+ # Model-based simulation
+ if "disdrodb_psd_model" in ds.attrs:
+ func = get_model_radar_parameters
+ ds_subset = get_psd_parameters(ds).compute()
+ # Empirical PSD simulation
+ else:
+ func = get_empirical_radar_parameters
+ ds_subset = ds[["drop_number_concentration"]].compute()
+
+ # Initialize radar band if not provided
+ if radar_band is None:
+ radar_band = available_radar_bands()
+
+ # Ensure parameters are list
+ diameter_max = np.atleast_1d(diameter_max)
+ canting_angle_std = np.atleast_1d(canting_angle_std)
+ axis_ratio = np.atleast_1d(axis_ratio)
+ radar_band = np.atleast_1d(radar_band)
+
+ # Check parameters validity
+ axis_ratio = [check_axis_ratio(method) for method in axis_ratio]
+ radar_band = [check_radar_band(band) for band in radar_band]
+
+ # Retrieve combination of parameters
+ list_params = [
+ {
+ "radar_band": rb.item(),
+ "canting_angle_std": cas.item(),
+ "axis_ratio": ar.item(),
+ "diameter_max": d_max.item(),
+ }
+ for rb, cas, ar, d_max in itertools.product(radar_band, canting_angle_std, axis_ratio, diameter_max)
+ ]
+
+ # Compute radar variables for each configuration in parallel
+ # - The function expects the data into memory (no dask arrays !)
+ if parallel:
+ list_ds = [dask.delayed(func)(ds_subset, **params) for params in list_params]
+ list_ds = dask.compute(*list_ds)
+ else:
+ list_ds = [func(ds_subset, **params) for params in list_params]
+
+ # Merge into a single dataset
+ ds_radar = xr.merge(list_ds)
+
+ # Copy global attributes from input dataset
+ ds_radar.attrs = ds.attrs.copy()
+
+ # Remove single dimensions (add info to attributes)
+ parameters = ["radar_band", "canting_angle_std", "axis_ratio", "diameter_max"]
+ for param in parameters:
+ if ds_radar.sizes[param] == 1:
+ ds_radar.attrs[f"disdrodb_scattering_{param}"] = ds_radar[param].item()
+ ds_radar = ds_radar.squeeze()
+ return ds_radar
diff --git a/disdrodb/tests/conftest.py b/disdrodb/tests/conftest.py
index 97e28bed..0263538c 100644
--- a/disdrodb/tests/conftest.py
+++ b/disdrodb/tests/conftest.py
@@ -151,7 +151,7 @@ def create_fake_raw_data_file(
return str(filepath)
-@pytest.fixture()
+@pytest.fixture
def create_test_config_files(request): # noqa PT004
"""Create the specified config files into a temporary "test" directory.
diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml b/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml
index 8e63baff..3e334611 100644
--- a/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml
+++ b/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml
@@ -37,7 +37,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 10
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc
index fe41f9b0..1d407e18 100644
Binary files a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc and b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc differ
diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml
index 54be96d8..492c0c4a 100755
--- a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml
+++ b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml
@@ -38,7 +38,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 60
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
index 4d04c7be..0d55359d 100644
--- a/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
+++ b/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
@@ -28,7 +28,7 @@ firmware_dsp: ""
firmware_version: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
contributors: ""
authors: ""
institution: ""
diff --git a/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
index 838e17f9..f9741785 100644
--- a/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
+++ b/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
@@ -37,7 +37,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml
index 19bd7cf0..5ce6e661 100644
--- a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml
+++ b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml
@@ -28,7 +28,7 @@ firmware_dsp: ""
firmware_version: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
contributors: ""
authors: ""
institution: ""
diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
index 5dfd8800..953b97c7 100644
--- a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
+++ b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml
@@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml
index a5e873db..cfd7a2cf 100644
--- a/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml
+++ b/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml
@@ -36,7 +36,7 @@ firmware_version: ""
sensor_beam_length: ""
sensor_beam_width: ""
sensor_nominal_width: ""
-measurement_interval: ""
+measurement_interval: 30
calibration_sensitivity: ""
calibration_certification_date: ""
calibration_certification_url: ""
diff --git a/disdrodb/tests/test_api/test_api_create_directories.py b/disdrodb/tests/test_api/test_api_create_directories.py
index b7dd8544..0a747433 100644
--- a/disdrodb/tests/test_api/test_api_create_directories.py
+++ b/disdrodb/tests/test_api/test_api_create_directories.py
@@ -26,21 +26,22 @@
_check_campaign_name_consistency,
_check_data_source_consistency,
_copy_station_metadata,
- create_directory_structure,
create_initial_station_structure,
create_issue_directory,
create_l0_directory_structure,
create_metadata_directory,
+ create_product_directory,
create_test_archive,
)
from disdrodb.api.path import (
define_campaign_dir,
+ define_data_dir,
define_issue_filepath,
define_metadata_dir,
define_metadata_filepath,
define_station_dir,
)
-from disdrodb.api.scripts.disdrodb_initialize_station import disdrodb_initialize_station
+from disdrodb.cli.disdrodb_initialize_station import disdrodb_initialize_station
from disdrodb.tests.conftest import (
create_fake_issue_file,
create_fake_metadata_directory,
@@ -139,7 +140,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product):
)
# Execute create_l0_directory_structure
- create_l0_directory_structure(
+ data_dir = create_l0_directory_structure(
product=product,
force=False,
raw_dir=raw_dir,
@@ -148,6 +149,8 @@ def test_create_l0_directory_structure(tmp_path, mocker, product):
)
# Test product, metadata and station directories have been created
+ assert os.path.exists(data_dir)
+ assert os.path.isdir(data_dir)
assert os.path.exists(dst_station_dir)
assert os.path.isdir(dst_station_dir)
assert os.path.exists(dst_metadata_dir)
@@ -177,7 +180,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product):
assert os.path.exists(product_filepath)
# Test delete file if already data in L0A (if force=True)
- create_l0_directory_structure(
+ data_dir = create_l0_directory_structure(
product=product,
force=True,
raw_dir=raw_dir,
@@ -185,6 +188,8 @@ def test_create_l0_directory_structure(tmp_path, mocker, product):
station_name=station_name,
)
assert not os.path.exists(product_filepath)
+ assert os.path.exists(data_dir)
+ assert os.path.isdir(data_dir)
assert os.path.exists(dst_station_dir)
assert os.path.isdir(dst_station_dir)
assert os.path.exists(dst_metadata_dir)
@@ -193,7 +198,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product):
assert os.path.isfile(dst_metadata_filepath)
-def test_create_directory_structure(tmp_path, mocker):
+def test_create_product_directory(tmp_path):
start_product = "L0A"
dst_product = "L0B"
# Define station info
@@ -205,20 +210,15 @@ def test_create_directory_structure(tmp_path, mocker):
metadata_dict["sensor_name"] = "OTT_Parsivel"
metadata_dict["reader"] = "GPM/IFLOODS"
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product=start_product,
- data_source=data_source,
- campaign_name=campaign_name,
- )
-
# Test raise error without data
with pytest.raises(ValueError):
- create_directory_structure(
+ _ = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
product=dst_product,
force=False,
- processed_dir=processed_dir,
- station_name=station_name,
)
# Add fake file
@@ -232,11 +232,13 @@ def test_create_directory_structure(tmp_path, mocker):
# Test raise error without metadata file
with pytest.raises(ValueError):
- create_directory_structure(
+ _ = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
product=dst_product,
force=False,
- processed_dir=processed_dir,
- station_name=station_name,
)
# Add metadata
@@ -249,18 +251,27 @@ def test_create_directory_structure(tmp_path, mocker):
metadata_dict=metadata_dict,
)
- # Execute create_directory_structure
- create_directory_structure(
- processed_dir=processed_dir,
- product=dst_product,
+ # Execute create_product_directory
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
station_name=station_name,
+ product=dst_product,
force=False,
)
- # Test product directory has been created
- dst_station_dir = os.path.join(processed_dir, dst_product)
- assert os.path.exists(dst_station_dir)
- assert os.path.isdir(dst_station_dir)
+ # Test product data directory has been created
+ expected_data_dir = define_data_dir(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product=dst_product,
+ )
+ assert expected_data_dir == data_dir
+ assert os.path.exists(data_dir)
+ assert os.path.isdir(data_dir)
# Test raise error if already data in dst_product (if force=False)
dst_product_file_filepath = create_fake_raw_data_file(
@@ -272,32 +283,39 @@ def test_create_directory_structure(tmp_path, mocker):
)
with pytest.raises(ValueError):
- create_directory_structure(
+ _ = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
product=dst_product,
force=False,
- processed_dir=processed_dir,
- station_name=station_name,
)
assert os.path.exists(dst_product_file_filepath)
# Test delete file if already data in L0A (if force=True)
- create_directory_structure(
+ data_dir = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
product=dst_product,
force=True,
- processed_dir=processed_dir,
- station_name=station_name,
)
+ assert expected_data_dir == data_dir
assert not os.path.exists(dst_product_file_filepath)
- assert os.path.exists(dst_station_dir)
- assert os.path.isdir(dst_station_dir)
+ assert os.path.exists(data_dir)
+ assert os.path.isdir(data_dir)
# Test raise error if bad station_name
with pytest.raises(ValueError):
- create_directory_structure(
+ _ = create_product_directory(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name="INEXISTENT_STATION",
product=dst_product,
force=False,
- processed_dir=processed_dir,
- station_name="INEXISTENT_STATION",
)
diff --git a/disdrodb/tests/test_api/test_api_info.py b/disdrodb/tests/test_api/test_api_info.py
index a9284e1b..bd031350 100644
--- a/disdrodb/tests/test_api/test_api_info.py
+++ b/disdrodb/tests/test_api/test_api_info.py
@@ -68,7 +68,7 @@
# valid_filepath = VALID_FNAME
-@pytest.fixture()
+@pytest.fixture
def valid_filepath(tmp_path):
# Create a valid filepath for testing
filepath = tmp_path / VALID_FNAME
@@ -76,7 +76,7 @@ def valid_filepath(tmp_path):
return str(filepath)
-@pytest.fixture()
+@pytest.fixture
def invalid_filepath(tmp_path):
# Create an invalid filepath for testing
filepath = tmp_path / INVALID_FNAME
@@ -275,5 +275,5 @@ def test_get_end_time_from_filepaths(valid_filepath):
def test_get_start_end_time_from_filepaths(valid_filepath):
start_time, end_time = get_start_end_time_from_filepaths(valid_filepath)
- assert np.array_equal(start_time, np.array([START_TIME]))
- assert np.array_equal(end_time, np.array([END_TIME]))
+ assert np.array_equal(start_time, np.array([START_TIME]).astype("M8[s]"))
+ assert np.array_equal(end_time, np.array([END_TIME]).astype("M8[s]"))
diff --git a/disdrodb/tests/test_api/test_api_path.py b/disdrodb/tests/test_api/test_api_path.py
index b2146451..dd08141a 100644
--- a/disdrodb/tests/test_api/test_api_path.py
+++ b/disdrodb/tests/test_api/test_api_path.py
@@ -18,7 +18,6 @@
# -----------------------------------------------------------------------------.
"""Test DISDRODB path."""
import datetime
-import os
import numpy as np
import pandas as pd
@@ -26,97 +25,59 @@
import xarray as xr
from disdrodb.api.path import (
- define_campaign_dir,
- define_l0a_filepath,
- define_l0a_station_dir,
- define_l0b_filepath,
- define_l0b_station_dir,
+ # define_campaign_dir,
+ define_l0a_filename,
+ define_l0b_filename,
+ define_l0c_filename,
)
-PROCESSED_FOLDER_WINDOWS = "\\DISDRODB\\Processed"
-PROCESSED_FOLDER_LINUX = "/DISDRODB/Processed"
+# PROCESSED_FOLDER_WINDOWS = "\\DISDRODB\\Processed"
+# PROCESSED_FOLDER_LINUX = "/DISDRODB/Processed"
-@pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX])
-def test_define_l0a_station_dir(processed_folder):
- res = (
- define_l0a_station_dir(processed_folder, "STATION_NAME")
- .replace(processed_folder, "")
- .replace("\\", "")
- .replace("/", "")
- )
- assert res == "L0ASTATION_NAME"
-
+# @pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX])
+# def test_define_l0a_station_dir(processed_folder):
+# res = (
+# define_l0a_station_dir(processed_folder, "STATION_NAME")
+# .replace(processed_folder, "")
+# .replace("\\", "")
+# .replace("/", "")
+# )
+# assert res == "L0ASTATION_NAME"
-@pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX])
-def test_define_l0b_station_dir(processed_folder):
- res = (
- define_l0b_station_dir(processed_folder, "STATION_NAME")
- .replace(processed_folder, "")
- .replace("\\", "")
- .replace("/", "")
- )
- assert res == "L0BSTATION_NAME"
-
-def test_define_l0a_filepath(tmp_path):
- from disdrodb.l0.standards import PRODUCT_VERSION
+def test_define_l0a_filename():
+ from disdrodb import PRODUCT_VERSION
# Set variables
product = "L0A"
- base_dir = tmp_path / "DISDRODB"
- data_source = "DATA_SOURCE"
campaign_name = "CAMPAIGN_NAME"
station_name = "STATION_NAME"
start_date = datetime.datetime(2019, 3, 26, 0, 0, 0)
end_date = datetime.datetime(2021, 2, 8, 0, 0, 0)
- start_date_str = start_date.strftime("%Y%m%d%H%M%S")
- end_date_str = end_date.strftime("%Y%m%d%H%M%S")
-
- # Set paths
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product=product,
- data_source=data_source,
- campaign_name=campaign_name,
- )
# Create dataframe
df = pd.DataFrame({"time": pd.date_range(start=start_date, end=end_date)})
- # Test the function
- res = define_l0a_filepath(df, processed_dir, station_name)
-
# Define expected results
- expected_name = (
- f"{product}.{campaign_name.upper()}.{station_name}.s{start_date_str}.e{end_date_str}.{PRODUCT_VERSION}.parquet"
- )
- expected_path = os.path.join(processed_dir, product, station_name, expected_name)
- assert res == expected_path
+ expected_name = f"{product}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.parquet"
+ # Test the function
+ res = define_l0a_filename(df, campaign_name, station_name)
+ assert res == expected_name
-def test_define_l0b_filepath(tmp_path):
- from disdrodb.l0.standards import PRODUCT_VERSION
- # Set variables
+@pytest.mark.parametrize("product", ["L0B", "L0C"])
+def test_define_l0b_filename(product):
+ from disdrodb import PRODUCT_VERSION
- product = "L0B"
- base_dir = tmp_path / "DISDRODB"
- data_source = "DATA_SOURCE"
+ # Set variables
campaign_name = "CAMPAIGN_NAME"
station_name = "STATION_NAME"
+ sample_interval = 10
+ sample_interval_str = "10S"
start_date = datetime.datetime(2019, 3, 26, 0, 0, 0)
end_date = datetime.datetime(2021, 2, 8, 0, 0, 0)
- start_date_str = start_date.strftime("%Y%m%d%H%M%S")
- end_date_str = end_date.strftime("%Y%m%d%H%M%S")
-
- # Set paths
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product=product,
- data_source=data_source,
- campaign_name=campaign_name,
- )
# Create xarray object
timesteps = pd.date_range(start=start_date, end=end_date)
@@ -124,15 +85,17 @@ def test_define_l0b_filepath(tmp_path):
ds = xr.DataArray(
data=data,
dims=["time"],
- coords={"time": pd.date_range(start=start_date, end=end_date)},
+ coords={"time": pd.date_range(start=start_date, end=end_date), "sample_interval": sample_interval},
)
- # Test the function
- res = define_l0b_filepath(ds, processed_dir, station_name)
-
# Define expected results
- expected_name = (
- f"{product}.{campaign_name.upper()}.{station_name}.s{start_date_str}.e{end_date_str}.{PRODUCT_VERSION}.nc"
- )
- expected_path = os.path.join(processed_dir, product, station_name, expected_name)
- assert res == expected_path
+ # TODO: MODIFY !
+ if product == "L0B":
+ expected_name = f"{product}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.nc"
+ else:
+ expected_name = f"{product}.{sample_interval_str}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.nc"
+
+ # Test the function
+ define_filename_func = define_l0b_filename if product == "L0B" else define_l0c_filename
+ res = define_filename_func(ds, campaign_name, station_name)
+ assert res == expected_name
diff --git a/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py b/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py
index 5b74d534..0526dcb9 100644
--- a/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py
+++ b/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py
@@ -20,10 +20,10 @@
from click.testing import CliRunner
-from disdrodb.data_transfer.scripts.disdrodb_download_archive import disdrodb_download_archive
-from disdrodb.data_transfer.scripts.disdrodb_download_station import disdrodb_download_station
-from disdrodb.data_transfer.scripts.disdrodb_upload_archive import disdrodb_upload_archive
-from disdrodb.data_transfer.scripts.disdrodb_upload_station import disdrodb_upload_station
+from disdrodb.cli.disdrodb_download_archive import disdrodb_download_archive
+from disdrodb.cli.disdrodb_download_station import disdrodb_download_station
+from disdrodb.cli.disdrodb_upload_archive import disdrodb_upload_archive
+from disdrodb.cli.disdrodb_upload_station import disdrodb_upload_station
from disdrodb.tests.conftest import create_fake_metadata_file
TEST_ZIP_FPATH = (
diff --git a/disdrodb/tests/test_issue/test_issue_checks.py b/disdrodb/tests/test_issue/test_issue_checks.py
index c9beb94f..2bc3496d 100644
--- a/disdrodb/tests/test_issue/test_issue_checks.py
+++ b/disdrodb/tests/test_issue/test_issue_checks.py
@@ -40,10 +40,6 @@ def test__is_numpy_array_string():
arr = np.array(["foo", "bar"], dtype=np.str_)
assert _is_numpy_array_string(arr)
- # Test unicode array
- arr = np.array(["foo", "bar"], dtype=np.unicode_)
- assert _is_numpy_array_string(arr)
-
# Test nonstring array
arr = np.array([1, 2, 3])
assert not _is_numpy_array_string(arr)
diff --git a/disdrodb/tests/test_l0/test_check_readers.py b/disdrodb/tests/test_l0/test_check_readers.py
index 5472e45d..2d79cb77 100644
--- a/disdrodb/tests/test_l0/test_check_readers.py
+++ b/disdrodb/tests/test_l0/test_check_readers.py
@@ -92,7 +92,7 @@ def _check_station_reader_results(
campaign_name=campaign_name,
station_name=station_name,
force=True,
- verbose=False,
+ verbose=True,
debugging_mode=False,
parallel=False,
)
@@ -164,6 +164,8 @@ def test_check_all_readers(tmp_path) -> None:
base_dir=test_base_dir,
)
+ # data_source, campaign_name, station_name = list_stations_info[0]
+ # data_source, campaign_name, station_name = list_stations_info[1]
for data_source, campaign_name, station_name in list_stations_info:
_check_station_reader_results(
base_dir=test_base_dir,
diff --git a/disdrodb/tests/test_l0/test_cmd_processing.py b/disdrodb/tests/test_l0/test_cmd_processing.py
index 759edd27..7fd220cf 100644
--- a/disdrodb/tests/test_l0/test_cmd_processing.py
+++ b/disdrodb/tests/test_l0/test_cmd_processing.py
@@ -25,137 +25,430 @@
from click.testing import CliRunner
from disdrodb import __root_path__
-from disdrodb.api.path import define_station_dir
-from disdrodb.l0.scripts.disdrodb_run_l0 import disdrodb_run_l0
-from disdrodb.l0.scripts.disdrodb_run_l0_station import disdrodb_run_l0_station
-from disdrodb.l0.scripts.disdrodb_run_l0a import disdrodb_run_l0a
-from disdrodb.l0.scripts.disdrodb_run_l0a_station import disdrodb_run_l0a_station
-from disdrodb.l0.scripts.disdrodb_run_l0b import disdrodb_run_l0b
-from disdrodb.l0.scripts.disdrodb_run_l0b_station import disdrodb_run_l0b_station
+from disdrodb.api.path import define_data_dir
+from disdrodb.cli.disdrodb_run_l0 import disdrodb_run_l0
+from disdrodb.cli.disdrodb_run_l0_station import disdrodb_run_l0_station
+from disdrodb.cli.disdrodb_run_l0a import disdrodb_run_l0a
+from disdrodb.cli.disdrodb_run_l0a_station import disdrodb_run_l0a_station
+from disdrodb.cli.disdrodb_run_l0b import disdrodb_run_l0b
+from disdrodb.cli.disdrodb_run_l0b_station import disdrodb_run_l0b_station
+from disdrodb.routines import (
+ run_disdrodb_l0_station,
+ run_disdrodb_l0a,
+ run_disdrodb_l0a_station,
+ run_disdrodb_l0b,
+ run_disdrodb_l0b_station,
+)
from disdrodb.utils.directories import count_files
BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB")
DATA_SOURCE = "EPFL"
CAMPAIGN_NAME = "PARSIVEL_2007"
STATION_NAME = "10"
+DEBUGGING_MODE = True
+VERBOSE = False
+FORCE = False
+# test_base_dir = "/tmp/new/DISDRODB"
+# shutil.copytree(BASE_DIR, test_base_dir)
+# parallel = False
+
+@pytest.mark.parametrize("cli", [True, False])
@pytest.mark.parametrize("parallel", [True, False])
-def test_disdrodb_run_l0a_station(tmp_path, parallel):
+def test_disdrodb_run_l0a_station(tmp_path, parallel, cli):
"""Test the disdrodb_run_l0a_station command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
- runner = CliRunner()
- runner.invoke(
- disdrodb_run_l0a_station,
- [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", str(test_base_dir), "--parallel", parallel],
- )
-
- station_dir = define_station_dir(
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0a_station,
+ [
+ DATA_SOURCE,
+ CAMPAIGN_NAME,
+ STATION_NAME,
+ "--base_dir",
+ test_base_dir,
+ "--parallel",
+ parallel,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--verbose",
+ VERBOSE,
+ "--force",
+ FORCE,
+ ],
+ )
+ else:
+ run_disdrodb_l0a_station(
+ # Station arguments
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
+ # Processing options
+ parallel=parallel,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0A",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.parquet", recursive=True) > 0
+ assert count_files(data_dir, glob_pattern="*.parquet", recursive=True) > 0
+@pytest.mark.parametrize("cli", [True, False])
@pytest.mark.parametrize("parallel", [True, False])
-def test_disdrodb_run_l0b_station(tmp_path, parallel):
+def test_disdrodb_run_l0b_station(tmp_path, parallel, cli):
"""Test the disdrodb_run_l0b_station command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
- runner = CliRunner()
- runner.invoke(
- disdrodb_run_l0a_station,
- [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--parallel", parallel],
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0a_station,
+ [
+ DATA_SOURCE,
+ CAMPAIGN_NAME,
+ STATION_NAME,
+ "--base_dir",
+ test_base_dir,
+ "--parallel",
+ parallel,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--verbose",
+ VERBOSE,
+ "--force",
+ FORCE,
+ ],
+ )
+ runner.invoke(
+ disdrodb_run_l0b_station,
+ [
+ DATA_SOURCE,
+ CAMPAIGN_NAME,
+ STATION_NAME,
+ "--base_dir",
+ test_base_dir,
+ "--parallel",
+ parallel,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--force",
+ FORCE,
+ ],
+ )
+ else:
+ run_disdrodb_l0a_station(
+ # Station arguments
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
+ # Processing options
+ parallel=parallel,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ run_disdrodb_l0b_station(
+ # Station arguments
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
+ # Processing options
+ parallel=parallel,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
+ base_dir=test_base_dir,
+ product="L0B",
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
)
+ assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0
- runner.invoke(
- disdrodb_run_l0b_station,
- [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--parallel", parallel],
- )
- station_dir = define_station_dir(
+@pytest.mark.parametrize("cli", [True, False])
+@pytest.mark.parametrize("parallel", [True, False])
+@pytest.mark.parametrize("verbose", [True, False])
+def test_disdrodb_run_l0_nc_station(tmp_path, verbose, parallel, cli):
+ """Test the disdrodb_run_l0_station process correctly raw netCDF files."""
+ BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB")
+ DATA_SOURCE = "UK"
+ CAMPAIGN_NAME = "DIVEN"
+ STATION_NAME = "CAIRNGORM"
+
+ test_base_dir = tmp_path / "DISDRODB"
+ shutil.copytree(BASE_DIR, test_base_dir)
+
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0_station,
+ [
+ DATA_SOURCE,
+ CAMPAIGN_NAME,
+ STATION_NAME,
+ "--base_dir",
+ test_base_dir,
+ "--verbose",
+ verbose,
+ "--parallel",
+ parallel,
+ ],
+ )
+ else:
+ run_disdrodb_l0_station(
+ # Station arguments
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
+ # Processing options
+ parallel=parallel,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0B",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0
+ assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0
+@pytest.mark.parametrize("cli", [True, False])
@pytest.mark.parametrize("verbose", [True, False])
-def test_disdrodb_run_l0_station(tmp_path, verbose):
+def test_disdrodb_run_l0_station(tmp_path, verbose, cli):
"""Test the disdrodb_run_l0_station command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
- runner = CliRunner()
- runner.invoke(
- disdrodb_run_l0_station,
- [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--verbose", verbose],
- )
-
- station_dir = define_station_dir(
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0_station,
+ [
+ DATA_SOURCE,
+ CAMPAIGN_NAME,
+ STATION_NAME,
+ "--base_dir",
+ test_base_dir,
+ "--verbose",
+ verbose,
+ "--parallel",
+ False,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--force",
+ FORCE,
+ ],
+ )
+ else:
+ run_disdrodb_l0_station(
+ # Station arguments
+ data_source=DATA_SOURCE,
+ campaign_name=CAMPAIGN_NAME,
+ station_name=STATION_NAME,
+ # Processing options
+ parallel=False,
+ force=FORCE,
+ verbose=verbose,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0B",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0
+ assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0
-def test_disdrodb_run_l0a(tmp_path):
+@pytest.mark.parametrize("cli", [True, False])
+def test_disdrodb_run_l0a(tmp_path, cli):
"""Test the disdrodb_run_l0a command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
- runner = CliRunner()
- runner.invoke(disdrodb_run_l0a, ["--base_dir", test_base_dir])
- station_dir = define_station_dir(
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0a,
+ [
+ "--base_dir",
+ test_base_dir,
+ "--data_sources",
+ DATA_SOURCE,
+ "--campaign_names",
+ CAMPAIGN_NAME,
+ "--station_names",
+ STATION_NAME,
+ "--verbose",
+ VERBOSE,
+ "--parallel",
+ False,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--force",
+ FORCE,
+ ],
+ )
+ else:
+ run_disdrodb_l0a(
+ # Station arguments
+ data_sources=DATA_SOURCE,
+ campaign_names=CAMPAIGN_NAME,
+ station_names=STATION_NAME,
+ # Processing options
+ parallel=False,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0A",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.parquet", recursive=True) > 0
+ assert count_files(data_dir, glob_pattern="*.parquet", recursive=True) > 0
-def test_disdrodb_run_l0b(tmp_path):
+@pytest.mark.parametrize("cli", [True, False])
+def test_disdrodb_run_l0b(tmp_path, cli):
"""Test the disdrodb_run_l0b command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
- runner = CliRunner()
- runner.invoke(disdrodb_run_l0a, ["--base_dir", test_base_dir])
-
- runner.invoke(disdrodb_run_l0b, ["--base_dir", test_base_dir])
-
- station_dir = define_station_dir(
+ # Produce data
+ if cli:
+ runner = CliRunner()
+ runner.invoke(
+ disdrodb_run_l0a,
+ [
+ "--base_dir",
+ test_base_dir,
+ "--data_sources",
+ DATA_SOURCE,
+ "--campaign_names",
+ CAMPAIGN_NAME,
+ "--station_names",
+ STATION_NAME,
+ "--verbose",
+ VERBOSE,
+ "--parallel",
+ False,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--force",
+ FORCE,
+ ],
+ )
+
+ runner.invoke(
+ disdrodb_run_l0b,
+ [
+ "--base_dir",
+ test_base_dir,
+ "--data_sources",
+ DATA_SOURCE,
+ "--campaign_names",
+ CAMPAIGN_NAME,
+ "--station_names",
+ STATION_NAME,
+ "--verbose",
+ VERBOSE,
+ "--parallel",
+ False,
+ "--debugging_mode",
+ DEBUGGING_MODE,
+ "--force",
+ FORCE,
+ ],
+ )
+ else:
+ run_disdrodb_l0a(
+ # Station arguments
+ data_sources=DATA_SOURCE,
+ campaign_names=CAMPAIGN_NAME,
+ station_names=STATION_NAME,
+ # Processing options
+ parallel=False,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+ run_disdrodb_l0b(
+ # Station arguments
+ data_sources=DATA_SOURCE,
+ campaign_names=CAMPAIGN_NAME,
+ station_names=STATION_NAME,
+ # Processing options
+ parallel=False,
+ force=FORCE,
+ verbose=VERBOSE,
+ debugging_mode=DEBUGGING_MODE,
+ base_dir=test_base_dir,
+ )
+
+ # Check files are produced
+ data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0B",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0
+ assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0
@pytest.mark.parametrize("remove_l0a", [True, False])
@pytest.mark.parametrize("remove_l0b", [True, False])
-@pytest.mark.parametrize("l0b_concat", [True, False])
-def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b, l0b_concat):
+def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b):
"""Test the disdrodb_run_l0b command."""
test_base_dir = tmp_path / "DISDRODB"
shutil.copytree(BASE_DIR, test_base_dir)
+ # Produce data
runner = CliRunner()
runner.invoke(
disdrodb_run_l0,
@@ -168,75 +461,39 @@ def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b, l0b_concat):
remove_l0a,
"--remove_l0b",
remove_l0b,
- "--l0b_concat",
- l0b_concat,
],
)
- l0a_station_dir = define_station_dir(
+ # Check files are produced
+ l0a_data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0A",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- l0b_station_dir = define_station_dir(
+ l0b_data_dir = define_data_dir(
base_dir=test_base_dir,
product="L0B",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- if remove_l0a:
- assert count_files(l0a_station_dir, glob_pattern="*.parquet", recursive=True) == 0
-
- if not remove_l0a:
- assert count_files(l0a_station_dir, glob_pattern="*.parquet", recursive=True) > 0
-
- if l0b_concat:
- if remove_l0b:
- assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) == 0
- else:
- assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) > 0
-
- # If not L0B concat, do not remove L0B also if remove_l0b is specified !
- if not l0b_concat and remove_l0b:
- assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) > 0
-
-
-@pytest.mark.parametrize("parallel", [True, False])
-@pytest.mark.parametrize("verbose", [True, False])
-def test_disdrodb_run_l0_nc_station(tmp_path, verbose, parallel):
- """Test the disdrodb_run_l0_station process correctly raw netCDF files."""
- BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB")
- DATA_SOURCE = "UK"
- CAMPAIGN_NAME = "DIVEN"
- STATION_NAME = "CAIRNGORM"
-
- test_base_dir = tmp_path / "DISDRODB"
- shutil.copytree(BASE_DIR, test_base_dir)
-
- runner = CliRunner()
- runner.invoke(
- disdrodb_run_l0_station,
- [
- DATA_SOURCE,
- CAMPAIGN_NAME,
- STATION_NAME,
- "--base_dir",
- test_base_dir,
- "--verbose",
- verbose,
- "--parallel",
- parallel,
- ],
- )
-
- station_dir = define_station_dir(
+ l0c_data_dir = define_data_dir(
base_dir=test_base_dir,
- product="L0B",
+ product="L0C",
data_source=DATA_SOURCE,
campaign_name=CAMPAIGN_NAME,
station_name=STATION_NAME,
)
- assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0
+ if remove_l0a:
+ assert count_files(l0a_data_dir, glob_pattern="*.parquet", recursive=True) == 0
+ else:
+ assert count_files(l0a_data_dir, glob_pattern="*.parquet", recursive=True) > 0
+
+ if remove_l0b:
+ assert count_files(l0b_data_dir, glob_pattern="*.nc", recursive=True) == 0
+ else:
+ assert count_files(l0b_data_dir, glob_pattern="*.nc", recursive=True) > 0
+
+ assert count_files(l0c_data_dir, glob_pattern="*.nc", recursive=True) > 0
diff --git a/disdrodb/tests/test_l0/test_io.py b/disdrodb/tests/test_l0/test_io.py
index f598e4ac..6905a9a1 100644
--- a/disdrodb/tests/test_l0/test_io.py
+++ b/disdrodb/tests/test_l0/test_io.py
@@ -23,11 +23,11 @@
import pandas as pd
import pytest
+from disdrodb.api.io import get_filepaths
from disdrodb.api.path import define_campaign_dir
from disdrodb.l0.io import (
_check_glob_pattern,
_read_l0a,
- get_l0a_filepaths,
get_raw_filepaths,
read_l0a_dataframe,
)
@@ -102,18 +102,14 @@ def test_get_l0a_filepaths(tmp_path):
campaign_name = "CAMPAIGN_NAME"
station_name = "STATION_NAME"
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product="L0A",
- data_source=data_source,
- campaign_name=campaign_name,
- )
-
# Test that the function raises an error if no files presenet
with pytest.raises(ValueError):
- get_l0a_filepaths(
- processed_dir=processed_dir,
+ _ = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
station_name=station_name,
+ product="L0A",
)
# Add fake data files
@@ -128,15 +124,24 @@ def test_get_l0a_filepaths(tmp_path):
)
# Test that the function returns the correct number of files in debugging mode
- filepaths = get_l0a_filepaths(
- processed_dir=processed_dir,
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
station_name=station_name,
+ product="L0A",
debugging_mode=True,
)
assert len(filepaths) == 2 # max(2, 3)
# Test that the function returns the correct number of files in normal mode
- filepaths = get_l0a_filepaths(processed_dir=processed_dir, station_name=station_name)
+ filepaths = get_filepaths(
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ product="L0A",
+ )
assert len(filepaths) == 2
diff --git a/disdrodb/tests/test_l0/test_l0a_processing.py b/disdrodb/tests/test_l0/test_l0a_processing.py
index b45decb1..c87059ad 100644
--- a/disdrodb/tests/test_l0/test_l0a_processing.py
+++ b/disdrodb/tests/test_l0/test_l0a_processing.py
@@ -151,7 +151,8 @@ def test_remove_corrupted_rows():
remove_corrupted_rows(pd.DataFrame())
# Test case 3: Check if the function raises ValueError when only one row remains
- with pytest.raises(ValueError, match=r"Only 1 row remains after data corruption checks. Check the file."):
+ msg = r"Only 1 row remains after data corruption checks. Check the raw file and maybe delete it."
+ with pytest.raises(ValueError, match=msg):
remove_corrupted_rows(pd.DataFrame({"raw_drop_number": ["1"]}))
@@ -569,7 +570,7 @@ def test_write_l0a(tmp_path):
# create dummy dataframe
data = [{"a": "1", "b": "2", "c": "3"}, {"a": "2", "b": "2", "c": "3"}]
df = pd.DataFrame(data).set_index("a")
- df["time"] = pd.Timestamp.now()
+ df["time"] = pd.Timestamp.now().to_numpy().astype("M8[ns]") # open by default as [ns]. Now() returns as [us]
# Write parquet file
filepath = os.path.join(tmp_path, "fake_data_sample.parquet")
diff --git a/disdrodb/tests/test_l0/test_l0b_concat.py b/disdrodb/tests/test_l0/test_l0b_concat.py
deleted file mode 100644
index 2e4e34b6..00000000
--- a/disdrodb/tests/test_l0/test_l0b_concat.py
+++ /dev/null
@@ -1,362 +0,0 @@
-#!/usr/bin/env python3
-
-# -----------------------------------------------------------------------------.
-# Copyright (c) 2021-2023 DISDRODB developers
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-# -----------------------------------------------------------------------------.
-"""Test DISDRODB L0B netCDF concatenation routines."""
-
-import os
-
-import numpy as np
-import pandas as pd
-import pytest
-import xarray as xr
-
-from disdrodb.api.path import define_campaign_dir
-from disdrodb.l0.l0_processing import run_l0b_concat, run_l0b_concat_station
-from disdrodb.l0.routines import run_disdrodb_l0b_concat
-from disdrodb.tests.conftest import create_fake_metadata_file, create_fake_station_dir
-from disdrodb.utils.directories import count_files, list_files
-from disdrodb.utils.netcdf import xr_concat_datasets
-
-
-def create_dummy_l0b_file(filepath: str, time):
- # Define the size of the dimensions
- n_lat = 10
- n_lon = 10
-
- # Assign lat/lon coordinates
- lat_data = np.linspace(-90, 90, n_lat, dtype=np.float32)
- lon_data = np.linspace(-180, 180, n_lon, dtype=np.float32)
-
- # Define variable dictionary
- data = np.random.rand(len(time), len(lat_data), len(lon_data)).astype(np.float32)
- data_vars = {
- "rainfall_rate_32bit": (("time", "lat", "lon"), data),
- }
- # Create the coordinate dictionary
- coords_dict = {
- "lat": ("lat", lat_data),
- "lon": ("lon", lon_data),
- "time": ("time", time),
- }
- # Create a dataset with dimensions lat, lon, and time
- ds = xr.Dataset(data_vars, coords=coords_dict)
- # Set global attribute
- ds.attrs["sensor_name"] = "OTT_Parsivel"
-
- # Set variable attributes
- ds["lat"].attrs["long_name"] = "latitude"
- ds["lat"].attrs["units"] = "degrees_north"
- ds["lon"].attrs["long_name"] = "longitude"
- ds["lon"].attrs["units"] = "degrees_east"
- ds["time"].attrs["long_name"] = "time"
- # ds["time"].attrs["units"] = "days since 2023-01-01"
-
- # Write the dataset to a new NetCDF file
- ds.to_netcdf(filepath)
- ds.close()
- return filepath
-
-
-def test_xr_concat_datasets(tmp_path):
- # Write L0B files
- filepath1 = os.path.join(tmp_path, "test_1.nc")
- filepath2 = os.path.join(tmp_path, "test_2.nc")
-
- time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=3, freq="D"))
- time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D"))
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
- _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2)
-
- # Check with file in correct orders
- filepaths = [filepath1, filepath2]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), np.concatenate((time_data_1, time_data_2)).astype(float))
-
- # Check with file in reverse orders
- filepaths = [filepath2, filepath1]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), np.concatenate((time_data_1, time_data_2)).astype(float))
-
-
-def test_xr_concat_completely_overlapped_datasets(tmp_path):
- # Write L0B files
- filepath1 = os.path.join(tmp_path, "test_1.nc")
- filepath2 = os.path.join(tmp_path, "test_2.nc")
- filepath3 = os.path.join(tmp_path, "test_3.nc")
-
- time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=6, freq="D"))
- time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D"))
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
- _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2)
- _ = create_dummy_l0b_file(filepath=filepath3, time=time_data_2[::-1])
-
- # Check with file in correct orders
- filepaths = [filepath1, filepath2]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), time_data_1.astype(float))
-
- # Check with file in reverse orders
- filepaths = [filepath2, filepath1]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), time_data_1.astype(float))
-
- # Check if completely overlapped but reversed order
- filepaths = [filepath2, filepath3]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 3
- np.testing.assert_allclose(time_values.astype(float), time_data_2.astype(float))
-
-
-def test_xr_concat_completely_partial_overlapped_datasets(tmp_path):
- # Write L0B files
- filepath1 = os.path.join(tmp_path, "test_1.nc")
- filepath2 = os.path.join(tmp_path, "test_2.nc")
-
- time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=4, freq="D"))
- time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D"))
-
- unique_time_data = np.sort(np.unique(np.concatenate((time_data_1, time_data_2))))
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
- _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2)
-
- # Check with file in correct orders
- filepaths = [filepath1, filepath2]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), unique_time_data.astype(float))
-
- # Check with file in reverse orders
- filepaths = [filepath2, filepath1]
- ds = xr_concat_datasets(filepaths)
- time_values = ds["time"].to_numpy()
- assert len(time_values) == 6
- np.testing.assert_allclose(time_values.astype(float), unique_time_data.astype(float))
-
-
-def test_run_l0b_concat(tmp_path):
- # Define station info
- base_dir = tmp_path / "DISDRODB"
- data_source = "DATA_SOURCE"
- campaign_name = "CAMPAIGN_NAME"
- station_name = "test_station"
-
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- )
- # Define fake L0B directory structure
- station_dir = create_fake_station_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name,
- )
-
- # Add dummy L0B files
- filepath1 = os.path.join(station_dir, "test_1.nc")
- filepath2 = os.path.join(station_dir, "test_2.nc")
-
- time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64)
- time_data_2 = np.array([3.0, 4.0, 5.0], dtype=np.float64)
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
- _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2)
-
- # Monkey patch the write_l0b function
- def mock_write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None:
- ds.to_netcdf(filepath, engine="netcdf4")
-
- from disdrodb.l0 import l0b_processing
-
- l0b_processing.write_l0b = mock_write_l0b
-
- # Run concatenation command
- run_l0b_concat(processed_dir=processed_dir, station_name=station_name, verbose=False)
-
- # Assert only 1 file is created
- list_concatenated_files = list_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False)
- assert len(list_concatenated_files) == 1
-
- # Read concatenated netCDF file
- ds = xr.open_dataset(list_concatenated_files[0])
- assert len(ds["time"].to_numpy()) == 6
-
-
-def test_run_l0b_concat_station(tmp_path):
- # Define stations info
- base_dir = tmp_path / "DISDRODB"
- data_source = "DATA_SOURCE"
- campaign_name = "CAMPAIGN_NAME"
- station_name1 = "test_station_1"
-
- # Define fake directory structure for the two L0B stations
- # # Define fake L0B directory structure
- station1_dir = create_fake_station_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- )
- _ = create_fake_metadata_file(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- )
-
- # Add dummy L0B files for two stations
- filepath1 = os.path.join(station1_dir, f"{station_name1}_file.nc")
- time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64)
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
-
- # Run concatenation command
- run_l0b_concat_station(
- base_dir=str(base_dir),
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- remove_l0b=True,
- verbose=False,
- )
-
- # Assert files where removed
- assert not os.path.exists(filepath1)
-
- # Assert the presence of 2 concatenated netcdf files (one for each station)
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- )
-
- assert count_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False) == 1
-
- # Check that if L0B files are removed, raise error if no stations available
- with pytest.raises(ValueError):
- run_l0b_concat_station(
- base_dir=str(base_dir),
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- remove_l0b=True,
- verbose=False,
- )
-
-
-def test_run_disdrodb_l0b_concat(tmp_path):
- # Define stations info
- base_dir = tmp_path / "DISDRODB"
- data_source = "DATA_SOURCE"
- campaign_name = "CAMPAIGN_NAME"
- station_name1 = "test_station_1"
- station_name2 = "test_station_2"
-
- # Define fake directory structure for the two L0B stations
- # # Define fake L0B directory structure
- station1_dir = create_fake_station_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- )
- station2_dir = create_fake_station_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name2,
- )
- _ = create_fake_metadata_file(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name1,
- )
- _ = create_fake_metadata_file(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- station_name=station_name2,
- )
- # Add dummy L0B files for two stations
- filepath1 = os.path.join(station1_dir, f"{station_name1}_file.nc")
- filepath2 = os.path.join(station2_dir, f"{station_name2}_file.nc")
-
- time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64)
- time_data_2 = np.array([3.0, 4.0, 5.0], dtype=np.float64)
-
- _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1)
- _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2)
-
- # Run concatenation command
- run_disdrodb_l0b_concat(
- base_dir=str(base_dir),
- data_sources=data_source,
- campaign_names=campaign_name,
- station_names=[station_name1, station_name2],
- remove_l0b=True,
- verbose=False,
- )
-
- # Assert files where removed
- assert not os.path.exists(filepath1)
- assert not os.path.exists(filepath2)
-
- # Assert the presence of 2 concatenated netcdf files (one for each station)
- processed_dir = define_campaign_dir(
- base_dir=base_dir,
- product="L0B",
- data_source=data_source,
- campaign_name=campaign_name,
- )
-
- assert count_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False) == 2
-
- # Check that if L0B files are removed, raise error if no stations available
- with pytest.raises(ValueError):
- run_disdrodb_l0b_concat(
- base_dir=str(base_dir),
- data_sources=data_source,
- campaign_names=campaign_name,
- station_names=[station_name1, station_name2],
- remove_l0b=True,
- verbose=False,
- )
diff --git a/disdrodb/tests/test_l0/test_l0b_processing.py b/disdrodb/tests/test_l0/test_l0b_processing.py
index 89ad123a..f01413b4 100644
--- a/disdrodb/tests/test_l0/test_l0b_processing.py
+++ b/disdrodb/tests/test_l0/test_l0b_processing.py
@@ -26,8 +26,6 @@
from disdrodb.l0 import l0b_processing
from disdrodb.l0.l0b_processing import (
- _set_attrs_dict,
- _set_coordinate_attributes,
_set_variable_attributes,
add_dataset_crs_coords,
create_l0b_from_l0a,
@@ -168,43 +166,6 @@ def test_add_dataset_crs_coords():
assert ds_out["crs"].to_numpy() == "WGS84"
-def test_set_attrs_dict():
- ds = xr.Dataset({"var1": xr.DataArray([1, 2, 3], dims="time")})
- attrs_dict = {"var1": {"attr1": "value1"}}
- ds = _set_attrs_dict(ds, attrs_dict)
- assert ds["var1"].attrs["attr1"] == "value1"
-
- attrs_dict = {"var2": {"attr1": "value1"}}
- ds = _set_attrs_dict(ds, attrs_dict)
- assert "var2" not in ds
-
- attrs_dict = {"var1": {"attr1": "value1"}, "var2": {"attr2": "value2"}}
- ds = _set_attrs_dict(ds, attrs_dict)
- assert ds["var1"].attrs["attr1"] == "value1"
- assert "var2" not in ds
-
-
-def test__set_coordinate_attributes():
- # Create example dataset
- ds = xr.Dataset(
- {
- "var1": xr.DataArray([1, 2, 3], dims="time"),
- "lat": xr.DataArray([0, 1, 2], dims="time"),
- "lon": xr.DataArray([0, 1, 2], dims="time"),
- },
- )
- ds.lat.attrs["units"] = "degrees_north"
- ds.lon.attrs["units"] = "degrees_east"
-
- # Call the function and check the output
- ds_out = _set_coordinate_attributes(ds)
- assert "units" in ds_out["lat"].attrs
- assert ds_out["lat"].attrs["units"] == "degrees_north"
- assert "units" in ds_out["lon"].attrs
- assert ds_out["lon"].attrs["units"] == "degrees_east"
- assert "units" not in ds_out["var1"].attrs
-
-
def test__set_variable_attributes(mocker):
# Create a sample dataset
data = np.random.rand(10, 10)
@@ -472,79 +433,3 @@ def test__convert_object_variables_to_string():
# Check that variable 'b' is of type 'float'
assert ds["b"].dtype == "float"
-
-
-@pytest.fixture()
-def encoding_dict_1():
- # create a test encoding dictionary
- return {
- "var1": {"dtype": "float32", "chunksizes": (10, 10, 10)},
- "var2": {"dtype": "int16", "chunksizes": (5, 5, 5)},
- "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)},
- }
-
-
-@pytest.fixture()
-def encoding_dict_2():
- # create a test encoding dictionary
- return {
- "var1": {"dtype": "float32", "chunksizes": (100, 100, 100)},
- "var2": {"dtype": "int16", "chunksizes": (100, 100, 100)},
- "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)},
- }
-
-
-@pytest.fixture()
-def ds():
- # create a test xr.Dataset
- data = {
- "var1": (["time", "x", "y"], np.random.random((10, 20, 30))),
- "var2": (["time", "x", "y"], np.random.randint(0, 10, size=(10, 20, 30))),
- "var3": (["time", "x", "y"], np.random.random((10, 20, 30))),
- }
- coords = {"time": np.arange(10), "x": np.arange(20), "y": np.arange(30)}
- return xr.Dataset(data, coords)
-
-
-def test_sanitize_encodings_dict(encoding_dict_1, encoding_dict_2, ds):
- result = l0b_processing.sanitize_encodings_dict(encoding_dict_1, ds)
-
- assert isinstance(result, dict)
-
- # Test that the dictionary contains the same keys as the input dictionary
- assert set(result.keys()) == set(encoding_dict_1.keys())
-
- # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes
- # in the dataset
- for var in result:
- assert tuple(result[var]["chunksizes"]) <= ds[var].shape
-
- result = l0b_processing.sanitize_encodings_dict(encoding_dict_2, ds)
-
- assert isinstance(result, dict)
-
- # Test that the dictionary contains the same keys as the input dictionary
- assert set(result.keys()) == set(encoding_dict_2.keys())
-
- # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes
- # in the dataset
- for var in result:
- assert tuple(result[var]["chunksizes"]) <= ds[var].shape
-
-
-def test_rechunk_dataset():
- # Create a sample xarray dataset
- data = {
- "a": (["x", "y"], [[1, 2, 3], [4, 5, 6]]),
- "b": (["x", "y"], [[7, 8, 9], [10, 11, 12]]),
- }
- coords = {"x": [0, 1], "y": [0, 1, 2]}
- ds = xr.Dataset(data, coords=coords)
-
- # Define the encoding dictionary
- encoding_dict = {"a": {"chunksizes": (1, 2)}, "b": {"chunksizes": (2, 1)}}
-
- # Test the rechunk_dataset function
- ds_rechunked = l0b_processing.rechunk_dataset(ds, encoding_dict)
- assert ds_rechunked["a"].chunks == ((1, 1), (2, 1))
- assert ds_rechunked["b"].chunks == ((2,), (1, 1, 1))
diff --git a/disdrodb/tests/test_l0/test_standards.py b/disdrodb/tests/test_l0/test_standards.py
index 10b8436a..ab54864d 100644
--- a/disdrodb/tests/test_l0/test_standards.py
+++ b/disdrodb/tests/test_l0/test_standards.py
@@ -33,7 +33,6 @@
get_l0a_encodings_dict,
get_n_velocity_bins,
get_nan_flags_dict,
- get_time_encoding,
get_valid_coordinates_names,
get_valid_dimension_names,
get_valid_names,
@@ -105,10 +104,6 @@ def test_get_valid_names(sensor_name):
assert isinstance(get_valid_names(sensor_name), list)
-def test_get_time_encoding():
- assert isinstance(get_time_encoding(), dict)
-
-
def test_get_n_velocity_bins():
# Impact disdrometer
sensor_name = "RD_80"
diff --git a/disdrodb/tests/test_utils/test_utils_attrs.py b/disdrodb/tests/test_utils/test_utils_attrs.py
new file mode 100644
index 00000000..b592c96a
--- /dev/null
+++ b/disdrodb/tests/test_utils/test_utils_attrs.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Test DISDRODB netCDF4 attributes utilities."""
+import xarray as xr
+
+from disdrodb.utils.attrs import set_attrs, set_coordinate_attributes
+
+
+def test_set_attrs():
+ ds = xr.Dataset({"var1": xr.DataArray([1, 2, 3], dims="time")})
+ attrs_dict = {"var1": {"attr1": "value1"}}
+ ds = set_attrs(ds, attrs_dict)
+ assert ds["var1"].attrs["attr1"] == "value1"
+
+ attrs_dict = {"var2": {"attr1": "value1"}}
+ ds = set_attrs(ds, attrs_dict)
+ assert "var2" not in ds
+
+ attrs_dict = {"var1": {"attr1": "value1"}, "var2": {"attr2": "value2"}}
+ ds = set_attrs(ds, attrs_dict)
+ assert ds["var1"].attrs["attr1"] == "value1"
+ assert "var2" not in ds
+
+
+def test_set_coordinate_attributes():
+ # Create example dataset
+ ds = xr.Dataset(
+ {
+ "var1": xr.DataArray([1, 2, 3], dims="time"),
+ "lat": xr.DataArray([0, 1, 2], dims="time"),
+ "lon": xr.DataArray([0, 1, 2], dims="time"),
+ },
+ )
+ ds.lat.attrs["units"] = "degrees_north"
+ ds.lon.attrs["units"] = "degrees_east"
+
+ # Call the function and check the output
+ ds_out = set_coordinate_attributes(ds)
+ assert "units" in ds_out["lat"].attrs
+ assert ds_out["lat"].attrs["units"] == "degrees_north"
+ assert "units" in ds_out["lon"].attrs
+ assert ds_out["lon"].attrs["units"] == "degrees_east"
+ assert "units" not in ds_out["var1"].attrs
diff --git a/disdrodb/tests/test_utils/test_utils_encoding.py b/disdrodb/tests/test_utils/test_utils_encoding.py
new file mode 100644
index 00000000..0af75882
--- /dev/null
+++ b/disdrodb/tests/test_utils/test_utils_encoding.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Test DISDRODB netCDF4 encoding utilities."""
+import numpy as np
+import pytest
+import xarray as xr
+
+from disdrodb.utils.encoding import get_time_encoding, rechunk_dataset, sanitize_encodings_dict
+
+
+def test_rechunk_dataset():
+ # Create a sample xarray dataset
+ data = {
+ "a": (["x", "y"], [[1, 2, 3], [4, 5, 6]]),
+ "b": (["x", "y"], [[7, 8, 9], [10, 11, 12]]),
+ }
+ coords = {"x": [0, 1], "y": [0, 1, 2]}
+ ds = xr.Dataset(data, coords=coords)
+
+ # Define the encoding dictionary
+ encoding_dict = {"a": {"chunksizes": (1, 2)}, "b": {"chunksizes": (2, 1)}}
+
+ # Test the rechunk_dataset function
+ ds_rechunked = rechunk_dataset(ds, encoding_dict)
+ assert ds_rechunked["a"].chunks == ((1, 1), (2, 1))
+ assert ds_rechunked["b"].chunks == ((2,), (1, 1, 1))
+
+
+@pytest.fixture
+def encoding_dict_1():
+ # create a test encoding dictionary
+ return {
+ "var1": {"dtype": "float32", "chunksizes": (10, 10, 10)},
+ "var2": {"dtype": "int16", "chunksizes": (5, 5, 5)},
+ "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)},
+ }
+
+
+@pytest.fixture
+def encoding_dict_2():
+ # create a test encoding dictionary
+ return {
+ "var1": {"dtype": "float32", "chunksizes": (100, 100, 100)},
+ "var2": {"dtype": "int16", "chunksizes": (100, 100, 100)},
+ "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)},
+ }
+
+
+@pytest.fixture
+def ds():
+ # create a test xr.Dataset
+ data = {
+ "var1": (["time", "x", "y"], np.random.random((10, 20, 30))),
+ "var2": (["time", "x", "y"], np.random.randint(0, 10, size=(10, 20, 30))),
+ "var3": (["time", "x", "y"], np.random.random((10, 20, 30))),
+ }
+ coords = {"time": np.arange(10), "x": np.arange(20), "y": np.arange(30)}
+ return xr.Dataset(data, coords)
+
+
+def test_sanitize_encodings_dict(encoding_dict_1, encoding_dict_2, ds):
+ result = sanitize_encodings_dict(encoding_dict_1, ds)
+
+ assert isinstance(result, dict)
+
+ # Test that the dictionary contains the same keys as the input dictionary
+ assert set(result.keys()) == set(encoding_dict_1.keys())
+
+ # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes
+ # in the dataset
+ for var in result:
+ assert tuple(result[var]["chunksizes"]) <= ds[var].shape
+
+ result = sanitize_encodings_dict(encoding_dict_2, ds)
+
+ assert isinstance(result, dict)
+
+ # Test that the dictionary contains the same keys as the input dictionary
+ assert set(result.keys()) == set(encoding_dict_2.keys())
+
+ # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes
+ # in the dataset
+ for var in result:
+ assert tuple(result[var]["chunksizes"]) <= ds[var].shape
+
+
+def test_get_time_encoding():
+ assert isinstance(get_time_encoding(), dict)
diff --git a/disdrodb/tests/test_utils/test_utils_logger.py b/disdrodb/tests/test_utils/test_utils_logger.py
index d9438935..f72c86a1 100644
--- a/disdrodb/tests/test_utils/test_utils_logger.py
+++ b/disdrodb/tests/test_utils/test_utils_logger.py
@@ -22,10 +22,11 @@
import pytest
+from disdrodb.api.path import define_campaign_dir, define_logs_dir
from disdrodb.utils.logger import (
close_logger,
- create_file_logger,
- define_summary_log,
+ create_logger_file,
+ create_product_logs,
log_debug,
log_error,
log_info,
@@ -40,20 +41,42 @@ def create_dummy_log_file(filepath, contents):
return filepath
-def test_define_summary_log(tmp_path):
+def test_create_product_logs(tmp_path):
+ test_base_dir = tmp_path / "DISDRODB"
+ data_source = "DATA_SOURCE"
+ campaign_name = "CAMPAIGN_NAME"
station_name = "STATION_NAME"
- logs_dir = tmp_path / "PRODUCT" / "logs"
- logs_dir.mkdir(parents=True)
-
- logs_station_dir = logs_dir / station_name
- logs_station_dir.mkdir(parents=True, exist_ok=True)
-
- log1_fpath = logs_station_dir / "log1.log"
- log2_fpath = logs_station_dir / "log2.log"
+ product = "L0A"
+
+ # Define directory where logs files are saved
+ logs_dir = define_logs_dir(
+ product=product,
+ base_dir=test_base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+ os.makedirs(logs_dir, exist_ok=True)
+
+ # Define paths of logs files
+ log1_fpath = os.path.join(logs_dir, "log1.log")
+ log2_fpath = os.path.join(logs_dir, "log2.log")
+
+ # Define /summary and /problem directory
+ campaign_dir = define_campaign_dir(
+ base_dir=test_base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ )
+ logs_summary_dir = os.path.join(campaign_dir, "logs", "summary")
+ logs_problem_dir = os.path.join(campaign_dir, "logs", "problems")
- summary_log_path = logs_dir / f"logs_summary_{station_name}.log"
- problem_log_path = logs_dir / f"logs_problem_{station_name}.log"
+ # Define summary and problem filepath
+ summary_log_path = os.path.join(logs_summary_dir, f"SUMMARY.{product}.{campaign_name}.{station_name}.log")
+ problem_log_path = os.path.join(logs_problem_dir, f"PROBLEMS.{product}.{campaign_name}.{station_name}.log")
+ ####-------------------------------------.
# Create dummy log files
log_contents1 = (
"INFO: DUMMY MESSAGE \nProcess has started \nWARNING: Potential issue detected \nNOTHING TO SUMMARIZE \n"
@@ -65,15 +88,25 @@ def test_define_summary_log(tmp_path):
# Call the function with the list of log files
list_logs = [str(log_file1), str(log_file2)]
- define_summary_log(list_logs)
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=test_base_dir,
+ # Logs list
+ list_logs=list_logs,
+ )
# Check summary log file
with open(str(summary_log_path)) as f:
summary_contents = f.read()
- assert "WARNING: Potential issue detected" in summary_contents
- assert "ERROR: Critical failure occurred" in summary_contents
+
assert "Process has started" in summary_contents
assert "Process has ended" in summary_contents
+ assert "WARNING: Potential issue detected" in summary_contents
+ assert "ERROR: Critical failure occurred" in summary_contents
+
assert "INFO: DUMMY MESSAGE" not in summary_contents
assert "NOTHING TO SUMMARIZE" not in summary_contents
@@ -91,32 +124,63 @@ def test_define_summary_log(tmp_path):
def test_define_summary_log_when_no_problems(tmp_path):
"""Test that not problem log file is created if no errors occurs."""
+ test_base_dir = tmp_path / "DISDRODB"
+ data_source = "DATA_SOURCE"
+ campaign_name = "CAMPAIGN_NAME"
station_name = "STATION_NAME"
- logs_dir = tmp_path / "PRODUCT" / "logs"
- logs_dir.mkdir(parents=True)
-
- logs_station_dir = logs_dir / station_name
- logs_station_dir.mkdir(parents=True, exist_ok=True)
-
- log1_fpath = logs_station_dir / "log1.log"
- log2_fpath = logs_station_dir / "log2.log"
+ product = "L0A"
+
+ # Define directory where logs files are saved
+ logs_dir = define_logs_dir(
+ product=product,
+ base_dir=test_base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ )
+ os.makedirs(logs_dir, exist_ok=True)
+
+ # Define paths of logs files
+ log1_fpath = os.path.join(logs_dir, "log1.log")
+ log2_fpath = os.path.join(logs_dir, "log2.log")
+
+ # Define /summary and /problem directory
+ campaign_dir = define_campaign_dir(
+ base_dir=test_base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ )
+ logs_summary_dir = os.path.join(campaign_dir, "logs", "summary")
+ logs_problem_dir = os.path.join(campaign_dir, "logs", "problems")
- summary_log_path = logs_dir / f"logs_summary_{station_name}.log"
- problem_log_path = logs_dir / f"logs_problem_{station_name}.log"
+ # Define summary and problem filepath
+ summary_log_path = os.path.join(logs_summary_dir, f"SUMMARY.{product}.{campaign_name}.{station_name}.log")
+ problem_log_path = os.path.join(logs_problem_dir, f"PROBLEMS.{product}.{campaign_name}.{station_name}.log")
+ ####-------------------------------------.
# Check that if no problems, the problems log is not created
log_contents1 = "INFO: DUMMY MESSAGE \nProcess has started \n Process has ended \n"
log_contents2 = "INFO: DUMMY MESSAGE \nProcess has started \n Process has ended \n"
log_file1 = create_dummy_log_file(log1_fpath, log_contents1)
log_file2 = create_dummy_log_file(log2_fpath, log_contents2)
- list_logs = [str(log_file1), str(log_file2)]
- define_summary_log(list_logs)
+ list_logs = [str(log_file1), str(log_file2)] # noqa
+
+ # List logs direc
+ create_product_logs(
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ base_dir=test_base_dir,
+ list_logs=None, # search for logs based on inputs
+ )
assert os.path.exists(summary_log_path)
assert not os.path.exists(problem_log_path)
-@pytest.fixture()
+@pytest.fixture
def test_logger():
logger = logging.getLogger("test_logger")
logger.setLevel(logging.DEBUG) # Capture all log levels
@@ -155,7 +219,7 @@ def test_log_error(caplog, test_logger, capfd):
assert " - Error message" in out
-@pytest.fixture()
+@pytest.fixture
def log_environment(tmp_path):
processed_dir = tmp_path / "processed"
os.makedirs(processed_dir, exist_ok=True)
@@ -165,9 +229,10 @@ def log_environment(tmp_path):
return processed_dir, product, station_name, filename
-def test_create_file_logger_paralle_false(log_environment):
+def test_create_logger_file_paralle_false(log_environment):
processed_dir, product, station_name, filename = log_environment
- logger = create_file_logger(str(processed_dir), product, station_name, filename, parallel=False)
+ logs_dir = os.path.join(str(processed_dir), "logs", product, station_name)
+ logger, logger_filepath = create_logger_file(logs_dir, filename, parallel=False)
assert isinstance(logger, logging.Logger)
@@ -193,6 +258,7 @@ def test_create_file_logger_paralle_false(log_environment):
def test_close_logger(log_environment):
processed_dir, product, station_name, filename = log_environment
- logger = create_file_logger(str(processed_dir), product, station_name, filename, parallel=False)
+ logs_dir = os.path.join(str(processed_dir), "logs", product, station_name)
+ logger, logger_filepath = create_logger_file(logs_dir, filename, parallel=False)
close_logger(logger)
assert not logger.handlers
diff --git a/disdrodb/tests/test_utils/test_utils_scripts.py b/disdrodb/tests/test_utils/test_utils_scripts.py
index 240b4a59..27e41c7f 100644
--- a/disdrodb/tests/test_utils/test_utils_scripts.py
+++ b/disdrodb/tests/test_utils/test_utils_scripts.py
@@ -16,9 +16,9 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# -----------------------------------------------------------------------------.
-"""Test DISDRODB scripts utility."""
+"""Test DISDRODB command-line interface scripts utilities."""
-from disdrodb.utils.scripts import parse_arg_to_list, parse_base_dir
+from disdrodb.utils.cli import parse_arg_to_list, parse_base_dir
def test_parse_arg_to_list_empty_string():
diff --git a/disdrodb/utils/__init__.py b/disdrodb/utils/__init__.py
index e69de29b..9fe0f797 100644
--- a/disdrodb/utils/__init__.py
+++ b/disdrodb/utils/__init__.py
@@ -0,0 +1,17 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB Utils Module."""
diff --git a/disdrodb/utils/attrs.py b/disdrodb/utils/attrs.py
new file mode 100644
index 00000000..c52ade13
--- /dev/null
+++ b/disdrodb/utils/attrs.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB netCDF4 attributes utilities."""
+import datetime
+
+from disdrodb import CONVENTIONS, PRODUCT_VERSION, SOFTWARE_VERSION
+
+####---------------------------------------------------------------------.
+#### Variable attributes
+
+
+def set_attrs(ds, attrs_dict):
+ """Set attributes to the variables of the xr.Dataset."""
+ for var in attrs_dict:
+ if var in ds:
+ ds[var].attrs.update(attrs_dict[var])
+ return ds
+
+
+####---------------------------------------------------------------------.
+#### Coordinates attributes
+
+
+def get_coords_attrs_dict():
+ """Return dictionary with DISDRODB coordinates attributes."""
+ attrs_dict = {}
+ # Define diameter attributes
+ attrs_dict["diameter_bin_center"] = {
+ "name": "diameter_bin_center",
+ "standard_name": "diameter_bin_center",
+ "long_name": "diameter_bin_center",
+ "units": "mm",
+ "description": "Bin center drop diameter value",
+ }
+ attrs_dict["diameter_bin_width"] = {
+ "name": "diameter_bin_width",
+ "standard_name": "diameter_bin_width",
+ "long_name": "diameter_bin_width",
+ "units": "mm",
+ "description": "Drop diameter bin width",
+ }
+ attrs_dict["diameter_bin_upper"] = {
+ "name": "diameter_bin_upper",
+ "standard_name": "diameter_bin_upper",
+ "long_name": "diameter_bin_upper",
+ "units": "mm",
+ "description": "Bin upper bound drop diameter value",
+ }
+ attrs_dict["velocity_bin_lower"] = {
+ "name": "velocity_bin_lower",
+ "standard_name": "velocity_bin_lower",
+ "long_name": "velocity_bin_lower",
+ "units": "mm",
+ "description": "Bin lower bound drop diameter value",
+ }
+ # Define velocity attributes
+ attrs_dict["velocity_bin_center"] = {
+ "name": "velocity_bin_center",
+ "standard_name": "velocity_bin_center",
+ "long_name": "velocity_bin_center",
+ "units": "m/s",
+ "description": "Bin center drop fall velocity value",
+ }
+ attrs_dict["velocity_bin_width"] = {
+ "name": "velocity_bin_width",
+ "standard_name": "velocity_bin_width",
+ "long_name": "velocity_bin_width",
+ "units": "m/s",
+ "description": "Drop fall velocity bin width",
+ }
+ attrs_dict["velocity_bin_upper"] = {
+ "name": "velocity_bin_upper",
+ "standard_name": "velocity_bin_upper",
+ "long_name": "velocity_bin_upper",
+ "units": "m/s",
+ "description": "Bin upper bound drop fall velocity value",
+ }
+ attrs_dict["velocity_bin_lower"] = {
+ "name": "velocity_bin_lower",
+ "standard_name": "velocity_bin_lower",
+ "long_name": "velocity_bin_lower",
+ "units": "m/s",
+ "description": "Bin lower bound drop fall velocity value",
+ }
+ # Define geolocation attributes
+ attrs_dict["latitude"] = {
+ "name": "latitude",
+ "standard_name": "latitude",
+ "long_name": "Latitude",
+ "units": "degrees_north",
+ }
+ attrs_dict["longitude"] = {
+ "name": "longitude",
+ "standard_name": "longitude",
+ "long_name": "Longitude",
+ "units": "degrees_east",
+ }
+ attrs_dict["altitude"] = {
+ "name": "altitude",
+ "standard_name": "altitude",
+ "long_name": "Altitude",
+ "units": "m",
+ "description": "Elevation above sea level",
+ }
+ # Define time attributes
+ attrs_dict["time"] = {
+ "name": "time",
+ "standard_name": "time",
+ "long_name": "time",
+ "description": "UTC Time",
+ }
+
+ return attrs_dict
+
+
+def set_coordinate_attributes(ds):
+ """Set coordinates attributes."""
+ # Get attributes dictionary
+ attrs_dict = get_coords_attrs_dict()
+ # Set attributes
+ ds = set_attrs(ds, attrs_dict)
+ return ds
+
+
+####-------------------------------------------------------------------------.
+#### DISDRODB Global Attributes
+
+
+def set_disdrodb_attrs(ds, product: str):
+ """Add DISDRODB processing information to the netCDF global attributes.
+
+ It assumes stations metadata are already added the dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Dataset
+ product: str
+ DISDRODB product.
+
+ Returns
+ -------
+ xarray dataset
+ Dataset.
+ """
+ # Add dataset conventions
+ ds.attrs["Conventions"] = CONVENTIONS
+
+ # Add featureType
+ if "platform_type" in ds.attrs:
+ platform_type = ds.attrs["platform_type"]
+ if platform_type == "fixed":
+ ds.attrs["featureType"] = "timeSeries"
+ else:
+ ds.attrs["featureType"] = "trajectory"
+
+ # Update DISDRODDB attributes
+ ds = update_disdrodb_attrs(ds=ds, product=product)
+ return ds
+
+
+def update_disdrodb_attrs(ds, product: str):
+ """Add DISDRODB processing information to the netCDF global attributes.
+
+ It assumes stations metadata are already added the dataset.
+
+ Parameters
+ ----------
+ ds : xarray dataset.
+ Dataset
+ product: str
+ DISDRODB product.
+
+ Returns
+ -------
+ xarray dataset
+ Dataset.
+ """
+ # Add time_coverage_start and time_coverage_end
+ ds.attrs["time_coverage_start"] = str(ds["time"].data[0])
+ ds.attrs["time_coverage_end"] = str(ds["time"].data[-1])
+
+ # DISDRODDB attributes
+ # - Add DISDRODB processing info
+ now = datetime.datetime.utcnow()
+ current_time = now.strftime("%Y-%m-%d %H:%M:%S")
+ ds.attrs["disdrodb_processing_date"] = current_time
+ # - Add DISDRODB product and version
+ ds.attrs["disdrodb_product_version"] = PRODUCT_VERSION
+ ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION
+ ds.attrs["disdrodb_product"] = product
+ return ds
diff --git a/disdrodb/utils/cli.py b/disdrodb/utils/cli.py
new file mode 100644
index 00000000..bbe62715
--- /dev/null
+++ b/disdrodb/utils/cli.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB command-line-interface scripts utilities."""
+
+import click
+
+
+def _execute_cmd(cmd, raise_error=False):
+ """Execute command in the terminal, streaming output in python console."""
+ from subprocess import PIPE, CalledProcessError, Popen
+
+ with Popen(cmd, shell=True, stdout=PIPE, bufsize=1, universal_newlines=True) as p:
+ for line in p.stdout:
+ print(line, end="")
+
+ # Raise error if command didn't run successfully
+ if p.returncode != 0 and raise_error:
+ raise CalledProcessError(p.returncode, p.args)
+
+
+def _parse_empty_string_and_none(args):
+ """Utility to parse argument passed from the command line.
+
+ If ``args = ''``, returns None.
+ If ``args = 'None'`` returns None.
+ Otherwise return ``args``.
+ """
+ # If '', set to 'None'
+ args = None if args == "" else args
+ # - If multiple arguments, split by space
+ if isinstance(args, str) and args == "None":
+ args = None
+ return args
+
+
+def parse_arg_to_list(args):
+ """Utility to pass list to command line scripts.
+
+ If ``args = ''`` returns ``None``.
+ If ``args = 'None'`` returns ``None``.
+ If ``args = 'variable'`` returns ``[variable]``.
+ If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
+ """
+ # If '' or 'None' --> Set to None
+ args = _parse_empty_string_and_none(args)
+ # - If multiple arguments, split by space
+ if isinstance(args, str):
+ # - Split by space
+ list_args = args.split(" ")
+ # - Remove '' (deal with multi space)
+ args = [args for args in list_args if len(args) > 0]
+ return args
+
+
+def parse_base_dir(base_dir):
+ """Utility to parse base_dir provided by command line.
+
+ If ``base_dir = 'None'`` returns ``None``.
+ If ``base_dir = ''`` returns ``None``.
+ """
+ # If '', set to 'None'
+ return _parse_empty_string_and_none(base_dir)
+
+
+def click_station_arguments(function: object):
+ """Click command line arguments for DISDRODB station processing.
+
+ Parameters
+ ----------
+ function : object
+ Function.
+ """
+ function = click.argument("station_name", metavar="")(function)
+ function = click.argument("campaign_name", metavar="")(function)
+ function = click.argument("data_source", metavar="")(function)
+ return function
+
+
+def click_base_dir_option(function: object):
+ """Click command line argument for DISDRODB ``base_dir``.
+
+ Parameters
+ ----------
+ function : object
+ Function.
+ """
+ function = click.option(
+ "--base_dir",
+ type=str,
+ show_default=True,
+ default=None,
+ help="DISDRODB base directory",
+ )(function)
+ return function
+
+
+def click_stations_options(function: object):
+ """Click command line options for DISDRODB archive L0 processing.
+
+ Parameters
+ ----------
+ function : object
+ Function.
+ """
+ function = click.option(
+ "--data_sources",
+ type=str,
+ show_default=True,
+ default="",
+ help="DISDRODB data sources to process",
+ )(function)
+ function = click.option(
+ "--campaign_names",
+ type=str,
+ show_default=True,
+ default="",
+ help="DISDRODB campaign names to process",
+ )(function)
+ function = click.option(
+ "--station_names",
+ type=str,
+ show_default=True,
+ default="",
+ help="DISDRODB station names to process",
+ )(function)
+ return function
+
+
+def click_processing_options(function: object):
+ """Click command line default parameters for L0 processing options.
+
+ Parameters
+ ----------
+ function : object
+ Function.
+ """
+ function = click.option(
+ "-p",
+ "--parallel",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="Process files in parallel",
+ )(function)
+ function = click.option(
+ "-d",
+ "--debugging_mode",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="Switch to debugging mode",
+ )(function)
+ function = click.option("-v", "--verbose", type=bool, show_default=True, default=True, help="Verbose")(function)
+ function = click.option(
+ "-f",
+ "--force",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="Force overwriting",
+ )(function)
+ return function
+
+
+def click_remove_l0a_option(function: object):
+ """Click command line argument for ``remove_l0a``."""
+ function = click.option(
+ "--remove_l0a",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="If true, remove the L0A files once the L0B processing is terminated.",
+ )(function)
+ return function
+
+
+def click_remove_l0b_option(function: object):
+ """Click command line argument for ``remove_l0b``."""
+ function = click.option(
+ "--remove_l0b",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="If true, remove the L0B files once the L0C processing is terminated.",
+ )(function)
+ return function
+
+
+def click_l0_archive_options(function: object):
+ """Click command line arguments for L0 processing archiving of a station.
+
+ Parameters
+ ----------
+ function : object
+ Function.
+ """
+ function = click.option(
+ "--remove_l0b",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="If true, remove all source L0B files once L0B concatenation is terminated.",
+ )(function)
+ function = click.option(
+ "--remove_l0a",
+ type=bool,
+ show_default=True,
+ default=False,
+ help="If true, remove the L0A files once the L0B processing is terminated.",
+ )(function)
+ function = click.option(
+ "-l0c",
+ "--l0c_processing",
+ type=bool,
+ show_default=True,
+ default=True,
+ help="Perform L0C processing.",
+ )(function)
+ function = click.option(
+ "-l0b",
+ "--l0b_processing",
+ type=bool,
+ show_default=True,
+ default=True,
+ help="Perform L0B processing.",
+ )(function)
+ function = click.option(
+ "-l0a",
+ "--l0a_processing",
+ type=bool,
+ show_default=True,
+ default=True,
+ help="Perform L0A processing.",
+ )(function)
+ return function
diff --git a/disdrodb/utils/dask.py b/disdrodb/utils/dask.py
new file mode 100644
index 00000000..ee3c5aae
--- /dev/null
+++ b/disdrodb/utils/dask.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Utilities for Dask Distributed computations."""
+import logging
+import os
+
+
+def initialize_dask_cluster():
+ """Initialize Dask Cluster."""
+ import dask
+ from dask.distributed import Client, LocalCluster
+
+ # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF
+ os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+ # Retrieve the number of process to run
+ available_workers = os.cpu_count() - 2 # if not set, all CPUs
+ num_workers = dask.config.get("num_workers", available_workers)
+ # Silence dask warnings
+ dask.config.set({"logging.distributed": "error"})
+ # dask.config.set({"distributed.admin.system-monitor.gil.enabled": False})
+ # Create dask.distributed local cluster
+ cluster = LocalCluster(
+ n_workers=num_workers,
+ threads_per_worker=1,
+ processes=True,
+ # memory_limit='8GB',
+ # silence_logs=False,
+ )
+ client = Client(cluster)
+ return cluster, client
+
+
+def close_dask_cluster(cluster, client):
+ """Close Dask Cluster."""
+ logger = logging.getLogger()
+ # Backup current log level
+ original_level = logger.level
+ logger.setLevel(logging.CRITICAL + 1) # Set level to suppress all logs
+ # Close cluster
+ # - Avoid log 'distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.'
+ try:
+ cluster.close()
+ client.close()
+ finally:
+ # Restore the original log level
+ logger.setLevel(original_level)
diff --git a/disdrodb/utils/decorator.py b/disdrodb/utils/decorator.py
new file mode 100644
index 00000000..64bd76e1
--- /dev/null
+++ b/disdrodb/utils/decorator.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB decorators."""
+import functools
+
+import dask
+
+
+def delayed_if_parallel(function):
+ """Decorator to make the function delayed if its ``parallel`` argument is ``True``."""
+
+ @functools.wraps(function)
+ def wrapper(*args, **kwargs):
+ # Check if it must be a delayed function
+ parallel = kwargs.get("parallel")
+ # If parallel is True
+ if parallel:
+ # Enforce verbose to be False
+ kwargs["verbose"] = False
+ # Define the delayed task
+ result = dask.delayed(function)(*args, **kwargs)
+ else:
+ # Else run the function
+ result = function(*args, **kwargs)
+ return result
+
+ return wrapper
+
+
+def single_threaded_if_parallel(function):
+ """Decorator to make a function use a single threadon delayed if its ``parallel`` argument is ``True``."""
+
+ @functools.wraps(function)
+ def wrapper(*args, **kwargs):
+ # Check if it must be a delayed function
+ parallel = kwargs.get("parallel")
+ # If parallel is True
+ if parallel:
+ # Call function with single thread
+ # with dask.config.set(scheduler='single-threaded'):
+ with dask.config.set(scheduler="synchronous"):
+ result = function(*args, **kwargs)
+ else:
+ # Else run the function as usual
+ result = function(*args, **kwargs)
+ return result
+
+ return wrapper
diff --git a/disdrodb/utils/directories.py b/disdrodb/utils/directories.py
index 8eba18b6..2db94043 100644
--- a/disdrodb/utils/directories.py
+++ b/disdrodb/utils/directories.py
@@ -90,21 +90,18 @@ def create_directory(path: str, exist_ok=True) -> None:
os.makedirs(path, exist_ok=exist_ok)
logger.debug(f"Created directory {path}.")
except Exception as e:
+ dir_path = os.path.dirname(path)
dir_name = os.path.basename(path)
- msg = f"Can not create directory {dir_name} inside . Error: {e}"
+ msg = f"Can not create directory {dir_name} inside {dir_path}. Error: {e}"
logger.exception(msg)
raise FileNotFoundError(msg)
-def create_required_directory(dir_path, dir_name):
+def create_required_directory(dir_path, dir_name, exist_ok=True):
"""Create directory ``dir_name`` inside the ``dir_path`` directory."""
- try:
- new_dir = os.path.join(dir_path, dir_name)
- os.makedirs(new_dir, exist_ok=True)
- except Exception as e:
- msg = f"Can not create directory {dir_name} at {new_dir}. Error: {e}"
- logger.exception(msg)
- raise FileNotFoundError(msg)
+ dir_path = ensure_string_path(dir_path, msg="'path' must be a string", accepth_pathlib=True)
+ new_dir_path = os.path.join(dir_path, dir_name)
+ create_directory(path=new_dir_path, exist_ok=exist_ok)
def is_empty_directory(path):
@@ -119,9 +116,7 @@ def is_empty_directory(path):
return False
paths = os.listdir(path)
- if len(paths) == 0:
- return True
- return False
+ return len(paths) == 0
def _remove_file_or_directories(path):
diff --git a/disdrodb/utils/encoding.py b/disdrodb/utils/encoding.py
new file mode 100644
index 00000000..7b052a56
--- /dev/null
+++ b/disdrodb/utils/encoding.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB netCDF4 encoding utilities."""
+import xarray as xr
+
+EPOCH = "seconds since 1970-01-01 00:00:00"
+
+
+def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
+ """Apply the encodings to the xarray Dataset.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input xarray dataset.
+ encoding_dict : dict
+ Dictionary with encoding specifications.
+
+ Returns
+ -------
+ xr.Dataset
+ Output xarray dataset.
+ """
+ # Subset encoding dictionary
+ # - Here below encoding_dict contains only keys (variables) within the dataset
+ encoding_dict = {var: encoding_dict[var] for var in ds.data_vars if var in encoding_dict}
+
+ # Ensure chunksize smaller than the array shape
+ encoding_dict = sanitize_encodings_dict(encoding_dict, ds)
+
+ # Rechunk variables for fast writing !
+ # - This pop the chunksize argument from the encoding dict !
+ ds = rechunk_dataset(ds, encoding_dict)
+
+ # Set time encoding
+ ds["time"].encoding.update(get_time_encoding())
+
+ # Set the variable encodings
+ for var, encoding in encoding_dict.items():
+ ds[var].encoding.update(encoding)
+
+ # Ensure no deprecated "missing_value" attribute
+ # - When source dataset is netcdf (i.e. ARM)
+ for var in list(ds.variables):
+ _ = ds[var].encoding.pop("missing_value", None)
+
+ return ds
+
+
+def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict:
+ """Ensure chunk size to be smaller than the array shape.
+
+ Parameters
+ ----------
+ encoding_dict : dict
+ Dictionary containing the variable encodings.
+ ds : xarray.Dataset
+ Input dataset.
+
+ Returns
+ -------
+ dict
+ Encoding dictionary.
+ """
+ for var in ds.data_vars:
+ if var in encoding_dict:
+ shape = ds[var].shape
+ chunks = encoding_dict[var].get("chunksizes", None)
+ if chunks is not None:
+ chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))]
+ encoding_dict[var]["chunksizes"] = chunks
+ return encoding_dict
+
+
+def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset:
+ """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input xarray dataset
+ encoding_dict : dict
+ Dictionary containing the encoding to write the xarray dataset as a netCDF.
+
+ Returns
+ -------
+ xr.Dataset
+ Output xarray dataset
+ """
+ for var in ds.data_vars:
+ if var in encoding_dict:
+ chunks = encoding_dict[var].pop("chunksizes", None)
+ if chunks is not None:
+ dims = list(ds[var].dims)
+ chunks_dict = dict(zip(dims, chunks))
+ ds[var] = ds[var].chunk(chunks_dict)
+ return ds
+
+
+def get_time_encoding() -> dict:
+ """Create time encoding.
+
+ Returns
+ -------
+ dict
+ Time encoding.
+ """
+ encoding = {}
+ encoding["units"] = EPOCH
+ encoding["calendar"] = "proleptic_gregorian"
+ return encoding
diff --git a/disdrodb/utils/logger.py b/disdrodb/utils/logger.py
index 2fae3d30..42f55080 100644
--- a/disdrodb/utils/logger.py
+++ b/disdrodb/utils/logger.py
@@ -24,10 +24,9 @@
from asyncio.log import logger
-def create_file_logger(processed_dir, product, station_name, filename, parallel):
- """Create file logger."""
+def create_logger_file(logs_dir, filename, parallel):
+ """Create logger file."""
# Create logs directory
- logs_dir = os.path.join(processed_dir, "logs", product, station_name)
os.makedirs(logs_dir, exist_ok=True)
# Define logger filepath
@@ -44,10 +43,14 @@ def create_file_logger(processed_dir, product, station_name, filename, parallel)
handler.setFormatter(logging.Formatter(format_type))
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
- return logger
+
+ # Define logger filepath
+ # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None
+ logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None
+ return logger, logger_filepath
-def close_logger(logger: logger) -> None:
+def close_logger(logger) -> None:
"""Close the logger.
Parameters
@@ -80,7 +83,8 @@ def log_debug(logger: logger, msg: str, verbose: bool = False) -> None:
"""
if verbose:
print(" - " + msg)
- logger.debug(msg)
+ if logger is not None:
+ logger.debug(msg)
def log_info(logger: logger, msg: str, verbose: bool = False) -> None:
@@ -98,7 +102,8 @@ def log_info(logger: logger, msg: str, verbose: bool = False) -> None:
"""
if verbose:
print(" - " + msg)
- logger.info(msg)
+ if logger is not None:
+ logger.info(msg)
def log_warning(logger: logger, msg: str, verbose: bool = False) -> None:
@@ -116,7 +121,8 @@ def log_warning(logger: logger, msg: str, verbose: bool = False) -> None:
"""
if verbose:
print(" - " + msg)
- logger.warning(msg)
+ if logger is not None:
+ logger.warning(msg)
def log_error(logger: logger, msg: str, verbose: bool = False) -> None:
@@ -134,15 +140,12 @@ def log_error(logger: logger, msg: str, verbose: bool = False) -> None:
"""
if verbose:
print(" - " + msg)
- logger.error(msg)
+ if logger is not None:
+ logger.error(msg)
-def _get_logs_dir(list_logs):
- list_logs = sorted(list_logs)
- station_logs_dir = os.path.dirname(list_logs[0])
- station_name = station_logs_dir.split(os.path.sep)[-1]
- logs_dir = os.path.dirname(station_logs_dir)
- return station_name, logs_dir
+####---------------------------------------------------------------------------.
+#### SUMMARY LOGS
def _define_station_summary_log_file(list_logs, summary_filepath):
@@ -163,16 +166,27 @@ def _define_station_summary_log_file(list_logs, summary_filepath):
def _define_station_problem_log_file(list_logs, problem_filepath):
# - Copy the log of files with warnings and error
list_keywords = ["ERROR"] # "WARNING"
+ list_patterns = ["ValueError: Less than 5 timesteps available for day"]
re_keyword = re.compile("|".join(list_keywords))
+ # Compile patterns to ignore, escaping any special regex characters
+ re_patterns = re.compile("|".join(map(re.escape, list_patterns))) if list_patterns else None
+ # Initialize problem log file
any_problem = False
+ n_files = len(list_logs)
+ n_files_with_problems = 0
with open(problem_filepath, "w") as output_file:
+ # Loop over log files and collect problems
for log_filepath in list_logs:
log_with_problem = False
# Check if an error is reported
with open(log_filepath) as input_file:
for line in input_file:
if re_keyword.search(line):
+ # If the line matches an ignore pattern, skip it
+ if re_patterns and re_patterns.search(line):
+ continue
log_with_problem = True
+ n_files_with_problems += 1
any_problem = True
break
# If it is reported, copy the log file in the logs_problem file
@@ -180,34 +194,154 @@ def _define_station_problem_log_file(list_logs, problem_filepath):
with open(log_filepath) as input_file:
output_file.write(input_file.read())
+ # Add number of files with problems
+ msg = f"SUMMARY: {n_files_with_problems} of {n_files} files had problems."
+ output_file.write(msg)
+
# If no problems occurred, remove the logs_problem_.log file
if not any_problem:
os.remove(problem_filepath)
-def define_summary_log(list_logs):
- """Define a station summary and a problems log file from the list of input logs.
-
- The summary log select only logged lines with ``root``, ``WARNING`` and ``ERROR`` keywords.
- The problems log file select only logged lines with the ``ERROR`` keyword.
- The two log files are saved in the parent directory of the input ``list_logs``.
-
- The function assume that the files logs are located at:
+def create_product_logs(
+ product,
+ data_source,
+ campaign_name,
+ station_name,
+ base_dir=None,
+ # Product options
+ sample_interval=None,
+ rolling=None,
+ model_name=None,
+ # Logs list
+ list_logs=None, # If none, list it !
+):
+ """Create station summary and station problems log files.
+
+ The summary log selects only logged lines with ``root``, ``WARNING``, and ``ERROR`` keywords.
+ The problems log file selects only logged lines with the ``ERROR`` keyword.
+
+ The logs directory structure is the follow:
+ /logs
+ - /files// (same structure as data ... a log for each processed file)
+ - /summary
+ --> SUMMARY....log
+ - /problems
+ --> PROBLEMS....log
- ``/DISDRODB/Processed///logs///.log``
+ Parameters
+ ----------
+ product : str
+ The DISDRODB product.
+ data_source : str
+ The data source name.
+ campaign_name : str
+ The campaign name.
+ station_name : str
+ The station name.
+ base_dir : str, optional
+ The base directory path. Default is None.
+ sample_interval : str, optional
+ The sample interval for L2E option. Default is None.
+ rolling : str, optional
+ The rolling option for L2E. Default is None.
+ model_name : str, optional
+ The model name for L2M. Default is None.
+ list_logs : list, optional
+ List of log file paths. If None, the function will list the log files.
+
+ Returns
+ -------
+ None
"""
+ from disdrodb.api.path import define_campaign_dir, define_filename, define_logs_dir
+ from disdrodb.utils.directories import list_files
+
+ # --------------------------------------------------------.
+ # Search for logs file
+ if list_logs is None:
+ # Define product logs directory within /files/....
+ logs_dir = define_logs_dir(
+ product=product,
+ base_dir=base_dir,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # Option for L2E
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # Option for L2M
+ model_name=model_name,
+ )
+ list_logs = list_files(logs_dir, glob_pattern="*", recursive=True)
+
+ # --------------------------------------------------------.
# LogCaptureHandler of pytest does not have baseFilename attribute, so it returns None
if list_logs[0] is None:
return
- station_name, logs_dir = _get_logs_dir(list_logs)
-
+ # --------------------------------------------------------.
+ # Define /summary and /problem directory
+ campaign_dir = define_campaign_dir(
+ base_dir=base_dir,
+ product=product,
+ data_source=data_source,
+ campaign_name=campaign_name,
+ )
+ logs_summary_dir = os.path.join(campaign_dir, "logs", "summary")
+ logs_problem_dir = os.path.join(campaign_dir, "logs", "problems")
+
+ os.makedirs(logs_summary_dir, exist_ok=True)
+ os.makedirs(logs_problem_dir, exist_ok=True)
+
+ # --------------------------------------------------------.
# Define station summary log file name
- summary_filepath = os.path.join(logs_dir, f"logs_summary_{station_name}.log")
+ summary_filename = define_filename(
+ product=product,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L2E option
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # L2M option
+ model_name=model_name,
+ # Filename options
+ add_version=False,
+ add_time_period=False,
+ add_extension=False,
+ prefix="SUMMARY",
+ suffix="log",
+ )
+ summary_filepath = os.path.join(logs_summary_dir, summary_filename)
+
# Define station problem logs file name
- problem_filepath = os.path.join(logs_dir, f"logs_problem_{station_name}.log")
- # Create station summary log file
+ problem_filename = define_filename(
+ product=product,
+ campaign_name=campaign_name,
+ station_name=station_name,
+ # L2E option
+ sample_interval=sample_interval,
+ rolling=rolling,
+ # L2M option
+ model_name=model_name,
+ # Filename options
+ add_version=False,
+ add_time_period=False,
+ add_extension=False,
+ prefix="PROBLEMS",
+ suffix="log",
+ )
+ problem_filepath = os.path.join(logs_problem_dir, problem_filename)
+
+ # --------------------------------------------------------.
+ # Create summary log file
_define_station_summary_log_file(list_logs, summary_filepath)
- # Create station ptoblems log file (if no problems, no file)
+
+ # Create problem log file (if no problems, no file created)
_define_station_problem_log_file(list_logs, problem_filepath)
+
+ # --------------------------------------------------------.
+ # Remove /problem directory if empty !
+ if len(os.listdir(logs_problem_dir)) == 0:
+ os.rmdir(logs_problem_dir)
diff --git a/disdrodb/utils/scripts.py b/disdrodb/utils/scripts.py
deleted file mode 100644
index 86d35924..00000000
--- a/disdrodb/utils/scripts.py
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/env python3
-
-# -----------------------------------------------------------------------------.
-# Copyright (c) 2021-2023 DISDRODB developers
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-# -----------------------------------------------------------------------------.
-"""DISDRODB scripts utility."""
-
-import click
-
-
-def _execute_cmd(cmd, raise_error=False):
- """Execute command in the terminal, streaming output in python console."""
- from subprocess import PIPE, CalledProcessError, Popen
-
- with Popen(cmd, shell=True, stdout=PIPE, bufsize=1, universal_newlines=True) as p:
- for line in p.stdout:
- print(line, end="")
-
- # Raise error if command didn't run successfully
- if p.returncode != 0 and raise_error:
- raise CalledProcessError(p.returncode, p.args)
-
-
-def _parse_empty_string_and_none(args):
- """Utility to parse argument passed from the command line.
-
- If ``args = ''``, returns None.
- If ``args = 'None'`` returns None.
- Otherwise return ``args``.
- """
- # If '', set to 'None'
- args = None if args == "" else args
- # - If multiple arguments, split by space
- if isinstance(args, str) and args == "None":
- args = None
- return args
-
-
-def parse_arg_to_list(args):
- """Utility to pass list to command line scripts.
-
- If ``args = ''`` returns ``None``.
- If ``args = 'None'`` returns ``None``.
- If ``args = 'variable'`` returns ``[variable]``.
- If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``.
- """
- # If '' or 'None' --> Set to None
- args = _parse_empty_string_and_none(args)
- # - If multiple arguments, split by space
- if isinstance(args, str):
- # - Split by space
- list_args = args.split(" ")
- # - Remove '' (deal with multi space)
- args = [args for args in list_args if len(args) > 0]
- return args
-
-
-def parse_base_dir(base_dir):
- """Utility to parse base_dir provided by command line.
-
- If ``base_dir = 'None'`` returns ``None``.
- If ``base_dir = ''`` returns ``None``.
- """
- # If '', set to 'None'
- return _parse_empty_string_and_none(base_dir)
-
-
-def click_station_arguments(function: object):
- """Click command line arguments for DISDRODB station processing.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.argument("station_name", metavar="")(function)
- function = click.argument("campaign_name", metavar="")(function)
- function = click.argument("data_source", metavar="")(function)
- return function
-
-
-def click_base_dir_option(function: object):
- """Click command line argument for DISDRODB ``base_dir``.
-
- Parameters
- ----------
- function : object
- Function.
- """
- function = click.option(
- "--base_dir",
- type=str,
- show_default=True,
- default=None,
- help="DISDRODB base directory",
- )(function)
- return function
diff --git a/disdrodb/utils/time.py b/disdrodb/utils/time.py
new file mode 100644
index 00000000..2da1aa1b
--- /dev/null
+++ b/disdrodb/utils/time.py
@@ -0,0 +1,545 @@
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""This module contains utilities related to the processing of temporal dataset."""
+import logging
+import re
+from typing import Optional
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+from xarray.core import dtypes
+
+from disdrodb.utils.logger import log_info, log_warning
+
+logger = logging.getLogger(__name__)
+
+####------------------------------------------------------------------------------------.
+#### Sampling Interval Acronyms
+
+
+def seconds_to_acronym(seconds):
+ """
+ Convert a duration in seconds to a readable string format (e.g., "1H30", "1D2H").
+
+ Parameters
+ ----------
+ - seconds (int): The time duration in seconds.
+
+ Returns
+ -------
+ - str: The duration as a string in a format like "30S", "1MIN30S", "1H30MIN", or "1D2H".
+ """
+ timedelta = pd.Timedelta(seconds=seconds)
+ components = timedelta.components
+
+ parts = []
+ if components.days > 0:
+ parts.append(f"{components.days}D")
+ if components.hours > 0:
+ parts.append(f"{components.hours}H")
+ if components.minutes > 0:
+ parts.append(f"{components.minutes}MIN")
+ if components.seconds > 0:
+ parts.append(f"{components.seconds}S")
+ acronym = "".join(parts)
+ return acronym
+
+
+def get_resampling_information(sample_interval_acronym):
+ """
+ Extract resampling information from the sample interval acronym.
+
+ Parameters
+ ----------
+ sample_interval_acronym: str
+ A string representing the sample interval: e.g., "1H30MIN", "ROLL1H30MIN".
+
+ Returns
+ -------
+ sample_interval_seconds, rolling: tuple
+ Sample_interval in seconds and whether rolling is enabled.
+ """
+ rolling = sample_interval_acronym.startswith("ROLL")
+ if rolling:
+ sample_interval_acronym = sample_interval_acronym[4:] # Remove "ROLL"
+
+ # Allowed pattern: one or more occurrences of ""
+ # where unit is exactly one of D, H, MIN, or S.
+ # Examples: 1H, 30MIN, 2D, 45S, and any concatenation like 1H30MIN.
+ pattern = r"^(\d+(?:D|H|MIN|S))+$"
+
+ # Check if the entire string matches the pattern
+ if not re.match(pattern, sample_interval_acronym):
+ raise ValueError(
+ f"Invalid sample interval acronym '{sample_interval_acronym}'. "
+ "Must be composed of one or more groups, where unit is D, H, MIN, or S.",
+ )
+
+ # Regular expression to match duration components and extract all (value, unit) pairs
+ pattern = r"(\d+)(D|H|MIN|S)"
+ matches = re.findall(pattern, sample_interval_acronym)
+
+ # Conversion factors for each unit
+ unit_to_seconds = {
+ "D": 86400, # Seconds in a day
+ "H": 3600, # Seconds in an hour
+ "MIN": 60, # Seconds in a minute
+ "S": 1, # Seconds in a second
+ }
+
+ # Parse matches and calculate total seconds
+ sample_interval = 0
+ for value, unit in matches:
+ value = int(value)
+ if unit in unit_to_seconds:
+ sample_interval += value * unit_to_seconds[unit]
+ return sample_interval, rolling
+
+
+def acronym_to_seconds(acronym):
+ """
+ Extract the interval in seconds from the duration acronym.
+
+ Parameters
+ ----------
+ acronym: str
+ A string representing a duration: e.g., "1H30MIN", "ROLL1H30MIN".
+
+ Returns
+ -------
+ seconds
+ Duration in seconds.
+ """
+ seconds, _ = get_resampling_information(acronym)
+ return seconds
+
+
+####------------------------------------------------------------------------------------.
+#### Xarray utilities
+
+
+def get_dataset_start_end_time(ds: xr.Dataset, time_dim="time"):
+ """Retrieves dataset starting and ending time.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input dataset
+ time_dim: str
+ Name of the time dimension.
+ The default is "time".
+
+ Returns
+ -------
+ tuple
+ (``starting_time``, ``ending_time``)
+
+ """
+ starting_time = ds[time_dim].to_numpy()[0]
+ ending_time = ds[time_dim].to_numpy()[-1]
+ return (starting_time, ending_time)
+
+
+def _define_fill_value(ds, fill_value):
+ fill_value = {}
+ for var in ds.data_vars:
+ if np.issubdtype(ds[var].dtype, np.floating):
+ fill_value[var] = dtypes.NA
+ elif np.issubdtype(ds[var].dtype, np.integer):
+ if "_FillValue" in ds[var].attrs:
+ fill_value[var] = ds[var].attrs["_FillValue"]
+ else:
+ fill_value[var] = np.iinfo(ds[var].dtype).max
+ return fill_value
+
+
+def _check_time_sorted(ds, time_dim):
+ time_diff = np.diff(ds[time_dim].data.astype(int))
+ if np.any(time_diff == 0):
+ raise ValueError(f"In the {time_dim} dimension there are duplicated timesteps !")
+ if not np.all(time_diff > 0):
+ print(f"The {time_dim} dimension was not sorted. Sorting it now !")
+ ds = ds.sortby(time_dim)
+ return ds
+
+
+def regularize_dataset(
+ ds: xr.Dataset,
+ freq: str,
+ time_dim: str = "time",
+ method: Optional[str] = None,
+ fill_value=None,
+):
+ """Regularize a dataset across time dimension with uniform resolution.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ xarray Dataset.
+ time_dim : str, optional
+ The time dimension in the xarray.Dataset. The default is ``"time"``.
+ freq : str
+ The ``freq`` string to pass to `pd.date_range()` to define the new time coordinates.
+ Examples: ``freq="2min"``.
+ method : str, optional
+ Method to use for filling missing timesteps.
+ If ``None``, fill with ``fill_value``. The default is ``None``.
+ For other possible methods, see xarray.Dataset.reindex()`.
+ fill_value : (float, dict), optional
+ Fill value to fill missing timesteps.
+ If not specified, for float variables it uses ``dtypes.NA`` while for
+ for integers variables it uses the maximum allowed integer value or,
+ in case of undecoded variables, the ``_FillValue`` DataArray attribute..
+
+ Returns
+ -------
+ ds_reindexed : xarray.Dataset
+ Regularized dataset.
+
+ """
+ ds = _check_time_sorted(ds, time_dim=time_dim)
+ start_time, end_time = get_dataset_start_end_time(ds, time_dim=time_dim)
+ new_time_index = pd.date_range(
+ start=pd.to_datetime(start_time),
+ end=pd.to_datetime(end_time),
+ freq=freq,
+ )
+
+ # Define fill_value dictionary
+ if fill_value is None:
+ fill_value = _define_fill_value(ds, fill_value)
+
+ # Regularize dataset and fill with NA values
+ ds = ds.reindex(
+ {time_dim: new_time_index},
+ method=method, # do not fill gaps
+ # tolerance=tolerance, # mismatch in seconds
+ fill_value=fill_value,
+ )
+ return ds
+
+
+def ensure_sorted_by_time(ds):
+ """Ensure a dataset is sorted by time."""
+ # Check sorted by time and sort if necessary
+ is_sorted = np.all(ds["time"].data[:-1] <= ds["time"].data[1:])
+ if not is_sorted:
+ ds = ds.sortby("time")
+ return ds
+
+
+####------------------------------------------
+#### Sampling interval utilities
+
+
+def ensure_sample_interval_in_seconds(sample_interval):
+ """
+ Ensure the sample interval is in seconds.
+
+ Parameters
+ ----------
+ sample_interval : int, numpy.ndarray, xarray.DataArray, or numpy.timedelta64
+ The sample interval to be converted to seconds.
+ It can be:
+ - An integer representing the interval in seconds.
+ - A numpy array or xarray DataArray of integers representing intervals in seconds.
+ - A numpy.timedelta64 object representing the interval.
+ - A numpy array or xarray DataArray of numpy.timedelta64 objects representing intervals.
+
+ Returns
+ -------
+ int, numpy.ndarray, or xarray.DataArray
+ The sample interval converted to seconds. The return type matches the input type:
+ - If the input is an integer, the output is an integer.
+ - If the input is a numpy array, the output is a numpy array of integers.
+ - If the input is an xarray DataArray, the output is an xarray DataArray of integers.
+
+ """
+ if (
+ isinstance(sample_interval, int)
+ or isinstance(sample_interval, (np.ndarray, xr.DataArray))
+ and np.issubdtype(sample_interval.dtype, int)
+ ):
+ return sample_interval
+ if isinstance(sample_interval, np.timedelta64):
+ return sample_interval / np.timedelta64(1, "s")
+ if isinstance(sample_interval, np.ndarray) and np.issubdtype(sample_interval.dtype, np.timedelta64):
+ return sample_interval.astype("timedelta64[s]").astype(int)
+ if isinstance(sample_interval, xr.DataArray) and np.issubdtype(sample_interval.dtype, np.timedelta64):
+ sample_interval = sample_interval.copy()
+ sample_interval_int = sample_interval.data.astype("timedelta64[s]").astype(int)
+ sample_interval.data = sample_interval_int
+ return sample_interval
+ raise TypeError(
+ "sample_interval must be an int, numpy.timedelta64, or numpy array of timedelta64.",
+ )
+
+
+def infer_sample_interval(ds, robust=False, verbose=False, logger=None):
+ """Infer the sample interval of a dataset.
+
+ NOTE: This function is not used in the DISDRODB processing chain.
+ """
+ # Check sorted by time and sort if necessary
+ ds = ensure_sorted_by_time(ds)
+
+ # Calculate number of timesteps
+ n_timesteps = len(ds["time"].data)
+
+ # Calculate time differences in seconds
+ deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int)
+
+ # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s)
+ # Example: for sample_interval = 10, deltat values like 8, 9, 11, 12 become 10 ...
+ # Example: for sample_interval = 10, deltat values like 6, 7 or 13, 14 become respectively 5 and 15 ...
+ # Example: for sample_interval = 30, deltat values like 28,29,30,31,32 deltat become 30 ...
+ # Example: for sample_interval = 30, deltat values like 26, 27 or 33, 34 become respectively 25 and 35 ...
+ # --> Need other rounding after having identified the most frequent sample interval to coerce such values to 30
+ min_sample_interval = 10
+ min_half_sample_interval = min_sample_interval / 2
+ deltadt = np.round(deltadt / min_half_sample_interval) * min_half_sample_interval
+
+ # Identify unique time intervals and their occurrences
+ unique_deltas, counts = np.unique(deltadt, return_counts=True)
+
+ # Determine the most frequent time interval (mode)
+ most_frequent_delta_idx = np.argmax(counts)
+ sample_interval = unique_deltas[most_frequent_delta_idx]
+
+ # Reround deltadt once knowing the sample interval
+ # - If sample interval is 10: all values between 6 and 14 are rounded to 10, below 6 to 0, above 14 to 20
+ # - If sample interval is 30: all values between 16 and 44 are rounded to 30, below 16 to 0, above 44 to 20
+ deltadt = np.round(deltadt / sample_interval) * sample_interval
+
+ # Identify unique time intervals and their occurrences
+ unique_deltas, counts = np.unique(deltadt, return_counts=True)
+ fractions = np.round(counts / len(deltadt) * 100, 2)
+
+ # Identify the minimum delta (except 0)
+ min_delta = unique_deltas[unique_deltas != 0].min()
+
+ # Determine the most frequent time interval (mode)
+ most_frequent_delta_idx = np.argmax(counts)
+ sample_interval = unique_deltas[most_frequent_delta_idx]
+ sample_interval_fraction = fractions[most_frequent_delta_idx]
+
+ # Inform about irregular sampling
+ unexpected_intervals = unique_deltas[unique_deltas != sample_interval]
+ unexpected_intervals_counts = counts[unique_deltas != sample_interval]
+ unexpected_intervals_fractions = fractions[unique_deltas != sample_interval]
+ if verbose and len(unexpected_intervals) > 0:
+ msg = "Irregular timesteps detected."
+ log_info(logger=logger, msg=msg, verbose=verbose)
+ for interval, count, fraction in zip(
+ unexpected_intervals,
+ unexpected_intervals_counts,
+ unexpected_intervals_fractions,
+ ):
+ msg = f" Interval: {interval} seconds, Occurrence: {count}, Frequency: {fraction} %"
+ log_info(logger=logger, msg=msg, verbose=verbose)
+
+ # Perform checks
+ # - Raise error if negative or zero time intervals are presents
+ # - If robust = False, still return the estimated sample_interval
+ if robust and np.any(deltadt == 0):
+ raise ValueError("Likely presence of duplicated timesteps.")
+
+ ####-------------------------------------------------------------------------.
+ #### Informative messages
+ # - Log a warning if estimated sample interval has frequency less than 60 %
+ sample_interval_fraction_threshold = 60
+ msg = (
+ f"The most frequent sampling interval ({sample_interval} s) "
+ + f"has a frequency lower than {sample_interval_fraction_threshold}%: {sample_interval_fraction} %. "
+ + f"Total number of timesteps: {n_timesteps}."
+ )
+ if sample_interval_fraction < sample_interval_fraction_threshold:
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+
+ # - Log a warning if an unexpected interval has frequency larger than 20 percent
+ frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 20]
+ if len(frequent_unexpected_intervals) != 0:
+ frequent_unexpected_intervals_str = ", ".join(
+ f"{interval} seconds" for interval in frequent_unexpected_intervals
+ )
+ msg = (
+ "The following unexpected intervals have a frequency "
+ + f"greater than 20%: {frequent_unexpected_intervals_str} %. "
+ + f"Total number of timesteps: {n_timesteps}."
+ )
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+
+ # - Raise error if the most frequent interval is not the expected one !
+ if sample_interval != min_delta:
+ raise ValueError(
+ f"The most frequent sampling interval ({sample_interval} seconds) "
+ f"is not the smallest interval ({min_delta} seconds). "
+ "Inconsistent sampling intervals in the dataset !",
+ )
+
+ return int(sample_interval)
+
+
+####---------------------------------------------------------------------------------
+#### Timesteps regularization
+
+
+def get_problematic_timestep_indices(timesteps, sample_interval):
+ """Identify timesteps with missing previous or following timesteps."""
+ previous_time = timesteps - pd.Timedelta(seconds=sample_interval)
+ next_time = timesteps + pd.Timedelta(seconds=sample_interval)
+ idx_previous_missing = np.where(~np.isin(previous_time, timesteps))[0][1:]
+ idx_next_missing = np.where(~np.isin(next_time, timesteps))[0][:-1]
+ idx_isolated_missing = np.intersect1d(idx_previous_missing, idx_next_missing)
+ idx_previous_missing = idx_previous_missing[np.isin(idx_previous_missing, idx_isolated_missing, invert=True)]
+ idx_next_missing = idx_next_missing[np.isin(idx_next_missing, idx_isolated_missing, invert=True)]
+ return idx_previous_missing, idx_next_missing, idx_isolated_missing
+
+
+def regularize_timesteps(ds, sample_interval, robust=False, add_quality_flag=True, logger=None, verbose=True):
+ """Ensure timesteps match with the sample_interval."""
+ # Check sorted by time and sort if necessary
+ ds = ensure_sorted_by_time(ds)
+
+ # Convert time to pandas.DatetimeIndex for easier manipulation
+ times = pd.to_datetime(ds["time"].values)
+
+ # Determine the start and end times
+ start_time = times[0].floor(f"{sample_interval}s")
+ end_time = times[-1].ceil(f"{sample_interval}s")
+
+ # Create the expected time grid
+ expected_times = pd.date_range(start=start_time, end=end_time, freq=f"{sample_interval}s")
+
+ # Convert to numpy arrays
+ times = times.to_numpy(dtype="M8[s]")
+ expected_times = expected_times.to_numpy(dtype="M8[s]")
+
+ # Map original times to the nearest expected times
+ # Calculate the difference between original times and expected times
+ time_deltas = np.abs(times - expected_times[:, None]).astype(int)
+
+ # Find the index of the closest expected time for each original time
+ nearest_indices = np.argmin(time_deltas, axis=0)
+ adjusted_times = expected_times[nearest_indices]
+
+ # Check for duplicates in adjusted times
+ unique_times, counts = np.unique(adjusted_times, return_counts=True)
+ duplicates = unique_times[counts > 1]
+
+ # Initialize time quality flag
+ # - 0 when ok or just rounded to closest 00
+ # - 1 if previous timestep is missing
+ # - 2 if next timestep is missing
+ # - 3 if previous and next timestep is missing
+ # - 4 if solved duplicated timesteps
+ # - 5 if needed to drop duplicated timesteps and select the last
+ flag_previous_missing = 1
+ flag_next_missing = 2
+ flag_isolated_timestep = 3
+ flag_solved_duplicated_timestep = 4
+ flag_dropped_duplicated_timestep = 5
+ qc_flag = np.zeros(adjusted_times.shape)
+
+ # Initialize list with the duplicated timesteps index to drop
+ # - We drop the first occurrence because is likely the shortest interval
+ idx_to_drop = []
+
+ # Attempt to resolve for duplicates
+ if duplicates.size > 0:
+ # Handle duplicates
+ for dup_time in duplicates:
+ # Indices of duplicates
+ dup_indices = np.where(adjusted_times == dup_time)[0]
+ n_duplicates = len(dup_indices)
+ # Define previous and following timestep
+ prev_time = dup_time - pd.Timedelta(seconds=sample_interval)
+ next_time = dup_time + pd.Timedelta(seconds=sample_interval)
+ # Try to find missing slots before and after
+ # - If more than 3 duplicates, impossible to solve !
+ count_solved = 0
+ # If the previous timestep is available, set that one
+ if n_duplicates == 2:
+ if prev_time not in adjusted_times:
+ adjusted_times[dup_indices[0]] = prev_time
+ qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep
+ count_solved += 1
+ elif next_time not in adjusted_times:
+ adjusted_times[dup_indices[-1]] = next_time
+ qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
+ count_solved += 1
+ else:
+ pass
+ elif n_duplicates == 3:
+ if prev_time not in adjusted_times:
+ adjusted_times[dup_indices[0]] = prev_time
+ qc_flag[dup_indices[0]] = flag_dropped_duplicated_timestep
+ count_solved += 1
+ if next_time not in adjusted_times:
+ adjusted_times[dup_indices[-1]] = next_time
+ qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep
+ count_solved += 1
+ if count_solved != n_duplicates - 1:
+ idx_to_drop = np.append(idx_to_drop, dup_indices[0:-1])
+ qc_flag[dup_indices[-1]] = flag_dropped_duplicated_timestep
+ msg = (
+ f"Cannot resolve {n_duplicates} duplicated timesteps"
+ f"(after trailing seconds correction) around {dup_time}."
+ )
+ log_warning(logger=logger, msg=msg, verbose=verbose)
+ if robust:
+ raise ValueError(msg)
+
+ # Update the time coordinate (Convert to ns for xarray compatibility)
+ ds = ds.assign_coords({"time": adjusted_times.astype("datetime64[ns]")})
+
+ # Update quality flag values for next and previous timestep is missing
+ if add_quality_flag:
+ idx_previous_missing, idx_next_missing, idx_isolated_missing = get_problematic_timestep_indices(
+ adjusted_times,
+ sample_interval,
+ )
+ qc_flag[idx_previous_missing] = np.maximum(qc_flag[idx_previous_missing], flag_previous_missing)
+ qc_flag[idx_next_missing] = np.maximum(qc_flag[idx_next_missing], flag_next_missing)
+ qc_flag[idx_isolated_missing] = np.maximum(qc_flag[idx_isolated_missing], flag_isolated_timestep)
+
+ # If the first timestep is at 00:00 and currently flagged as previous missing (1), reset to 0
+ # first_time = pd.to_datetime(adjusted_times[0]).time()
+ # first_expected_time = pd.Timestamp("00:00:00").time()
+ # if first_time == first_expected_time and qc_flag[0] == flag_previous_missing:
+ # qc_flag[0] = 0
+
+ # # If the last timestep is flagged and currently flagged as next missing (2), reset it to 0
+ # last_time = pd.to_datetime(adjusted_times[-1]).time()
+ # last_time_expected = (pd.Timestamp("00:00:00") - pd.Timedelta(30, unit="seconds")).time()
+ # # Check if adding one interval would go beyond the end_time
+ # if last_time == last_time_expected and qc_flag[-1] == flag_next_missing:
+ # qc_flag[-1] = 0
+
+ # Assign time quality flag coordinate
+ ds["time_qc"] = xr.DataArray(qc_flag, dims="time")
+ ds = ds.set_coords("time_qc")
+
+ # Drop duplicated timesteps
+ if len(idx_to_drop) > 0:
+ idx_to_drop = idx_to_drop.astype(int)
+ idx_valid_timesteps = np.arange(0, ds["time"].size)
+ idx_valid_timesteps = np.delete(idx_valid_timesteps, idx_to_drop)
+ ds = ds.isel(time=idx_valid_timesteps)
+ # Return dataset
+ return ds
diff --git a/disdrodb/utils/warnings.py b/disdrodb/utils/warnings.py
new file mode 100644
index 00000000..e9e1546f
--- /dev/null
+++ b/disdrodb/utils/warnings.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""Warning utilities."""
+import warnings
+from contextlib import contextmanager
+
+
+@contextmanager
+def suppress_warnings():
+ """Context manager suppressing RuntimeWarnings and UserWarnings."""
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", RuntimeWarning)
+ warnings.simplefilter("ignore", UserWarning)
+ yield
diff --git a/disdrodb/utils/writer.py b/disdrodb/utils/writer.py
new file mode 100644
index 00000000..81f3e839
--- /dev/null
+++ b/disdrodb/utils/writer.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------.
+# Copyright (c) 2021-2023 DISDRODB developers
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+# -----------------------------------------------------------------------------.
+"""DISDRODB product writers."""
+
+import os
+
+import xarray as xr
+
+from disdrodb.utils.attrs import set_disdrodb_attrs
+from disdrodb.utils.directories import create_directory, remove_if_exists
+
+
+def write_product(ds: xr.Dataset, filepath: str, product: str, force: bool = False) -> None:
+ """Save the xarray dataset into a NetCDF file.
+
+ Parameters
+ ----------
+ ds : xarray.Dataset
+ Input xarray dataset.
+ filepath : str
+ Output file path.
+ product: str
+ DISDRODB product name.
+ force : bool, optional
+ Whether to overwrite existing data.
+ If ``True``, overwrite existing data into destination directories.
+ If ``False``, raise an error if there are already data into destination directories. This is the default.
+ """
+ # Create station directory if does not exist
+ create_directory(os.path.dirname(filepath))
+
+ # Check if the file already exists
+ # - If force=True --> Remove it
+ # - If force=False --> Raise error
+ remove_if_exists(filepath, force=force)
+
+ # Update attributes
+ ds = set_disdrodb_attrs(ds, product=product)
+
+ # Write netcdf
+ ds.to_netcdf(filepath, engine="netcdf4")
diff --git a/docs/source/l0_processing.rst b/docs/source/l0_processing.rst
index f8604ad5..91df610a 100644
--- a/docs/source/l0_processing.rst
+++ b/docs/source/l0_processing.rst
@@ -55,7 +55,7 @@ Example :
# L0 processing settings
l0a_processing = True
l0b_processing = True
- l0b_concat = True
+ l0c_processing = True
remove_l0a = False
remove_l0b = False
@@ -74,7 +74,7 @@ Example :
# L0 processing settings
l0a_processing=l0a_processing,
l0b_processing=l0b_processing,
- l0b_concat=l0b_concat,
+ l0c_processing=l0c_processing,
remove_l0a=remove_l0a,
remove_l0b=remove_l0b,
# L0 processing options
@@ -151,7 +151,7 @@ Example :
# L0 processing settings
l0a_processing = True
l0b_processing = True
- l0b_concat = False
+ l0c_processing = True
remove_l0a = False
remove_l0b = False
# L0 processing options
@@ -168,7 +168,7 @@ Example :
# L0 processing settings
l0a_processing=l0a_processing,
l0b_processing=l0b_processing,
- l0b_concat=l0b_concat,
+ l0c_processing=l0c_processing,
remove_l0a=remove_l0a,
remove_l0b=remove_l0b,
# L0 processing options
diff --git a/docs/source/metadata_csv/Sensor_Info.csv b/docs/source/metadata_csv/Sensor_Info.csv
index 5a08ba5f..bf59ae17 100644
--- a/docs/source/metadata_csv/Sensor_Info.csv
+++ b/docs/source/metadata_csv/Sensor_Info.csv
@@ -9,7 +9,7 @@ firmware_version,Firmware version
sensor_beam_length,Length of the laser beam's measurement area in mm
sensor_beam_width,Width of the laser beam's measurement area in mm
sensor_nominal_width,Expected width of the sensor beam under typical operating conditions
-measurement_interval,Number of seconds over which measurements are taken
+measurement_interval,Number of seconds over which measurements are taken.
calibration_sensitivity,Sensor sensitivity
calibration_certification_date,Sensor calibration date(s)
calibration_certification_url,Sensor calibration certification url
diff --git a/docs/source/software_structure.rst b/docs/source/software_structure.rst
index 0a90a87d..c3afd058 100644
--- a/docs/source/software_structure.rst
+++ b/docs/source/software_structure.rst
@@ -15,7 +15,6 @@ The current software structure is described below:
| ├── 📜 io.py
| ├── 📜 path.py
| ├── 📁 metadata
-| ├── 📁 scripts
| ├── 📜 disdrodb_check_metadata_archive.py
| ├── 📜 checks.py
| ├── 📜 info.py
@@ -53,8 +52,6 @@ The current software structure is described below:
| ├── 📜 disdrodb_run_l0a_station.py
| ├── 📜 disdrodb_run_l0b.py
| ├── 📜 disdrodb_run_l0b_station.py
-| ├── 📜 disdrodb_run_l0b_concat.py
-| ├── 📜 disdrodb_run_l0b_concat_station.py
| ├── 📜 check_configs.py
| ├── 📜 check_standards.py
| ├── 📜 io.py
diff --git a/docs/source/tutorials/.gitkeep b/docs/source/tutorials/.gitkeep
index 139597f9..e69de29b 100644
--- a/docs/source/tutorials/.gitkeep
+++ b/docs/source/tutorials/.gitkeep
@@ -1,2 +0,0 @@
-
-
diff --git a/pyproject.toml b/pyproject.toml
index 8678144a..02a6604d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,26 +70,36 @@ include = ["disdrodb*"]
[project.scripts]
# Initialization
-disdrodb_initialize_station="disdrodb.api.scripts.disdrodb_initialize_station:disdrodb_initialize_station"
+disdrodb_initialize_station="disdrodb.cli.disdrodb_initialize_station:disdrodb_initialize_station"
# Metadata archive
-disdrodb_check_metadata_archive="disdrodb.metadata.scripts.disdrodb_check_metadata_archive:disdrodb_check_metadata_archive"
+disdrodb_check_metadata_archive="disdrodb.cli.disdrodb_check_metadata_archive:disdrodb_check_metadata_archive"
# Data transfer
-disdrodb_download_archive="disdrodb.data_transfer.scripts.disdrodb_download_archive:disdrodb_download_archive"
-disdrodb_download_station="disdrodb.data_transfer.scripts.disdrodb_download_station:disdrodb_download_station"
-disdrodb_upload_archive="disdrodb.data_transfer.scripts.disdrodb_upload_archive:disdrodb_upload_archive"
-disdrodb_upload_station="disdrodb.data_transfer.scripts.disdrodb_upload_station:disdrodb_upload_station"
+disdrodb_download_archive="disdrodb.cli.disdrodb_download_archive:disdrodb_download_archive"
+disdrodb_download_station="disdrodb.cli.disdrodb_download_station:disdrodb_download_station"
+disdrodb_upload_archive="disdrodb.cli.disdrodb_upload_archive:disdrodb_upload_archive"
+disdrodb_upload_station="disdrodb.cli.disdrodb_upload_station:disdrodb_upload_station"
# L0A
-disdrodb_run_l0a_station="disdrodb.l0.scripts.disdrodb_run_l0a_station:disdrodb_run_l0a_station"
-disdrodb_run_l0a="disdrodb.l0.scripts.disdrodb_run_l0a:disdrodb_run_l0a"
+disdrodb_run_l0a_station="disdrodb.cli.disdrodb_run_l0a_station:disdrodb_run_l0a_station"
+disdrodb_run_l0a="disdrodb.cli.disdrodb_run_l0a:disdrodb_run_l0a"
# L0B
-disdrodb_run_l0b_station="disdrodb.l0.scripts.disdrodb_run_l0b_station:disdrodb_run_l0b_station"
-disdrodb_run_l0_station="disdrodb.l0.scripts.disdrodb_run_l0_station:disdrodb_run_l0_station"
-# L0B concatenation
-disdrodb_run_l0b_concat_station="disdrodb.l0.scripts.disdrodb_run_l0b_concat_station:disdrodb_run_l0b_concat_station"
-disdrodb_run_l0b_concat="disdrodb.l0.scripts.disdrodb_run_l0b_concat:disdrodb_run_l0b_concat"
+disdrodb_run_l0b_station="disdrodb.cli.disdrodb_run_l0b_station:disdrodb_run_l0b_station"
+disdrodb_run_l0b="disdrodb.cli.disdrodb_run_l0b:disdrodb_run_l0b"
+# L0C
+disdrodb_run_l0c_station="disdrodb.cli.disdrodb_run_l0c_station:disdrodb_run_l0c_station"
+disdrodb_run_l0c="disdrodb.cli.disdrodb_run_l0c:disdrodb_run_l0c"
# L0
-disdrodb_run_l0b="disdrodb.l0.scripts.disdrodb_run_l0b:disdrodb_run_l0b"
-disdrodb_run_l0="disdrodb.l0.scripts.disdrodb_run_l0:disdrodb_run_l0"
+disdrodb_run_l0_station="disdrodb.cli.disdrodb_run_l0_station:disdrodb_run_l0_station"
+disdrodb_run_l0="disdrodb.cli.disdrodb_run_l0:disdrodb_run_l0"
+# L1
+disdrodb_run_l1_station="disdrodb.cli.disdrodb_run_l1_station:disdrodb_run_l1_station"
+disdrodb_run_l1="disdrodb.cli.disdrodb_run_l1_station:disdrodb_run_l1_station"
+# L2E
+disdrodb_run_l2e_station="disdrodb.cli.disdrodb_run_l2e_station:disdrodb_run_l2e_station"
+disdrodb_run_l2e="disdrodb.cli.disdrodb_run_l2e_station:disdrodb_run_l2e_station"
+# L2M
+disdrodb_run_l2m_station="disdrodb.cli.disdrodb_run_l2m_station:disdrodb_run_l2m_station"
+disdrodb_run_l2m="disdrodb.cli.disdrodb_run_l2m_station:disdrodb_run_l2m_station"
+
[tool.pytest.ini_options]
diff --git a/tutorials/reader_preparation.ipynb b/tutorials/reader_preparation.ipynb
index b47cf41d..15764625 100644
--- a/tutorials/reader_preparation.ipynb
+++ b/tutorials/reader_preparation.ipynb
@@ -123,6 +123,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "from IPython.display import display\n",
"\n",
"from disdrodb.api.checks import check_sensor_name\n",
"\n",
@@ -262,7 +263,8 @@
"source": [
"**3. Initialization**\n",
"\n",
- "We initiate some checks, and get some variable. *Nothing must be changed here.*"
+ "We initiate some checks, and get some variable. *Nothing must be changed here.*\n",
+ "The `data_dir` is the directory path where the processed data will be stored."
]
},
{
@@ -273,7 +275,7 @@
"outputs": [],
"source": [
"# Create directory structure\n",
- "create_l0_directory_structure(\n",
+ "data_dir = create_l0_directory_structure(\n",
" raw_dir=raw_dir,\n",
" processed_dir=processed_dir,\n",
" station_name=station_name,\n",
@@ -819,7 +821,7 @@
"df_raw = read_raw_file(filepath, column_names=None, reader_kwargs=reader_kwargs)\n",
"# Print the dataframe\n",
"print(f\"Dataframe for the file {os.path.basename(filepath)} :\")\n",
- "display(df_raw) # noqa F821"
+ "display(df_raw)"
]
},
{
@@ -2432,7 +2434,7 @@
" verbose=verbose,\n",
" df_sanitizer_fun=df_sanitizer_fun,\n",
")\n",
- "display(df) # noqa F821"
+ "display(df)"
]
},
{
@@ -2529,7 +2531,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# ds = set_encodings(ds, sensor_name)\n",
+ "# ds = set_l0b_encodings(ds, sensor_name)\n",
"# ds.to_netcdf(\"/path/where/to/save/the/file.nc\")"
]
},