diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23af6525..169b2906 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ --- repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -12,12 +12,12 @@ repos: - id: check-ast - id: check-added-large-files - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.5 + rev: v0.7.1 hooks: - id: ruff args: [--fix] - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.10.0 hooks: - id: black language_version: python3 @@ -27,18 +27,18 @@ repos: - id: blackdoc additional_dependencies: ["black[jupyter]"] - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.1.0" + rev: "v4.0.0-alpha.8" hooks: - id: prettier types_or: [yaml, html, css, scss, javascript, json] # markdown to avoid conflicts with mdformat - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell types_or: [python, markdown, rst] additional_dependencies: [tomli] - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.19.0 hooks: - id: pyupgrade - repo: https://github.com/MarcoGorelli/madforhooks @@ -47,7 +47,7 @@ repos: # - id: conda-env-sorter # conflicts with prettier - id: check-execution-order - repo: https://github.com/executablebooks/mdformat - rev: 0.7.17 + rev: 0.7.18 hooks: - id: mdformat additional_dependencies: [mdformat-gfm, mdformat-black] @@ -58,7 +58,7 @@ repos: - id: nbstripout args: [--keep-output] - repo: https://github.com/nbQA-dev/nbQA - rev: 1.8.5 + rev: 1.8.7 hooks: - id: nbqa-black - id: nbqa-ruff diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 259566b2..573a8044 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -5,7 +5,7 @@ We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender -identity and expression, level of experience, education, socio-economic status, +identity and expression, level of experience, education, socioeconomic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml index 58d5240e..18bf3020 100644 --- a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml +++ b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_1.yml @@ -36,7 +36,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml index 12e2198d..424ab818 100644 --- a/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml +++ b/data/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/station_name_2.yml @@ -36,7 +36,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/__init__.py b/disdrodb/__init__.py index 79129c0a..4c5153e4 100644 --- a/disdrodb/__init__.py +++ b/disdrodb/__init__.py @@ -1,4 +1,23 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB software.""" + import contextlib +import importlib import os from importlib.metadata import PackageNotFoundError, version @@ -18,6 +37,11 @@ check_archive_metadata_geolocation, ) +PRODUCT_VERSION = "V0" +SOFTWARE_VERSION = "V" + importlib.metadata.version("disdrodb") +CONVENTIONS = "CF-1.10, ACDD-1.3" + + __all__ = [ "define_configs", "available_stations", diff --git a/disdrodb/api/checks.py b/disdrodb/api/checks.py index de5b5296..1c5b14ea 100644 --- a/disdrodb/api/checks.py +++ b/disdrodb/api/checks.py @@ -24,11 +24,11 @@ from disdrodb.api.info import infer_disdrodb_tree_path_components from disdrodb.api.path import ( + define_data_dir, define_issue_dir, define_issue_filepath, define_metadata_dir, define_metadata_filepath, - define_station_dir, ) from disdrodb.utils.directories import ( ensure_string_path, @@ -70,10 +70,7 @@ def check_url(url: str) -> bool: ``True`` if url well formatted, ``False`` if not well formatted. """ regex = r"^(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)$" # noqa: E501 - - if re.match(regex, url): - return True - return False + return re.match(regex, url) def check_path_is_a_directory(dir_path, path_name=""): @@ -95,6 +92,7 @@ def check_directories_inside(dir_path): def check_base_dir(base_dir: str): """Raise an error if the path does not end with ``DISDRODB``.""" base_dir = str(base_dir) # convert Pathlib to string + base_dir = os.path.normpath(base_dir) if not base_dir.endswith("DISDRODB"): raise ValueError(f"The path {base_dir} does not end with DISDRODB. Please check the path.") return base_dir @@ -150,7 +148,7 @@ def check_product(product): """Check DISDRODB product.""" if not isinstance(product, str): raise TypeError("`product` must be a string.") - valid_products = ["RAW", "L0A", "L0B"] + valid_products = ["RAW", "L0A", "L0B", "L0C", "L1", "L2E", "L2M", "L2S"] if product.upper() not in valid_products: msg = f"Valid `products` are {valid_products}." logger.error(msg) @@ -158,45 +156,68 @@ def check_product(product): return product -def check_station_dir(product, data_source, campaign_name, station_name, base_dir=None): - """Check existence of the station data directory. If does not exists, raise an error.""" - station_dir = define_station_dir( +def has_available_data( + data_source, + campaign_name, + station_name, + product, + base_dir=None, + # Option for L2E + sample_interval=None, + rolling=None, + # Option for L2M + model_name=None, +): + """Return ``True`` if data are available for the given product and station.""" + # Define product directory + data_dir = define_data_dir( product=product, base_dir=base_dir, data_source=data_source, campaign_name=campaign_name, station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + # Directory options check_exists=False, ) - if not os.path.exists(station_dir) and os.path.isdir(station_dir): - msg = f"The station {station_name} data directory does not exist at {station_dir}." - logger.error(msg) - raise ValueError(msg) - return station_dir - + # If the product directory does not exists, return False + if not os.path.isdir(data_dir): + return False -def has_available_station_files(product, data_source, campaign_name, station_name, base_dir=None): - """Return ``True`` if data are available for the given product and station.""" - station_dir = check_station_dir( - product=product, - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - ) - filepaths = list_files(station_dir, glob_pattern="*", recursive=True) + # If no files, return False + filepaths = list_files(data_dir, glob_pattern="*", recursive=True) nfiles = len(filepaths) return nfiles >= 1 -def check_station_has_data(product, data_source, campaign_name, station_name, base_dir=None): - """Check the station data directory has data inside. If not, raise an error.""" - if not has_available_station_files( +def check_data_availability( + product, + data_source, + campaign_name, + station_name, + base_dir=None, + # Option for L2E + sample_interval=None, + rolling=None, + # Option for L2M + model_name=None, +): + """Check the station product data directory has files inside. If not, raise an error.""" + if not has_available_data( product=product, base_dir=base_dir, data_source=data_source, campaign_name=campaign_name, station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, ): msg = f"The {product} station data directory of {data_source} {campaign_name} {station_name} is empty !" logger.error(msg) @@ -271,6 +292,7 @@ def check_issue_dir(data_source, campaign_name, base_dir=None): def check_issue_file(data_source, campaign_name, station_name, base_dir=None): """Check existence of a valid issue YAML file. If does not exists, raise an error.""" from disdrodb.issue.checks import check_issue_compliance + from disdrodb.issue.writer import create_station_issue _ = check_issue_dir( base_dir=base_dir, @@ -286,9 +308,12 @@ def check_issue_file(data_source, campaign_name, station_name, base_dir=None): ) # Check existence if not os.path.exists(issue_filepath): - msg = f"The issue YAML file of {data_source} {campaign_name} {station_name} does not exist at {issue_filepath}." - logger.error(msg) - raise ValueError(msg) + create_station_issue( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) # Check validity check_issue_compliance( @@ -398,7 +423,7 @@ def check_raw_dir(raw_dir: str, station_name: str) -> None: check_directories_inside(raw_dir) # Check there is data in the station directory - check_station_has_data( + check_data_availability( product="RAW", base_dir=base_dir, data_source=data_source, diff --git a/disdrodb/api/create_directories.py b/disdrodb/api/create_directories.py index cf31f1d9..af91a95b 100644 --- a/disdrodb/api/create_directories.py +++ b/disdrodb/api/create_directories.py @@ -19,7 +19,7 @@ """Tools to create Raw, L0A and L0B DISDRODB directories.""" # L0A and L0B from raw NC: create_l0_directory_structure(raw_dir, processed_dir) -# L0B: create_directory_structure(processed_dir) +# L0B: create_product_directory(processed_dir) import logging import os @@ -27,12 +27,12 @@ from typing import Optional from disdrodb.api.checks import ( + check_data_availability, check_metadata_file, check_processed_dir, check_product, check_raw_dir, - check_station_has_data, - has_available_station_files, + has_available_data, ) from disdrodb.api.info import ( infer_campaign_name_from_path, @@ -41,16 +41,18 @@ ) from disdrodb.api.path import ( define_campaign_dir, + define_data_dir, define_issue_dir, define_issue_filepath, + define_logs_dir, define_metadata_dir, define_metadata_filepath, define_station_dir, ) from disdrodb.configs import get_base_dir from disdrodb.utils.directories import ( - check_directory_exists, copy_file, + create_directory, create_required_directory, remove_if_exists, ) @@ -162,52 +164,17 @@ def _copy_station_metadata( ) -def _check_pre_existing_station_data( - data_source: str, - campaign_name: str, - station_name: str, - product: str, - base_dir=None, - force=False, -): - """Check for pre-existing station data. - - - If ``force=True``, remove all data inside the station directory. - - If ``force=False``, raise error. - """ - # NOTE: ``force=False`` behaviour could be changed to enable updating of missing files. - # This would require also adding code to check whether a downstream file already exist. - - # Check if there are available data - available_data = has_available_station_files( - product=product, - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - ) - # Define the station directory path - station_dir = define_station_dir( - product=product, - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - ) - # If the station data are already present: - # - If force=True, remove all data inside the station directory - # - If force=False, raise error - if available_data: - # Check is a directory - check_directory_exists(station_dir) - # If force=True, remove all the content - if force: - # Remove all station directory content - shutil.rmtree(station_dir) - else: - msg = f"The station directory {station_dir} already exists and force=False." - logger.error(msg) - raise ValueError(msg) +def ensure_empty_data_dir(data_dir, force): + """Remove the content of the data_dir directory.""" + # If force=True, remove all the directory content + if force: + shutil.rmtree(data_dir) + # Recreate the directory + create_directory(data_dir) + else: + msg = f"The product directory {data_dir} already contains files and force=False." + logger.error(msg) + raise ValueError(msg) def create_l0_directory_structure( @@ -236,8 +203,8 @@ def create_l0_directory_structure( # Retrieve components base_dir, product_type, data_source, campaign_name = infer_disdrodb_tree_path_components(processed_dir) - # Check station data are available - check_station_has_data( + # Check RAW station data are available + check_data_availability( product="RAW", base_dir=base_dir, data_source=data_source, @@ -248,7 +215,18 @@ def create_l0_directory_structure( # Create required directories (if they don't exist) create_required_directory(processed_dir, dir_name="metadata") create_required_directory(processed_dir, dir_name="info") - create_required_directory(processed_dir, dir_name=product) + + # Define and create product directory + data_dir = define_data_dir( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + + # Create required directory (if it doesn't exist) + create_directory(data_dir) # Copy the station metadata _copy_station_metadata( @@ -257,40 +235,70 @@ def create_l0_directory_structure( campaign_name=campaign_name, station_name=station_name, ) - # Remove / directory if force=True - _check_pre_existing_station_data( + + # Check if product files are already available + available_data = has_available_data( product=product, base_dir=base_dir, data_source=data_source, campaign_name=campaign_name, station_name=station_name, - force=force, ) - # Create the / directory - create_required_directory(os.path.join(processed_dir, product), dir_name=station_name) + + # If product files are already available: + # - If force=True, remove all data inside the product directory + # - If force=False, raise an error + if available_data: + ensure_empty_data_dir(data_dir, force=force) + + return data_dir -def create_directory_structure(processed_dir, product, station_name, force): - """Create directory structure for L0B and higher DISDRODB products.""" +def create_product_directory( + data_source, + campaign_name, + station_name, + product, + force, + base_dir=None, + # Option for L2E + sample_interval=None, + rolling=None, + # Option for L2M + model_name=None, +): + """Initialize the directory structure for a DISDRODB product. + + If product files already exists: + - If ``force=True``, it remove all existing data inside the product directory. + - If ``force=False``, it raise an error. + """ + # NOTE: ``force=False`` behaviour could be changed to enable updating of missing files. + # This would require also adding code to check whether a downstream file already exist. + + from disdrodb.api.io import get_required_product + + # Get DISDRODB base directory + base_dir = get_base_dir(base_dir) + # Check inputs check_product(product) - processed_dir = check_processed_dir(processed_dir=processed_dir) - - base_dir, product_type, data_source, campaign_name = infer_disdrodb_tree_path_components(processed_dir) # Determine required product - if product == "L0B": - required_product = "L0A" - else: - raise NotImplementedError("product {product} not yet implemented.") + required_product = get_required_product(product) - # Check station is available in the previous product level - check_station_has_data( + # Check station data is available in the previous product level + check_data_availability( product=required_product, base_dir=base_dir, data_source=data_source, campaign_name=campaign_name, station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, ) # Check metadata file is available @@ -302,19 +310,84 @@ def create_directory_structure(processed_dir, product, station_name, force): station_name=station_name, ) + # Define product output directory + data_dir = define_data_dir( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + ) + # Create required directory (if it doesn't exist) - create_required_directory(processed_dir, dir_name=product) + create_directory(data_dir) - # Remove / directory if force=True - _check_pre_existing_station_data( + # Check if product files are already available + available_data = has_available_data( product=product, base_dir=base_dir, data_source=data_source, campaign_name=campaign_name, station_name=station_name, - force=force, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, ) + # If product files are already available: + # - If force=True, remove all data inside the product directory + # - If force=False, raise an error + if available_data: + ensure_empty_data_dir(data_dir, force=force) + + # Return product directory + return data_dir + + +def create_logs_directory( + product, + data_source, + campaign_name, + station_name, + base_dir=None, + # Option for L2E + sample_interval=None, + rolling=None, + # Option for L2M + model_name=None, +): + """Initialize the logs directory structure for a DISDRODB product.""" + # Define logs directory + logs_dir = define_logs_dir( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + ) + + # Ensure empty log directory + if os.path.isdir(logs_dir): + shutil.rmtree(logs_dir) + + # Create logs directory + os.makedirs(logs_dir, exist_ok=True) + + # Return logs directory + return logs_dir + #### DISDRODB Station Initialization diff --git a/disdrodb/api/info.py b/disdrodb/api/info.py index 9b015811..62763538 100644 --- a/disdrodb/api/info.py +++ b/disdrodb/api/info.py @@ -19,19 +19,31 @@ """Retrieve file information from DISDRODB products file names and filepaths.""" import os +from collections import defaultdict from pathlib import Path import numpy as np from trollsift import Parser +from disdrodb.utils.time import acronym_to_seconds + ####--------------------------------------------------------------------------- ######################## #### FNAME PATTERNS #### ######################## -DISDRODB_FNAME_PATTERN = ( +DISDRODB_FNAME_L0_PATTERN = ( "{product:s}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}" ".{version:s}.{data_format:s}" ) +DISDRODB_FNAME_L2E_PATTERN = ( # also L0C and L1 --> accumulation_acronym = sample_interval + "{product:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}" + ".{version:s}.{data_format:s}" +) + +DISDRODB_FNAME_L2M_PATTERN = ( + "{product:s}_{subproduct:s}.{accumulation_acronym}.{campaign_name:s}.{station_name:s}.s{start_time:%Y%m%d%H%M%S}.e{end_time:%Y%m%d%H%M%S}" + ".{version:s}.{data_format:s}" +) ####---------------------------------------------------------------------------. ########################## @@ -41,9 +53,17 @@ def _parse_filename(filename): """Parse the filename with trollsift.""" - # Retrieve information from filename - p = Parser(DISDRODB_FNAME_PATTERN) - info_dict = p.parse(filename) + if filename.startswith("L0A") or filename.startswith("L0B"): + p = Parser(DISDRODB_FNAME_L0_PATTERN) + info_dict = p.parse(filename) + elif filename.startswith("L2E") or filename.startswith("L1") or filename.startswith("L0C"): + p = Parser(DISDRODB_FNAME_L2E_PATTERN) + info_dict = p.parse(filename) + elif filename.startswith("L2M"): + p = Parser(DISDRODB_FNAME_L2M_PATTERN) + info_dict = p.parse(filename) + else: + raise ValueError("Not a DISDRODB product file.") return info_dict @@ -54,6 +74,11 @@ def _get_info_from_filename(filename): info_dict = _parse_filename(filename) except ValueError: raise ValueError(f"{filename} can not be parsed. Report the issue.") + + # Add additional information to info dictionary + if "accumulation_acronym" in info_dict: + info_dict["sample_interval"] = acronym_to_seconds(info_dict["accumulation_acronym"]) + # Return info dictionary return info_dict @@ -132,7 +157,14 @@ def get_start_end_time_from_filepaths(filepaths): """Return the start and end time of the specified files.""" list_start_time = get_key_from_filepaths(filepaths, key="start_time") list_end_time = get_key_from_filepaths(filepaths, key="end_time") - return np.array(list_start_time), np.array(list_end_time) + return np.array(list_start_time).astype("M8[s]"), np.array(list_end_time).astype("M8[s]") + + +def get_sample_interval_from_filepaths(filepaths): + """Return the sample interval of the specified files.""" + list_accumulation_acronym = get_key_from_filepaths(filepaths, key="accumulation_acronym") + list_sample_interval = [acronym_to_seconds(s) for s in list_accumulation_acronym] + return list_sample_interval ####--------------------------------------------------------------------------. @@ -183,7 +215,7 @@ def infer_path_info_dict(path: str) -> dict: Returns ------- - list + dict Dictionary with the path element of the DISDRODB archive. Valid keys: ``"base_dir"``, ``"data_source"``, ``"campaign_name"`` """ @@ -197,6 +229,24 @@ def infer_path_info_dict(path: str) -> dict: return path_dict +def infer_path_info_tuple(path: str) -> tuple: + """Return a tuple with the ``base_dir``, ``data_source`` and ``campaign_name`` of the disdrodb_path. + + Parameters + ---------- + path : str + ``path`` can be a ``campaign_dir`` (``raw_dir`` or ``processed_dir``), or a DISDRODB file path. + + Returns + ------- + tuple + Dictionary with the path element of the DISDRODB archive. + Valid keys: ``"base_dir"``, ``"data_source"``, ``"campaign_name"`` + """ + path_dict = infer_path_info_dict(path) + return path_dict["base_dir"], path_dict["data_source"], path_dict["campaign_name"] + + def infer_disdrodb_tree_path(path: str) -> str: """Return the directory tree path from the base_dir directory. @@ -281,3 +331,136 @@ def infer_data_source_from_path(path: str) -> str: ####--------------------------------------------------------------------------. +####################### +#### Group utility #### +####################### + + +FILE_KEYS = [ + "product", + "subproduct", + "campaign_name", + "station_name", + "start_time", + "end_time", + "data_format", + "accumulation_acronym", + "sample_interval", +] + + +TIME_KEYS = [ + "year", + "month", + "month_name", + "quarter", + "season", + "day", + "doy", + "dow", + "hour", + "minute", + "second", +] + + +def check_groups(groups): + """Check groups validity.""" + if not isinstance(groups, (str, list)): + raise TypeError("'groups' must be a list (or a string if a single group is specified.") + if isinstance(groups, str): + groups = [groups] + groups = np.array(groups) + valid_keys = FILE_KEYS + TIME_KEYS + invalid_keys = groups[np.isin(groups, valid_keys, invert=True)] + if len(invalid_keys) > 0: + raise ValueError(f"The following group keys are invalid: {invalid_keys}. Valid values are {valid_keys}.") + return groups.tolist() + + +def get_season(time): + """Get season from `datetime.datetime` or `datetime.date` object.""" + month = time.month + if month in [12, 1, 2]: + return "DJF" # Winter (December, January, February) + if month in [3, 4, 5]: + return "MAM" # Spring (March, April, May) + if month in [6, 7, 8]: + return "JJA" # Summer (June, July, August) + return "SON" # Autumn (September, October, November) + + +def get_time_component(time, component): + """Get time component from `datetime.datetime` object.""" + func_dict = { + "year": lambda time: time.year, + "month": lambda time: time.month, + "day": lambda time: time.day, + "doy": lambda time: time.timetuple().tm_yday, # Day of year + "dow": lambda time: time.weekday(), # Day of week (0=Monday, 6=Sunday) + "hour": lambda time: time.hour, + "minute": lambda time: time.minute, + "second": lambda time: time.second, + # Additional + "month_name": lambda time: time.strftime("%B"), # Full month name + "quarter": lambda time: (time.month - 1) // 3 + 1, # Quarter (1-4) + "season": lambda time: get_season(time), # Season (DJF, MAM, JJA, SON) + } + return str(func_dict[component](time)) + + +def _get_groups_value(groups, filepath): + """Return the value associated to the groups keys. + + If multiple keys are specified, the value returned is a string of format: ``//...`` + + If a single key is specified and is ``start_time`` or ``end_time``, the function + returns a :py:class:`datetime.datetime` object. + """ + single_key = len(groups) == 1 + info_dict = get_info_from_filepath(filepath) + start_time = info_dict["start_time"] + list_key_values = [] + for key in groups: + if key in TIME_KEYS: + list_key_values.append(get_time_component(start_time, component=key)) + else: + value = info_dict.get(key, f"{key}=None") + list_key_values.append(value if single_key else str(value)) + if single_key: + return list_key_values[0] + return "/".join(list_key_values) + + +def group_filepaths(filepaths, groups=None): + """ + Group filepaths in a dictionary if groups are specified. + + Parameters + ---------- + filepaths : list + List of filepaths. + groups: list or str + The group keys by which to group the filepaths. + Valid group keys are ``product``, ``subproduct``, ``campaign_name``, ``station_name``, + ``start_time``, ``end_time``,``accumulation_acronym``,``sample_interval``, + ``data_format``, + ``year``, ``month``, ``day``, ``doy``, ``dow``, ``hour``, ``minute``, ``second``, + ``month_name``, ``quarter``, ``season``. + The time components are extracted from ``start_time`` ! + If groups is ``None`` returns the input filepaths list. + The default is ``None``. + + Returns + ------- + dict or list + Either a dictionary of format ``{: }``. + or the original input filepaths (if ``groups=None``) + + """ + if groups is None: + return filepaths + groups = check_groups(groups) + filepaths_dict = defaultdict(list) + _ = [filepaths_dict[_get_groups_value(groups, filepath)].append(filepath) for filepath in filepaths] + return dict(filepaths_dict) diff --git a/disdrodb/api/io.py b/disdrodb/api/io.py index 67b38242..8832f310 100644 --- a/disdrodb/api/io.py +++ b/disdrodb/api/io.py @@ -19,23 +19,129 @@ """Routines tot extract information from the DISDRODB infrastructure.""" import os +import shutil +from typing import Optional import numpy as np from disdrodb.api.checks import check_product -from disdrodb.api.path import get_disdrodb_path +from disdrodb.api.path import define_data_dir, define_product_dir, get_disdrodb_path from disdrodb.configs import get_base_dir from disdrodb.utils.directories import count_files, list_directories, list_files +from disdrodb.utils.logger import ( + log_info, +) + + +def get_required_product(product): + """Determine the required product for input product processing.""" + # Check input + check_product(product) + # Determine required product + requirement_dict = { + "L0A": "RAW", + "L0B": "L0A", + "L0C": "L0B", + "L1": "L0C", + "L2E": "L1", + "L2M": "L2E", + } + required_product = requirement_dict[product] + return required_product + + +def filter_filepaths(filepaths, debugging_mode): + """Filter out filepaths if ``debugging_mode=True``.""" + if debugging_mode: + max_files = min(3, len(filepaths)) + filepaths = filepaths[0:max_files] + return filepaths + + +def get_filepaths( + data_source, + campaign_name, + station_name, + product, + model_name=None, + sample_interval=None, + rolling=None, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """Retrieve DISDRODB product files for a give station. + + Parameters + ---------- + data_source : str + The name of the institution (for campaigns spanning multiple countries) or + the name of the country (for campaigns or sensor networks within a single country). + Must be provided in UPPER CASE. + campaign_name : str + The name of the campaign. Must be provided in UPPER CASE. + station_name : str + The name of the station. + product : str + The name DISDRODB product. + sample_interval : int, optional + The sampling interval in seconds of the product. + It must be specified only for product L2E and L2M ! + rolling : bool, optional + Whether the dataset has been resampled by aggregating or rolling. + It must be specified only for product L2E and L2M ! + model_name : str + The model name of the statistical distribution for the DSD. + It must be specified only for product L2M ! + debugging_mode : bool, optional + If ``True``, it select maximum 3 files for debugging purposes. + The default is ``False``. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + + Returns + ------- + filepaths : list + List of file paths. + + """ + # Retrieve data directory + data_dir = define_data_dir( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=product, + # Option for L2E and L2M + sample_interval=sample_interval, + rolling=rolling, + # Options for L2M + model_name=model_name, + ) + + # Define glob pattern + glob_pattern = "*.parquet" if product == "L0A" else "*.nc" + + # Retrieve files + filepaths = list_files(data_dir, glob_pattern=glob_pattern, recursive=True) + + # Filter out filepaths if debugging_mode=True + filepaths = filter_filepaths(filepaths, debugging_mode=debugging_mode) + + # If no file available, raise error + if len(filepaths) == 0: + msg = f"No {product} files are available in {data_dir}. Run {product} processing first." + raise ValueError(msg) + + # Sort filepaths + filepaths = sorted(filepaths) + + return filepaths def _get_list_stations_dirs(product, campaign_dir): # Get directory where data are stored - # - Raw: /data/<...> - # - Processed: /L0A/L0B> - if product.upper() == "RAW": - product_dir = os.path.join(campaign_dir, "data") - else: - product_dir = os.path.join(campaign_dir, product) + product_dir = define_product_dir(campaign_dir=campaign_dir, product=product) # Check if the data directory exists # - For a fresh disdrodb-data cloned repo, no "data" directories if not os.path.exists(product_dir): @@ -51,6 +157,7 @@ def _get_list_stations_with_data(product, campaign_dir): # Get stations directory list_stations_dir = _get_list_stations_dirs(product=product, campaign_dir=campaign_dir) # Count number of files within directory + # - TODO: here just check for one file ! list_nfiles_per_station = [count_files(station_dir, "*", recursive=True) for station_dir in list_stations_dir] # Keep only stations with at least one file stations_names = [os.path.basename(path) for n, path in zip(list_nfiles_per_station, list_stations_dir) if n >= 1] @@ -75,7 +182,6 @@ def _get_campaign_stations(base_dir, product, data_source, campaign_name): data_source=data_source, campaign_name=campaign_name, ) - # Get list of stations with data and metadata list_stations_data = _get_list_stations_with_data(product=product, campaign_dir=campaign_dir) list_stations_metadata = _get_list_stations_with_metadata(campaign_dir) @@ -278,9 +384,13 @@ def available_stations( campaign_names=None, station_names=None, return_tuple=True, + raise_error_if_empty=False, base_dir=None, ): - """Return stations for which data are available on disk.""" + """Return stations for which data and metadata are available on disk. + + Raise an error if no stations are available. + """ base_dir = get_base_dir(base_dir) # Checks arguments product = check_product(product) @@ -297,24 +407,42 @@ def available_stations( if isinstance(station_names, str): station_names = [station_names] - # If data_source is None, first retrieve all stations + # If data_source is None, retrieve all stations if data_sources is None: list_info = _get_stations(base_dir=base_dir, product=product) - # Otherwise retrieve all stations for the specified data sources + + ###-----------------------------------------------. + ### Filter by data_sources else: list_info = _get_data_sources_stations( base_dir=base_dir, data_sources=data_sources, product=product, ) + # If no stations available, raise an error + if raise_error_if_empty and len(list_info) == 0: + raise ValueError(f"No stations available given the provided `data_sources` {data_sources}.") + + ###-----------------------------------------------. + ### Filter by campaign_names # If campaign_names is not None, subset by campaign_names if campaign_names is not None: list_info = [info for info in list_info if info[1] in campaign_names] + # If no stations available, raise an error + if raise_error_if_empty and len(list_info) == 0: + raise ValueError(f"No stations available given the provided `campaign_names` {campaign_names}.") + + ###-----------------------------------------------. + ### Filter by station_names # If station_names is not None, subset by station_names if station_names is not None: list_info = [info for info in list_info if info[2] in station_names] + # If no stations available, raise an error + if raise_error_if_empty and len(list_info) == 0: + raise ValueError(f"No stations available given the provided `station_names` {station_names}.") + ###-----------------------------------------------. # Return list with the tuple (data_source, campaign_name, station_name) if return_tuple: return list_info @@ -322,3 +450,33 @@ def available_stations( # - Return list with the name of the available stations list_stations = [info[2] for info in list_info] return list_stations + + +####---------------------------------------------------------------------------------- +#### DISDRODB Removal Functions + + +def remove_product( + base_dir, + product, + data_source, + campaign_name, + station_name, + logger=None, + verbose=True, +): + """Remove all product files of a specific station.""" + if product.upper() == "RAW": + raise ValueError("Removal of 'RAW' files is not allowed.") + data_dir = define_data_dir( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + if logger is not None: + log_info(logger=logger, msg="Removal of {product} files started.", verbose=verbose) + shutil.rmtree(data_dir) + if logger is not None: + log_info(logger=logger, msg="Removal of {product} files ended.", verbose=verbose) diff --git a/disdrodb/api/path.py b/disdrodb/api/path.py index ab4c6f0d..87955047 100644 --- a/disdrodb/api/path.py +++ b/disdrodb/api/path.py @@ -17,15 +17,14 @@ # along with this program. If not, see . # -----------------------------------------------------------------------------. """Define paths within the DISDRODB infrastructure.""" - import os +from typing import Optional import pandas as pd -import xarray as xr -from disdrodb.api.info import infer_campaign_name_from_path from disdrodb.configs import get_base_dir from disdrodb.utils.directories import check_directory_exists +from disdrodb.utils.time import ensure_sample_interval_in_seconds, seconds_to_acronym ####--------------------------------------------------------------------------. #### Paths from BASE_DIR @@ -120,54 +119,6 @@ def define_campaign_dir( return str(campaign_dir) -def define_station_dir( - product, - data_source, - campaign_name, - station_name, - base_dir=None, - check_exists=False, -): - """Return the station data directory in the DISDRODB infrastructure. - - Parameters - ---------- - product : str - The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. - data_source : str - The data source. - campaign_name : str - The campaign name. - station_name : str - The station name. - base_dir : str, optional - The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. - If not specified, the path specified in the DISDRODB active configuration will be used. - check_exists : bool, optional - Whether to check if the directory exists. By default ``False``. - - Returns - ------- - station_dir : str - Station data directory path - """ - base_dir = get_base_dir(base_dir) - campaign_dir = get_disdrodb_path( - base_dir=base_dir, - product=product, - data_source=data_source, - campaign_name=campaign_name, - check_exists=check_exists, - ) - if product.upper() == "RAW": - station_dir = os.path.join(campaign_dir, "data", station_name) - else: - station_dir = os.path.join(campaign_dir, product, station_name) - if check_exists: - check_directory_exists(station_dir) - return str(station_dir) - - def define_metadata_dir( product, data_source, @@ -250,11 +201,11 @@ def define_issue_dir( def define_metadata_filepath( - product, data_source, campaign_name, station_name, base_dir=None, + product="RAW", check_exists=False, ): """Return the station metadata filepath in the DISDRODB infrastructure. @@ -353,82 +304,537 @@ def define_config_dir(product): #### Directory/Filepaths L0A and L0B products -def define_l0a_station_dir(processed_dir: str, station_name: str) -> str: - """Define L0A directory. +def check_sample_interval(sample_interval): + """Check sample_interval argument validity.""" + if not isinstance(sample_interval, int): + raise ValueError("'sample_interval' must be an integer.") + + +def check_rolling(rolling): + """Check rolling argument validity.""" + if not isinstance(rolling, bool): + raise ValueError("'rolling' must be a boolean.") + + +def define_product_dir_tree( + product, + model_name=None, + sample_interval=None, + rolling=None, +): + """Return the product directory tree. Parameters ---------- - processed_dir : str - Path of the processed directory + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + sample_interval : int, optional + The sampling interval in seconds of the product. + It must be specified only for product L2E and L2M ! + rolling : bool, optional + Whether the dataset has been resampled by aggregating or rolling. + It must be specified only for product L2E and L2M ! + model_name : str + The custom model name of the fitted statistical distribution. + It must be specified only for product L2M ! + + Returns + ------- + data_dir : str + Station data directory path + """ + if product.upper() == "RAW": + return "" + if product.upper() in ["L0A", "L0B", "L0C", "L1"]: + return product + if product == "L2E": + check_rolling(rolling) + check_sample_interval(sample_interval) + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + return os.path.join(product, sample_interval_acronym) + if product == "L2M": + check_rolling(rolling) + check_sample_interval(sample_interval) + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + return os.path.join(product, model_name, sample_interval_acronym) + raise ValueError(f"The product {product} is not defined.") + + +def define_station_dir_new( + product, + data_source, + campaign_name, + station_name, + base_dir=None, + check_exists=False, +): # TODO: IN FUTURE without product --> campaign_dir/station_name/product ! + """Return the station data directory in the DISDRODB infrastructure. + + Parameters + ---------- + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + data_source : str + The data source. + campaign_name : str + The campaign name. station_name : str - Name of the station + The station name. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + check_exists : bool, optional + Whether to check if the directory exists. By default ``False``. Returns ------- - str - L0A directory path. + station_dir : str + Station data directory path """ - station_dir = os.path.join(processed_dir, "L0A", station_name) - return station_dir + base_dir = get_base_dir(base_dir) + campaign_dir = get_disdrodb_path( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + check_exists=check_exists, + ) + if product.upper() == "RAW": + station_dir = os.path.join(campaign_dir, "data", station_name) + else: + station_dir = os.path.join(campaign_dir, station_name, "data") + if check_exists: + check_directory_exists(station_dir) + return str(station_dir) -def define_l0b_station_dir(processed_dir: str, station_name: str) -> str: - """Define L0B directory. +def define_data_dir_new( + product, + data_source, + campaign_name, + station_name, + model_name=None, + sample_interval=None, + rolling=None, + base_dir=None, + check_exists=False, +): + """Return the station data directory in the DISDRODB infrastructure. Parameters ---------- - processed_dir : str - Path of the processed directory - station_name : int - Name of the station + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + data_source : str + The data source. + campaign_name : str + The campaign name. + station_name : str + The station name. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + check_exists : bool, optional + Whether to check if the directory exists. By default ``False``. + + Returns + ------- + station_dir : str + Station data directory path + """ + station_dir = define_station_dir_new( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + check_exists=check_exists, + ) + product_dir_tree = define_product_dir_tree( + product=product, + model_name=model_name, + sample_interval=sample_interval, + rolling=rolling, + ) + data_dir = os.path.join(station_dir, product_dir_tree) + if check_exists: + check_directory_exists(data_dir) + return str(data_dir) + + +def define_logs_dir( + product, + data_source, + campaign_name, + station_name, + model_name=None, + sample_interval=None, + rolling=None, + base_dir=None, + check_exists=False, +): + """Return the station log directory in the DISDRODB infrastructure. + + Parameters + ---------- + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + data_source : str + The data source. + campaign_name : str + The campaign name. + station_name : str + The station name. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + check_exists : bool, optional + Whether to check if the directory exists. By default ``False``. + + Returns + ------- + station_dir : str + Station data directory path + """ + # station_dir = define_station_dir_new( + # base_dir=base_dir, + # product=product, + # data_source=data_source, + # campaign_name=campaign_name, + # check_exists=check_exists, + # ) + campaign_dir = define_campaign_dir( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + check_exists=check_exists, + ) + product_dir_tree = define_product_dir_tree( + product=product, + model_name=model_name, + sample_interval=sample_interval, + rolling=rolling, + ) + logs_dir = os.path.join(campaign_dir, "logs", "files", product_dir_tree, station_name) + if check_exists: + check_directory_exists(logs_dir) + return str(logs_dir) + + +def define_data_dir( + product, + data_source, + campaign_name, + station_name, + model_name=None, + sample_interval=None, + rolling=None, + base_dir=None, + check_exists=False, +): + """Return the station data directory in the DISDRODB infrastructure. + + Parameters + ---------- + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + data_source : str + The data source. + campaign_name : str + The campaign name. + station_name : str + The station name. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + check_exists : bool, optional + Whether to check if the directory exists. By default ``False``. + sample_interval : int, optional + The sampling interval in seconds of the product. + It must be specified only for product L2E and L2M ! + rolling : bool, optional + Whether the dataset has been resampled by aggregating or rolling. + It must be specified only for product L2E and L2M ! + model_name : str + The name of the fitted statistical distribution for the DSD. + It must be specified only for product L2M ! + + Returns + ------- + data_dir : str + Station data directory path + """ + station_dir = define_station_dir( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + check_exists=check_exists, + ) + if product.upper() in ["RAW", "L0A", "L0B", "L0C", "L1"]: + data_dir = station_dir + elif product == "L2E": + check_rolling(rolling) + check_sample_interval(sample_interval) + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + data_dir = os.path.join(station_dir, sample_interval_acronym) + elif product == "L2M": + check_rolling(rolling) + check_sample_interval(sample_interval) + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + data_dir = os.path.join(station_dir, model_name, sample_interval_acronym) + else: + raise ValueError("TODO") # CHECK Product on top !` + if check_exists: + check_directory_exists(data_dir) + return str(data_dir) + + +def define_product_dir(campaign_dir: str, product: str) -> str: + """Define product directory.""" + # TODO: this currently only works for L0A and L0B. Should be removed ! + # - Raw: /data/<...> + # - Processed: /L0A/L0B> + if product.upper() == "RAW": + product_dir = os.path.join(campaign_dir, "data") + else: + product_dir = os.path.join(campaign_dir, product) + return product_dir + + +def define_station_dir( + product, + data_source, + campaign_name, + station_name, + base_dir=None, + check_exists=False, +): # TODO: IN FUTURE without product --> campaign_dir/station_name/product ! + """Return the station data directory in the DISDRODB infrastructure. + + Parameters + ---------- + product : str + The DISDRODB product. It can be ``"RAW"``, ``"L0A"``, or ``"L0B"``. + data_source : str + The data source. + campaign_name : str + The campaign name. + station_name : str + The station name. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + check_exists : bool, optional + Whether to check if the directory exists. By default ``False``. + + Returns + ------- + station_dir : str + Station data directory path + """ + base_dir = get_base_dir(base_dir) + campaign_dir = get_disdrodb_path( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + check_exists=check_exists, + ) + if product.upper() == "RAW": + station_dir = os.path.join(campaign_dir, "data", station_name) + else: + station_dir = os.path.join(campaign_dir, product, station_name) + if check_exists: + check_directory_exists(station_dir) + return str(station_dir) + + +####--------------------------------------------------------------------------. +#### Filenames for DISDRODB products + + +def define_accumulation_acronym(seconds, rolling): + """Define the accumulation acronnym. + + Prefix the accumulation interval acronym with ROLL if rolling=True. + """ + accumulation_acronym = seconds_to_acronym(seconds) + if rolling: + accumulation_acronym = f"ROLL{accumulation_acronym}" + return accumulation_acronym + + +####--------------------------------------------------------------------------. +#### Filenames for DISDRODB products + + +def define_filename( + product: str, + campaign_name: str, + station_name: str, + # L2E option + sample_interval: Optional[int] = None, + rolling: Optional[bool] = None, + # L2M option + model_name: Optional[str] = None, + # Filename options + obj=None, + add_version=True, + add_time_period=True, + add_extension=True, + # Prefix + prefix="", + suffix="", +) -> str: + """Define DISDRODB products filename. + + Parameters + ---------- + obj : xarray.Dataset or pandas.DataFrame + xarray Dataset or pandas DataFrame. + Required if add_time_period = True. + campaign_name : str + Name of the campaign. + station_name : str + Name of the station. + sample_interval : int, optional + The sampling interval in seconds of the product. + It must be specified only for product L2E and L2M ! + rolling : bool, optional + Whether the dataset has been resampled by aggregating or rolling. + It must be specified only for product L2E and L2M ! + model_name : str + The model name of the fitted statistical distribution for the DSD. + It must be specified only for product L2M ! Returns ------- str - Path of the L0B directory + L0B file name. """ - station_dir = os.path.join(processed_dir, "L0B", station_name) - return station_dir + from disdrodb import PRODUCT_VERSION + from disdrodb.utils.pandas import get_dataframe_start_end_time + from disdrodb.utils.xarray import get_dataset_start_end_time + + # -----------------------------------------. + # TODO: Define sample_interval_acronym + # - ADD sample_interval_acronym also to L0A and L0B + # - Add sample_interval_acronym also to L0C and L1 + + # -----------------------------------------. + # Define product acronym + product_acronym = f"{product}" + if product in ["L2E", "L2M"]: + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + product_acronym = f"L2E.{sample_interval_acronym}" + if product in ["L2M"]: + product_acronym = f"L2M_{model_name}.{sample_interval_acronym}" + + # -----------------------------------------. + # Define base filename + filename = f"{product_acronym}.{campaign_name}.{station_name}" + + # -----------------------------------------. + # Add prefix + if prefix != "": + filename = f"{prefix}.{filename}" + + # -----------------------------------------. + # Add time period information + if add_time_period: + if product == "L0A": + starting_time, ending_time = get_dataframe_start_end_time(obj) + else: + starting_time, ending_time = get_dataset_start_end_time(obj) + starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") + filename = f"{filename}.s{starting_time}.e{ending_time}" + + # -----------------------------------------. + # Add product version + if add_version: + filename = f"{filename}.{PRODUCT_VERSION}" + + # -----------------------------------------. + # Add product extension + if add_extension: + filename = f"{filename}.parquet" if product == "L0A" else f"{filename}.nc" + + # -----------------------------------------. + # Add suffix + if suffix != "": + filename = f"{filename}.{suffix}" + return filename -def define_l0a_filename(df, processed_dir, station_name: str) -> str: +def define_l0a_filename(df, campaign_name: str, station_name: str) -> str: """Define L0A file name. Parameters ---------- df : pandas.DataFrame - L0A DataFrame - processed_dir : str - Path of the processed directory + L0A DataFrame. + campaign_name : str + Name of the campaign. station_name : str - Name of the station + Name of the station. Returns ------- str L0A file name. """ - from disdrodb.l0.standards import PRODUCT_VERSION + from disdrodb import PRODUCT_VERSION from disdrodb.utils.pandas import get_dataframe_start_end_time starting_time, ending_time = get_dataframe_start_end_time(df) starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") - campaign_name = infer_campaign_name_from_path(processed_dir).replace(".", "-") version = PRODUCT_VERSION filename = f"L0A.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.parquet" return filename -def define_l0b_filename(ds, processed_dir, station_name: str) -> str: +def define_l0b_filename(ds, campaign_name: str, station_name: str) -> str: """Define L0B file name. + Parameters + ---------- + ds : xarray.Dataset + L0B xarray Dataset. + campaign_name : str + Name of the campaign. + station_name : str + Name of the station. + + Returns + ------- + str + L0B file name. + """ + from disdrodb import PRODUCT_VERSION + from disdrodb.utils.xarray import get_dataset_start_end_time + + starting_time, ending_time = get_dataset_start_end_time(ds) + starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") + version = PRODUCT_VERSION + filename = f"L0B.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + return filename + + +def define_l0c_filename(ds, campaign_name: str, station_name: str) -> str: + """Define L0C file name. + Parameters ---------- ds : xarray.Dataset L0B xarray Dataset - processed_dir : str - Path of the processed directory + campaign_name : str + Name of the campaign station_name : str Name of the station @@ -437,69 +843,120 @@ def define_l0b_filename(ds, processed_dir, station_name: str) -> str: str L0B file name. """ - from disdrodb.l0.standards import PRODUCT_VERSION + from disdrodb import PRODUCT_VERSION from disdrodb.utils.xarray import get_dataset_start_end_time + # TODO: add sample_interval as argument + sample_interval = int(ensure_sample_interval_in_seconds(ds["sample_interval"]).data.item()) + sample_interval_acronym = define_accumulation_acronym(sample_interval, rolling=False) starting_time, ending_time = get_dataset_start_end_time(ds) starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") - campaign_name = infer_campaign_name_from_path(processed_dir).replace(".", "-") version = PRODUCT_VERSION - filename = f"L0B.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + filename = ( + f"L0C.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + ) return filename -def define_l0a_filepath(df: pd.DataFrame, processed_dir: str, station_name: str) -> str: - """Define L0A file path. +def define_l1_filename(ds, campaign_name, station_name: str) -> str: + """Define L1 file name. Parameters ---------- - df : pandas.DataFrame - L0A DataFrame. + ds : xarray.Dataset + L1 xarray Dataset processed_dir : str - Path of the processed directory. + Path of the processed directory station_name : str - Name of the station. + Name of the station Returns ------- str - L0A file path. + L1 file name. """ - filename = define_l0a_filename(df=df, processed_dir=processed_dir, station_name=station_name) - station_dir = define_l0a_station_dir(processed_dir=processed_dir, station_name=station_name) - filepath = os.path.join(station_dir, filename) - return filepath + from disdrodb import PRODUCT_VERSION + from disdrodb.utils.xarray import get_dataset_start_end_time + + # TODO: add sample_interval as argument + sample_interval = int(ensure_sample_interval_in_seconds(ds["sample_interval"]).data.item()) + sample_interval_acronym = define_accumulation_acronym(sample_interval, rolling=False) + starting_time, ending_time = get_dataset_start_end_time(ds) + starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") + version = PRODUCT_VERSION + filename = ( + f"L1.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + ) + return filename -def define_l0b_filepath(ds: xr.Dataset, processed_dir: str, station_name: str, l0b_concat=False) -> str: - """Define L0B file path. +def define_l2e_filename(ds, campaign_name: str, station_name: str, sample_interval: int, rolling: bool) -> str: + """Define L2E file name. Parameters ---------- ds : xarray.Dataset - L0B xarray Dataset. + L1 xarray Dataset processed_dir : str - Path of the processed directory. + Path of the processed directory station_name : str - ID of the station - l0b_concat : bool - If ``False``, the file is specified inside the station directory. - If ``True``, the file is specified outside the station directory. + Name of the station Returns ------- str - L0B file path. + L0B file name. """ - station_dir = define_l0b_station_dir(processed_dir, station_name) - filename = define_l0b_filename(ds, processed_dir, station_name) - if l0b_concat: - product_dir = os.path.dirname(station_dir) - filepath = os.path.join(product_dir, filename) - else: - filepath = os.path.join(station_dir, filename) - return filepath + from disdrodb import PRODUCT_VERSION + from disdrodb.utils.xarray import get_dataset_start_end_time + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + starting_time, ending_time = get_dataset_start_end_time(ds) + starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") + version = PRODUCT_VERSION + filename = ( + f"L2E.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + ) + return filename -####--------------------------------------------------------------------------. + +def define_l2m_filename( + ds, + campaign_name: str, + station_name: str, + sample_interval: int, + rolling: bool, + model_name: str, +) -> str: + """Define L2M file name. + + Parameters + ---------- + ds : xarray.Dataset + L1 xarray Dataset + processed_dir : str + Path of the processed directory + station_name : str + Name of the station + + Returns + ------- + str + L0B file name. + """ + from disdrodb import PRODUCT_VERSION + from disdrodb.utils.xarray import get_dataset_start_end_time + + sample_interval_acronym = define_accumulation_acronym(seconds=sample_interval, rolling=rolling) + starting_time, ending_time = get_dataset_start_end_time(ds) + starting_time = pd.to_datetime(starting_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(ending_time).strftime("%Y%m%d%H%M%S") + version = PRODUCT_VERSION + filename = ( + f"L2M_{model_name}.{sample_interval_acronym}.{campaign_name}." + + f"{station_name}.s{starting_time}.e{ending_time}.{version}.nc" + ) + return filename diff --git a/disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py b/disdrodb/cli/disdrodb_check_metadata_archive.py similarity index 93% rename from disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py rename to disdrodb/cli/disdrodb_check_metadata_archive.py index d3ad0d06..653d145b 100644 --- a/disdrodb/metadata/scripts/disdrodb_check_metadata_archive.py +++ b/disdrodb/cli/disdrodb_check_metadata_archive.py @@ -19,7 +19,7 @@ import click -from disdrodb.utils.scripts import click_base_dir_option, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/data_transfer/scripts/disdrodb_download_archive.py b/disdrodb/cli/disdrodb_download_archive.py similarity index 94% rename from disdrodb/data_transfer/scripts/disdrodb_download_archive.py rename to disdrodb/cli/disdrodb_download_archive.py index 04b8d67f..f8efad84 100644 --- a/disdrodb/data_transfer/scripts/disdrodb_download_archive.py +++ b/disdrodb/cli/disdrodb_download_archive.py @@ -22,7 +22,7 @@ import click from disdrodb.data_transfer.download_data import click_download_archive_options, click_download_options -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/data_transfer/scripts/disdrodb_download_station.py b/disdrodb/cli/disdrodb_download_station.py similarity index 96% rename from disdrodb/data_transfer/scripts/disdrodb_download_station.py rename to disdrodb/cli/disdrodb_download_station.py index c13b7e35..52a4d8a4 100644 --- a/disdrodb/data_transfer/scripts/disdrodb_download_station.py +++ b/disdrodb/cli/disdrodb_download_station.py @@ -24,7 +24,7 @@ import click from disdrodb.data_transfer.download_data import click_download_options -from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/api/scripts/disdrodb_initialize_station.py b/disdrodb/cli/disdrodb_initialize_station.py similarity index 96% rename from disdrodb/api/scripts/disdrodb_initialize_station.py rename to disdrodb/cli/disdrodb_initialize_station.py index bb36cb80..17e752fe 100644 --- a/disdrodb/api/scripts/disdrodb_initialize_station.py +++ b/disdrodb/cli/disdrodb_initialize_station.py @@ -20,7 +20,7 @@ import click -from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/l0/scripts/disdrodb_run_l0.py b/disdrodb/cli/disdrodb_run_l0.py similarity index 86% rename from disdrodb/l0/scripts/disdrodb_run_l0.py rename to disdrodb/cli/disdrodb_run_l0.py index 5d035f9a..b857cc89 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0.py +++ b/disdrodb/cli/disdrodb_run_l0.py @@ -20,20 +20,22 @@ import click -from disdrodb.l0.routines import ( +from disdrodb.utils.cli import ( + click_base_dir_option, click_l0_archive_options, - click_l0_processing_options, - click_l0_stations_options, + click_processing_options, + click_stations_options, + parse_arg_to_list, + parse_base_dir, ) -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur @click.command() -@click_l0_stations_options +@click_stations_options @click_l0_archive_options -@click_l0_processing_options +@click_processing_options @click_base_dir_option def disdrodb_run_l0( # L0 disdrodb stations options @@ -43,7 +45,7 @@ def disdrodb_run_l0( # L0 archive options l0a_processing: bool = True, l0b_processing: bool = True, - l0b_concat: bool = True, + l0c_processing: bool = True, remove_l0a: bool = False, remove_l0b: bool = False, # Processing options @@ -83,17 +85,14 @@ def disdrodb_run_l0( l0b_processing : bool Whether to launch processing to generate L0B netCDF4 file(s) from L0A data. The default is True. - l0b_concat : bool - Whether to concatenate all raw files into a single L0B netCDF file. - If l0b_concat=True, all raw files will be saved into a single L0B netCDF file. - If l0b_concat=False, each raw file will be converted into the corresponding L0B netCDF file. - The default is False. + l0c_processing : bool + Whether to launch processing to generate L0C netCDF4 file(s) from L0C data. + The default is True. remove_l0a : bool Whether to keep the L0A files after having generated the L0B netCDF products. The default is False. remove_l0b : bool - Whether to remove the L0B files after having concatenated all L0B netCDF files. - It takes places only if l0b_concat = True + Whether to remove the L0B files after having produced L0C netCDF files. The default is False. force : bool If True, overwrite existing data into destination directories. @@ -119,7 +118,7 @@ def disdrodb_run_l0( Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - from disdrodb.l0.routines import run_disdrodb_l0 + from disdrodb.routines import run_disdrodb_l0 # Parse data_sources, campaign_names and station arguments base_dir = parse_base_dir(base_dir) @@ -136,7 +135,7 @@ def disdrodb_run_l0( # L0 archive options l0a_processing=l0a_processing, l0b_processing=l0b_processing, - l0b_concat=l0b_concat, + l0c_processing=l0c_processing, remove_l0a=remove_l0a, remove_l0b=remove_l0b, # Processing options diff --git a/disdrodb/l0/scripts/disdrodb_run_l0_station.py b/disdrodb/cli/disdrodb_run_l0_station.py similarity index 85% rename from disdrodb/l0/scripts/disdrodb_run_l0_station.py rename to disdrodb/cli/disdrodb_run_l0_station.py index a197f9fb..166c4d73 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0_station.py +++ b/disdrodb/cli/disdrodb_run_l0_station.py @@ -20,12 +20,10 @@ import click -from disdrodb.l0.routines import ( - click_l0_archive_options, - click_l0_processing_options, -) -from disdrodb.utils.scripts import ( +from disdrodb.utils.cli import ( click_base_dir_option, + click_l0_archive_options, + click_processing_options, click_station_arguments, parse_base_dir, ) @@ -38,7 +36,7 @@ @click.command() @click_station_arguments -@click_l0_processing_options +@click_processing_options @click_l0_archive_options @click_base_dir_option def disdrodb_run_l0_station( @@ -49,7 +47,7 @@ def disdrodb_run_l0_station( # L0 archive options l0a_processing: bool = True, l0b_processing: bool = True, - l0b_concat: bool = True, + l0c_processing: bool = True, remove_l0a: bool = False, remove_l0b: bool = False, # Processing options @@ -77,18 +75,15 @@ def disdrodb_run_l0_station( l0b_processing : bool \n Whether to launch processing to generate L0B netCDF4 file(s) from L0A data.\n The default is True.\n - l0b_concat : bool \n - Whether to concatenate all raw files into a single L0B netCDF file.\n - If l0b_concat=True, all raw files will be saved into a single L0B netCDF file.\n - If l0b_concat=False, each raw file will be converted into the corresponding L0B netCDF file.\n - The default is False.\n + l0c_processing : bool + Whether to launch processing to generate L0C netCDF4 file(s) from L0C data. + The default is True. remove_l0a : bool \n Whether to keep the L0A files after having generated the L0B netCDF products.\n The default is False.\n - remove_l0b : bool \n - Whether to remove the L0B files after having concatenated all L0B netCDF files.\n - It takes places only if l0b_concat=True\n - The default is False.\n + remove_l0b : bool + Whether to remove the L0B files after having produced L0C netCDF files. + The default is False. force : bool \n If True, overwrite existing data into destination directories.\n If False, raise an error if there are already data into destination directories.\n @@ -113,7 +108,7 @@ def disdrodb_run_l0_station( Format: <...>/DISDRODB \n If not specified, uses path specified in the DISDRODB active configuration. \n """ - from disdrodb.l0.routines import run_disdrodb_l0_station + from disdrodb.routines import run_disdrodb_l0_station base_dir = parse_base_dir(base_dir) @@ -125,7 +120,7 @@ def disdrodb_run_l0_station( # L0 archive options l0a_processing=l0a_processing, l0b_processing=l0b_processing, - l0b_concat=l0b_concat, + l0c_processing=l0c_processing, remove_l0a=remove_l0a, remove_l0b=remove_l0b, # Processing options diff --git a/disdrodb/l0/scripts/disdrodb_run_l0a.py b/disdrodb/cli/disdrodb_run_l0a.py similarity index 93% rename from disdrodb/l0/scripts/disdrodb_run_l0a.py rename to disdrodb/cli/disdrodb_run_l0a.py index 5e8121de..73357c26 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0a.py +++ b/disdrodb/cli/disdrodb_run_l0a.py @@ -21,18 +21,20 @@ import click -from disdrodb.l0.routines import ( - click_l0_processing_options, - click_l0_stations_options, +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_stations_options, + parse_arg_to_list, + parse_base_dir, ) -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur @click.command() -@click_l0_stations_options -@click_l0_processing_options +@click_stations_options +@click_processing_options @click_base_dir_option def disdrodb_run_l0a( # L0 disdrodb stations options @@ -90,7 +92,7 @@ def disdrodb_run_l0a( Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - from disdrodb.l0.routines import run_disdrodb_l0a + from disdrodb.routines import run_disdrodb_l0a # Parse data_sources, campaign_names and station arguments base_dir = parse_base_dir(base_dir) diff --git a/disdrodb/l0/scripts/disdrodb_run_l0a_station.py b/disdrodb/cli/disdrodb_run_l0a_station.py similarity index 82% rename from disdrodb/l0/scripts/disdrodb_run_l0a_station.py rename to disdrodb/cli/disdrodb_run_l0a_station.py index 4f160a06..dacd0fa6 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0a_station.py +++ b/disdrodb/cli/disdrodb_run_l0a_station.py @@ -20,9 +20,9 @@ import click -from disdrodb.l0.routines import click_l0_processing_options -from disdrodb.utils.scripts import ( +from disdrodb.utils.cli import ( click_base_dir_option, + click_processing_options, click_station_arguments, parse_base_dir, ) @@ -35,7 +35,7 @@ @click.command() @click_station_arguments -@click_l0_processing_options +@click_processing_options @click_base_dir_option def disdrodb_run_l0a_station( # Station arguments @@ -85,32 +85,15 @@ def disdrodb_run_l0a_station( Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - import os - - import dask - from dask.distributed import Client, LocalCluster - from disdrodb.l0.l0_processing import run_l0a_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster base_dir = parse_base_dir(base_dir) # -------------------------------------------------------------------------. # If parallel=True, set the dask environment if parallel: - # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF - os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" - # Retrieve the number of process to run - available_workers = os.cpu_count() - 2 # if not set, all CPUs - num_workers = dask.config.get("num_workers", available_workers) - # Create dask.distributed local cluster - cluster = LocalCluster( - n_workers=num_workers, - threads_per_worker=1, - processes=True, - # memory_limit='8GB', - # silence_logs=False, - ) - Client(cluster) + cluster, client = initialize_dask_cluster() # -------------------------------------------------------------------------. run_l0a_station( @@ -129,4 +112,4 @@ def disdrodb_run_l0a_station( # -------------------------------------------------------------------------. # Close the cluster if parallel: - cluster.close() + close_dask_cluster(cluster, client) diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b.py b/disdrodb/cli/disdrodb_run_l0b.py similarity index 93% rename from disdrodb/l0/scripts/disdrodb_run_l0b.py rename to disdrodb/cli/disdrodb_run_l0b.py index 836cc599..b5706c16 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0b.py +++ b/disdrodb/cli/disdrodb_run_l0b.py @@ -21,19 +21,21 @@ import click -from disdrodb.l0.routines import ( - click_l0_processing_options, - click_l0_stations_options, +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, click_remove_l0a_option, + click_stations_options, + parse_arg_to_list, + parse_base_dir, ) -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur @click.command() -@click_l0_stations_options -@click_l0_processing_options +@click_stations_options +@click_processing_options @click_remove_l0a_option @click_base_dir_option def disdrodb_run_l0b( @@ -93,7 +95,7 @@ def disdrodb_run_l0b( Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - from disdrodb.l0.routines import run_disdrodb_l0b + from disdrodb.routines import run_disdrodb_l0b # Parse data_sources, campaign_names and station arguments base_dir = parse_base_dir(base_dir) diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_station.py b/disdrodb/cli/disdrodb_run_l0b_station.py similarity index 82% rename from disdrodb/l0/scripts/disdrodb_run_l0b_station.py rename to disdrodb/cli/disdrodb_run_l0b_station.py index 49462911..297de3ae 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0b_station.py +++ b/disdrodb/cli/disdrodb_run_l0b_station.py @@ -20,9 +20,10 @@ import click -from disdrodb.l0.routines import click_l0_processing_options, click_remove_l0a_option -from disdrodb.utils.scripts import ( +from disdrodb.utils.cli import ( click_base_dir_option, + click_processing_options, + click_remove_l0a_option, click_station_arguments, parse_base_dir, ) @@ -35,7 +36,7 @@ @click.command() @click_station_arguments -@click_l0_processing_options +@click_processing_options @click_remove_l0a_option @click_base_dir_option def disdrodb_run_l0b_station( @@ -86,32 +87,15 @@ def disdrodb_run_l0b_station( Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - import os - - import dask - from dask.distributed import Client, LocalCluster - from disdrodb.l0.l0_processing import run_l0b_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster base_dir = parse_base_dir(base_dir) # -------------------------------------------------------------------------. # If parallel=True, set the dask environment if parallel: - # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF - os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" - # Retrieve the number of process to run - available_workers = os.cpu_count() - 2 # if not set, all CPUs - num_workers = dask.config.get("num_workers", available_workers) - # Create dask.distributed local cluster - cluster = LocalCluster( - n_workers=num_workers, - threads_per_worker=1, - processes=True, - # memory_limit='8GB', - # silence_logs=False, - ) - Client(cluster) + cluster, client = initialize_dask_cluster() # -------------------------------------------------------------------------. run_l0b_station( @@ -131,4 +115,4 @@ def disdrodb_run_l0b_station( # -------------------------------------------------------------------------. # Close the cluster if parallel: - cluster.close() + close_dask_cluster(cluster, client) diff --git a/disdrodb/cli/disdrodb_run_l0c.py b/disdrodb/cli/disdrodb_run_l0c.py new file mode 100644 index 00000000..40bf2b22 --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l0c.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L0C processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_remove_l0b_option, + click_stations_options, + parse_arg_to_list, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + + +@click.command() +@click_stations_options +@click_processing_options +@click_remove_l0b_option +@click_base_dir_option +def disdrodb_run_l0c( + # L0 disdrodb stations options + data_sources: Optional[str] = None, + campaign_names: Optional[str] = None, + station_names: Optional[str] = None, + # L0C processing options + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L0C processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : str + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR' + campaign_names : str + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS' + station_names : str + Station names. + To specify multiple stations, write i.e.: --station_names 'station1 station2' + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is True. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0b + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 100 rows of 3 L0A files for each station. + The default is False. + remove_l0b: bool, optional + Whether to remove the processed L0B files. The default is ``False``. + base_dir : str + Base directory of DISDRODB + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.routines import run_disdrodb_l0c + + # Parse data_sources, campaign_names and station arguments + base_dir = parse_base_dir(base_dir) + data_sources = parse_arg_to_list(data_sources) + campaign_names = parse_arg_to_list(campaign_names) + station_names = parse_arg_to_list(station_names) + + # Run processing + run_disdrodb_l0c( + base_dir=base_dir, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + # L0C processing options + remove_l0b=remove_l0b, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) diff --git a/disdrodb/cli/disdrodb_run_l0c_station.py b/disdrodb/cli/disdrodb_run_l0c_station.py new file mode 100644 index 00000000..0e3d699e --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l0c_station.py @@ -0,0 +1,122 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L0C station processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_remove_l0b_option, + click_station_arguments, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + +# -------------------------------------------------------------------------. +# Click Command Line Interface decorator + + +@click.command() +@click_station_arguments +@click_processing_options +@click_remove_l0b_option +@click_base_dir_option +def disdrodb_run_l0c_station( + # Station arguments + data_source: str, + campaign_name: str, + station_name: str, + # L0C processing options + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """Run the L0C processing of a specific DISDRODB station from the terminal. + + Parameters + ---------- + data_source : str + Institution name (when campaign data spans more than 1 country), + or country (when all campaigns (or sensor networks) are inside a given country). + Must be UPPER CASE. + campaign_name : str + Campaign name. Must be UPPER CASE. + station_name : str + Station name + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is True. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0b_station + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 100 rows of 3 L0A files. + The default is False. + remove_l0b: bool, optional + Whether to remove the processed L0B files. The default is ``False``. + base_dir : str + Base directory of DISDRODB + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.l0.l0_processing import run_l0c_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster + + base_dir = parse_base_dir(base_dir) + + # -------------------------------------------------------------------------. + # If parallel=True, set the dask environment + if parallel: + cluster, client = initialize_dask_cluster() + + # -------------------------------------------------------------------------. + run_l0c_station( + # Station arguments + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0C processing options + remove_l0b=remove_l0b, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + base_dir=base_dir, + ) + + # -------------------------------------------------------------------------. + # Close the cluster + if parallel: + close_dask_cluster(cluster, client) diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_concat.py b/disdrodb/cli/disdrodb_run_l1.py similarity index 60% rename from disdrodb/l0/scripts/disdrodb_run_l0b_concat.py rename to disdrodb/cli/disdrodb_run_l1.py index 46199e54..9458e5a2 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0b_concat.py +++ b/disdrodb/cli/disdrodb_run_l1.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # -----------------------------------------------------------------------------. # Copyright (c) 2021-2023 DISDRODB developers # @@ -14,38 +15,45 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # -----------------------------------------------------------------------------. -"""Script to concatenate the DISDRODB L0B files.""" +"""Script to run the DISDRODB L1B processing.""" import sys from typing import Optional import click -from disdrodb.l0.routines import ( - click_l0_stations_options, - click_l0b_concat_options, +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_stations_options, + parse_arg_to_list, + parse_base_dir, ) -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur @click.command() -@click_l0_stations_options -@click_l0b_concat_options +@click_stations_options +@click_processing_options @click_base_dir_option -def disdrodb_run_l0b_concat( +def disdrodb_run_l1( + # Stations options data_sources: Optional[str] = None, campaign_names: Optional[str] = None, station_names: Optional[str] = None, - remove_l0b: bool = False, + # Processing options + force: bool = False, verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, base_dir: Optional[str] = None, ): - """Run the L0B concatenation of available DISDRODB stations. + """ + Run the L1 processing of DISDRODB stations. - This function allow to launch the processing of many DISDRODB stations with a single command. - From the list of all available DISDRODB stations, it runs the processing of the - stations matching the provided data_sources, campaign_names and station_names. + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing + of the stations matching the provided data_sources, campaign_names and station_names. Parameters ---------- @@ -61,18 +69,30 @@ def disdrodb_run_l0b_concat( station_names : str Station names. To specify multiple stations, write i.e.: --station_names 'station1 station2' - remove_l0b : bool - If true, remove all source L0B files once L0B concatenation is terminated. + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. The default is False. verbose : bool Whether to print detailed processing information into terminal. The default is False. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files for each station. + The default is False. base_dir : str Base directory of DISDRODB Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - from disdrodb.l0.routines import run_disdrodb_l0b_concat + from disdrodb.l1.routines import run_disdrodb_l1 # Parse data_sources, campaign_names and station arguments base_dir = parse_base_dir(base_dir) @@ -80,12 +100,15 @@ def disdrodb_run_l0b_concat( campaign_names = parse_arg_to_list(campaign_names) station_names = parse_arg_to_list(station_names) - # Run concatenation - run_disdrodb_l0b_concat( + # Run processing + run_disdrodb_l1( base_dir=base_dir, data_sources=data_sources, campaign_names=campaign_names, station_names=station_names, - remove_l0b=remove_l0b, + # Processing options + force=force, verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, ) diff --git a/disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py b/disdrodb/cli/disdrodb_run_l1_station.py similarity index 51% rename from disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py rename to disdrodb/cli/disdrodb_run_l1_station.py index 8da14e10..b91e8c18 100644 --- a/disdrodb/l0/scripts/disdrodb_run_l0b_concat_station.py +++ b/disdrodb/cli/disdrodb_run_l1_station.py @@ -14,40 +14,43 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # -----------------------------------------------------------------------------. -"""Script to concatenate the DISDRODB L0B station files.""" -################################################## -## Wrapper to concat L0B files by command lines ## -################################################## +"""Script to run the DISDRODB L1 station processing.""" import sys from typing import Optional import click -from disdrodb.l0.routines import click_l0b_concat_options -from disdrodb.utils.scripts import ( +from disdrodb.utils.cli import ( click_base_dir_option, + click_processing_options, click_station_arguments, parse_base_dir, ) sys.tracebacklimit = 0 # avoid full traceback error if occur +# -------------------------------------------------------------------------. +# Click Command Line Interface decorator + @click.command() @click_station_arguments -@click_l0b_concat_options +@click_processing_options @click_base_dir_option -def disdrodb_run_l0b_concat_station( +def disdrodb_run_l1_station( # Station arguments data_source: str, campaign_name: str, station_name: str, - # L0B concat options - remove_l0b=False, - verbose=True, + # Processing options + force: bool = False, + verbose: bool = False, + parallel: bool = True, + debugging_mode: bool = False, base_dir: Optional[str] = None, ): - """Concatenation all L0B files of a specific DISDRODB station into a single netCDF. + """ + Run the L1 processing of a specific DISDRODB station from the terminal. Parameters ---------- @@ -59,28 +62,54 @@ def disdrodb_run_l0b_concat_station( Campaign name. Must be UPPER CASE. station_name : str Station name - remove_l0b : bool - If true, remove all source L0B files once L0B concatenation is terminated. + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. The default is False. verbose : bool Whether to print detailed processing information into terminal. + The default is True. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files. The default is False. base_dir : str - Base directory of DISDRODB + Base directory of DISDRODB. Format: <...>/DISDRODB If not specified, uses path specified in the DISDRODB active configuration. """ - from disdrodb.l0.l0_processing import run_l0b_concat_station + from disdrodb.l1.routines import run_l1_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster base_dir = parse_base_dir(base_dir) - run_l0b_concat_station( + # -------------------------------------------------------------------------. + # If parallel=True, set the dask environment + if parallel: + cluster, client = initialize_dask_cluster() + + # -------------------------------------------------------------------------. + run_l1_station( # Station arguments data_source=data_source, campaign_name=campaign_name, station_name=station_name, # Processing options - remove_l0b=remove_l0b, + force=force, verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, base_dir=base_dir, ) + + # -------------------------------------------------------------------------. + # Close the cluster + if parallel: + close_dask_cluster(cluster, client) diff --git a/disdrodb/cli/disdrodb_run_l2e.py b/disdrodb/cli/disdrodb_run_l2e.py new file mode 100644 index 00000000..8026d7f7 --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l2e.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L2E processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_stations_options, + parse_arg_to_list, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + + +@click.command() +@click_stations_options +@click_processing_options +@click_base_dir_option +def disdrodb_run_l2e( + # Stations options + data_sources: Optional[str] = None, + campaign_names: Optional[str] = None, + station_names: Optional[str] = None, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L2E processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing + of the stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : str + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR' + campaign_names : str + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS' + station_names : str + Station names. + To specify multiple stations, write i.e.: --station_names 'station1 station2' + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is False. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files for each station. + The default is False. + base_dir : str + Base directory of DISDRODB + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.routines import run_disdrodb_l2e + + # Parse data_sources, campaign_names and station arguments + base_dir = parse_base_dir(base_dir) + data_sources = parse_arg_to_list(data_sources) + campaign_names = parse_arg_to_list(campaign_names) + station_names = parse_arg_to_list(station_names) + + # Run processing + run_disdrodb_l2e( + base_dir=base_dir, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) diff --git a/disdrodb/cli/disdrodb_run_l2e_station.py b/disdrodb/cli/disdrodb_run_l2e_station.py new file mode 100644 index 00000000..0fb5a81f --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l2e_station.py @@ -0,0 +1,115 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L2E station processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_station_arguments, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + +# -------------------------------------------------------------------------. +# Click Command Line Interface decorator + + +@click.command() +@click_station_arguments +@click_processing_options +@click_base_dir_option +def disdrodb_run_l2e_station( + # Station arguments + data_source: str, + campaign_name: str, + station_name: str, + # Processing options + force: bool = False, + verbose: bool = False, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L2E processing of a specific DISDRODB station from the terminal. + + Parameters + ---------- + data_source : str + Institution name (when campaign data spans more than 1 country), + or country (when all campaigns (or sensor networks) are inside a given country). + Must be UPPER CASE. + campaign_name : str + Campaign name. Must be UPPER CASE. + station_name : str + Station name + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is True. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files. + The default is False. + base_dir : str + Base directory of DISDRODB. + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.l2.routines import run_l2e_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster + + base_dir = parse_base_dir(base_dir) + + # -------------------------------------------------------------------------. + # If parallel=True, set the dask environment + if parallel: + cluster, client = initialize_dask_cluster() + + # -------------------------------------------------------------------------. + run_l2e_station( + # Station arguments + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + base_dir=base_dir, + ) + + # -------------------------------------------------------------------------. + # Close the cluster + if parallel: + close_dask_cluster(cluster, client) diff --git a/disdrodb/cli/disdrodb_run_l2m.py b/disdrodb/cli/disdrodb_run_l2m.py new file mode 100644 index 00000000..ca00c71a --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l2m.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L2M processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_stations_options, + parse_arg_to_list, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + + +@click.command() +@click_stations_options +@click_processing_options +@click_base_dir_option +def disdrodb_run_l2m( + # Stations options + data_sources: Optional[str] = None, + campaign_names: Optional[str] = None, + station_names: Optional[str] = None, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L2M processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing + of the stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : str + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + To specify multiple data sources, write i.e.: --data_sources 'GPM EPFL NCAR' + campaign_names : str + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + To specify multiple campaigns, write i.e.: --campaign_names 'IPEX IMPACTS' + station_names : str + Station names. + To specify multiple stations, write i.e.: --station_names 'station1 station2' + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is False. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files for each station. + The default is False. + base_dir : str + Base directory of DISDRODB + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.routines import run_disdrodb_l2m + + # Parse data_sources, campaign_names and station arguments + base_dir = parse_base_dir(base_dir) + data_sources = parse_arg_to_list(data_sources) + campaign_names = parse_arg_to_list(campaign_names) + station_names = parse_arg_to_list(station_names) + + # Run processing + run_disdrodb_l2m( + base_dir=base_dir, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) diff --git a/disdrodb/cli/disdrodb_run_l2m_station.py b/disdrodb/cli/disdrodb_run_l2m_station.py new file mode 100644 index 00000000..3e1ed86f --- /dev/null +++ b/disdrodb/cli/disdrodb_run_l2m_station.py @@ -0,0 +1,115 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Script to run the DISDRODB L2M station processing.""" +import sys +from typing import Optional + +import click + +from disdrodb.utils.cli import ( + click_base_dir_option, + click_processing_options, + click_station_arguments, + parse_base_dir, +) + +sys.tracebacklimit = 0 # avoid full traceback error if occur + +# -------------------------------------------------------------------------. +# Click Command Line Interface decorator + + +@click.command() +@click_station_arguments +@click_processing_options +@click_base_dir_option +def disdrodb_run_l2m_station( + # Station arguments + data_source: str, + campaign_name: str, + station_name: str, + # Processing options + force: bool = False, + verbose: bool = False, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L2M processing of a specific DISDRODB station from the terminal. + + Parameters + ---------- + data_source : str + Institution name (when campaign data spans more than 1 country), + or country (when all campaigns (or sensor networks) are inside a given country). + Must be UPPER CASE. + campaign_name : str + Campaign name. Must be UPPER CASE. + station_name : str + Station name + force : bool + If True, overwrite existing data into destination directories. + If False, raise an error if there are already data into destination directories. + The default is False. + verbose : bool + Whether to print detailed processing information into terminal. + The default is True. + parallel : bool + If True, the files are processed simultaneously in multiple processes. + Each process will use a single thread. + By default, the number of process is defined with os.cpu_count(). + However, you can customize it by typing: DASK_NUM_WORKERS=4 disdrodb_run_l0a_station + If False, the files are processed sequentially in a single process. + If False, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If True, it reduces the amount of data to process. + It processes just the first 3 raw data files. + The default is False. + base_dir : str + Base directory of DISDRODB. + Format: <...>/DISDRODB + If not specified, uses path specified in the DISDRODB active configuration. + """ + from disdrodb.l2.routines import run_l2m_station + from disdrodb.utils.dask import close_dask_cluster, initialize_dask_cluster + + base_dir = parse_base_dir(base_dir) + + # -------------------------------------------------------------------------. + # If parallel=True, set the dask environment + if parallel: + cluster, client = initialize_dask_cluster() + + # -------------------------------------------------------------------------. + run_l2m_station( + # Station arguments + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + base_dir=base_dir, + ) + + # -------------------------------------------------------------------------. + # Close the cluster + if parallel: + close_dask_cluster(cluster, client) diff --git a/disdrodb/data_transfer/scripts/disdrodb_upload_archive.py b/disdrodb/cli/disdrodb_upload_archive.py similarity index 97% rename from disdrodb/data_transfer/scripts/disdrodb_upload_archive.py rename to disdrodb/cli/disdrodb_upload_archive.py index 0107169d..97bf8d28 100644 --- a/disdrodb/data_transfer/scripts/disdrodb_upload_archive.py +++ b/disdrodb/cli/disdrodb_upload_archive.py @@ -24,7 +24,7 @@ import click from disdrodb.data_transfer.upload_data import click_upload_archive_options, click_upload_options -from disdrodb.utils.scripts import click_base_dir_option, parse_arg_to_list, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, parse_arg_to_list, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/data_transfer/scripts/disdrodb_upload_station.py b/disdrodb/cli/disdrodb_upload_station.py similarity index 96% rename from disdrodb/data_transfer/scripts/disdrodb_upload_station.py rename to disdrodb/cli/disdrodb_upload_station.py index 754a8f7e..188ff5c9 100644 --- a/disdrodb/data_transfer/scripts/disdrodb_upload_station.py +++ b/disdrodb/cli/disdrodb_upload_station.py @@ -24,7 +24,7 @@ import click from disdrodb.data_transfer.upload_data import click_upload_options -from disdrodb.utils.scripts import click_base_dir_option, click_station_arguments, parse_base_dir +from disdrodb.utils.cli import click_base_dir_option, click_station_arguments, parse_base_dir sys.tracebacklimit = 0 # avoid full traceback error if occur diff --git a/disdrodb/data_transfer/download_data.py b/disdrodb/data_transfer/download_data.py index a5c03f52..8b6ea512 100644 --- a/disdrodb/data_transfer/download_data.py +++ b/disdrodb/data_transfer/download_data.py @@ -207,9 +207,7 @@ def download_station( def _is_valid_disdrodb_data_url(disdrodb_data_url): """Check if it is a valid disdrodb_data_url.""" - if isinstance(disdrodb_data_url, str) and len(disdrodb_data_url) > 10: - return True - return False + return isinstance(disdrodb_data_url, str) and len(disdrodb_data_url) > 10 def _has_disdrodb_data_url(metadata_filepath): diff --git a/disdrodb/issue/checks.py b/disdrodb/issue/checks.py index b28aafc6..8ac4ebb0 100644 --- a/disdrodb/issue/checks.py +++ b/disdrodb/issue/checks.py @@ -35,8 +35,7 @@ def _is_numpy_array_string(arr): arr : numpy array Numpy array to check. """ - dtype = arr.dtype.type - return dtype in (np.str_, np.unicode_) + return np.issubdtype(arr.dtype, np.str_) def _is_numpy_array_datetime(arr): @@ -52,7 +51,7 @@ def _is_numpy_array_datetime(arr): numpy array Numpy array checked. """ - return arr.dtype.type == np.datetime64 + return np.issubdtype(arr.dtype, np.datetime64) def _check_timestep_datetime_accuracy(timesteps, unit="s"): diff --git a/disdrodb/l0/__init__.py b/disdrodb/l0/__init__.py index cc1b9f11..bbd54d92 100644 --- a/disdrodb/l0/__init__.py +++ b/disdrodb/l0/__init__.py @@ -3,7 +3,7 @@ run_l0b_from_nc, ) from disdrodb.l0.l0_reader import available_readers -from disdrodb.l0.routines import ( +from disdrodb.routines import ( run_disdrodb_l0, run_disdrodb_l0_station, run_disdrodb_l0a, diff --git a/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml b/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml index d8b04830..87c77956 100644 --- a/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml +++ b/disdrodb/l0/configs/OTT_Parsivel/l0b_encodings.yml @@ -34,7 +34,7 @@ weather_code_synop_4677: _FillValue: 255 weather_code_metar_4678: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -42,7 +42,7 @@ weather_code_metar_4678: chunksizes: 5000 weather_code_nws: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -103,7 +103,7 @@ sensor_temperature: _FillValue: 255 sensor_serial_number: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -111,7 +111,7 @@ sensor_serial_number: chunksizes: 5000 firmware_iop: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -119,7 +119,7 @@ firmware_iop: chunksizes: 5000 firmware_dsp: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -152,7 +152,7 @@ sensor_status: _FillValue: 255 start_time: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -160,7 +160,7 @@ start_time: chunksizes: 5000 sensor_time: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -168,7 +168,7 @@ sensor_time: chunksizes: 5000 sensor_date: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -176,7 +176,7 @@ sensor_date: chunksizes: 5000 station_name: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -184,7 +184,7 @@ station_name: chunksizes: 5000 station_number: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false diff --git a/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml b/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml index 192e7e2a..459764af 100644 --- a/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml +++ b/disdrodb/l0/configs/OTT_Parsivel/raw_data_format.yml @@ -294,7 +294,7 @@ raw_drop_average_velocity: data_range: null nan_flags: null dimension_order: - - velocity_bin_center + - diameter_bin_center n_values: 32 field_number: "91" raw_drop_number: diff --git a/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml b/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml index 88d4a6ed..fbb8c1e1 100644 --- a/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml +++ b/disdrodb/l0/configs/OTT_Parsivel2/l0b_encodings.yml @@ -15,14 +15,14 @@ rainfall_accumulated_32bit: contiguous: false chunksizes: 5000 weather_code_synop_4680: - dtype: uint32 + dtype: uint8 zlib: true complevel: 3 shuffle: true fletcher32: false contiguous: false chunksizes: 5000 - _FillValue: 4294967295 + _FillValue: 255 weather_code_synop_4677: dtype: uint32 zlib: true @@ -34,7 +34,7 @@ weather_code_synop_4677: _FillValue: 4294967295 weather_code_metar_4678: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -42,7 +42,7 @@ weather_code_metar_4678: chunksizes: 5000 weather_code_nws: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -103,7 +103,7 @@ sensor_temperature: _FillValue: 127 sensor_serial_number: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -111,7 +111,7 @@ sensor_serial_number: chunksizes: 5000 firmware_iop: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -119,7 +119,7 @@ firmware_iop: chunksizes: 5000 firmware_dsp: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -152,7 +152,7 @@ sensor_status: _FillValue: 255 start_time: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -160,7 +160,7 @@ start_time: chunksizes: 5000 sensor_time: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -168,7 +168,7 @@ sensor_time: chunksizes: 5000 sensor_date: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -176,7 +176,7 @@ sensor_date: chunksizes: 5000 station_name: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -184,7 +184,7 @@ station_name: chunksizes: 5000 station_number: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -293,7 +293,7 @@ number_particles_all: _FillValue: 4294967295 list_particles: dtype: object - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false diff --git a/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml b/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml index 5c271483..baf965ca 100644 --- a/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml +++ b/disdrodb/l0/configs/OTT_Parsivel2/raw_data_format.yml @@ -364,7 +364,7 @@ raw_drop_average_velocity: data_range: null nan_flags: null dimension_order: - - velocity_bin_center + - diameter_bin_center n_values: 32 field_number: "91" raw_drop_number: diff --git a/disdrodb/l0/configs/RD_80/l0a_encodings.yml b/disdrodb/l0/configs/RD_80/l0a_encodings.yml index 775d637b..c4399588 100644 --- a/disdrodb/l0/configs/RD_80/l0a_encodings.yml +++ b/disdrodb/l0/configs/RD_80/l0a_encodings.yml @@ -1,5 +1,5 @@ sensor_status: "float32" # 'int8' -interval: "float32" # 'uint8' +sample_interval: "float32" # 'uint8' RI: "float32" RA: "float32" RAT: "float32" diff --git a/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml b/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml index 95baceae..14b515c2 100644 --- a/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml +++ b/disdrodb/l0/configs/RD_80/l0b_cf_attrs.yml @@ -2,7 +2,7 @@ sensor_status: description: Sensor status long_name: Sensor status units: "" -interval: +sample_interval: description: Time interval between measurement long_name: Time interval between measurement units: s diff --git a/disdrodb/l0/configs/RD_80/l0b_encodings.yml b/disdrodb/l0/configs/RD_80/l0b_encodings.yml index bc6a63a0..3ae873cb 100644 --- a/disdrodb/l0/configs/RD_80/l0b_encodings.yml +++ b/disdrodb/l0/configs/RD_80/l0b_encodings.yml @@ -7,7 +7,7 @@ sensor_status: contiguous: false chunksizes: 5000 _FillValue: 255 -interval: +sample_interval: dtype: uint8 zlib: true complevel: 3 diff --git a/disdrodb/l0/configs/RD_80/raw_data_format.yml b/disdrodb/l0/configs/RD_80/raw_data_format.yml index 0b1cf856..3f82e120 100644 --- a/disdrodb/l0/configs/RD_80/raw_data_format.yml +++ b/disdrodb/l0/configs/RD_80/raw_data_format.yml @@ -25,7 +25,7 @@ sensor_status: - 0 - 1 field_number: "03" -interval: +sample_interval: n_digits: 4 n_characters: 4 n_decimals: 4 diff --git a/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml b/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml index ce1f50be..5851af0b 100644 --- a/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml +++ b/disdrodb/l0/configs/Thies_LPM/l0b_encodings.yml @@ -18,7 +18,7 @@ device_address: _FillValue: 255 # sensor_serial_number: # dtype: uint16 -# zlib: true +# zlib: false # complevel: 3 # shuffle: true # fletcher32: false @@ -27,7 +27,7 @@ device_address: # _FillValue: 65535 # software_version: # dtype: float32 -# zlib: true +# zlib: false # complevel: 3 # shuffle: true # fletcher32: false @@ -35,7 +35,7 @@ device_address: # chunksizes: 5000 # sensor_date: # dtype: object -# zlib: true +# zlib: false # complevel: 3 # shuffle: true # fletcher32: false @@ -43,7 +43,7 @@ device_address: # chunksizes: 5000 # sensor_time: # dtype: object -# zlib: true +# zlib: false # complevel: 3 # shuffle: true # fletcher32: false @@ -69,7 +69,7 @@ weather_code_synop_4680_5min: _FillValue: 255 weather_code_metar_4678_5min: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false @@ -103,7 +103,7 @@ weather_code_synop_4680: _FillValue: 255 weather_code_metar_4678: dtype: str - zlib: true + zlib: false complevel: 3 shuffle: true fletcher32: false diff --git a/disdrodb/l0/io.py b/disdrodb/l0/io.py index d3a5f141..5a05e6a1 100644 --- a/disdrodb/l0/io.py +++ b/disdrodb/l0/io.py @@ -23,7 +23,7 @@ import pandas as pd -from disdrodb.api.path import define_l0a_station_dir +from disdrodb.api.io import filter_filepaths from disdrodb.utils.directories import list_files from disdrodb.utils.logger import log_info @@ -101,14 +101,6 @@ def _get_available_filepaths(raw_dir, station_name, glob_patterns): return filepaths -def _filter_filepaths(filepaths, debugging_mode): - """Filter out filepaths if ``debugging_mode=True``.""" - if debugging_mode: - max_files = min(3, len(filepaths)) - filepaths = filepaths[0:max_files] - return filepaths - - def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debugging_mode=False): """Get the list of files from a directory based on input parameters. @@ -122,7 +114,7 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug Directory of the campaign where to search for files. Format <..>/DISDRODB/Raw// station_name : str - ID of the station + Name of the station. verbose : bool, optional Whether to verbose the processing. The default is ``False``. @@ -141,7 +133,7 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug filepaths = _get_available_filepaths(raw_dir=raw_dir, station_name=station_name, glob_patterns=glob_patterns) # Filter out filepaths if debugging_mode=True - filepaths = _filter_filepaths(filepaths, debugging_mode) + filepaths = filter_filepaths(filepaths, debugging_mode) # Log number of files to process n_files = len(filepaths) @@ -153,40 +145,6 @@ def get_raw_filepaths(raw_dir, station_name, glob_patterns, verbose=False, debug return filepaths -def get_l0a_filepaths(processed_dir, station_name, debugging_mode=False): - """Retrieve L0A files for a give station. - - Parameters - ---------- - processed_dir : str - Directory of the campaign where to search for the L0A files. - Format: ``<..>/DISDRODB/Processed//``. - station_name : str - ID of the station - debugging_mode : bool, optional - If ``True``, it select maximum 3 files for debugging purposes. - The default is ``False``. - - Returns - ------- - filepaths : list - List of L0A file paths. - - """ - station_dir = define_l0a_station_dir(processed_dir, station_name) - filepaths = list_files(station_dir, glob_pattern="*.parquet", recursive=True) - - # Filter out filepaths if debugging_mode=True - filepaths = _filter_filepaths(filepaths, debugging_mode=debugging_mode) - - # If no file available, raise error - if len(filepaths) == 0: - msg = f"No L0A Apache Parquet file is available in {station_dir}. Run L0A processing first." - raise ValueError(msg) - - return filepaths - - ####--------------------------------------------------------------------------. #### DISDRODB L0A product reader @@ -243,14 +201,20 @@ def read_l0a_dataframe( if isinstance(filepaths, str): filepaths = [filepaths] # --------------------------------------------------- - # - If debugging_mode=True, it reads only the first 3 filepaths + # If debugging_mode=True, it reads only the first 3 filepaths if debugging_mode: filepaths = filepaths[0:3] # select first 3 filepaths - # - Define the list of dataframe + # --------------------------------------------------- + # Define the list of dataframe list_df = [_read_l0a(filepath, verbose=verbose, debugging_mode=debugging_mode) for filepath in filepaths] - # - Concatenate dataframe + + # Concatenate dataframe df = concatenate_dataframe(list_df, verbose=verbose) + + # Ensure time is in nanoseconds + df["time"] = df["time"].astype("M8[ns]") + # --------------------------------------------------- # Return dataframe return df diff --git a/disdrodb/l0/l0_processing.py b/disdrodb/l0/l0_processing.py index d126c7af..408d3e15 100644 --- a/disdrodb/l0/l0_processing.py +++ b/disdrodb/l0/l0_processing.py @@ -19,130 +19,114 @@ """Implement DISDRODB L0 processing.""" import datetime -import functools import logging import os -import shutil import time from typing import Optional import dask -import dask.bag as db -import xarray as xr from disdrodb.api.checks import check_sensor_name # Directory from disdrodb.api.create_directories import ( - create_directory_structure, create_l0_directory_structure, + create_logs_directory, + create_product_directory, ) -from disdrodb.api.info import infer_path_info_dict +from disdrodb.api.info import infer_path_info_tuple +from disdrodb.api.io import get_filepaths, get_required_product, remove_product from disdrodb.api.path import ( define_campaign_dir, - define_l0a_filepath, - define_l0b_filepath, - define_l0b_station_dir, - define_station_dir, - get_disdrodb_path, + define_l0a_filename, + define_l0b_filename, + define_l0c_filename, + define_metadata_filepath, ) + +# get_disdrodb_path, from disdrodb.configs import get_base_dir from disdrodb.issue import read_station_issue from disdrodb.l0.io import ( - get_l0a_filepaths, get_raw_filepaths, read_l0a_dataframe, ) from disdrodb.l0.l0_reader import get_station_reader_function +from disdrodb.l0.l0a_processing import ( + process_raw_file, + write_l0a, +) +from disdrodb.l0.l0b_nc_processing import create_l0b_from_raw_nc +from disdrodb.l0.l0b_processing import ( + create_l0b_from_l0a, + set_l0b_encodings, + write_l0b, +) +from disdrodb.l0.l0c_processing import ( + create_daily_file, + get_files_per_days, + retrieve_possible_measurement_intervals, +) from disdrodb.metadata import read_station_metadata -from disdrodb.utils.directories import list_files +from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel # Logger from disdrodb.utils.logger import ( close_logger, - create_file_logger, - define_summary_log, + create_logger_file, + create_product_logs, log_error, log_info, - log_warning, ) +# log_warning, +from disdrodb.utils.writer import write_product +from disdrodb.utils.yaml import read_yaml + logger = logging.getLogger(__name__) # -----------------------------------------------------------------------------. #### Creation of L0A and L0B Single Station File -def _delayed_based_on_kwargs(function): - """Decorator to make the function delayed if its ``parallel`` argument is ``True``.""" - - @functools.wraps(function) - def wrapper(*args, **kwargs): - # Check if it must be a delayed function - parallel = kwargs.get("parallel") - # If parallel is True - if parallel: - # Enforce verbose to be False - kwargs["verbose"] = False - # Define the delayed task - result = dask.delayed(function)(*args, **kwargs) - else: - # Else run the function - result = function(*args, **kwargs) - return result - - return wrapper - - -@_delayed_based_on_kwargs +@delayed_if_parallel +@single_threaded_if_parallel def _generate_l0a( filepath, - processed_dir, - station_name, # retrievable from filepath + data_dir, + logs_dir, + campaign_name, + station_name, + # Reader arguments column_names, reader_kwargs, df_sanitizer_fun, + # Processing info + sensor_name, + issue_dict, + # Processing options force, verbose, parallel, - issue_dict=None, ): """Generate L0A file from raw file.""" - from disdrodb.l0.l0a_processing import ( - process_raw_file, - write_l0a, - ) + # Define product + product = "L0A" ##------------------------------------------------------------------------. # Create file logger - if issue_dict is None: - issue_dict = {} filename = os.path.basename(filepath) - logger = create_file_logger( - processed_dir=processed_dir, - product="L0A", - station_name=station_name, + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, filename=filename, parallel=parallel, ) - # Define logger filepath - # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None - logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None - ##------------------------------------------------------------------------. # Log start processing - msg = f"L0A processing of {filename} has started." + msg = f"{product} processing of {filename} has started." log_info(logger=logger, msg=msg, verbose=verbose) - ##------------------------------------------------------------------------. - # Retrieve metadata - attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir)) - - # Retrieve sensor name - sensor_name = attrs["sensor_name"] - check_sensor_name(sensor_name) - ##------------------------------------------------------------------------. try: #### - Read raw file into a dataframe and sanitize to L0A format @@ -158,7 +142,8 @@ def _generate_l0a( ##--------------------------------------------------------------------. #### - Write to Parquet - filepath = define_l0a_filepath(df=df, processed_dir=processed_dir, station_name=station_name) + filename = define_l0a_filename(df=df, campaign_name=campaign_name, station_name=station_name) + filepath = os.path.join(data_dir, filename) write_l0a(df=df, filepath=filepath, force=force, verbose=verbose) ##--------------------------------------------------------------------. @@ -166,7 +151,7 @@ def _generate_l0a( del df # Log end processing - msg = f"L0A processing of {filename} has ended." + msg = f"{product} processing of {filename} has ended." log_info(logger=logger, msg=msg, verbose=verbose) # Otherwise log the error @@ -182,58 +167,57 @@ def _generate_l0a( return logger_filepath +@delayed_if_parallel +@single_threaded_if_parallel def _generate_l0b( filepath, - processed_dir, # retrievable from filepath - station_name, # retrievable from filepath + data_dir, + logs_dir, + campaign_name, + station_name, + # Processing info + metadata, + # Processing options force, verbose, - debugging_mode, parallel, + debugging_mode, ): - from disdrodb.l0.l0b_processing import ( - create_l0b_from_l0a, - write_l0b, - ) + # Define product + product = "L0B" # -----------------------------------------------------------------. # Create file logger filename = os.path.basename(filepath) - logger = create_file_logger( - processed_dir=processed_dir, - product="L0B", - station_name=station_name, + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, filename=filename, parallel=parallel, ) - # Define logger filepath - # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None - logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None ##------------------------------------------------------------------------. # Log start processing - msg = f"L0B processing of {filename} has started." + msg = f"{product} processing of {filename} has started." log_info(logger, msg, verbose=verbose) ##------------------------------------------------------------------------. - # Retrieve metadata - attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir)) - # Retrieve sensor name - sensor_name = attrs["sensor_name"] + sensor_name = metadata["sensor_name"] check_sensor_name(sensor_name) ##------------------------------------------------------------------------. try: # Read L0A Apache Parquet file df = read_l0a_dataframe(filepath, verbose=verbose, debugging_mode=debugging_mode) + # -----------------------------------------------------------------. # Create xarray Dataset - ds = create_l0b_from_l0a(df=df, attrs=attrs, verbose=verbose) + ds = create_l0b_from_l0a(df=df, attrs=metadata, verbose=verbose) # -----------------------------------------------------------------. # Write L0B netCDF4 dataset - filepath = define_l0b_filepath(ds, processed_dir, station_name) + filename = define_l0b_filename(ds=ds, campaign_name=campaign_name, station_name=station_name) + filepath = os.path.join(data_dir, filename) write_l0b(ds, filepath=filepath, force=force) ##--------------------------------------------------------------------. @@ -241,7 +225,7 @@ def _generate_l0b( del ds, df # Log end processing - msg = f"L0B processing of {filename} has ended." + msg = f"{product} processing of {filename} has ended." log_info(logger, msg, verbose=verbose) # Otherwise log the error @@ -259,43 +243,43 @@ def _generate_l0b( def _generate_l0b_from_nc( filepath, - processed_dir, - station_name, # retrievable from filepath + data_dir, + logs_dir, + campaign_name, + station_name, + # Processing info + metadata, + # Reader arguments dict_names, ds_sanitizer_fun, + # Processing options force, verbose, parallel, ): - from disdrodb.l0.l0b_nc_processing import create_l0b_from_raw_nc - from disdrodb.l0.l0b_processing import write_l0b + import xarray as xr # Load in each process + + # -----------------------------------------------------------------. + # Define product name + product = "L0B" # -----------------------------------------------------------------. # Create file logger filename = os.path.basename(filepath) - logger = create_file_logger( - processed_dir=processed_dir, - product="L0B", - station_name=station_name, + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, filename=filename, parallel=parallel, ) - # Define logger filepath - # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None - logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None - ##------------------------------------------------------------------------. # Log start processing - msg = f"L0B processing of {filename} has started." + msg = f"{product} processing of {filename} has started." log_info(logger, msg, verbose=verbose) ##------------------------------------------------------------------------. - # Retrieve metadata - attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir)) - # Retrieve sensor name - sensor_name = attrs["sensor_name"] + sensor_name = metadata["sensor_name"] check_sensor_name(sensor_name) ##------------------------------------------------------------------------. @@ -311,11 +295,12 @@ def _generate_l0b_from_nc( ds_sanitizer_fun=ds_sanitizer_fun, sensor_name=sensor_name, verbose=verbose, - attrs=attrs, + attrs=metadata, ) # -----------------------------------------------------------------. # Write L0B netCDF4 dataset - filepath = define_l0b_filepath(ds, processed_dir, station_name) + filename = define_l0b_filename(ds=ds, campaign_name=campaign_name, station_name=station_name) + filepath = os.path.join(data_dir, filename) write_l0b(ds, filepath=filepath, force=force) ##--------------------------------------------------------------------. @@ -339,6 +324,96 @@ def _generate_l0b_from_nc( return logger_filepath +@delayed_if_parallel +@single_threaded_if_parallel +def _generate_l0c( + day, + filepaths, + data_dir, + logs_dir, + metadata_filepath, + campaign_name, + station_name, + # Processing options + force, + verbose, + parallel, # this is used only to initialize the correct logger ! +): + # -----------------------------------------------------------------. + # Define product name + product = "L0C" + + # -----------------------------------------------------------------. + # Create file logger + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, + filename=day, + parallel=parallel, + ) + + ##------------------------------------------------------------------------. + # Log start processing + msg = f"{product} processing for {day} has started." + log_info(logger, msg, verbose=verbose) + + ##------------------------------------------------------------------------. + ### Core computation + try: + # Retrieve measurement_intervals + # - TODO: in future available from dataset + metadata = read_yaml(metadata_filepath) + measurement_intervals = retrieve_possible_measurement_intervals(metadata) + + # Produce L0C datasets + dict_ds = create_daily_file( + day=day, + filepaths=filepaths, + measurement_intervals=measurement_intervals, + ensure_variables_equality=True, + logger=logger, + verbose=verbose, + ) + + # Write a dataset for each sample interval + for ds in dict_ds.values(): # (sample_interval, ds) + # Write L0C netCDF4 dataset + if ds["time"].size > 1: + # Get sensor name from dataset + sensor_name = ds.attrs.get("sensor_name") + campaign_name = ds.attrs.get("campaign_name") + station_name = ds.attrs.get("station_name") + + # Set encodings + ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name) + + # Define filepath + filename = define_l0c_filename(ds, campaign_name=campaign_name, station_name=station_name) + filepath = os.path.join(data_dir, filename) + + # Write to disk + write_product(ds, product=product, filepath=filepath, force=force) + + # Clean environment + del ds + + # Log end processing + msg = f"{product} processing for {day} has ended." + log_info(logger, msg, verbose=verbose) + + ##--------------------------------------------------------------------. + # Otherwise log the error + except Exception as e: + error_type = str(type(e).__name__) + msg = f"{error_type}: {e}" + log_error(logger, msg, verbose=verbose) + + # Close the file logger + close_logger(logger) + + # Return the logger file path + return logger_filepath + + ####------------------------------------------------------------------------. #### Creation of L0A and L0B Single Station Files @@ -414,19 +489,22 @@ def run_l0a( Default is ``False``. """ + # Define product name + product = "L0A" + # ------------------------------------------------------------------------. # Start L0A processing if verbose: t_i = time.time() - msg = f"L0A processing of station {station_name} has started." + msg = f"{product} processing of station {station_name} has started." log_info(logger=logger, msg=msg, verbose=verbose) # ------------------------------------------------------------------------. # Create directory structure - create_l0_directory_structure( + data_dir = create_l0_directory_structure( raw_dir=raw_dir, processed_dir=processed_dir, - product="L0A", + product=product, station_name=station_name, force=force, ) @@ -443,9 +521,40 @@ def run_l0a( debugging_mode=debugging_mode, ) + # -------------------------------------------------------------------------. + # Retrieve DISDRODB path components + base_dir, data_source, campaign_name = infer_path_info_tuple(raw_dir) + + # -------------------------------------------------------------------------. + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + # -----------------------------------------------------------------. # Read issue YAML file - issue_dict = read_station_issue(station_name=station_name, **infer_path_info_dict(raw_dir)) + issue_dict = read_station_issue( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + + ##------------------------------------------------------------------------. + # Read metadata + metadata = read_station_metadata( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + # Retrieve sensor name + sensor_name = metadata["sensor_name"] + check_sensor_name(sensor_name) # -----------------------------------------------------------------. # Generate L0A files @@ -454,12 +563,16 @@ def run_l0a( list_tasks = [ _generate_l0a( filepath=filepath, - processed_dir=processed_dir, + data_dir=data_dir, + logs_dir=logs_dir, + campaign_name=campaign_name, station_name=station_name, - # L0A reader argument + # Reader argument column_names=column_names, reader_kwargs=reader_kwargs, df_sanitizer_fun=df_sanitizer_fun, + # Processing info + sensor_name=sensor_name, issue_dict=issue_dict, # Processing options force=force, @@ -471,149 +584,24 @@ def run_l0a( list_logs = dask.compute(*list_tasks) if parallel else list_tasks # -----------------------------------------------------------------. # Define L0A summary logs - define_summary_log(list_logs) + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Logs list + list_logs=list_logs, + ) # ---------------------------------------------------------------------. # End L0A processing if verbose: - timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i)) + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) msg = f"L0A processing of station {station_name} completed in {timedelta_str}" log_info(logger=logger, msg=msg, verbose=verbose) -def run_l0b( - processed_dir, - station_name, - # Processing options - parallel, - force, - verbose, - debugging_mode, -): - """ - Run the L0B processing for a specific DISDRODB station. - - Parameters - ---------- - raw_dir : str - The directory path where all the raw content of a specific campaign is stored. - The path must have the following structure: ``<...>/DISDRODB/Raw//``. - Inside the ``raw_dir`` directory, it is required to adopt the following structure:: - - - ``/data//`` - - ``/metadata/.yml`` - - **Important points:** - - - For each ````, there must be a corresponding YAML file in the metadata subdirectory. - - The ``campaign_name`` are expected to be UPPER CASE. - - The ```` must semantically match between: - - the ``raw_dir`` and ``processed_dir`` directory paths; - - with the key ``campaign_name`` within the metadata YAML files. - processed_dir : str - The desired directory path for the processed DISDRODB L0A and L0B products. - The path should have the following structure: ``<...>/DISDRODB/Processed//``. - For testing purposes, this function exceptionally accepts also a directory path simply ending - with ```` (e.g., ``/tmp/``). - station_name : str - The name of the station. - force : bool, optional - If ``True``, overwrite existing data in destination directories. - If ``False``, raise an error if data already exists in destination directories. - Default is ``False``. - verbose : bool, optional - If ``True``, print detailed processing information to the terminal. - Default is ``True``. - parallel : bool, optional - If ``True``, process the files simultaneously in multiple processes. - The number of simultaneous processes can be customized using the ``dask.distributed.LocalCluster``. - Ensure that the ``threads_per_worker`` (number of thread per process) is set to 1 to avoid HDF errors. - Also, ensure to set the ``HDF5_USE_FILE_LOCKING`` environment variable to ``False``. - If ``False``, process the files sequentially in a single process. - Default is ``False``. - debugging_mode : bool, optional - If ``True``, reduce the amount of data to process. - Only the first 3 raw data files will be processed. - Default is ``False``. - - """ - # -----------------------------------------------------------------. - # Retrieve metadata - attrs = read_station_metadata(station_name=station_name, product="L0A", **infer_path_info_dict(processed_dir)) - - # Skip run_l0b processing if the raw data are netCDFs - if attrs["raw_data_format"] == "netcdf": - return - - # -----------------------------------------------------------------. - # Start L0B processing - if verbose: - t_i = time.time() - msg = f"L0B processing of station_name {station_name} has started." - log_info(logger=logger, msg=msg, verbose=verbose) - - # -------------------------------------------------------------------------. - # Create directory structure - create_directory_structure( - processed_dir=processed_dir, - product="L0B", - station_name=station_name, - force=force, - ) - - ##----------------------------------------------------------------. - # Get L0A files for the station - filepaths = get_l0a_filepaths( - processed_dir=processed_dir, - station_name=station_name, - debugging_mode=debugging_mode, - ) - - # -----------------------------------------------------------------. - # Generate L0B files - # Loop over the L0A files and save the L0B netCDF files. - # - If parallel=True, it does that in parallel using dask.bag - # Settings npartitions=len(filepaths) enable to wait prior task on a core - # finish before starting a new one. - if not parallel: - list_logs = [ - _generate_l0b( - filepath=filepath, - processed_dir=processed_dir, - station_name=station_name, - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ) - for filepath in filepaths - ] - - else: - bag = db.from_sequence(filepaths, npartitions=len(filepaths)) - list_logs = bag.map( - _generate_l0b, - processed_dir=processed_dir, - station_name=station_name, - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ).compute() - - # -----------------------------------------------------------------. - # Define L0B summary logs - define_summary_log(list_logs) - - # -----------------------------------------------------------------. - # End L0B processing - if verbose: - timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i)) - msg = f"L0B processing of station_name {station_name} completed in {timedelta_str}" - log_info(logger=logger, msg=msg, verbose=verbose) - return - - def run_l0b_from_nc( raw_dir, processed_dir, @@ -694,8 +682,11 @@ def run_l0b_from_nc( Default is ``False``. """ + # Define product name + product = "L0B" + # ------------------------------------------------------------------------. - # Start L0A processing + # Start L0B NC processing if verbose: t_i = time.time() msg = f"L0B processing of station {station_name} has started." @@ -703,14 +694,36 @@ def run_l0b_from_nc( # ------------------------------------------------------------------------. # Create directory structure - create_l0_directory_structure( + data_dir = create_l0_directory_structure( raw_dir=raw_dir, processed_dir=processed_dir, - product="L0B", + product=product, station_name=station_name, force=force, ) + # -------------------------------------------------------------------------. + # Retrieve DISDRODB path components + base_dir, data_source, campaign_name = infer_path_info_tuple(processed_dir) + + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + + # -----------------------------------------------------------------. + # Retrieve metadata + metadata = read_station_metadata( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + # -------------------------------------------------------------------------. # List files to process filepaths = get_raw_filepaths( @@ -729,28 +742,15 @@ def run_l0b_from_nc( # - If parallel=True, it does that in parallel using dask.bag # Settings npartitions=len(filepaths) enable to wait prior task on a core # finish before starting a new one. - if not parallel: - list_logs = [ - _generate_l0b_from_nc( - filepath=filepath, - processed_dir=processed_dir, - station_name=station_name, - # Reader arguments - dict_names=dict_names, - ds_sanitizer_fun=ds_sanitizer_fun, - # Processing options - force=force, - verbose=verbose, - parallel=parallel, - ) - for filepath in filepaths - ] - else: - bag = db.from_sequence(filepaths, npartitions=len(filepaths)) - list_logs = bag.map( - _generate_l0b_from_nc, - processed_dir=processed_dir, + list_tasks = [ + _generate_l0b_from_nc( + filepath=filepath, + data_dir=data_dir, + logs_dir=logs_dir, + campaign_name=campaign_name, station_name=station_name, + # Processing info + metadata=metadata, # Reader arguments dict_names=dict_names, ds_sanitizer_fun=ds_sanitizer_fun, @@ -758,78 +758,70 @@ def run_l0b_from_nc( force=force, verbose=verbose, parallel=parallel, - ).compute() + ) + for filepath in filepaths + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + + # if not parallel: + # list_logs = [ + # _generate_l0b_from_nc( + # filepath=filepath, + # data_dir=data_dir, + # logs_dir=logs_dir, + # campaign_name=campaign_name, + # station_name=station_name, + # # Processing info + # metadata=metadata, + # # Reader arguments + # dict_names=dict_names, + # ds_sanitizer_fun=ds_sanitizer_fun, + # # Processing options + # force=force, + # verbose=verbose, + # parallel=parallel, + # ) + # for filepath in filepaths + # ] + # else: + # bag = db.from_sequence(filepaths, npartitions=len(filepaths)) + # list_logs = bag.map( + # _generate_l0b_from_nc, + # data_dir=data_dir, + # logs_dir=logs_dir, + # campaign_name=campaign_name, + # station_name=station_name, + # # Processing info + # metadata=metadata, + # # Reader arguments + # dict_names=dict_names, + # ds_sanitizer_fun=ds_sanitizer_fun, + # # Processing options + # force=force, + # verbose=verbose, + # parallel=parallel, + # ).compute() # -----------------------------------------------------------------. # Define L0B summary logs - define_summary_log(list_logs) + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Logs list + list_logs=list_logs, + ) # ---------------------------------------------------------------------. # End L0B processing if verbose: - timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i)) + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) msg = f"L0B processing of station {station_name} completed in {timedelta_str}" log_info(logger=logger, msg=msg, verbose=verbose) -def run_l0b_concat(processed_dir, station_name, verbose=False): - """Concatenate all L0B netCDF files into a single netCDF file. - - The single netCDF file is saved at ``/L0B``. - """ - from disdrodb.l0.l0b_processing import write_l0b - from disdrodb.utils.netcdf import xr_concat_datasets - - # Create logger - filename = f"concatenatation_{station_name}" - logger = create_file_logger( - processed_dir=processed_dir, - product="L0B", - station_name="", # locate outside the station directory - filename=filename, - parallel=False, - ) - - # -------------------------------------------------------------------------. - # Retrieve L0B files - station_dir = define_l0b_station_dir(processed_dir, station_name) - filepaths = list_files(station_dir, glob_pattern="*.nc", recursive=True) - filepaths = sorted(filepaths) - - # -------------------------------------------------------------------------. - # Check there are at least two files - n_files = len(filepaths) - if n_files == 0: - msg = f"No L0B file is available for concatenation in {station_dir}." - log_error(logger=logger, msg=msg, verbose=False) - raise ValueError(msg) - - if n_files == 1: - msg = f"Only a single file is available for concatenation in {station_dir}." - log_warning(logger=logger, msg=msg, verbose=verbose) - - # -------------------------------------------------------------------------. - # Concatenate the files - ds = xr_concat_datasets(filepaths) - - # -------------------------------------------------------------------------. - # Define the filepath of the concatenated L0B netCDF - single_nc_filepath = define_l0b_filepath(ds, processed_dir, station_name, l0b_concat=True) - force = True # TODO add as argument - write_l0b(ds, filepath=single_nc_filepath, force=force) - - # -------------------------------------------------------------------------. - # Close file and delete - ds.close() - del ds - - # -------------------------------------------------------------------------. - # Close the file logger - close_logger(logger) - - # Return the dataset - - ####--------------------------------------------------------------------------. #### DISDRODB Station Functions @@ -880,7 +872,10 @@ def run_l0a_station( The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. If not specified, the path specified in the DISDRODB active configuration will be used. """ + # Define base directory base_dir = get_base_dir(base_dir) + + # Retrieve reader reader = get_station_reader_function( base_dir=base_dir, data_source=data_source, @@ -901,8 +896,8 @@ def run_l0a_station( campaign_name=campaign_name, ) # Run L0A processing - # --> The reader call the run_l0a within the custom defined reader function - # --> For the special case of raw netCDF data, it calls the run_l0b_from_nc function + # --> The reader calls the run_l0a or the run_l0b_from_nc if the raw data are + # text files or netCDF files respectively. reader( raw_dir=raw_dir, processed_dir=processed_dir, @@ -920,12 +915,13 @@ def run_l0b_station( data_source, campaign_name, station_name, + # L0B processing options + remove_l0a: bool = False, # Processing options force: bool = False, verbose: bool = True, parallel: bool = True, debugging_mode: bool = False, - remove_l0a: bool = False, base_dir: Optional[str] = None, ): """ @@ -957,58 +953,207 @@ def run_l0b_station( and multi-threading will be automatically exploited to speed up I/O tasks. debugging_mode : bool, optional If ``True``, the amount of data processed will be reduced. - Only the first 100 rows of 3 L0A files will be processed. By default, ``False``. + Only the first 100 rows of 3 L0A files will be processed. The default is ``False``. + remove_l0a: bool, optional + Whether to remove the processed L0A files. The default is ``False``. base_dir : str, optional The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. If not specified, the path specified in the DISDRODB active configuration will be used. """ - # Define campaign processed dir + # Define product name + product = "L0B" + + # Retrieve DISDRODB base directory base_dir = get_base_dir(base_dir) - processed_dir = get_disdrodb_path( + + # -----------------------------------------------------------------. + # Retrieve metadata + metadata = read_station_metadata( base_dir=base_dir, - product="L0B", data_source=data_source, campaign_name=campaign_name, - check_exists=False, + station_name=station_name, ) - # Run L0B - run_l0b( - processed_dir=processed_dir, + + # Skip run_l0b processing if the raw data are netCDFs + # - L0B produced when running L0A ... + if metadata["raw_data_format"] == "netcdf": + return + + # -----------------------------------------------------------------. + # Start L0B processing + if verbose: + t_i = time.time() + msg = f"{product} processing of station_name {station_name} has started." + log_info(logger=logger, msg=msg, verbose=verbose) + + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, station_name=station_name, - # Processing options + ) + + # -------------------------------------------------------------------------. + # Create product directory + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=product, force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, ) + ##----------------------------------------------------------------. + # Get L0A files for the station + required_product = get_required_product(product) + flag_not_available_data = False + try: + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=required_product, + debugging_mode=debugging_mode, + ) + except Exception as e: + print(str(e)) # Case where no file paths available + flag_not_available_data = True + + # -------------------------------------------------------------------------. + # If no data available, print error message and return None + if flag_not_available_data: + msg = ( + f"{product} processing of {data_source} {campaign_name} {station_name}" + + f"has not been launched because of missing {required_product} data." + ) + print(msg) + return + + ##----------------------------------------------------------------. + # Generate L0B files + # Loop over the L0A files and save the L0B netCDF files. + # - If parallel=True, it does that in parallel using dask.bag + # Settings npartitions=len(filepaths) enable to wait prior task on a core + # finish before starting a new one. + # BUG: If debugging_mode=True and parallel=True a subtle bug can currently occur when + # two processes with a subsetted L0A files want to create the same L0B files ! + list_tasks = [ + _generate_l0b( + filepath=filepath, + data_dir=data_dir, + logs_dir=logs_dir, + metadata=metadata, + campaign_name=campaign_name, + station_name=station_name, + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + for filepath in filepaths + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + # if not parallel: + # list_logs = [ + # _generate_l0b( + # filepath=filepath, + # data_dir=data_dir, + # logs_dir=logs_dir, + # metadata=metadata, + # campaign_name=campaign_name, + # station_name=station_name, + # force=force, + # verbose=verbose, + # debugging_mode=debugging_mode, + # parallel=parallel, + # ) + # for filepath in filepaths + # ] + + # else: + # bag = db.from_sequence(filepaths, npartitions=len(filepaths)) + # list_logs = bag.map( + # _generate_l0b, + # data_dir=data_dir, + # logs_dir=logs_dir, + # metadata=metadata, + # campaign_name=campaign_name, + # station_name=station_name, + # force=force, + # verbose=verbose, + # debugging_mode=debugging_mode, + # parallel=parallel, + # ).compute() + + # -----------------------------------------------------------------. + # Define L0B summary logs + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Logs list + list_logs=list_logs, + ) + + # -----------------------------------------------------------------. + # End L0B processing + if verbose: + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + msg = f"{product} processing of station_name {station_name} completed in {timedelta_str}" + log_info(logger=logger, msg=msg, verbose=verbose) + + # -----------------------------------------------------------------. + # Option to remove L0A if remove_l0a: - station_dir = define_station_dir( + remove_product( base_dir=base_dir, product="L0A", data_source=data_source, campaign_name=campaign_name, station_name=station_name, + logger=logger, + verbose=verbose, ) - log_info(logger=logger, msg="Removal of single L0A files started.", verbose=verbose) - shutil.rmtree(station_dir) - log_info(logger=logger, msg="Removal of single L0A files ended.", verbose=verbose) -def run_l0b_concat_station( +def run_l0c_station( # Station arguments data_source, campaign_name, station_name, - # L0B concat options - remove_l0b=False, - verbose=True, + # L0C processing options + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, base_dir: Optional[str] = None, ): - """Define the L0B file concatenation of a station. + """ + Run the L0C processing of a specific DISDRODB station when invoked from the terminal. + + The DISDRODB L0A and L0B routines just convert source raw data into netCDF format. + The DISDRODB L0C routine ingests L0B files and performs data homogenization. + The DISDRODB L0C routine takes care of: + + - removing duplicated timesteps across files, + - merging/splitting files into daily files, + - regularizing timesteps for potentially trailing seconds, + - ensuring L0C files with unique sample intervals. - This function is intended to be called through the ``disdrodb_run_l0b_concat station`` + Duplicated timesteps are automatically dropped if their variable values coincides, + otherwise an error is raised. + + This function is intended to be called through the ``disdrodb_run_l0c_station`` command-line interface. Parameters @@ -1021,42 +1166,151 @@ def run_l0b_concat_station( The name of the campaign. Must be provided in UPPER CASE. station_name : str The name of the station. + force : bool, optional + If ``True``, existing data in the destination directories will be overwritten. + If ``False`` (default), an error will be raised if data already exists in the destination directories. verbose : bool, optional If ``True`` (default), detailed processing information will be printed to the terminal. If ``False``, less information will be displayed. + parallel : bool, optional + If ``True``, files will be processed in multiple processes simultaneously, + with each process using a single thread to avoid issues with the HDF/netCDF library. + If ``False`` (default), files will be processed sequentially in a single process, + and multi-threading will be automatically exploited to speed up I/O tasks. + debugging_mode : bool, optional + If ``True``, the amount of data processed will be reduced. + Only the first 3 files will be processed. By default, ``False``. + remove_l0b: bool, optional + Whether to remove the processed L0B files. The default is ``False``. base_dir : str, optional The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. If not specified, the path specified in the DISDRODB active configuration will be used. """ - # Retrieve processed_dir + # Define product + product = "L0C" + + # Define base directory base_dir = get_base_dir(base_dir) - processed_dir = get_disdrodb_path( + + # Define logs directory + logs_dir = create_logs_directory( + product=product, base_dir=base_dir, - product="L0B", data_source=data_source, campaign_name=campaign_name, - check_exists=True, + station_name=station_name, ) - # Run concatenation - run_l0b_concat( - processed_dir=processed_dir, + # ------------------------------------------------------------------------. + # Start processing + if verbose: + t_i = time.time() + msg = f"{product} processing of station {station_name} has started." + log_info(logger=logger, msg=msg, verbose=verbose) + + # ------------------------------------------------------------------------. + # Create product directory + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, station_name=station_name, - verbose=verbose, + product=product, + force=force, ) - if remove_l0b: - station_dir = define_station_dir( + # ------------------------------------------------------------------------. + # Define metadata filepath + metadata_filepath = define_metadata_filepath( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + + # -------------------------------------------------------------------------. + # List files to process + required_product = get_required_product(product) + flag_not_available_data = False + try: + filepaths = get_filepaths( base_dir=base_dir, - product="L0B", data_source=data_source, campaign_name=campaign_name, station_name=station_name, + product=required_product, + # Processing options + debugging_mode=debugging_mode, ) - log_info(logger=logger, msg="Removal of single L0B files started.", verbose=verbose) - shutil.rmtree(station_dir) - log_info(logger=logger, msg="Removal of single L0B files ended.", verbose=verbose) + except Exception as e: + print(str(e)) # Case where no file paths available + flag_not_available_data = True + + # -------------------------------------------------------------------------. + # If no data available, print error message and return None + if flag_not_available_data: + msg = ( + f"{product} processing of {data_source} {campaign_name} {station_name}" + + f"has not been launched because of missing {required_product} data." + ) + print(msg) + return + # -------------------------------------------------------------------------. + # Retrieve dictionary with the required files for each day. + dict_days_files = get_files_per_days(filepaths) -####---------------------------------------------------------------------------. + # -----------------------------------------------------------------. + # Generate L0C files + # - Loop over the L0 netCDF files and generate L1 files. + # - If parallel=True, it does that in parallel using dask.delayed + list_tasks = [ + _generate_l0c( + day=day, + filepaths=filepaths, + data_dir=data_dir, + logs_dir=logs_dir, + metadata_filepath=metadata_filepath, + campaign_name=campaign_name, + station_name=station_name, + # Processing options + force=force, + verbose=verbose, + parallel=parallel, + ) + for day, filepaths in dict_days_files.items() + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + + # -----------------------------------------------------------------. + # Define summary logs + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Logs list + list_logs=list_logs, + ) + + # ---------------------------------------------------------------------. + # End processing + if verbose: + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + msg = f"{product} processing of station {station_name} completed in {timedelta_str}" + log_info(logger=logger, msg=msg, verbose=verbose) + + # -----------------------------------------------------------------. + # Option to remove L0B + if remove_l0b: + remove_product( + base_dir=base_dir, + product="L0B", + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + logger=logger, + verbose=verbose, + ) diff --git a/disdrodb/l0/l0a_processing.py b/disdrodb/l0/l0a_processing.py index 9c6540cb..89c35b83 100644 --- a/disdrodb/l0/l0a_processing.py +++ b/disdrodb/l0/l0a_processing.py @@ -208,6 +208,8 @@ def remove_duplicated_timesteps(df: pd.DataFrame, verbose: bool = False): values_duplicates = values[idx_duplicates].astype("M8[s]") # If there are duplicated timesteps if len(values_duplicates) > 0: + # TODO: raise error if duplicated timesteps have different values ! + # Drop duplicated timesteps (keeping the first occurrence) df = df.drop_duplicates(subset="time", keep="first") # Report the values of duplicated timesteps @@ -446,7 +448,7 @@ def remove_corrupted_rows(df): raise ValueError("No remaining rows after data corruption checks.") # If only one row available, raise also error if len(df) == 1: - raise ValueError("Only 1 row remains after data corruption checks. Check the file.") + raise ValueError("Only 1 row remains after data corruption checks. Check the raw file and maybe delete it.") # Return the dataframe return df @@ -653,6 +655,9 @@ def process_raw_file( # - Replace invalid values with np.nan df = set_nan_invalid_values(df, sensor_name=sensor_name, verbose=verbose) + # - Sort by time + df = df.sort_values("time") + # ------------------------------------------------------. # - Check column names agrees to DISDRODB standards check_l0a_column_names(df, sensor_name=sensor_name) diff --git a/disdrodb/l0/l0b_nc_processing.py b/disdrodb/l0/l0b_nc_processing.py index 121b8185..734d4d0c 100644 --- a/disdrodb/l0/l0b_nc_processing.py +++ b/disdrodb/l0/l0b_nc_processing.py @@ -18,11 +18,9 @@ # -----------------------------------------------------------------------------. """Functions to process DISDRODB raw netCDF files into DISDRODB L0B netCDF files.""" -import copy import logging import numpy as np -import xarray as xr from disdrodb.l0.l0b_processing import finalize_dataset from disdrodb.l0.standards import ( @@ -115,6 +113,8 @@ def subset_dataset(ds, dict_names, sensor_name): def add_dataset_missing_variables(ds, missing_vars, sensor_name): """Add missing xr.Dataset variables as ``np.nan`` xr.DataArrays.""" + import xarray as xr + from disdrodb.l0.standards import get_variables_dimension # Get dimension of each variables @@ -171,8 +171,7 @@ def preprocess_raw_netcdf(ds, dict_names, sensor_name): ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name) # Update the coordinates for (diameter and velocity) - coords = get_bin_coords_dict(sensor_name) - ds = ds.assign_coords(coords) + ds = ds.assign_coords(get_bin_coords_dict(sensor_name)) # Return dataset return ds @@ -346,19 +345,6 @@ def create_l0b_from_raw_nc( # Preprocess netcdf ds = preprocess_raw_netcdf(ds=ds, dict_names=dict_names, sensor_name=sensor_name) - # Add CRS and geolocation information - attrs = copy.deepcopy(attrs) - coords = {} - geolocation_vars = ["latitude", "longitude", "altitude"] - for var in geolocation_vars: - if var not in ds: - coords[var] = attrs[var] - _ = attrs.pop(var) - ds = ds.assign_coords(coords) - - # Add global attributes - ds.attrs = attrs - # Apply dataset sanitizer function ds = ds_sanitizer_fun(ds) @@ -372,7 +358,7 @@ def create_l0b_from_raw_nc( ds = set_nan_invalid_values(ds, sensor_name=sensor_name, verbose=verbose) # Finalize dataset - ds = finalize_dataset(ds, sensor_name=sensor_name) + ds = finalize_dataset(ds, sensor_name=sensor_name, attrs=attrs) # Return dataset return ds diff --git a/disdrodb/l0/l0b_processing.py b/disdrodb/l0/l0b_processing.py index 7741535d..ab0021a6 100644 --- a/disdrodb/l0/l0b_processing.py +++ b/disdrodb/l0/l0b_processing.py @@ -32,23 +32,26 @@ from disdrodb.l0.standards import ( # get_valid_coordinates_names, get_bin_coords_dict, - get_coords_attrs_dict, get_data_range_dict, get_dims_size_dict, get_l0b_cf_attrs_dict, get_l0b_encodings_dict, get_raw_array_dims_order, get_raw_array_nvalues, - get_time_encoding, +) +from disdrodb.utils.attrs import ( + set_coordinate_attributes, set_disdrodb_attrs, ) from disdrodb.utils.directories import create_directory, remove_if_exists +from disdrodb.utils.encoding import set_encodings from disdrodb.utils.logger import ( # log_warning, # log_debug, log_error, log_info, ) +from disdrodb.utils.time import ensure_sorted_by_time logger = logging.getLogger(__name__) @@ -329,28 +332,13 @@ def _set_variable_attributes(ds: xr.Dataset, sensor_name: str) -> xr.Dataset: return ds -def _set_attrs_dict(ds, attrs_dict): - for var in attrs_dict: - if var in ds: - ds[var].attrs.update(attrs_dict[var]) - return ds - - -def _set_coordinate_attributes(ds): - # Get attributes dictionary - attrs_dict = get_coords_attrs_dict() - # Set attributes - ds = _set_attrs_dict(ds, attrs_dict) - return ds - - def _set_dataset_attrs(ds, sensor_name): """Set variable and coordinates attributes.""" # - Add netCDF variable attributes # --> Attributes: long_name, units, descriptions, valid_min, valid_max ds = _set_variable_attributes(ds=ds, sensor_name=sensor_name) # - Add netCDF coordinate attributes - ds = _set_coordinate_attributes(ds=ds) + ds = set_coordinate_attributes(ds=ds) # - Set DISDRODB global attributes ds = set_disdrodb_attrs(ds=ds, product="L0B") return ds @@ -384,44 +372,19 @@ def _define_dataset_variables(df, sensor_name, verbose): raise ValueError("No raw fields available.") # Define other disdrometer 'auxiliary' variables varying over time dimension + # - Includes time + # - Includes longitude and latitude for moving sensors valid_core_fields = [ "raw_drop_concentration", "raw_drop_average_velocity", "raw_drop_number", - "time", - # longitude and latitude too for moving sensors ] aux_columns = df.columns[np.isin(df.columns, valid_core_fields, invert=True)] aux_data_vars = {column: (["time"], df[column].to_numpy()) for column in aux_columns} data_vars.update(aux_data_vars) - - # Add key "time" - # - Is dropped in _define_coordinates ! - data_vars["time"] = df["time"].to_numpy() - return data_vars -def _define_coordinates(data_vars, attrs, sensor_name): - """Define DISDRODB L0B netCDF coordinates.""" - # Note: attrs and data_vars are modified in place ! - - # - Diameter and velocity - coords = get_bin_coords_dict(sensor_name=sensor_name) - - # - Geolocation + Time - geolocation_vars = ["time", "latitude", "longitude", "altitude"] - for var in geolocation_vars: - if var in data_vars: - coords[var] = data_vars[var] - _ = data_vars.pop(var) - _ = attrs.pop(var, None) - else: - coords[var] = attrs[var] - _ = attrs.pop(var) - return coords - - def create_l0b_from_l0a( df: pd.DataFrame, attrs: dict, @@ -451,25 +414,13 @@ def create_l0b_from_l0a( # Retrieve sensor name attrs = attrs.copy() sensor_name = attrs["sensor_name"] - # -----------------------------------------------------------. + # Define Dataset variables and coordinates data_vars = _define_dataset_variables(df, sensor_name=sensor_name, verbose=verbose) - # -----------------------------------------------------------. - # Define coordinates for xarray Dataset - # - attrs and data_vars are modified in place ! - coords = _define_coordinates(data_vars, attrs=attrs, sensor_name=sensor_name) - - # ----------------------------------------------------------- # Create xarray Dataset - ds = xr.Dataset( - data_vars=data_vars, - coords=coords, - attrs=attrs, - ) - ds = finalize_dataset(ds, sensor_name=sensor_name) - - # ----------------------------------------------------------- + ds = xr.Dataset(data_vars=data_vars) + ds = finalize_dataset(ds, sensor_name=sensor_name, attrs=attrs) return ds @@ -477,8 +428,43 @@ def create_l0b_from_l0a( #### L0B netCDF4 Writer -def finalize_dataset(ds, sensor_name): +def set_geolocation_coordinates(ds, attrs): + """Add geolocation coordinates to dataset.""" + # Assumption + # - If coordinate is present in L0A, overrides the one specified in the attributes + # - If a station is fixed, discard the coordinates in the DISDRODB reader ! + + # Assign geolocation coordinates to dataset + coords = ["latitude", "longitude", "altitude"] + for coord in coords: + # If coordinate not present, add it from dictionary + if coord not in ds: + ds = ds.assign_coords({coord: attrs.pop(coord, np.nan)}) + # Else if set coordinates the variable in the dataset (present in the raw data) + else: + ds = ds.set_coords(coord) + _ = attrs.pop(coord, None) + + # Set -9999 flag value to np.nan + for coord in coords: + ds[coord] = xr.where(ds[coord] == -9999, np.nan, ds[coord]) + + # Set attributes without geolocation coordinates + ds.attrs = attrs + return ds + + +def finalize_dataset(ds, sensor_name, attrs): """Finalize DISDRODB L0B Dataset.""" + # Ensure sorted by time + ds = ensure_sorted_by_time(ds) + + # Set diameter and velocity bin coordinates + ds = ds.assign_coords(get_bin_coords_dict(sensor_name=sensor_name)) + + # Set geolocation coordinates and attributes + ds = set_geolocation_coordinates(ds, attrs=attrs) + # Add dataset CRS coordinate ds = add_dataset_crs_coords(ds) @@ -496,56 +482,8 @@ def finalize_dataset(ds, sensor_name): return ds -def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict: - """Ensure chunk size to be smaller than the array shape. - - Parameters - ---------- - encoding_dict : dict - Dictionary containing the encoding to write DISDRODB L0B netCDFs. - ds : xarray.Dataset - Input dataset. - - Returns - ------- - dict - Encoding dictionary. - """ - for var in ds.data_vars: - shape = ds[var].shape - chunks = encoding_dict[var]["chunksizes"] - if chunks is not None: - chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))] - encoding_dict[var]["chunksizes"] = chunks - return encoding_dict - - -def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset: - """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary. - - Parameters - ---------- - ds : xarray.Dataset - Input xarray dataset - encoding_dict : dict - Dictionary containing the encoding to write the xarray dataset as a netCDF. - - Returns - ------- - xr.Dataset - Output xarray dataset - """ - for var in ds.data_vars: - chunks = encoding_dict[var].pop("chunksizes") - dims = list(ds[var].dims) - chunks_dict = dict(zip(dims, chunks)) - if chunks is not None: - ds[var] = ds[var].chunk(chunks_dict) - return ds - - -def set_encodings(ds: xr.Dataset, sensor_name: str) -> xr.Dataset: - """Apply the encodings to the xarray Dataset. +def set_l0b_encodings(ds: xr.Dataset, sensor_name: str): + """Apply the L0B encodings to the xarray Dataset. Parameters ---------- @@ -559,24 +497,8 @@ def set_encodings(ds: xr.Dataset, sensor_name: str) -> xr.Dataset: xr.Dataset Output xarray dataset. """ - # Get encoding dictionary encoding_dict = get_l0b_encodings_dict(sensor_name) - encoding_dict = {k: encoding_dict[k] for k in ds.data_vars} - - # Ensure chunksize smaller than the array shape - encoding_dict = sanitize_encodings_dict(encoding_dict, ds) - - # Rechunk variables for fast writing ! - # - This pop the chunksize argument from the encoding dict ! - ds = rechunk_dataset(ds, encoding_dict) - - # Set time encoding - ds["time"].encoding.update(get_time_encoding()) - - # Set the variable encodings - for var in ds.data_vars: - ds[var].encoding.update(encoding_dict[var]) - + ds = set_encodings(ds=ds, encoding_dict=encoding_dict) return ds @@ -608,7 +530,7 @@ def write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None: sensor_name = ds.attrs.get("sensor_name") # Set encodings - ds = set_encodings(ds=ds, sensor_name=sensor_name) + ds = set_l0b_encodings(ds=ds, sensor_name=sensor_name) # Write netcdf ds.to_netcdf(filepath, engine="netcdf4") diff --git a/disdrodb/l0/l0c_processing.py b/disdrodb/l0/l0c_processing.py new file mode 100644 index 00000000..c84253bf --- /dev/null +++ b/disdrodb/l0/l0c_processing.py @@ -0,0 +1,626 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Functions to process DISDRODB L0B files into DISDRODB L0C netCDF files.""" +import logging + +import numpy as np +import pandas as pd + +from disdrodb.api.info import get_start_end_time_from_filepaths +from disdrodb.l1.resampling import add_sample_interval +from disdrodb.utils.logger import log_warning # , log_info +from disdrodb.utils.time import ( + ensure_sorted_by_time, + regularize_timesteps, +) + +logger = logging.getLogger(__name__) + + +TOLERANCE_SECONDS = 120 + + +def get_files_per_days(filepaths): + """ + Organize files by the days they cover based on their start and end times. + + Parameters + ---------- + filepaths : list of str + List of file paths to be processed. + + Returns + ------- + dict + Dictionary where keys are days (as strings) and values are lists of file paths + that cover those days. + + Notes + ----- + This function adds a tolerance of 60 seconds to account for imprecise time logging by the sensors. + """ + # Retrieve file start_time and end_time + files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths) + + # Add tolerance to account for imprecise time logging by the sensors + # - Example: timestep 23:59:30 might be 00.00 and goes into the next day file ... + files_start_time = files_start_time - np.array(TOLERANCE_SECONDS, dtype="m8[s]") + files_end_time = files_end_time + np.array(TOLERANCE_SECONDS, dtype="m8[s]") + + # Retrieve file start day and end day + start_day = files_start_time.min().astype("M8[D]") + end_day = files_end_time.max().astype("M8[D]") + np.array(1, dtype="m8[D]") + + # Create an array with all days in time period covered by the files + list_days = np.asanyarray(pd.date_range(start=start_day, end=end_day, freq="D")).astype("M8[D]") + + # Expand dimension to match each day using broadcasting + files_start_time = files_start_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1) + files_end_time = files_end_time.astype("M8[D]")[:, np.newaxis] # shape (n_files, 1) + + # Create an array of all days + # - Expand dimension to match each day using broadcasting + days = list_days[np.newaxis, :] # shape (1, n_days) + + # Use broadcasting to create a boolean matrix indicating which files cover which days + mask = (files_start_time <= days) & (files_end_time >= days) # shape (n_files, n_days) + + # Build a mapping from days to file indices + # For each day (column), find the indices of files (rows) that cover that day + dict_days = {} + filepaths = np.array(filepaths) + for i, day in enumerate(list_days): + file_indices = np.where(mask[:, i])[0] + if file_indices.size > 0: + dict_days[str(day)] = filepaths[file_indices].tolist() + + return dict_days + + +def retrieve_possible_measurement_intervals(metadata): + """Retrieve list of possible measurements intervals.""" + measurement_interval = metadata.get("measurement_interval", []) + if isinstance(measurement_interval, (int, float, str)): + measurement_interval = [measurement_interval] + measurement_intervals = [int(v) for v in measurement_interval] + return measurement_intervals + + +def drop_timesteps_with_invalid_sample_interval(ds, measurement_intervals, verbose=True, logger=None): + """Drop timesteps with unexpected sample intervals.""" + # TODO + # - correct logged sample_interval for trailing seconds. Example (58,59,61,62) converted to 60 s ? + # - Need to know more how Parsivel software computes sample_interval variable ... + + # Retrieve logged sample_interval + sample_interval = ds["sample_interval"].compute().data + timesteps = ds["time"].compute().data + is_valid_sample_interval = np.isin(sample_interval.data, measurement_intervals) + indices_invalid_sample_interval = np.where(~is_valid_sample_interval)[0] + if len(indices_invalid_sample_interval) > 0: + # Log information for each invalid timestep + invalid_timesteps = pd.to_datetime(timesteps[indices_invalid_sample_interval]).strftime("%Y-%m-%d %H:%M:%S") + invalid_sample_intervals = sample_interval[indices_invalid_sample_interval] + for tt, ss in zip(invalid_timesteps, invalid_sample_intervals): + msg = f"Unexpected sampling interval ({ss} s) at {tt}. The measurement has been dropped." + log_warning(logger=logger, msg=msg, verbose=verbose) + # Remove timesteps with invalid sample intervals + indices_valid_sample_interval = np.where(is_valid_sample_interval)[0] + ds = ds.isel(time=indices_valid_sample_interval) + return ds + + +def split_dataset_by_sampling_intervals(ds, measurement_intervals, min_sample_interval=10, min_block_size=5): + """ + Split a dataset into subsets where each subset has a consistent sampling interval. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset with a 'time' dimension. + measurement_intervals : list or array-like + A list of possible primary sampling intervals (in seconds) that the dataset might have. + min_sample_interval : int, optional + The minimum expected sampling interval in seconds. Defaults to 10s. + min_block_size : float, optional + The minimum number of timesteps with a given sampling interval to be considered. + Otherwise such portion of data is discarded ! + Defaults to 5 timesteps. + + Returns + ------- + dict + A dictionary where keys are the identified sampling intervals (in seconds), + and values are xarray.Datasets containing only data from those intervals. + """ + # Define array of possible measurement intervals + measurement_intervals = np.array(measurement_intervals) + + # If a single measurement interval expected, return dictionary with input dataset + if len(measurement_intervals) == 1: + dict_ds = {measurement_intervals[0]: ds} + return dict_ds + + # Check sorted by time and sort if necessary + ds = ensure_sorted_by_time(ds) + + # Calculate time differences in seconds + deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int) + + # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s) + # - This account for possible trailing seconds of the logger + # Example: for sample_interval = 10, deltat values like 8, 9, 11, 12 become 10 ... + # Example: for sample_interval = 10, deltat values like 6, 7 or 13, 14 become respectively 5 and 15 ... + # Example: for sample_interval = 30, deltat values like 28,29,30,31,32 deltat become 30 ... + # Example: for sample_interval = 30, deltat values like 26, 27 or 33, 34 become respectively 25 and 35 ... + min_half_sample_interval = min_sample_interval / 2 + deltadt = np.round(deltadt / min_half_sample_interval) * min_half_sample_interval + + # Map each delta to one of the possible_measurement_intervals if exact match, otherwise np.nan + mapped_intervals = np.where(np.isin(deltadt, measurement_intervals), deltadt, np.nan) + if np.all(np.isnan(mapped_intervals)): + raise ValueError("Impossible to identify timesteps with expected sampling intervals.") + + # Infill np.nan values by using neighbor intervals + # Forward fill + for i in range(1, len(mapped_intervals)): + if np.isnan(mapped_intervals[i]): + mapped_intervals[i] = mapped_intervals[i - 1] + + # Backward fill (in case the first entries were np.nan) + for i in range(len(mapped_intervals) - 2, -1, -1): + if np.isnan(mapped_intervals[i]): + mapped_intervals[i] = mapped_intervals[i + 1] + + # Now all intervals are assigned to one of the possible measurement_intervals. + # Identify boundaries where interval changes + change_points = np.where(mapped_intervals[:-1] != mapped_intervals[1:])[0] + 1 + + # Split ds into segments according to change_points + segments = np.split(np.arange(ds.sizes["time"]), change_points) + + # Remove segments with less than 10 points + segments = [seg for seg in segments if len(seg) >= min_block_size] + if len(segments) == 0: + raise ValueError( + f"No blocks of {min_block_size} consecutive timesteps with constant sampling interval are available.", + ) + + # Define dataset indices for each sampling interva + dict_sampling_interval_indices = {} + for seg in segments: + # Define the assumed sampling interval of such segment + start_idx = seg[0] + segment_sampling_interval = int(mapped_intervals[start_idx]) + if segment_sampling_interval not in dict_sampling_interval_indices: + dict_sampling_interval_indices[segment_sampling_interval] = [seg] + else: + dict_sampling_interval_indices[segment_sampling_interval].append(seg) + dict_sampling_interval_indices = { + k: np.concatenate(list_indices) for k, list_indices in dict_sampling_interval_indices.items() + } + + # Define dictionary of datasets + dict_ds = {k: ds.isel(time=indices) for k, indices in dict_sampling_interval_indices.items()} + return dict_ds + + +def has_same_value_over_time(da): + """ + Check if a DataArray has the same value over all timesteps, considering NaNs as equal. + + Parameters + ---------- + da : xarray.DataArray + The DataArray to check. Must have a 'time' dimension. + + Returns + ------- + bool + True if the values are the same (or NaN in the same positions) across all timesteps, + False otherwise. + """ + # Select the first timestep + da_first = da.isel(time=0) + + # Create a boolean array that identifies where values are equal or both NaN + equal_or_nan = (da == da_first) | (da.isnull() & da_first.isnull()) # noqa: PD003 + + # Check if all values match this condition across all dimensions + return bool(equal_or_nan.all().item()) + + +def remove_duplicated_timesteps(ds, ensure_variables_equality=True, logger=None, verbose=True): + """Removes duplicated timesteps from a xarray dataset.""" + # Check for duplicated timesteps + timesteps, counts = np.unique(ds["time"].data, return_counts=True) + duplicated_timesteps = timesteps[counts > 1] + + # If no duplicated timesteps, returns dataset as is + if len(duplicated_timesteps) == 0: + return ds + + # If there are duplicated timesteps + # - First check for variables equality + # - Keep first occurrence of duplicated timesteps if values are equals + # - Drop duplicated timesteps where values are different + different_duplicated_timesteps = [] + equal_duplicated_timesteps = [] + for t in duplicated_timesteps: + # Select dataset at given duplicated timestep + ds_duplicated = ds.sel(time=t) + n_t = len(ds_duplicated["time"]) + + # Check raw_drop_number equality + if not has_same_value_over_time(ds_duplicated["raw_drop_number"]): + different_duplicated_timesteps.append(t) + msg = ( + f"Presence of {n_t} duplicated timesteps at {t}." + "They have different 'raw_drop_number' values. These timesteps are dropped." + ) + log_warning(logger=logger, msg=msg, verbose=verbose) + + # Check other variables equality + other_variables_to_check = [v for v in ds.data_vars if v != "raw_drop_number"] + variables_with_different_values = [ + var for var in other_variables_to_check if not has_same_value_over_time(ds_duplicated[var]) + ] + if len(variables_with_different_values) > 0: + msg = ( + f"Presence of {n_t} duplicated timesteps at {t}." + f"The duplicated timesteps have different values in variables {variables_with_different_values}. " + ) + if ensure_variables_equality: + different_duplicated_timesteps.append(t) + msg = msg + "These timesteps are dropped." + else: + equal_duplicated_timesteps.append(t) + msg = msg + ( + "These timesteps are not dropped because 'raw_drop_number' values are equals." + "'ensure_variables_equality' is False." + ) + log_warning(logger=logger, msg=msg, verbose=verbose) + else: + equal_duplicated_timesteps.append(t) + + # Ensure single occurrence of duplicated timesteps + equal_duplicated_timesteps = np.unique(equal_duplicated_timesteps) + different_duplicated_timesteps = np.unique(different_duplicated_timesteps) + + # - Keep first occurrence of equal_duplicated_timesteps + if len(equal_duplicated_timesteps) > 0: + indices_to_drop = [np.where(ds["time"] == t)[0][1:] for t in equal_duplicated_timesteps] + indices_to_drop = np.concatenate(indices_to_drop) + # Keep only indices not in indices_to_drop + mask = ~np.isin(np.arange(ds["time"].size), indices_to_drop) + ds = ds.isel(time=np.where(mask)[0]) + + # - Drop different_duplicated_timesteps + if len(different_duplicated_timesteps) > 0: + mask = np.isin(ds["time"], different_duplicated_timesteps, invert=True) + ds = ds.isel(time=np.where(mask)[0]) + + return ds + + +def check_timesteps_regularity(ds, sample_interval, verbose=False, logger=None): + """Check for the regularity of timesteps.""" + # Check sorted by time and sort if necessary + ds = ensure_sorted_by_time(ds) + + # Calculate number of timesteps + n = len(ds["time"].data) + + # Calculate time differences in seconds + deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int) + + # Identify unique time intervals and their occurrences + unique_deltadt, counts = np.unique(deltadt, return_counts=True) + + # Determine the most frequent time interval (mode) + most_frequent_deltadt_idx = np.argmax(counts) + most_frequent_deltadt = unique_deltadt[most_frequent_deltadt_idx] + + # Count fraction occurrence of deltadt + fractions = np.round(counts / len(deltadt) * 100, 2) + + # Compute stats about expected deltadt + sample_interval_counts = counts[unique_deltadt == sample_interval].item() + sample_interval_fraction = fractions[unique_deltadt == sample_interval].item() + + # Compute stats about most frequent deltadt + most_frequent_deltadt_counts = counts[unique_deltadt == most_frequent_deltadt].item() + most_frequent_deltadt_fraction = fractions[unique_deltadt == most_frequent_deltadt].item() + + # Compute stats about unexpected deltadt + unexpected_intervals = unique_deltadt[unique_deltadt != sample_interval] + unexpected_intervals_counts = counts[unique_deltadt != sample_interval] + unexpected_intervals_fractions = fractions[unique_deltadt != sample_interval] + frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 5] + + # Report warning if the samplin_interval deltadt occurs less often than 60 % of times + # -> TODO: maybe only report in stations where the disdro does not log only data when rainy + if sample_interval_fraction < 60: + msg = ( + f"The expected (sampling) interval between observations occurs only " + f"{sample_interval_counts}/{n} times ({sample_interval_fraction} %)." + ) + + # Report warning if a deltadt occurs more often then the sampling interval + if most_frequent_deltadt != sample_interval: + msg = ( + f"The most frequent time interval between observations is {most_frequent_deltadt} s " + f"(occurs {most_frequent_deltadt_counts}/{n} times) ({most_frequent_deltadt_fraction}%) " + f"although the expected (sampling) interval is {sample_interval} s " + f"and occurs {sample_interval_counts}/{n} times ({sample_interval_fraction}%)." + ) + log_warning(logger=logger, msg=msg, verbose=verbose) + + # Report with a warning all unexpected deltadt with frequency larger than 5 % + if len(frequent_unexpected_intervals) > 0: + msg_parts = ["The following unexpected intervals occur frequently:"] + for interval in frequent_unexpected_intervals: + c = unexpected_intervals_counts[unexpected_intervals == interval].item() + f = unexpected_intervals_fractions[unexpected_intervals == interval].item() + msg_parts.append(f" {interval} ({f}%) ({c}/{n}) | ") + msg = " ".join(msg_parts) + + msg = "The following time intervals between observations occurs often: " + for interval in frequent_unexpected_intervals: + c = unexpected_intervals_counts[unexpected_intervals == interval].item() + f = unexpected_intervals_fractions[unexpected_intervals == interval].item() + msg = msg + f"{interval} s ({f}%) ({c}/{n})" + log_warning(logger=logger, msg=msg, verbose=verbose) + return ds + + +def finalize_l0c_dataset(ds, sample_interval, start_day, end_day, verbose=True, logger=None): + """Finalize a L0C dataset with unique sampling interval. + + It adds the sampling_interval coordinate and it regularizes + the timesteps for trailing seconds. + """ + # Add sample interval as coordinate + ds = add_sample_interval(ds, sample_interval=sample_interval) + + # Regularize timesteps (for trailing seconds) + ds = regularize_timesteps( + ds, + sample_interval=sample_interval, + robust=False, # if True, raise error if an error occur during regularization + add_quality_flag=True, + verbose=verbose, + logger=logger, + ) + + # Performs checks about timesteps regularity + ds = check_timesteps_regularity(ds=ds, sample_interval=sample_interval, verbose=verbose, logger=logger) + + # Slice for requested day + ds = ds.sel({"time": slice(start_day, end_day)}) + return ds + + +def create_daily_file(day, filepaths, measurement_intervals, ensure_variables_equality=True, logger=None, verbose=True): + """ + Create a daily file by merging and processing data from multiple filepaths. + + Parameters + ---------- + day : str or numpy.datetime64 + The day for which the daily file is to be created. + Should be in a format that can be converted to numpy.datetime64. + filepaths : list of str + List of filepaths to the data files to be processed. + + Returns + ------- + xarray.Dataset + The processed dataset containing data for the specified day. + + Raises + ------ + ValueError + If less than 5 timesteps are available for the specified day. + + Notes + ----- + - The function adds a tolerance for searching timesteps + before and after 00:00 to account for imprecise logging times. + - It checks that duplicated timesteps have the same raw drop number values. + - The function infers the time integration sample interval and + regularizes timesteps to handle trailing seconds. + - The data is loaded into memory and connections to source files + are closed before returning the dataset. + """ + import xarray as xr # Load in each process when function is called ! + + # ---------------------------------------------------------------------------------------. + # Define start day and end of day + start_day = np.array(day).astype("M8[D]") + end_day = start_day + np.array(1, dtype="m8[D]") - np.array(1, dtype="m8[s]") # avoid 00:00 of next day ! + + # Add tolerance for searching timesteps before and after 00:00 to account for imprecise logging time + # - Example: timestep 23:59:30 that should be 00.00 goes into the next day ... + start_day_tol = start_day - np.array(TOLERANCE_SECONDS, dtype="m8[s]") + end_day_tol = end_day + np.array(TOLERANCE_SECONDS, dtype="m8[s]") + + # ---------------------------------------------------------------------------------------. + # Open files with data within the provided day and concatenate them + # list_ds = [xr.open_dataset(filepath, chunks={}).sel({"time": slice(start_day_tol, end_day_tol)}) + # for filepath in filepaths] + list_ds = [xr.open_dataset(filepath, chunks={}, cache=False).sortby("time") for filepath in filepaths] + list_ds = [ds.sel({"time": slice(start_day_tol, end_day_tol)}) for ds in list_ds] + if len(list_ds) > 1: + # Concatenate dataset + # - If some variable are missing in one file, it is filled with NaN. This should not occur anyway. + # - The resulting dataset can have duplicated timesteps ! + ds = xr.concat(list_ds, dim="time", join="outer", compat="no_conflicts", combine_attrs="override").sortby( + "time", + ) + else: + ds = list_ds[0] + + # Compute data + ds = ds.compute() + + # Close connection to source files + _ = [ds.close() for ds in list_ds] + ds.close() + del list_ds + + # ---------------------------------------------------------------------------------------. + # If sample interval is a dataset variable, drop timesteps with unexpected measurement intervals ! + if "sample_interval" in ds: + ds = drop_timesteps_with_invalid_sample_interval( + ds=ds, + measurement_intervals=measurement_intervals, + verbose=verbose, + logger=logger, + ) + + # ---------------------------------------------------------------------------------------. + # Remove duplicated timesteps + ds = remove_duplicated_timesteps( + ds, + ensure_variables_equality=ensure_variables_equality, + logger=logger, + verbose=verbose, + ) + + # Raise error if less than 3 timesteps left + n_timesteps = len(ds["time"]) + if n_timesteps < 3: + raise ValueError(f"{n_timesteps} timesteps left after removing duplicated timesteps.") + + # ---------------------------------------------------------------------------------------. + # Split dataset by sampling intervals + dict_ds = split_dataset_by_sampling_intervals( + ds=ds, + measurement_intervals=measurement_intervals, + min_sample_interval=10, + min_block_size=5, + ) + + # Log a warning if two sampling intervals are present within a given day + if len(dict_ds) > 1: + occuring_sampling_intervals = list(dict_ds) + msg = f"The dataset contains both sampling intervals {occuring_sampling_intervals}." + log_warning(logger=logger, msg=msg, verbose=verbose) + + # ---------------------------------------------------------------------------------------. + # Finalize L0C datasets + # - Add sample_interval coordinate + # - Regularize timesteps for trailing seconds + dict_ds = { + sample_interval: finalize_l0c_dataset( + ds=ds, + sample_interval=sample_interval, + start_day=start_day, + end_day=end_day, + verbose=verbose, + logger=logger, + ) + for sample_interval, ds in dict_ds.items() + } + return dict_ds + + +# ---------------------------------------------------------------------------------------. +#### DEPRECATED CODE + + +# def copy_l0b_to_l0c_directory(filepath): +# """Copy L0B file to L0C directory.""" +# import netCDF4 + +# # Copy file +# l0c_filepath = filepath.replace("L0B", "L0C") +# _ = shutil.copy(filepath, l0c_filepath) + +# # Edit DISDRODB product attribute +# with netCDF4.Dataset(l0c_filepath, mode="a") as nc_file: +# # Modify the global attribute +# nc_file.setncattr("disdrodb_product", "L0C") + +# def find_isel_common_time(da1, da2): +# """ +# Find the indices of common time steps between two data arrays. + +# Parameters +# ---------- +# da1 : xarray.DataArray +# The first data array with a time coordinate. +# da2 : xarray.DataArray +# The second data array with a time coordinate. + +# Returns +# ------- +# da1_isel : numpy.ndarray +# Indices of the common time steps in the first data array. +# da2_isel : numpy.ndarray +# Indices of the common time steps in the second data array. + +# Notes +# ----- +# This function assumes that both input data arrays have a "time" coordinate. +# The function finds the intersection of the time steps in both data arrays +# and returns the indices of these common time steps for each data array. +# """ +# intersecting_timesteps = np.intersect1d(da1["time"], da2["time"]) +# da1_isel = np.where(np.isin(da1["time"], intersecting_timesteps))[0] +# da2_isel = np.where(np.isin(da2["time"], intersecting_timesteps))[0] +# return da1_isel, da2_isel + + +# def check_same_raw_drop_number_values(list_ds, filepaths): +# """ +# Check if the 'raw_drop_number' values are the same across multiple datasets. + +# This function compares the 'raw_drop_number' values of multiple datasets to ensure they are identical +# at common timesteps. + +# If any discrepancies are found, a ValueError is raised indicating which files +# have differing values. + +# Parameters +# ---------- +# list_ds : list of xarray.Dataset +# A list of xarray Datasets to be compared. +# filepaths : list of str +# A list of file paths corresponding to the datasets in `list_ds`. + +# Raises +# ------ +# ValueError +# If 'raw_drop_number' values differ at any common timestep between any two datasets. +# """ +# # Retrieve variable to compare +# list_drop_number = [ds["raw_drop_number"].compute() for ds in list_ds] +# # Compare values +# combos = list(itertools.combinations(range(len(list_drop_number)), 2)) +# for i, j in combos: +# da1 = list_drop_number[i] +# da2 = list_drop_number[j] +# da1_isel, da2_isel = find_isel_common_time(da1=da1, da2=da2) +# if not np.all(da1.isel(time=da1_isel).data == da2.isel(time=da2_isel).data): +# file1 = filepaths[i] +# file2 = filepaths[i] +# msg = f"Duplicated timesteps have different values between file {file1} and {file2}" +# raise ValueError(msg) diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py b/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py index 25f473c1..706e870e 100644 --- a/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py +++ b/disdrodb/l0/readers/BRAZIL/CHUVA_LPM.py @@ -161,6 +161,8 @@ def df_sanitizer_fun(df): df["time"] = df["sensor_date"] + "-" + df["sensor_time"] df["time"] = pd.to_datetime(df["time"], format="%d.%m.%y-%H:%M:%S", errors="coerce") + # TODO: correct time is unavailable yet ! + # Drop row if start_identifier different than 00 df = df[df["start_identifier"].astype(str) == "00"] diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py b/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py index a42f29d2..eabbdfaa 100644 --- a/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py +++ b/disdrodb/l0/readers/BRAZIL/CHUVA_OTT.py @@ -71,18 +71,18 @@ def df_sanitizer_fun(df): df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] - # Convert ID to integer - # - First convert to numeric and if errors arise (corrupted rows), drop rows - df["ID"] = pd.to_numeric(df["ID"], errors="coerce") - df = df.dropna(subset="ID") - df["ID"] = df["ID"].astype(int) + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart - groups = df.groupby((df["ID"].diff() <= 0).cumsum()) + groups = df.groupby((df["ID"].astype(int).diff() <= 0).cumsum()) # - Reshape the dataframe group_dfs = [] diff --git a/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py b/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py index 55dca220..0c98c0c9 100644 --- a/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py +++ b/disdrodb/l0/readers/BRAZIL/CHUVA_RD80.py @@ -38,7 +38,7 @@ def reader( "date", "time", "sensor_status", - "interval", + "sample_interval", "n1", "n2", "n3", @@ -99,7 +99,7 @@ def df_sanitizer_fun(df): import pandas as pd # - Replace 'status' NaN with 0 - df["sensor_status"] = df["sensor_status"].fillna(0) + df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int) # - Define 'time' datetime column df["time"] = df["date"].astype(str) + " " + df["time"].astype(str) diff --git a/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py b/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py index 3366c62e..0c88ae8e 100644 --- a/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py +++ b/disdrodb/l0/readers/BRAZIL/GOAMAZON_OTT.py @@ -71,18 +71,18 @@ def df_sanitizer_fun(df): df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] - # Convert ID to integer - # - First convert to numeric and if errors arise (corrupted rows), drop rows - df["ID"] = pd.to_numeric(df["ID"], errors="coerce") - df = df.dropna(subset="ID") - df["ID"] = df["ID"].astype(int) + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart - groups = df.groupby((df["ID"].diff() <= 0).cumsum()) + groups = df.groupby((df["ID"].astype(int).diff() <= 0).cumsum()) # - Reshape the dataframe group_dfs = [] diff --git a/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py b/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py index ce667dbb..0e5b222a 100644 --- a/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py +++ b/disdrodb/l0/readers/BRAZIL/GOAMAZON_RD80.py @@ -38,7 +38,7 @@ def reader( "date", "time", "sensor_status", - "interval", + "sample_interval", "n1", "n2", "n3", @@ -99,7 +99,7 @@ def df_sanitizer_fun(df): import pandas as pd # - Replace 'status' NaN with 0 - df["sensor_status"] = df["sensor_status"].fillna(0) + df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int) # - Define 'time' datetime column df["time"] = df["date"].astype(str) + " " + df["time"].astype(str) diff --git a/disdrodb/l0/readers/EPFL/UNIL_2022.py b/disdrodb/l0/readers/EPFL/UNIL_2022.py index e380015a..cc2cec95 100644 --- a/disdrodb/l0/readers/EPFL/UNIL_2022.py +++ b/disdrodb/l0/readers/EPFL/UNIL_2022.py @@ -92,10 +92,15 @@ def df_sanitizer_fun(df): df["time"] = pd.to_datetime(df["time"], format="%d-%m-%Y %H:%M:%S", errors="coerce") # - Split TO_BE_SPLITTED columns + df_splitted = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=1) df_splitted.columns = ["datalogger_voltage", "rainfall_rate_32bit"] df["rainfall_rate_32bit"] = df_splitted["rainfall_rate_32bit"] + # Remove rows with error in data reading + # - When datalogger error: rainfall_rate_32bit: Error in data reading! + df = df[df["rainfall_rate_32bit"] != "Error in data reading! 0"] + # - Drop columns not agreeing with DISDRODB L0 standards columns_to_drop = [ "id", diff --git a/disdrodb/l0/readers/GPM/MC3E.py b/disdrodb/l0/readers/GPM/MC3E.py index 775d14f4..30156005 100644 --- a/disdrodb/l0/readers/GPM/MC3E.py +++ b/disdrodb/l0/readers/GPM/MC3E.py @@ -65,40 +65,104 @@ def reader( #### - Define dataframe sanitizer function for L0 processing def df_sanitizer_fun(df): # - Import pandas + import numpy as np import pandas as pd - # - Define 'time' datetime - df_time = pd.to_datetime(df["time"], format="%Y%m%d%H%M%S", errors="coerce") + # - Convert 'time' column to datetime + df["time"] = pd.to_datetime(df["time"], format="%Y%m%d%H%M%S", errors="coerce") - # - Split the 'TO_BE_SPLITTED' column - df = df["TO_BE_SPLITTED"].str.split(",", n=9, expand=True) + # Count number of delimiters in the column to be parsed + # --> Some first rows are corrupted, so count the most frequent occurrence + possible_delimiters, counts = np.unique(df["TO_BE_SPLITTED"].str.count(","), return_counts=True) + n_delimiters = possible_delimiters[np.argmax(counts)] - # - Assign column names - column_names = [ - "station_name", - "sensor_status", - "sensor_temperature", - "number_particles", - "rainfall_rate_32bit", - "reflectivity_16bit", - "mor_visibility", - "weather_code_synop_4680", - "weather_code_synop_4677", - "raw_drop_number", - ] - df.columns = column_names - - # - Add the time column - df["time"] = df_time + if n_delimiters == 1031: # first files + # - Select valid rows + df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1031] + # - Get time column + df_time = df["time"] + # - Split the 'TO_BE_SPLITTED' column + df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=7) + # - Assign column names + column_names = [ + "station_name", + "sensor_status", + "sensor_temperature", + "reflectivity_32bit", + "mor_visibility", + "weather_code_synop_4680", + "weather_code_synop_4677", + "raw_drop_number", + ] + df.columns = column_names + # - Add time column + df["time"] = df_time + # - Remove columns not in other files + df = df.drop(columns="reflectivity_32bit") + # - Add missing columns and set NaN value + missing_columns = [ + "number_particles", + "rainfall_rate_32bit", + "reflectivity_16bit", + ] + for column in missing_columns: + df[column] = "NaN" + elif n_delimiters == 1033: # (most of the files) + # - Select valid rows + df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1033] + # - Get time column + df_time = df["time"] + # - Split the column be parsed + df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=9) + # - Assign column names + column_names = [ + "station_name", + "sensor_status", + "sensor_temperature", + "number_particles", + "rainfall_rate_32bit", + "reflectivity_16bit", + "mor_visibility", + "weather_code_synop_4680", + "weather_code_synop_4677", + "raw_drop_number", + ] + df.columns = column_names + # - Add time column + df["time"] = df_time + elif n_delimiters == 1035: # APU 17 first files + # - Select valid rows + df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1035] + # - Get time column + df_time = df["time"] + # - Split the column be parsed + df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=11) + # - Assign column names + column_names = [ + "station_name", + "sensor_date", + "sensor_time", + "sensor_status", + "sensor_temperature", + "number_particles", + "rainfall_rate_32bit", + "reflectivity_16bit", + "mor_visibility", + "weather_code_synop_4680", + "weather_code_synop_4677", + "raw_drop_number", + ] + df.columns = column_names + # - Add time column + df["time"] = df_time + # - Drop columns not needed + df = df.drop(columns=["sensor_time", "sensor_date"]) + else: + # Wrong number of delimiters ... likely a corrupted file + raise ValueError("Unexpected number of comma delimiters !") # - Drop columns not agreeing with DISDRODB L0 standards df = df.drop(columns=["station_name"]) - - # - Drop rows with invalid values - # --> Ensure that weather_code_synop_4677 has length 2 - # --> If a previous column is missing it will have 000 - df = df[df["weather_code_synop_4677"].str.len() == 2] - return df ##------------------------------------------------------------------------. diff --git a/disdrodb/l0/readers/GPM/NSSTC.py b/disdrodb/l0/readers/GPM/NSSTC.py index 7595ada7..908b1349 100644 --- a/disdrodb/l0/readers/GPM/NSSTC.py +++ b/disdrodb/l0/readers/GPM/NSSTC.py @@ -82,7 +82,7 @@ def df_sanitizer_fun(df): possible_delimiters, counts = np.unique(df["TO_BE_SPLITTED"].str.count(","), return_counts=True) n_delimiters = possible_delimiters[np.argmax(counts)] - if n_delimiters == 1027: + if n_delimiters == 1027: # APU 2010 # - Select valid rows df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1027] # - Get time column @@ -110,6 +110,37 @@ def df_sanitizer_fun(df): ] for column in missing_columns: df[column] = "NaN" + elif n_delimiters == 1031: # APU08 (2011) + # - Select valid rows + df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1031] + # - Get time column + df_time = df["time"] + # - Split the 'TO_BE_SPLITTED' column + df = df["TO_BE_SPLITTED"].str.split(",", expand=True, n=7) + # - Assign column names + column_names = [ + "station_name", + "sensor_status", + "sensor_temperature", + "reflectivity_32bit", + "mor_visibility", + "weather_code_synop_4680", + "weather_code_synop_4677", + "raw_drop_number", + ] + df.columns = column_names + # - Add time column + df["time"] = df_time + # - Remove columns not in other files + df = df.drop(columns="reflectivity_32bit") + # - Add missing columns and set NaN value + missing_columns = [ + "number_particles", + "rainfall_rate_32bit", + "reflectivity_16bit", + ] + for column in missing_columns: + df[column] = "NaN" elif n_delimiters == 1033: # - Select valid rows df = df.loc[df["TO_BE_SPLITTED"].str.count(",") == 1033] diff --git a/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py b/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py index faefb118..7e8d42cf 100644 --- a/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py +++ b/disdrodb/l0/readers/NCAR/RELAMPAGO_OTT.py @@ -81,8 +81,14 @@ def df_sanitizer_fun(df): df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] + + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart diff --git a/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py b/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py index 55b0c8da..c33df841 100644 --- a/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py +++ b/disdrodb/l0/readers/NCAR/RELAMPAGO_RD80.py @@ -38,7 +38,7 @@ def reader( "date", "time", "sensor_status", - "interval", + "sample_interval", "n1", "n2", "n3", @@ -99,7 +99,7 @@ def df_sanitizer_fun(df): import pandas as pd # - Replace 'status' NaN with 0 - df["sensor_status"] = df["sensor_status"].fillna(0) + df["sensor_status"] = df["sensor_status"].astype(float).fillna(value=0).astype(int) # - Replace all ',' with '.' in RI, RA, RAT df["RI"] = df["RI"].replace({",": "."}, regex=True) diff --git a/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py b/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py index 53b4fbc9..dc8607af 100644 --- a/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py +++ b/disdrodb/l0/readers/NCAR/SNOWIE_PJ.py @@ -64,14 +64,21 @@ def reader( #### - Define dataframe sanitizer function for L0 processing def df_sanitizer_fun(df): # - Import pandas + import numpy as np import pandas as pd # Create ID and Value columns df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] + + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart diff --git a/disdrodb/l0/readers/NCAR/SNOWIE_SB.py b/disdrodb/l0/readers/NCAR/SNOWIE_SB.py index 223d4ffd..8d8330e4 100644 --- a/disdrodb/l0/readers/NCAR/SNOWIE_SB.py +++ b/disdrodb/l0/readers/NCAR/SNOWIE_SB.py @@ -71,14 +71,21 @@ def reader( #### - Define dataframe sanitizer function for L0 processing def df_sanitizer_fun(df): # - Import pandas + import numpy as np import pandas as pd # Create ID and Value columns df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] + + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart diff --git a/disdrodb/l0/readers/NCAR/VORTEX2_2010.py b/disdrodb/l0/readers/NCAR/VORTEX2_2010.py index a3d6752d..3d5fe922 100644 --- a/disdrodb/l0/readers/NCAR/VORTEX2_2010.py +++ b/disdrodb/l0/readers/NCAR/VORTEX2_2010.py @@ -82,8 +82,14 @@ def df_sanitizer_fun(df): df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] + + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart @@ -127,8 +133,8 @@ def df_sanitizer_fun(df): # "23": "station_number", "24": "rainfall_amount_absolute_32bit", "25": "error_code", - "30": "rainfall_rate_16_bit", - "31": "rainfall_rate_12_bit", + "30": "rainfall_rate_16bit", + "31": "rainfall_rate_12bit", "32": "rainfall_accumulated_16bit", "90": "raw_drop_concentration", "91": "raw_drop_average_velocity", diff --git a/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py b/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py index 16f99b25..f0cf2602 100644 --- a/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py +++ b/disdrodb/l0/readers/NCAR/VORTEX2_2010_UF.py @@ -82,8 +82,18 @@ def df_sanitizer_fun(df): df = df["TO_PARSE"].str.split(":", expand=True, n=1) df.columns = ["ID", "Value"] - # Drop rows with no values + # Select only rows with values df = df[df["Value"].astype(bool)] + df = df[df["Value"].apply(lambda x: x is not None)] + + # Drop rows with invalid IDs + # - Corrupted rows + valid_id_str = np.char.rjust(np.arange(0, 94).astype(str), width=2, fillchar="0") + df = df[df["ID"].astype(str).isin(valid_id_str)] + + # Raise error if no more rows after removed corrupted ones + if len(df) == 0: + raise ValueError("No rows left after removing corrupted ones.") # Create the dataframe with each row corresponding to a timestep # - Group rows based on when ID values restart diff --git a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py index c1e9e2c0..baef8ab5 100644 --- a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py +++ b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_P2.py @@ -97,6 +97,8 @@ def df_sanitizer_fun(df): # Preprocess the raw spectrum # - The 'ZERO' indicates no drops detected # --> "" generates an array of zeros in L0B processing + df["raw_drop_number"] = df["raw_drop_number"].astype("string") + df["raw_drop_number"] = df["raw_drop_number"].str.strip() df["raw_drop_number"] = df["raw_drop_number"].replace("ZERO", "") # Remove and " acronyms from the raw_drop_number field diff --git a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py index 7814f66f..da0ad731 100644 --- a/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py +++ b/disdrodb/l0/readers/NCAR/VORTEX_SE_2016_PIPS.py @@ -144,7 +144,7 @@ def df_sanitizer_fun(df): df["longitude"] = df_lon # - Drop columns not agreeing with DISDRODB L0 standards - df = df.drop(columns=["serial_number", "sensor_time", "serial_number"]) + df = df.drop(columns=["serial_number", "sensor_time", "sensor_date", "serial_number"]) return df diff --git a/disdrodb/l0/readers/NETHERLANDS/DELFT.py b/disdrodb/l0/readers/NETHERLANDS/DELFT.py index 5fa5632a..fcd829cd 100644 --- a/disdrodb/l0/readers/NETHERLANDS/DELFT.py +++ b/disdrodb/l0/readers/NETHERLANDS/DELFT.py @@ -156,9 +156,7 @@ def df_sanitizer_fun(df): "station_name", "station_number", "sensor_serial_number", - "sample_interval", "sensor_serial_number", - # "epoch_time", # "number_particles_all_detected", ] df = df.drop(columns=columns_to_drop) diff --git a/disdrodb/l0/routines.py b/disdrodb/l0/routines.py deleted file mode 100644 index 8b137814..00000000 --- a/disdrodb/l0/routines.py +++ /dev/null @@ -1,760 +0,0 @@ -#!/usr/bin/env python3 - -# -----------------------------------------------------------------------------. -# Copyright (c) 2021-2023 DISDRODB developers -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -----------------------------------------------------------------------------. -"""Implement DISDRODB wrappers to launch L0 processing in the terminal.""" - -import datetime -import logging -import time -from typing import Optional - -import click - -from disdrodb.utils.logger import ( - # log_warning, - # log_error, - log_info, -) -from disdrodb.utils.scripts import _execute_cmd - -logger = logging.getLogger(__name__) - -####--------------------------------------------------------------------------. -#### CLIck - - -def click_l0_stations_options(function: object): - """Click command line options for DISDRODB archive L0 processing. - - Parameters - ---------- - function : object - Function. - """ - function = click.option( - "--data_sources", - type=str, - show_default=True, - default="", - help="DISDRODB data sources to process", - )(function) - function = click.option( - "--campaign_names", - type=str, - show_default=True, - default="", - help="DISDRODB campaign names to process", - )(function) - function = click.option( - "--station_names", - type=str, - show_default=True, - default="", - help="DISDRODB station names to process", - )(function) - return function - - -def click_l0_processing_options(function: object): - """Click command line default parameters for L0 processing options. - - Parameters - ---------- - function : object - Function. - """ - function = click.option( - "-p", - "--parallel", - type=bool, - show_default=True, - default=False, - help="Process files in parallel", - )(function) - function = click.option( - "-d", - "--debugging_mode", - type=bool, - show_default=True, - default=False, - help="Switch to debugging mode", - )(function) - function = click.option("-v", "--verbose", type=bool, show_default=True, default=True, help="Verbose")(function) - function = click.option( - "-f", - "--force", - type=bool, - show_default=True, - default=False, - help="Force overwriting", - )(function) - return function - - -def click_remove_l0a_option(function: object): - """Click command line argument for ``remove_l0a``.""" - function = click.option( - "--remove_l0a", - type=bool, - show_default=True, - default=False, - help="If true, remove the L0A files once the L0B processing is terminated.", - )(function) - return function - - -def click_l0_archive_options(function: object): - """Click command line arguments for L0 processing archiving of a station. - - Parameters - ---------- - function : object - Function. - """ - function = click.option( - "--l0b_concat", - type=bool, - show_default=True, - default=False, - help="Produce single L0B netCDF file.", - )(function) - function = click.option( - "--remove_l0b", - type=bool, - show_default=True, - default=False, - help="If true, remove all source L0B files once L0B concatenation is terminated.", - )(function) - function = click.option( - "--remove_l0a", - type=bool, - show_default=True, - default=False, - help="If true, remove the L0A files once the L0B processing is terminated.", - )(function) - function = click.option( - "-l0b", - "--l0b_processing", - type=bool, - show_default=True, - default=True, - help="Perform L0B processing.", - )(function) - function = click.option( - "-l0a", - "--l0a_processing", - type=bool, - show_default=True, - default=True, - help="Perform L0A processing.", - )(function) - return function - - -def click_l0b_concat_options(function: object): - """Click command line default parameters for L0B concatenation. - - Parameters - ---------- - function : object - Function. - """ - function = click.option( - "--remove_l0b", - type=bool, - show_default=True, - default=False, - help="If true, remove all source L0B files once L0B concatenation is terminated.", - )(function) - function = click.option("-v", "--verbose", type=bool, show_default=True, default=False, help="Verbose")(function) - return function - - -####--------------------------------------------------------------------------. -#### Run L0A and L0B Station processing - - -def run_disdrodb_l0a_station( - # Station arguments - data_source, - campaign_name, - station_name, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, -): - """Run the L0A processing of a station calling the disdrodb_l0a_station in the terminal.""" - # Define command - cmd = " ".join( - [ - "disdrodb_run_l0a_station", - # Station arguments - data_source, - campaign_name, - station_name, - # Processing options - "--force", - str(force), - "--verbose", - str(verbose), - "--debugging_mode", - str(debugging_mode), - "--parallel", - str(parallel), - "--base_dir", - str(base_dir), - ], - ) - # Execute command - _execute_cmd(cmd) - - -def run_disdrodb_l0b_station( - # Station arguments - data_source, - campaign_name, - station_name, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, - remove_l0a: bool = False, -): - """Run the L0B processing of a station calling disdrodb_run_l0b_station in the terminal.""" - # Define command - cmd = " ".join( - [ - "disdrodb_run_l0b_station", - # Station arguments - data_source, - campaign_name, - station_name, - # Processing options - "--force", - str(force), - "--verbose", - str(verbose), - "--debugging_mode", - str(debugging_mode), - "--parallel", - str(parallel), - "--remove_l0a", - str(remove_l0a), - "--base_dir", - str(base_dir), - ], - ) - # Execute command - _execute_cmd(cmd) - - -def run_disdrodb_l0b_concat_station( - data_source, - campaign_name, - station_name, - remove_l0b=False, - verbose=False, - base_dir=None, -): - """Concatenate the L0B files of a single DISDRODB station. - - This function runs the ``disdrodb_run_l0b_concat_station`` script in the terminal. - """ - cmd = " ".join( - [ - "disdrodb_run_l0b_concat_station", - data_source, - campaign_name, - station_name, - "--remove_l0b", - str(remove_l0b), - "--verbose", - str(verbose), - "--base_dir", - str(base_dir), - ], - ) - _execute_cmd(cmd) - - -####--------------------------------------------------------------------------. -#### Run L0 Station processing (L0A + L0B) - - -def run_disdrodb_l0_station( - data_source, - campaign_name, - station_name, - # L0 archive options - l0a_processing: bool = True, - l0b_processing: bool = True, - l0b_concat: bool = False, - remove_l0a: bool = False, - remove_l0b: bool = False, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, -): - """Run the L0 processing of a specific DISDRODB station from the terminal. - - Parameters - ---------- - data_source : str - Institution name (when campaign data spans more than 1 country), - or country (when all campaigns (or sensor networks) are inside a given country). - Must be UPPER CASE. - campaign_name : str - Campaign name. Must be UPPER CASE. - station_name : str - Station name - l0a_processing : bool - Whether to launch processing to generate L0A Apache Parquet file(s) from raw data. - The default is ``True``. - l0b_processing : bool - Whether to launch processing to generate L0B netCDF4 file(s) from L0A data. - The default is ``True``. - l0b_concat : bool - Whether to concatenate all raw files into a single L0B netCDF file. - If ``l0b_concat=True``, all raw files will be saved into a single L0B netCDF file. - If ``l0b_concat=False``, each raw file will be converted into the corresponding L0B netCDF file. - The default is ``False``. - remove_l0a : bool - Whether to keep the L0A files after having generated the L0B netCDF products. - The default is ``False``. - remove_l0b : bool - Whether to remove the L0B files after having concatenated all L0B netCDF files. - It takes places only if ``l0b_concat=True``. - The default is ``False``. - force : bool - If ``True``, overwrite existing data into destination directories. - If ``False``, raise an error if there are already data into destination directories. - The default is ``False``. - verbose : bool - Whether to print detailed processing information into terminal. - The default is ``True``. - parallel : bool - If ``True``, the files are processed simultaneously in multiple processes. - Each process will use a single thread to avoid issues with the HDF/netCDF library. - By default, the number of process is defined with ``os.cpu_count()``. - If ``False``, the files are processed sequentially in a single process. - If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. - debugging_mode : bool - If ``True``, it reduces the amount of data to process. - For L0A, it processes just the first 3 raw data files for each station. - For L0B, it processes just the first 100 rows of 3 L0A files for each station. - The default is ``False``. - base_dir : str (optional) - Base directory of DISDRODB. Format: ``<...>/DISDRODB``. - If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. - """ - # ---------------------------------------------------------------------. - t_i = time.time() - msg = f"L0 processing of station {station_name} has started." - log_info(logger=logger, msg=msg, verbose=verbose) - - # ------------------------------------------------------------------. - # L0A processing - if l0a_processing: - run_disdrodb_l0a_station( - # Station arguments - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - # Processing options - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ) - # ------------------------------------------------------------------. - # L0B processing - if l0b_processing: - run_disdrodb_l0b_station( - # Station arguments - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - # Processing options - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - remove_l0a=remove_l0a, - ) - - # ------------------------------------------------------------------------. - # If l0b_concat=True, concat the netCDF in a single file - if l0b_concat: - run_disdrodb_l0b_concat_station( - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - remove_l0b=remove_l0b, - verbose=verbose, - ) - - # -------------------------------------------------------------------------. - # End of L0 processing for all stations - timedelta_str = str(datetime.timedelta(seconds=time.time() - t_i)) - msg = f"L0 processing of stations {station_name} completed in {timedelta_str}" - log_info(logger, msg, verbose) - - -####---------------------------------------------------------------------------. -#### Run L0 Archive processing - - -def _check_available_stations(list_info): - # If no stations available, raise an error - if len(list_info) == 0: - msg = "No stations available given the provided `data_sources` and `campaign_names` arguments !" - raise ValueError(msg) - - -def _filter_list_info(list_info, station_names): - # Filter by provided stations - if station_names is not None: - list_info = [info for info in list_info if info[2] in station_names] - # If nothing left, raise an error - if len(list_info) == 0: - raise ValueError("No stations available given the provided `station_names` argument !") - return list_info - - -def _get_starting_product(l0a_processing, l0b_processing): - if l0a_processing: - product = "RAW" - elif l0b_processing: - product = "L0A" - else: - raise ValueError("At least l0a_processing or l0b_processing must be `True`.") - return product - - -def run_disdrodb_l0( - data_sources=None, - campaign_names=None, - station_names=None, - # L0 archive options - l0a_processing: bool = True, - l0b_processing: bool = True, - l0b_concat: bool = False, - remove_l0a: bool = False, - remove_l0b: bool = False, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, -): - """Run the L0 processing of DISDRODB stations. - - This function allows to launch the processing of many DISDRODB stations with a single command. - From the list of all available DISDRODB stations, it runs the processing of the - stations matching the provided data_sources, campaign_names and station_names. - - Parameters - ---------- - data_sources : list - Name of data source(s) to process. - The name(s) must be UPPER CASE. - If campaign_names and station are not specified, process all stations. - The default is ``None``. - campaign_names : list - Name of the campaign(s) to process. - The name(s) must be UPPER CASE. - The default is ``None``. - station_names : list - Station names to process. - The default is ``None``. - l0a_processing : bool - Whether to launch processing to generate L0A Apache Parquet file(s) from raw data. - The default is ``True``. - l0b_processing : bool - Whether to launch processing to generate L0B netCDF4 file(s) from L0A data. - The default is ``True``. - l0b_concat : bool - Whether to concatenate all raw files into a single L0B netCDF file. - If ``l0b_concat=True``, all raw files will be saved into a single L0B netCDF file. - If ``l0b_concat=False``, each raw file will be converted into the corresponding L0B netCDF file. - The default is ``False``. - remove_l0a : bool - Whether to keep the L0A files after having generated the L0B netCDF products. - The default is ``False``. - remove_l0b : bool - Whether to remove the L0B files after having concatenated all L0B netCDF files. - It takes places only if ``l0b_concat = True``. - The default is ``False``. - force : bool - If ``True``, overwrite existing data into destination directories. - If ``False``, raise an error if there are already data into destination directories. - The default is ``False``. - verbose : bool - Whether to print detailed processing information into terminal. - The default is ``False``. - parallel : bool - If ``True``, the files are processed simultaneously in multiple processes. - Each process will use a single thread to avoid issues with the HDF/netCDF library. - By default, the number of process is defined with ``os.cpu_count()``. - If ``False``, the files are processed sequentially in a single process. - If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. - debugging_mode : bool - If ``True``, it reduces the amount of data to process. - For L0A, it processes just the first 3 raw data files. - For L0B, it processes just the first 100 rows of 3 L0A files. - The default is ``False``. - base_dir : str (optional) - Base directory of DISDRODB. Format: ``<...>/DISDRODB``. - If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. - """ - from disdrodb.api.io import available_stations - - # Get list of available stations - product = _get_starting_product(l0a_processing=l0a_processing, l0b_processing=l0b_processing) - list_info = available_stations( - base_dir=base_dir, - product=product, - data_sources=data_sources, - campaign_names=campaign_names, - ) - _check_available_stations(list_info) - list_info = _filter_list_info(list_info, station_names) - - # Print message - n_stations = len(list_info) - print(f"L0 processing of {n_stations} stations started.") - - # Loop over stations - for data_source, campaign_name, station_name in list_info: - print(f"L0 processing of {data_source} {campaign_name} {station_name} station started.") - # Run processing - run_disdrodb_l0_station( - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - # L0 archive options - l0a_processing=l0a_processing, - l0b_processing=l0b_processing, - l0b_concat=l0b_concat, - remove_l0a=remove_l0a, - remove_l0b=remove_l0b, - # Process options - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ) - print(f"L0 processing of {data_source} {campaign_name} {station_name} station ended.") - - -def run_disdrodb_l0a( - data_sources=None, - campaign_names=None, - station_names=None, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, -): - """Run the L0A processing of DISDRODB stations. - - This function allows to launch the processing of many DISDRODB stations with a single command. - From the list of all available DISDRODB stations, it runs the processing of the - stations matching the provided data_sources, campaign_names and station_names. - - Parameters - ---------- - data_sources : list - Name of data source(s) to process. - The name(s) must be UPPER CASE. - If campaign_names and station are not specified, process all stations. - The default is ``None``. - campaign_names : list - Name of the campaign(s) to process. - The name(s) must be UPPER CASE. - The default is ``None``. - station_names : list - Station names to process. - The default is ``None``. - force : bool - If ``True``, overwrite existing data into destination directories. - If ``False``, raise an error if there are already data into destination directories. - The default is ``False``. - verbose : bool - Whether to print detailed processing information into terminal. - The default is ``True``. - parallel : bool - If ``True``, the files are processed simultaneously in multiple processes. - By default, the number of process is defined with ``os.cpu_count()``. - If ``False``, the files are processed sequentially in a single process. - debugging_mode : bool - If ``True``, it reduces the amount of data to process. - For L0A, it processes just the first 3 raw data files. - The default is ``False``. - base_dir : str (optional) - Base directory of DISDRODB. Format: ``<...>/DISDRODB``. - If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. - """ - run_disdrodb_l0( - base_dir=base_dir, - data_sources=data_sources, - campaign_names=campaign_names, - station_names=station_names, - # L0 archive options - l0a_processing=True, - l0b_processing=False, - l0b_concat=False, - remove_l0a=False, - remove_l0b=False, - # Processing options - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ) - - -def run_disdrodb_l0b( - data_sources=None, - campaign_names=None, - station_names=None, - # Processing options - force: bool = False, - verbose: bool = False, - debugging_mode: bool = False, - parallel: bool = True, - base_dir: Optional[str] = None, - remove_l0a: bool = False, -): - """Run the L0B processing of DISDRODB stations. - - This function allows to launch the processing of many DISDRODB stations with a single command. - From the list of all available DISDRODB L0A stations, it runs the processing of the - stations matching the provided data_sources, campaign_names and station_names. - - Parameters - ---------- - data_sources : list - Name of data source(s) to process. - The name(s) must be UPPER CASE. - If campaign_names and station are not specified, process all stations. - The default is ``None``. - campaign_names : list - Name of the campaign(s) to process. - The name(s) must be UPPER CASE. - The default is ``None``. - station_names : list - Station names to process. - The default is ``None``. - force : bool - If ``True``, overwrite existing data into destination directories. - If ``False``, raise an error if there are already data into destination directories. - The default is ``False``. - verbose : bool - Whether to print detailed processing information into terminal. - The default is ``True``. - parallel : bool - If ``True``, the files are processed simultaneously in multiple processes. - By default, the number of process is defined with ``os.cpu_count()``. - If ``False``, the files are processed sequentially in a single process. - debugging_mode : bool - If ``True``, it reduces the amount of data to process. - For L0B, it processes just the first 100 rows of 3 L0A files. - The default is ``False``. - base_dir : str (optional) - Base directory of DISDRODB. Format: ``<...>/DISDRODB``. - If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. - """ - run_disdrodb_l0( - base_dir=base_dir, - data_sources=data_sources, - campaign_names=campaign_names, - station_names=station_names, - # L0 archive options - l0a_processing=False, - l0b_processing=True, - l0b_concat=False, - remove_l0a=remove_l0a, - remove_l0b=False, - # Processing options - force=force, - verbose=verbose, - debugging_mode=debugging_mode, - parallel=parallel, - ) - - -####---------------------------------------------------------------------------. -def run_disdrodb_l0b_concat( - data_sources=None, - campaign_names=None, - station_names=None, - remove_l0b=False, - verbose=False, - base_dir=None, -): - """Concatenate the L0B files of the DISDRODB archive. - - This function is called by the ``disdrodb_run_l0b_concat`` script. - """ - from disdrodb.api.io import available_stations - - list_info = available_stations( - base_dir=base_dir, - product="L0B", - data_sources=data_sources, - campaign_names=campaign_names, - ) - - _check_available_stations(list_info) - list_info = _filter_list_info(list_info, station_names) - - # Print message - n_stations = len(list_info) - print(f"Concatenation of {n_stations} L0B stations started.") - - # Start the loop to launch the concatenation of each station - for data_source, campaign_name, station_name in list_info: - print(f"L0B files concatenation of {data_source} {campaign_name} {station_name} station started.") - run_disdrodb_l0b_concat_station( - base_dir=base_dir, - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - remove_l0b=remove_l0b, - verbose=verbose, - ) - print(f"L0 files concatenation of {data_source} {campaign_name} {station_name} station ended.") - - -####---------------------------------------------------------------------------. diff --git a/disdrodb/l0/standards.py b/disdrodb/l0/standards.py index 2c563abb..4d4390cf 100644 --- a/disdrodb/l0/standards.py +++ b/disdrodb/l0/standards.py @@ -18,8 +18,6 @@ # -----------------------------------------------------------------------------. """Retrieve L0 sensor standards.""" -import datetime -import importlib import logging import numpy as np @@ -29,11 +27,6 @@ logger = logging.getLogger(__name__) -PRODUCT_VERSION = "V0" -SOFTWARE_VERSION = "V" + importlib.metadata.version("disdrodb") -CONVENTIONS = "CF-1.10, ACDD-1.3" -EPOCH = "seconds since 1970-01-01 00:00:00" - ####--------------------------------------------------------------------------. #### Variables validity dictionary @@ -252,150 +245,6 @@ def get_l0b_cf_attrs_dict(sensor_name: str) -> dict: return read_config_file(sensor_name=sensor_name, product="L0A", filename="l0b_cf_attrs.yml") -####-------------------------------------------------------------------------. -#### Coordinates attributes - - -def get_coords_attrs_dict(): - """Return dictionary with DISDRODB coordinates attributes.""" - attrs_dict = {} - # Define diameter attributes - attrs_dict["diameter_bin_center"] = { - "name": "diameter_bin_center", - "standard_name": "diameter_bin_center", - "long_name": "diameter_bin_center", - "units": "mm", - "description": "Bin center drop diameter value", - } - attrs_dict["diameter_bin_width"] = { - "name": "diameter_bin_width", - "standard_name": "diameter_bin_width", - "long_name": "diameter_bin_width", - "units": "mm", - "description": "Drop diameter bin width", - } - attrs_dict["diameter_bin_upper"] = { - "name": "diameter_bin_upper", - "standard_name": "diameter_bin_upper", - "long_name": "diameter_bin_upper", - "units": "mm", - "description": "Bin upper bound drop diameter value", - } - attrs_dict["velocity_bin_lower"] = { - "name": "velocity_bin_lower", - "standard_name": "velocity_bin_lower", - "long_name": "velocity_bin_lower", - "units": "mm", - "description": "Bin lower bound drop diameter value", - } - # Define velocity attributes - attrs_dict["velocity_bin_center"] = { - "name": "velocity_bin_center", - "standard_name": "velocity_bin_center", - "long_name": "velocity_bin_center", - "units": "m/s", - "description": "Bin center drop fall velocity value", - } - attrs_dict["velocity_bin_width"] = { - "name": "velocity_bin_width", - "standard_name": "velocity_bin_width", - "long_name": "velocity_bin_width", - "units": "m/s", - "description": "Drop fall velocity bin width", - } - attrs_dict["velocity_bin_upper"] = { - "name": "velocity_bin_upper", - "standard_name": "velocity_bin_upper", - "long_name": "velocity_bin_upper", - "units": "m/s", - "description": "Bin upper bound drop fall velocity value", - } - attrs_dict["velocity_bin_lower"] = { - "name": "velocity_bin_lower", - "standard_name": "velocity_bin_lower", - "long_name": "velocity_bin_lower", - "units": "m/s", - "description": "Bin lower bound drop fall velocity value", - } - # Define geolocation attributes - attrs_dict["latitude"] = { - "name": "latitude", - "standard_name": "latitude", - "long_name": "Latitude", - "units": "degrees_north", - } - attrs_dict["longitude"] = { - "name": "longitude", - "standard_name": "longitude", - "long_name": "Longitude", - "units": "degrees_east", - } - attrs_dict["altitude"] = { - "name": "altitude", - "standard_name": "altitude", - "long_name": "Altitude", - "units": "m", - "description": "Elevation above sea level", - } - # Define time attributes - attrs_dict["time"] = { - "name": "time", - "standard_name": "time", - "long_name": "time", - "description": "UTC Time", - } - - return attrs_dict - - -####-------------------------------------------------------------------------. -#### DISDRODB attributes - - -def set_disdrodb_attrs(ds, product: str): - """Add DISDRODB processing information to the netCDF global attributes. - - It assumes stations metadata are already added the dataset. - - Parameters - ---------- - ds : xarray.Dataset - Dataset - product: str - DISDRODB product. - - Returns - ------- - xarray dataset - Dataset. - """ - # Add dataset conventions - ds.attrs["Conventions"] = CONVENTIONS - - # Add featureType - platform_type = ds.attrs["platform_type"] - if platform_type == "fixed": - ds.attrs["featureType"] = "timeSeries" - else: - ds.attrs["featureType"] = "trajectory" - - # Add time_coverage_start and time_coverage_end - ds.attrs["time_coverage_start"] = str(ds["time"].data[0]) - ds.attrs["time_coverage_end"] = str(ds["time"].data[-1]) - - # DISDRODDB attributes - # - Add DISDRODB processing info - now = datetime.datetime.utcnow() - current_time = now.strftime("%Y-%m-%d %H:%M:%S") - ds.attrs["disdrodb_processing_date"] = current_time - # - Add DISDRODB product and version - ds.attrs["disdrodb_product_version"] = PRODUCT_VERSION - ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION - ds.attrs["disdrodb_product"] = product - - return ds - - ####-------------------------------------------------------------------------. #### Bin Coordinates Information @@ -762,20 +611,6 @@ def get_l0b_encodings_dict(sensor_name: str) -> dict: return encoding_dict -def get_time_encoding() -> dict: - """Create time encoding. - - Returns - ------- - dict - Time encoding. - """ - encoding = {} - encoding["units"] = EPOCH - encoding["calendar"] = "proleptic_gregorian" - return encoding - - ####-------------------------------------------------------------------------. #### L0B processing tools diff --git a/disdrodb/l0/template_tools.py b/disdrodb/l0/template_tools.py index 4cb8de19..9fb51b54 100644 --- a/disdrodb/l0/template_tools.py +++ b/disdrodb/l0/template_tools.py @@ -194,7 +194,7 @@ def print_df_summary_stats( # Define columns of interest _, columns_of_interest = _get_selected_column_names(df, column_indices) # Remove columns of dtype object or string - indices_to_remove = np.where((df.dtypes == type(object)) | (df.dtypes == str)) + indices_to_remove = np.where((df.dtypes == type(object)) | (df.dtypes == str)) # noqa indices = np.arange(0, len(df.columns)) indices = indices[np.isin(indices, indices_to_remove, invert=True)] columns = df.columns[indices] @@ -325,9 +325,7 @@ def str_has_decimal_digits(string: str) -> bool: bool True if string has digits. """ - if len(string.split(".")) == 2: - return True - return False + return len(string.split(".")) == 2 def get_decimal_ndigits(string: str) -> int: diff --git a/disdrodb/l1/__init__.py b/disdrodb/l1/__init__.py new file mode 100644 index 00000000..3bba3aaf --- /dev/null +++ b/disdrodb/l1/__init__.py @@ -0,0 +1,17 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB L1 module.""" diff --git a/disdrodb/l1/beard_model.py b/disdrodb/l1/beard_model.py new file mode 100644 index 00000000..1e25ff38 --- /dev/null +++ b/disdrodb/l1/beard_model.py @@ -0,0 +1,716 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Utilities to estimate the drop fall velocity using the Beard model.""" + + +import numpy as np +import xarray as xr + + +def get_gravitational_acceleration(latitude, altitude=0): + """ + Computes gravitational acceleration at a given altitude and latitude. + + Parameters + ---------- + altitude : float + Altitude in meters. The default is 0 m (sea level). + latitude : float + Latitude in degrees. + + Returns + ------- + float + Gravitational acceleration in m/s^2. + """ + g0 = 9.806229 - 0.025889372 * np.cos(2 * np.deg2rad(latitude)) + return g0 - 2.879513 * altitude / 1e6 + + +def get_air_pressure_at_height( + altitude, + latitude, + temperature, + sea_level_air_pressure=101_325, + lapse_rate=0.0065, + gas_constant_dry_air=287.04, +): + """ + Computes the air pressure at a given height in a standard atmosphere. + + According to the hypsometric formula of Brutsaert 1982; Ulaby et al. 1981 + + Parameters + ---------- + altitude : float + Altitude in meters. + latitude : float + Latitude in degrees. + temperature : float + Temperature at altitude in Kelvin. + sea_level_air_pressure : float, optional + Standard atmospheric pressure at sea level in Pascals. The default is 101_325 Pascals. + lapse_rate : float, optional + Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m. + gas_constant_dry_air : float, optional + Gas constant for dry air in J/(kg*K). The default is 287.04 J/(kg*K). + + Returns + ------- + float + Air pressure in Pascals. + """ + g = get_gravitational_acceleration(altitude=altitude, latitude=latitude) + return sea_level_air_pressure * np.exp( + -g / (lapse_rate * gas_constant_dry_air) * np.log(1 + lapse_rate * altitude / temperature), + ) + + +def get_air_temperature_at_height(altitude, sea_level_temperature, lapse_rate=0.0065): + """ + Computes the air temperature at a given height in a standard atmosphere. + + Reference: Brutsaert 1982; Ulaby et al. 1981 + + Parameters + ---------- + altitude : float + Altitude in meters. + sea_level_temperature : float + Standard temperature at sea level in Kelvin. + lapse_rate : float, optional + Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m. + + Returns + ------- + float + Air temperature in Kelvin. + """ + return sea_level_temperature - lapse_rate * altitude + + +def get_vapor_actual_pressure_at_height( + altitude, + sea_level_temperature, + sea_level_relative_humidity, + sea_level_air_pressure=101_325, + lapse_rate=0.0065, +): + """ + Computes the vapor pressure using Yamamoto's exponential relationship. + + Reference: Brutsaert 1982 + + Parameters + ---------- + altitude : float + Altitude in meters. + sea_level_temperature : float + Standard temperature at sea level in Kelvin. + sea_level_relative_humidity : float + Relative humidity at sea level. A value between 0 and 1. + sea_level_air_pressure : float, optional + Standard atmospheric pressure at sea level in Pascals. The default is 101_325 Pascals. + lapse_rate : float, optional + Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m. + + Returns + ------- + float + Vapor pressure in Pascals. + """ + temperature_at_altitude = get_air_temperature_at_height( + altitude=altitude, + sea_level_temperature=sea_level_temperature, + lapse_rate=lapse_rate, + ) + esat = get_vapor_saturation_pressure(sea_level_temperature) + actual_vapor = sea_level_relative_humidity / (1 / esat - (1 - sea_level_relative_humidity) / sea_level_air_pressure) + return actual_vapor * np.exp(-(5.8e3 * lapse_rate / (temperature_at_altitude**2) + 5.5e-5) * altitude) + + +def get_vapor_saturation_pressure(temperature): + """ + Computes the saturation vapor pressure over water as a function of temperature. + + Use formulation and coefficients of Wexler (1976, 1977). + References: Brutsaert 1982; Pruppacher & Klett 1978; Flatau & al. 1992 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Saturation vapor pressure in Pascal. + """ + # Polynomial coefficients + g = [ + -0.29912729e4, + -0.60170128e4, + 0.1887643854e2, + -0.28354721e-1, + 0.17838301e-4, + -0.84150417e-9, + 0.44412543e-12, + 0.2858487e1, + ] + # Perform polynomial accumulation using Horner rule + esat = g[6] + for i in [5, 4, 3, 2]: + esat = esat * temperature + g[i] + esat = esat + g[7] * np.log(temperature) + for i in [1, 0]: + esat = esat * temperature + g[i] + return np.exp(esat / (temperature**2)) + + +def get_vapor_actual_pressure(relative_humidity, temperature): + """ + Computes the actual vapor pressure over water. + + Parameters + ---------- + relative_humidity : float + Relative humidity. A value between 0 and 1. + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Actual vapor pressure in Pascal. + """ + esat = get_vapor_saturation_pressure(temperature) + return relative_humidity * esat + + +def get_pure_water_density(temperature): + """ + Computes the density of pure water at standard pressure. + + For temperatures above freezing uses Kell formulation. + For temperatures below freezing use Dorsch & Boyd formulation. + + References: Pruppacher & Klett 1978; Weast & Astle 1980 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Density of pure water in kg/m^3. + """ + # Convert to Celsius + temperature = temperature - 273.15 + + # Define mask + above_freezing_mask = temperature > 0 + + # Compute density above freezing temperature + c = [9.9983952e2, 1.6945176e1, -7.9870401e-3, -4.6170461e-5, 1.0556302e-7, -2.8054253e-10, 1.6879850e-2] + density = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:6], start=1)) + density_above_0 = density / (1 + c[6] * temperature) + + # Compute density below freezing temperature + c = [999.84, 0.086, -0.0108] + density_below_0 = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:], start=1)) + + # Define final density + density = xr.where(above_freezing_mask, density_above_0, density_below_0) + return density + + +def get_pure_water_compressibility(temperature): + """ + Computes the isothermal compressibility of pure ordinary water. + + Reference: Kell, Weast & Astle 1980 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Compressibility of water in Pascals. + """ + # Convert to Celsius + temperature = temperature - 273.15 + + # Compute compressibility + c = [5.088496e1, 6.163813e-1, 1.459187e-3, 2.008438e-5, -5.857727e-8, 4.10411e-10, 1.967348e-2] + compressibility = c[0] + sum(c * temperature**i for i, c in enumerate(c[1:6], start=1)) + compressibility = compressibility / (1 + c[6] * temperature) * 1e-11 + return compressibility + + +def get_pure_water_surface_tension(temperature): + """ + Computes the surface tension of pure ordinary water against air. + + Reference: Pruppacher & Klett 1978 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Surface tension in N/m. + """ + sigma = 0.0761 - 0.000155 * (temperature - 273.15) + return sigma + + +def get_air_dynamic_viscosity(temperature): + """ + Computes the dynamic viscosity of dry air. + + Reference: Beard 1977; Pruppacher & Klett 1978 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + + Returns + ------- + float + Dynamic viscosity of dry air in kg/(m*s) (aka Pa*s). + """ + # Convert to Celsius + temperature = temperature - 273.15 + + # Define mask + above_freezing_mask = temperature > 0 + + # Compute viscosity above freezing temperature + viscosity_above_0 = (1.721 + 0.00487 * temperature) / 1e5 + + # Compute viscosity below freezing temperature + viscosity_below_0 = (1.718 + 0.0049 * temperature - 1.2 * temperature**2 / 1e5) / 1e5 + + # Define final viscosity + viscosity = xr.where(above_freezing_mask, viscosity_above_0, viscosity_below_0) + return viscosity + + +def get_air_density(temperature, air_pressure, vapor_pressure, gas_constant_dry_air=287.04): + """ + Computes the air density according to the equation of state for moist air. + + Reference: Brutsaert 1982 + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + air_pressure : float + Air pressure in Pascals. + vapor_pressure : float + Vapor pressure in Pascals. + gas_constant_dry_air : float, optional + Gas constant for dry air in J/(kg*K). The default is 287.04 J/(kg*K). + + Returns + ------- + float + Air density in kg/m^3. + """ + # # Define constant for water vapor in J/(kg·K) + # gas_constant_water_vapor=461.5 + + # # Partial pressure of dry air (Pa) + # pressure_dry_air = air_pressure - vapor_pressure + + # # Density of dry air (kg/m^3) + # density_dry_air = pressure_dry_air / (gas_constant_dry_air * temperature) + + # # Density of water vapor (kg/m^3) + # density_water_vapor = vapor_pressure / (gas_constant_water_vapor * temperature) + + # # Total air density (kg/m^3) + # air_density = density_dry_air + density_water_vapor + + return air_pressure * (1 - 0.378 * vapor_pressure / air_pressure) / (gas_constant_dry_air * temperature) + + +def get_water_density(temperature, air_pressure, sea_level_air_pressure=101_325): + """ + Computes the density of water according to Weast & Astle 1980. + + Parameters + ---------- + temperature : float + Temperature in Kelvin. + air_pressure : float + Air pressure in Pascals. + sea_level_air_pressure : float + Standard atmospheric pressure at sea level in Pascals. + The default is 101_325 Pascal. + freezing_temperature : float, optional + Freezing temperature of water in Kelvin. The default is 273.15 K. + + Returns + ------- + float + Water density in kg/m^3. + """ + delta_pressure = sea_level_air_pressure - air_pressure + water_compressibility = get_pure_water_compressibility(temperature) + return get_pure_water_density(temperature) * np.exp(-1 * water_compressibility * delta_pressure) + + +def get_raindrop_reynolds_number(diameter, temperature, air_density, water_density, g): + """Compute raindrop Reynolds number. + + It quantifies the relative strength of the convective inertia and linear viscous + forces acting on the drop at terminal velocity. + + Estimates Reynolds number for drops with diameter between 19 um and 7 mm. + Coefficients are taken from Table 1 of Beard 1976. + + Reference: Beard 1976; Pruppacher & Klett 1978 + + Parameters + ---------- + diameter : float + Diameter of the raindrop in meters. + temperature : float + Temperature in Kelvin. + air_density : float + Density of air in kg/m^3. + water_density : float + Density of water in kg/m^3. + g : float + Gravitational acceleration in m/s^2. + + Returns + ------- + float + Reynolds number for the raindrop. + """ + # Define mask for small and large particles + small_diam_mask = diameter < 1.07e-3 # < 1mm + + # Compute properties + pure_water_surface_tension = get_pure_water_surface_tension(temperature) # N/m + air_viscosity = get_air_dynamic_viscosity(temperature) # kg/(m*s) (aka Pa*s). + delta_density = water_density - air_density + + # Compute Davis number for small droplets + davis_number = 4 * air_density * delta_density * g * diameter**3 / (3 * air_viscosity**2) + + # Compute the slip correction (is approx 1 and can be discarded) + # l0 = 6.62*1e-8 # m + # v0 = 0.01818 # g / m / s + # p0 = 101_325_25 # Pa + # t0 = 293.15 # K + # c_sc = 1 + 2.51*l0*(air_viscosity/v0)*(air_pressure/p0)*((temperature/t0)**3)/diameter + + # Compute modified Bond and physical property numbers for large droplets + bond_number = 4 * delta_density * g * diameter**2 / (3 * pure_water_surface_tension) + property_number = pure_water_surface_tension**3 * air_density**2 / (air_viscosity**4 * delta_density * g) + + # Compute Reynolds_number_for small particles (diameter < 0.00107) (1 mm) + # --> First 9 bins of Parsivel ... + b = [-3.18657, 0.992696, -0.00153193, -0.000987059, -0.000578878, 0.0000855176, -0.00000327815] + x = np.log(davis_number) + y = b[0] + sum(b * x**i for i, b in enumerate(b[1:], start=1)) + reynolds_number_small = np.exp(y) # TODO: miss C_sc = slip correction factor ? + + # Compute Reynolds_number_for large particles (diameter >= 0.00107) + b = [-5.00015, 5.23778, -2.04914, 0.475294, -0.0542819, 0.00238449] + log_property_number = np.log(property_number) / 6 + x = np.log(bond_number) + log_property_number + y = b[0] + y = b[0] + sum(b * x**i for i, b in enumerate(b[1:], start=1)) + reynolds_number_large = np.exp(log_property_number + y) + + # Define final reynolds number + reynolds_number = xr.where(small_diam_mask, reynolds_number_small, reynolds_number_large) + return reynolds_number + + +def get_fall_velocity_beard_1976(diameter, temperature, air_density, water_density, g): + """ + Computes the terminal fall velocity of a raindrop in still air. + + Reference: Beard 1976; Pruppacher & Klett 1978 + + Parameters + ---------- + diameter : float + Diameter of the raindrop in meters. + temperature : float + Temperature in Kelvin. + air_density : float + Density of air in kg/m^3. + water_density : float + Density of water in kg/m^3. + g : float + Gravitational acceleration in m/s^2. + + Returns + ------- + float + Terminal fall velocity of the raindrop in m/s. + """ + air_viscosity = get_air_dynamic_viscosity(temperature) + reynolds_number = get_raindrop_reynolds_number( + diameter=diameter, + temperature=temperature, + air_density=air_density, + water_density=water_density, + g=g, + ) + fall_velocity = air_viscosity * reynolds_number / (air_density * diameter) + return fall_velocity + + +def get_drag_coefficient(diameter, air_density, water_density, fall_velocity, g=9.81): + """ + Computes the drag coefficient for a raindrop. + + Parameters + ---------- + diameter : float + Diameter of the raindrop in meters. + air_density : float + Density of air in kg/m^3. + water_density : float + Density of water in kg/m^3. + fall_velocity : float + Terminal fall velocity of the raindrop in m/s. + g : float + Gravitational acceleration in m/s^2. + + Returns + ------- + float + Drag coefficient of the raindrop. + """ + delta_density = water_density - air_density + drag_coefficient = 4 * delta_density * g * diameter / (3 * air_density * fall_velocity**2) + return drag_coefficient + + +def retrieve_fall_velocity( + diameter, + altitude, + latitude, + temperature, + relative_humidity, + air_pressure=None, + sea_level_air_pressure=101_325, + gas_constant_dry_air=287.04, + lapse_rate=0.0065, +): + """ + Computes the terminal fall velocity and drag coefficients for liquid raindrops. + + Parameters + ---------- + diameter : float + Diameter of the raindrop in meters. + altitude : float + Altitude in meters. + temperature : float + Temperature in Kelvin. + relative_humidity : float + Relative humidity. A value between 0 and 1. + latitude : float + Latitude in degrees. + air_pressure : float + Air pressure in Pascals. + If None, air_pressure at altitude is inferred assuming + a standard atmospheric pressure at sea level. + sea_level_air_pressure : float + Standard atmospheric pressure at sea level in Pascals. + The default is 101_325 Pascal. + gas_constant_dry_air : float, optional + Gas constant for dry air in J/(kg*K). The default is 287.04 is J/(kg*K). + lapse_rate : float, optional + Standard atmospheric lapse rate in K/m. The default is 0.0065 K/m. + + Returns + ------- + tuple + Terminal fall velocity and drag coefficients for liquid raindrops. + """ + # Retrieve air pressure at altitude if not specified + if air_pressure is None: + air_pressure = get_air_pressure_at_height( + altitude=altitude, + latitude=latitude, + temperature=temperature, + sea_level_air_pressure=sea_level_air_pressure, + lapse_rate=lapse_rate, + gas_constant_dry_air=gas_constant_dry_air, + ) + + # Retrieve vapour pressure (from relative humidity) + vapor_pressure = get_vapor_actual_pressure( + relative_humidity=relative_humidity, + temperature=temperature, + ) + + # Retrieve air density and water density + air_density = get_air_density( + temperature=temperature, + air_pressure=air_pressure, + vapor_pressure=vapor_pressure, + gas_constant_dry_air=gas_constant_dry_air, + ) + water_density = get_water_density( + temperature=temperature, + air_pressure=air_pressure, + sea_level_air_pressure=sea_level_air_pressure, + ) + + # Retrieve accurate gravitational_acceleration + g = get_gravitational_acceleration(altitude=altitude, latitude=latitude) + + # Compute fall velocity + fall_velocity = get_fall_velocity_beard_1976( + diameter=diameter, + temperature=temperature, + air_density=air_density, + water_density=water_density, + g=g, + ) + + # drag_coefficient = get_drag_coefficient(diameter=diameter, + # air_density=air_density, + # water_density=water_density, + # g=g. + # fall_velocity=fall_velocity) + + return fall_velocity + + +####----------------------------------------------------------------------------------------- +#### OLD CODE + + +# def get_fall_velocity_beard_1977(diameter): +# """ +# Compute the fall velocity of raindrops using the Beard (1977) relationship. + +# Parameters +# ---------- +# diameter : array-like +# Diameter of the raindrops in millimeters. +# Valid up to 7 mm (0.7 cm). + +# Returns +# ------- +# fall_velocity : array-like +# Fall velocities in meters per second. + +# Notes +# ----- +# This method uses an exponential function based on the work of Beard (1977), +# valid at sea level conditions (pressure = 1 atm, temperature = 20°C, +# air density = 1.194 kg/m³). + +# References +# ---------- +# Beard, K. V. (1977). +# Terminal velocity adjustment for cloud and precipitation drops aloft. +# Journal of the Atmospheric Sciences, 34(8), 1293-1298. +# https://doi.org/10.1175/1520-0469(1977)034<1293:TVAFCA>2.0.CO;2 + +# """ +# diameter_cm = diameter/1000 +# c = [7.06037, 1.74951, 4.86324, 6.60631, 4.84606, 2.14922, 0.58714, 0.096348, 0.00869209, 0.00033089] +# log_diameter = np.log(diameter_cm) +# y = c[0] + sum(c * log_diameter**i for i, c in enumerate(c[1:], start=1)) +# fall_velocity = np.exp(y) +# return fall_velocity + + +# def get_fall_velocity_beard_1977(diameter, temperature, air_pressure, gas_constant_dry_air=287.04): +# """ +# Computes the terminal fall velocity of a raindrop in still air. + +# This function is based on the Table 4 coefficients of Kenneth V. Beard (1977), +# "Terminal Velocity and Shape of Cloud and Precipitation Drops Aloft", +# Journal of the Atmospheric Sciences, Vol. 34, pp. 1293-1298. + +# Note: This approximation is valid at sea level with conditions: +# Pressure = 1 atm, Temperature = 20°C, (saturated) air density = 1.194 kg/m³. + +# Parameters +# ---------- +# diameter : array-like +# Array of equivolume drop diameters in meters. + +# Returns +# ------- +# fall_velocity : array-like +# Array of terminal fall velocity in meters per second (m/s). +# For diameters greater than 7 mm, the function returns NaN. + +# """ +# # PROBLEMATIC +# # Compute sea level velocity +# c = [7.06037, 1.74951, 4.86324, 6.60631, 4.84606, 2.14922, 0.58714, 0.096348, 0.00869209, 0.00033089] +# log_diameter = np.log(diameter / 1000 * 10) +# y = c[0] + sum(c * log_diameter**i for i, c in enumerate(c[1:], start=1)) +# v0 = np.exp(y) + +# # Compute fall velocity +# t_20 = 273.15 + 20 +# eps_s = get_air_dynamic_viscosity(t_20) / get_air_dynamic_viscosity(temperature) - 1 +# eps_c = -1 + ( +# np.sqrt( +# get_air_density( +# temperature=t_20, +# air_pressure=101325, +# vapor_pressure=0, +# gas_constant_dry_air=gas_constant_dry_air, +# ) +# / get_air_density( +# temperature=temperature, +# air_pressure=air_pressure, +# vapor_pressure=0, +# gas_constant_dry_air=gas_constant_dry_air, +# ), +# ) +# ) +# a = 1.104 * eps_s +# b = (1.058 * eps_c - 1.104 * eps_s) / 5.01 +# x = np.log(diameter) + 5.52 +# f = (a + b * x) + 1 +# fall_velocity = v0 * f +# # fall_velocity.plot() + +# eps = 1.104 * eps_s + (1.058 * eps_c - 1.104 * eps_s) * np.log(diameter / 1e-3) / 5.01 +# # eps = 1.104 * eps_s + (1.058 * eps_c - 1.104 * eps_s) * np.log(diameter / 4e-5) / 5.01 +# fall_velocity = 0.01 * v0 * (1 + eps) +# return fall_velocity diff --git a/disdrodb/l1/encoding_attrs.py b/disdrodb/l1/encoding_attrs.py new file mode 100644 index 00000000..35a4abab --- /dev/null +++ b/disdrodb/l1/encoding_attrs.py @@ -0,0 +1,605 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Attributes and encoding options for DISDRODB products.""" + + +def get_attrs_dict(): + """Temporary attributes.""" + attrs_dict = { + #### L1 + "drop_number": { + "description": "Counts of drops per diameter and velocity class", + "long_name": "Drop counts per diameter and velocity class", + "units": "", + }, + "drop_counts": { + "description": "Counts of drops per diameter class", + "long_name": "Drop counts per diameter class", + "units": "", + }, + "Dmin": { + "description": "Minimum drop diameter", + "long_name": "Minimum drop diameter", + "units": "mm", + }, + "Dmax": { + "description": "Maximum drop diameter", + "long_name": "Maximum drop diameter", + "units": "mm", + }, + "fall_velocity": { + "description": "Estimated drop fall velocity per diameter class", + "long_name": "Estimated drop fall velocity", + "units": "m s-1", + }, + "drop_average_velocity": { + "description": "Average measured drop fall velocity per diameter class", + "long_name": "Measured average drop fall velocity", + "units": "m s-1", + }, + "n_drops_selected": { + "description": "Total number of selected drops", + "long_name": "Total number of selected drops", + "units": "", + }, + "n_drops_discarded": { + "description": "Total number of discarded drops", + "long_name": "Total number of discarded drops", + "units": "", + }, + #### L2 + "drop_number_concentration": { + "description": "Number concentration of drops per diameter class per unit volume", + "long_name": "Drop number concentration per diameter class", + "units": "m-3 mm-1", + }, + "drop_volume": { + "standard_name": "", + "units": "mm3", + "long_name": "Volume of Drops per Diameter Class", + }, + "drop_total_volume": { + "standard_name": "", + "units": "mm3", + "long_name": "Total Volume of Drops", + }, + "drop_relative_volume_ratio": { + "standard_name": "", + "units": "", + "long_name": "Relative Volume Ratio of Drops", + }, + "KEmin": { + "standard_name": "", + "units": "J", + "long_name": "Minimum Drop Kinetic Energy", + }, + "KEmax": { + "standard_name": "", + "units": "J", + "long_name": "Maximum Drop Kinetic Energy", + }, + "E": { + "description": "Kinetic energy per unit rainfall depth", + "standard_name": "", + "units": "J m-2 mm-1", + "long_name": "Rainfall Kinetic Energy", + }, + "KE": { + "standard_name": "", + "units": "J m-2 h-1", + "long_name": "Kinetic Energy Density Flux", + }, + "M1": { + "standard_name": "", + "units": "m-3 mm", + "long_name": "First Moment of the Drop Size Distribution", + }, + "M2": { + "standard_name": "", + "units": "m-3 mm2", + "long_name": "Second Moment of the Drop Size Distribution", + }, + "M3": { + "standard_name": "", + "units": "m-3 mm3", + "long_name": "Third Moment of the Drop Size Distribution", + }, + "M4": { + "standard_name": "", + "units": "m-3 mm4", + "long_name": "Fourth Moment of the Drop Size Distribution", + }, + "M5": { + "standard_name": "", + "units": "m-3 mm5", + "long_name": "Fifth Moment of the Drop Size Distribution", + }, + "M6": { + "standard_name": "", + "units": "m-3 mm6", + "long_name": "Sixth Moment of the Drop Size Distribution", + }, + "Nt": { + "standard_name": "number_concentration_of_rain_drops_in_air", + "units": "m-3", + "long_name": "Total Number Concentration", + }, + "R": { + "standard_name": "rainfall_rate", + "units": "mm h-1", + "long_name": "Instantaneous Rainfall Rate", + }, + "P": { + "standard_name": "precipitation_amount", + "units": "mm", + "long_name": "Rain Accumulation", + }, + "Z": { + "standard_name": "equivalent_reflectivity_factor", + "units": "dBZ", + "long_name": "Equivalent Radar Reflectivity Factor", + }, + "W": { + "description": "Water Mass of the Drop Size Distribution", + "standard_name": "mass_concentration_of_liquid_water_in_air", + "units": "g m-3", + "long_name": "Liquid Water Content", + }, + "D10": { + "standard_name": "", + "units": "mm", + "long_name": "10th Percentile Drop Diameter", + }, + "D50": { + "standard_name": "median_volume_diameter", + "units": "mm", + "long_name": "Median Volume Drop Diameter", + }, + "D90": { + "standard_name": "", + "units": "mm", + "long_name": "90th Percentile Drop Diameter", + }, + "Dmode": { + "standard_name": "", + "units": "mm", + "long_name": "Mode Diameter of the Drop Size Distribution", + }, + "Dm": { + "standard_name": "Dm", + "units": "mm", + "long_name": "Mean Volume Diameter", + }, + "sigma_m": { + "standard_name": "", + "units": "mm", + "long_name": "Standard Deviation of Mass Spectrum", + }, + "Nw": { + "standard_name": "normalized_intercept_parameter", + "units": "mm-1 m-3", # TODO + "long_name": "Normalized Intercept Parameter of a Normalized Gamma Distribution", + }, + "N0": { + "standard_name": "intercept_parameter", + "units": "mm-1 m-3", # TODO + "long_name": "Intercept Parameter of the Modeled Drop Size Distribution", + }, + "mu": { + "standard_name": "shape_parameter", + "units": "1", # TODO + "long_name": "Shape Parameter of the Modeled Drop Size Distribution", + }, + "Lambda": { + "standard_name": "distribution_slope", + "units": "1/mm", # TODO + "long_name": "Slope Parameter of the Modeled Drop Size Distribution", + }, + "sigma": { + "standard_name": "distribution_slope", + "units": "1/mm", # TODO + "long_name": "Slope Parameter of the Modeled Lognormal Distribution", + }, + # Radar variables + "Zh": { + "description": "Radar reflectivity factor at horizontal polarization", + "long_name": "Horizontal Reflectivity", + "units": "dBZ", + }, + "Zdr": { + "description": "Differential reflectivity", + "long_name": "Differential Reflectivity", + "units": "dB", + }, + "rho_hv": { + "description": "Correlation coefficient between horizontally and vertically polarized reflectivity", + "long_name": "Copolarized Correlation Coefficient", + "units": "", + }, + "ldr": { + "description": "Linear depolarization ratio", + "long_name": "Linear Depolarization Ratio", + "units": "dB", + }, + "Kdp": { + "description": "Specific differential phase", + "long_name": "Specific Differential Phase", + "units": "deg/km", + }, + "Ai": { + "description": "Specific attenuation", + "long_name": "Specific attenuation", + "units": "dB/km", + }, + } + return attrs_dict + + +def get_encoding_dict(): + """Temporary encoding dictionary.""" + encoding_dict = { + "M1": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "M2": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "M3": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "M4": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "M5": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "M6": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Nt": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "R": { + "dtype": "uint16", + "scale_factor": 0.01, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "P": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Z": { + "dtype": "uint16", + "scale_factor": 0.01, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "W": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Dm": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "sigma_m": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Dmode": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Nw": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "D50": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "D10": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "D90": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_number": { + "dtype": "uint32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + "_FillValue": 4294967295, + }, + "drop_counts": { + "dtype": "uint32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + "_FillValue": 4294967295, + }, + "n_drops_selected": { + "dtype": "uint32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + "_FillValue": 4294967295, + }, + "n_drops_discarded": { + "dtype": "uint32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + "_FillValue": 4294967295, + }, + "Dmin": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Dmax": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_average_velocity": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "fall_velocity": { + "dtype": "uint16", + "scale_factor": 0.001, + "_FillValue": 65535, + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_number_concentration": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_volume": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_total_volume": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "drop_relative_volume_ratio": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "KEmin": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "KEmax": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "E": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "KE": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + # Radar variables + "Zh": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Zdr": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "rho_hv": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "ldr": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Kdp": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + "Ai": { + "dtype": "float32", + "zlib": True, + "complevel": 3, + "shuffle": True, + "fletcher32": False, + "contiguous": False, + }, + } + return encoding_dict diff --git a/disdrodb/l1/fall_velocity.py b/disdrodb/l1/fall_velocity.py new file mode 100644 index 00000000..6e7d8dc4 --- /dev/null +++ b/disdrodb/l1/fall_velocity.py @@ -0,0 +1,260 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Theoretical models to estimate the drop fall velocity.""" + + +import numpy as np + + +def get_fall_velocity_atlas_1973(diameter): + """ + Compute the fall velocity of raindrops using the Atlas et al. (1973) relationship. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + fall_velocity : array-like + Fall velocities corresponding to the input diameters, in meters per second. + + References + ---------- + Atlas, D., Srivastava, R. C., & Sekhon, R. S. (1973). + Doppler radar characteristics of precipitation at vertical incidence. + Reviews of Geophysics, 11(1), 1-35. + https://doi.org/10.1029/RG011i001p00001 + + Atlas, D., & Ulbrich, C. W. (1977). + Path- and area-integrated rainfall measurement by microwave attenuation in the 1-3 cm band. + Journal of Applied Meteorology, 16(12), 1322-1331. + https://doi.org/10.1175/1520-0450(1977)016<1322:PAAIRM>2.0.CO;2 + + Gunn, R., & Kinzer, G. D. (1949). + The terminal velocity of fall for water droplets in stagnant air. + Journal of Meteorology, 6(4), 243-248. + https://doi.org/10.1175/1520-0469(1949)006<0243:TTVOFF>2.0.CO;2 + + """ + fall_velocity = 9.65 - 10.3 * np.exp(-0.6 * diameter) # clip to 0 ! + fall_velocity = np.clip(fall_velocity, 0, None) + return fall_velocity + + +def get_fall_velocity_brandes_2002(diameter): + """ + Compute the fall velocity of raindrops using the Brandes et al. (2002) relationship. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + fall_velocity : array-like + Fall velocities in meters per second. + + References + ---------- + Brandes, E. A., Zhang, G., & Vivekanandan, J. (2002). + Experiments in rainfall estimation with a polarimetric radar in a subtropical environment. + Journal of Applied Meteorology, 41(6), 674-685. + https://doi.org/10.1175/1520-0450(2002)041<0674:EIREWA>2.0.CO;2 + + """ + fall_velocity = -0.1021 + 4.932 * diameter - 0.9551 * diameter**2 + 0.07934 * diameter**3 - 0.002362 * diameter**4 + return fall_velocity + + +def get_fall_velocity_uplinger_1981(diameter): + """ + Compute the fall velocity of raindrops using Uplinger (1981) relationship. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + Valid for diameters between 0.1 mm and 7 mm. + + Returns + ------- + fall_velocity : array-like + Fall velocities in meters per second. + + References + ---------- + Uplinger, C. W. (1981). A new formula for raindrop terminal velocity. + In Proceedings of the 20th Conference on Radar Meteorology (pp. 389-391). + AMS. + + """ + # Valid between 0.1 and 7 mm + fall_velocity = 4.874 * diameter * np.exp(-0.195 * diameter) + return fall_velocity + + +def get_fall_velocity_van_dijk_2002(diameter): + """ + Compute the fall velocity of raindrops using van Dijk et al. (2002) relationship. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + fall_velocity : array-like + Fall velocities in meters per second. + + References + ---------- + van Dijk, A. I. J. M., Bruijnzeel, L. A., & Rosewell, C. J. (2002). + Rainfall intensity-kinetic energy relationships: a critical literature appraisal. + Journal of Hydrology, 261(1-4), 1-23. + https://doi.org/10.1016/S0022-1694(02)00020-3 + + """ + fall_velocity = -0.254 + 5.03 * diameter - 0.912 * diameter**2 + 0.0561 * diameter**3 + return fall_velocity + + +def get_fall_velocity_beard_1976(diameter, ds_env): + """Calculate the fall velocity of a particle using the Beard (1976) model. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + ds_env : xr.Dataset + A dataset containing the following environmental variables: + - 'altitude' : Altitude in meters (m). + - 'latitude' : Latitude in degrees. + - 'temperature' : Temperature in degrees Celsius (°C). + - 'relative_humidity' : Relative humidity in percentage (%). + - 'sea_level_air_pressure' : Sea level air pressure in Pascals (Pa). + - 'lapse_rate' : Lapse rate in degrees Celsius per meter (°C/m). + + Returns + ------- + fall_velocity : array-like + The calculated fall velocities of the raindrops. + """ + from disdrodb.l1.beard_model import retrieve_fall_velocity + + # Input diameter in mmm + fall_velocity = retrieve_fall_velocity( + diameter=diameter / 1000, # diameter expected in m !!! + altitude=ds_env["altitude"], + latitude=ds_env["latitude"], + temperature=ds_env["temperature"], + relative_humidity=ds_env["relative_humidity"], + # TODO: add air_pressure # TODO + sea_level_air_pressure=ds_env["sea_level_air_pressure"], + lapse_rate=ds_env["lapse_rate"], + ) + return fall_velocity + + +def ensure_valid_coordinates(ds, default_altitude=0, default_latitude=0, default_longitude=0): + """Ensure dataset valid coordinates for altitude, latitude, and longitude. + + Invalid values are np.nan and -9999. + + Parameters + ---------- + ds : xarray.Dataset + The dataset for which to ensure valid geolocation coordinates. + default_altitude : float, optional + The default value to use for invalid altitude values. Defaults to 0. + default_latitude : float, optional + The default value to use for invalid latitude values. Defaults to 0. + default_longitude : float, optional + The default value to use for invalid longitude values. Defaults to 0. + + Returns + ------- + xarray.Dataset + The dataset with invalid coordinates replaced by default values. + + """ + invalid_altitude = np.logical_or(np.isnan(ds["altitude"]), ds["altitude"] == -9999) + ds["altitude"] = ds["altitude"].where(~invalid_altitude, default_altitude) + + invalid_lat = np.logical_or(np.isnan(ds["latitude"]), ds["latitude"] == -9999) + ds["latitude"] = ds["latitude"].where(~invalid_lat, default_latitude) + + invalid_lon = np.logical_or(np.isnan(ds["longitude"]), ds["longitude"] == -9999) + ds["longitude"] = ds["longitude"].where(~invalid_lon, default_longitude) + return ds + + +def get_raindrop_fall_velocity(diameter, method, ds_env=None): + """Calculate the fall velocity of raindrops based on their diameter. + + Parameters + ---------- + diameter : array-like + The diameter of the raindrops in millimeters. + method : str + The method to use for calculating the fall velocity. Must be one of the following: + 'Atlas1973', 'Beard1976', 'Brandes2002', 'Uplinger1981', 'VanDijk2002'. + ds_env : xr.Dataset, optional + A dataset containing the following environmental variables: + - 'altitude' : Altitude in meters (m). + - 'latitude' : Latitude in degrees. + - 'temperature' : Temperature in degrees Celsius (°C). + - 'relative_humidity' : Relative humidity. A value between 0 and 1. + - 'sea_level_air_pressure' : Sea level air pressure in Pascals (Pa). + - 'lapse_rate' : Lapse rate in degrees Celsius per meter (°C/m). + It is required for for the 'Beard1976' method. + + Returns + ------- + fall_velocity : array-like + The calculated fall velocities of the raindrops. + + Notes + ----- + The 'Beard1976' method requires additional environmental parameters such as altitude and latitude. + These parameters can be provided through the `ds_env` argument. If not provided, default values will be used. + """ + # Input diameter in mm + dict_methods = { + "Atlas1973": get_fall_velocity_atlas_1973, + "Beard1976": get_fall_velocity_beard_1976, + "Brandes2002": get_fall_velocity_brandes_2002, + "Uplinger1981": get_fall_velocity_uplinger_1981, + "VanDijk2002": get_fall_velocity_van_dijk_2002, + } + # Check valid method + available_methods = list(dict_methods) + if method not in dict_methods: + raise ValueError(f"{method} is an invalid fall velocity method. Valid methods: {available_methods}.") + # Copy diameter + diameter = diameter.copy() + # Ensure valid altitude and geolocation (if missing set defaults) + # - altitude required by Beard + # - latitude required for gravity + ds_env = ensure_valid_coordinates(ds_env) + # Retrieve fall velocity + func = dict_methods[method] + fall_velocity = func(diameter, ds_env=ds_env) if method == "Beard1976" else func(diameter) + return fall_velocity diff --git a/disdrodb/l1/filters.py b/disdrodb/l1/filters.py new file mode 100644 index 00000000..b72f0dfd --- /dev/null +++ b/disdrodb/l1/filters.py @@ -0,0 +1,192 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Utilities for filtering the disdrometer raw drop spectra.""" + +import numpy as np +import xarray as xr + + +def filter_diameter_bins(ds, minimum_diameter=None, maximum_diameter=None): + """ + Filter the dataset to include only diameter bins within specified bounds. + + Parameters + ---------- + ds : xarray.Dataset + The dataset containing diameter bin data. + minimum_diameter : float, optional + The minimum diameter to include in the filter, in millimeters. + Defaults to the minimum value in `ds["diameter_bin_lower"]`. + maximum_diameter : float, optional + The maximum diameter to include in the filter, in millimeters. + Defaults to the maximum value in `ds["diameter_bin_upper"]`. + + Returns + ------- + xarray.Dataset + The filtered dataset containing only the specified diameter bins. + """ + # Initialize default arguments + if minimum_diameter is None: + minimum_diameter = ds["diameter_bin_lower"].min().item() + if maximum_diameter is None: + maximum_diameter = ds["diameter_bin_upper"].max().item() + # Select valid bins + valid_indices = np.logical_and( + ds["diameter_bin_lower"] >= minimum_diameter, + ds["diameter_bin_upper"] <= maximum_diameter, + ) + ds = ds.isel({"diameter_bin_center": valid_indices}) + # Update history + history = ds.attrs.get("history", "") + ds.attrs["history"] = ( + history + f" Selected drops with diameters between {minimum_diameter} and {maximum_diameter} mm \n" + ) + return ds + + +def filter_velocity_bins(ds, minimum_velocity=0, maximum_velocity=12): + """ + Filter the dataset to include only velocity bins within specified bounds. + + Parameters + ---------- + ds : xarray.Dataset + The dataset containing velocity bin data. + minimum_velocity : float, optional + The minimum velocity to include in the filter, in meters per second. + Defaults to 0 m/s. + maximum_velocity : float, optional + The maximum velocity to include in the filter, in meters per second. + Defaults to 12 m/s. + + Returns + ------- + xarray.Dataset + The filtered dataset containing only the specified velocity bins. + """ + # Initialize default arguments + if minimum_velocity is None: + minimum_velocity = ds["velocity_bin_lower"].min().item() + if maximum_velocity is None: + maximum_velocity = ds["velocity_bin_upper"].max().item() + # Select valid bins + valid_indices = np.logical_and( + ds["velocity_bin_lower"] >= minimum_velocity, + ds["velocity_bin_upper"] <= maximum_velocity, + ) + ds = ds.isel({"velocity_bin_center": valid_indices}) + # Update history + history = ds.attrs.get("history", "") + ds.attrs["history"] = ( + history + f" Selected drops with fall velocity between {minimum_velocity} and {maximum_velocity} m/s \n" + ) + return ds + + +def define_spectrum_mask( + drop_number, + fall_velocity, + above_velocity_fraction=None, + above_velocity_tolerance=None, + below_velocity_fraction=None, + below_velocity_tolerance=None, + small_diameter_threshold=1, # 1, # 2 + small_velocity_threshold=2.5, # 2.5, # 3 + maintain_smallest_drops=False, +): + """Define a mask for the drop spectrum based on fall velocity thresholds. + + Parameters + ---------- + drop_number : xarray.DataArray + Array of drop counts per diameter and velocity bins. + fall_velocity : array-like + The expected terminal fall velocities for drops of given sizes. + above_velocity_fraction : float, optional + Fraction of terminal fall velocity above which drops are considered too fast. + Either specify ``above_velocity_fraction`` or ``above_velocity_tolerance``. + above_velocity_tolerance : float, optional + Absolute tolerance above which drops terminal fall velocities are considered too fast. + Either specify ``above_velocity_fraction`` or ``above_velocity_tolerance``. + below_velocity_fraction : float, optional + Fraction of terminal fall velocity below which drops are considered too slow. + Either specify ``below_velocity_fraction`` or ``below_velocity_tolerance``. + below_velocity_tolerance : float, optional + Absolute tolerance below which drops terminal fall velocities are considered too slow. + Either specify ``below_velocity_fraction`` or ``below_velocity_tolerance``. + maintain_smallest : bool, optional + If True, ensures that the small drops in the spectrum are retained in the mask. + The smallest drops are characterized by ``small_diameter_threshold`` + and ``small_velocity_threshold`` arguments. + Defaults to False. + small_diameter_threshold : float, optional + The diameter threshold to use for keeping the smallest drop. + Defaults to 1 mm. + small_velocity_threshold : float, optional + The fall velocity threshold to use for keeping the smallest drops. + Defaults to 2.5 m/s. + + Returns + ------- + xarray.DataArray + A boolean mask array indicating valid bins according to the specified criteria. + + """ + # Ensure it creates a 2D mask if the fall_velocity does not vary over time + if "time" in drop_number.dims and "time" not in fall_velocity.dims: + drop_number = drop_number.isel(time=0) + + # Check arguments + if above_velocity_fraction is not None and above_velocity_tolerance is not None: + raise ValueError("Either specify 'above_velocity_fraction' or 'above_velocity_tolerance'.") + if below_velocity_fraction is not None and below_velocity_tolerance is not None: + raise ValueError("Either specify 'below_velocity_fraction' or 'below_velocity_tolerance'.") + + # Define above/below velocity thresholds + if above_velocity_fraction is not None: + above_fall_velocity = fall_velocity * (1 + above_velocity_fraction) + elif above_velocity_tolerance is not None: + above_fall_velocity = fall_velocity + above_velocity_tolerance + else: + above_fall_velocity = np.inf + if below_velocity_fraction is not None: + below_fall_velocity = fall_velocity * (1 - below_velocity_fraction) + elif below_velocity_tolerance is not None: + below_fall_velocity = fall_velocity - below_velocity_tolerance + else: + below_fall_velocity = 0 + + # Define velocity 2D array + velocity_lower = xr.ones_like(drop_number) * drop_number["velocity_bin_lower"] + velocity_upper = xr.ones_like(drop_number) * drop_number["velocity_bin_upper"] + + # Define mask + mask = np.logical_and( + velocity_lower >= below_fall_velocity, + velocity_upper <= above_fall_velocity, + ) + + # Maintant smallest drops + if maintain_smallest_drops: + mask_smallest = np.logical_and( + drop_number["diameter_bin_upper"] < small_diameter_threshold, + drop_number["velocity_bin_upper"] < small_velocity_threshold, + ) + mask = np.logical_or(mask, mask_smallest) + + return mask diff --git a/disdrodb/l1/processing.py b/disdrodb/l1/processing.py new file mode 100644 index 00000000..7783ad99 --- /dev/null +++ b/disdrodb/l1/processing.py @@ -0,0 +1,194 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Core functions for DISDRODB L1 production.""" + + +import xarray as xr + +from disdrodb.l1.encoding_attrs import get_attrs_dict, get_encoding_dict +from disdrodb.l1.fall_velocity import get_raindrop_fall_velocity +from disdrodb.l1.filters import define_spectrum_mask, filter_diameter_bins, filter_velocity_bins +from disdrodb.l1.resampling import add_sample_interval +from disdrodb.l1_env.routines import load_env_dataset +from disdrodb.l2.empirical_dsd import get_drop_average_velocity, get_min_max_diameter # TODO: maybe move out of L2 +from disdrodb.utils.attrs import set_attrs +from disdrodb.utils.encoding import set_encodings +from disdrodb.utils.time import ensure_sample_interval_in_seconds, infer_sample_interval + + +def generate_l1( + ds, + # Fall velocity option + fall_velocity_method="Beard1976", + # Diameter-Velocity Filtering Options + minimum_diameter=0, + maximum_diameter=10, + minimum_velocity=0, + maximum_velocity=12, + above_velocity_fraction=0.5, + above_velocity_tolerance=None, + below_velocity_fraction=0.5, + below_velocity_tolerance=None, + small_diameter_threshold=1, # 2 + small_velocity_threshold=2.5, # 3 + maintain_smallest_drops=True, +): + """Generate the DISDRODB L1 dataset from the DISDRODB L0C dataset. + + Parameters + ---------- + ds : xarray.Dataset + DISDRODB L0C dataset. + fall_velocity_method : str, optional + Method to compute fall velocity. + The default method is ``"Beard1976"``. + minimum_diameter : float, optional + Minimum diameter for filtering. The default value is 0 mm. + maximum_diameter : float, optional + Maximum diameter for filtering. The default value is 10 mm. + minimum_velocity : float, optional + Minimum velocity for filtering. The default value is 0 m/s. + maximum_velocity : float, optional + Maximum velocity for filtering. The default value is 12 m/s. + above_velocity_fraction : float, optional + Fraction of drops above velocity threshold. The default value is 0.5. + above_velocity_tolerance : float or None, optional + Tolerance for above velocity filtering. The default is ``None``. + below_velocity_fraction : float, optional + Fraction of drops below velocity threshold. The default value is 0.5. + below_velocity_tolerance : float or None, optional + Tolerance for below velocity filtering. The default is ``None``. + small_diameter_threshold : float, optional + Threshold for small diameter drops. The default value is 1. + small_velocity_threshold : float, optional + Threshold for small velocity drops. The default value is 2.5. + maintain_smallest_drops : bool, optional + Whether to maintain the smallest drops. The default is ``True``. + + Returns + ------- + xarray.Dataset + DISRODB L1 dataset. + """ + # Take as input an L0 ! + + # Retrieve source attributes + attrs = ds.attrs.copy() + + # Determine if the velocity dimension is available + has_velocity_dimension = "velocity_bin_center" in ds.dims + + # Initialize L2 dataset + ds_l1 = xr.Dataset() + + # Retrieve sample interval + # --> sample_interval is a coordinate of L0C products + if "sample_interval" in ds: + sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"].data) + else: + # This line is not called in the DISDRODB processing chain ! + sample_interval = infer_sample_interval(ds, verbose=False) + + # Re-add sample interval as coordinate (in seconds) + ds = add_sample_interval(ds, sample_interval=sample_interval) + + # --------------------------------------------------------------------------- + # Retrieve ENV dataset or take defaults + # --> Used only for Beard fall velocity currently ! + ds_env = load_env_dataset(ds) + + # ------------------------------------------------------------------------------------------- + # Filter dataset by diameter and velocity bins + # - Filter diameter bins + ds = filter_diameter_bins(ds=ds, minimum_diameter=minimum_diameter, maximum_diameter=maximum_diameter) + # - Filter velocity bins + if has_velocity_dimension: + ds = filter_velocity_bins(ds=ds, minimum_velocity=minimum_velocity, maximum_velocity=maximum_velocity) + + # ------------------------------------------------------------------------------------------- + # Compute fall velocity + fall_velocity = get_raindrop_fall_velocity( + diameter=ds["diameter_bin_center"], + method=fall_velocity_method, + ds_env=ds_env, # mm + ) + + # Add fall velocity + ds_l1["fall_velocity"] = fall_velocity + + # ------------------------------------------------------------------------------------------- + # Define filtering mask according to fall velocity + if has_velocity_dimension: + mask = define_spectrum_mask( + drop_number=ds["raw_drop_number"], + fall_velocity=fall_velocity, + above_velocity_fraction=above_velocity_fraction, + above_velocity_tolerance=above_velocity_tolerance, + below_velocity_fraction=below_velocity_fraction, + below_velocity_tolerance=below_velocity_tolerance, + small_diameter_threshold=small_diameter_threshold, + small_velocity_threshold=small_velocity_threshold, + maintain_smallest_drops=maintain_smallest_drops, + ) + + # ------------------------------------------------------------------------------------------- + # Retrieve drop number and drop_counts arrays + if has_velocity_dimension: + drop_number = ds["raw_drop_number"].where(mask) # 2D (diameter, velocity) + drop_counts = drop_number.sum(dim="velocity_bin_center") # 1D (diameter) + + else: + drop_number = ds["raw_drop_number"] # 1D (diameter) + drop_counts = ds["raw_drop_number"] # 1D (diameter) + + # Add drop number and drop_counts + ds_l1["drop_number"] = drop_number + ds_l1["drop_counts"] = drop_counts + + # ------------------------------------------------------------------------------------------- + # Compute and add drop average velocity if an optical disdrometer (i.e OTT Parsivel or ThiesLPM) + if has_velocity_dimension: + ds_l1["drop_average_velocity"] = get_drop_average_velocity(drop_number) + + # ------------------------------------------------------------------------------------------- + # Compute minimum and max drop diameter observed + min_drop_diameter, max_drop_diameter = get_min_max_diameter(drop_counts) + + # Add drop statistics + ds_l1["Dmin"] = min_drop_diameter + ds_l1["Dmax"] = max_drop_diameter + ds_l1["n_drops_selected"] = drop_counts.sum(dim=["diameter_bin_center"]) + ds_l1["n_drops_discarded"] = drop_counts.sum(dim=["diameter_bin_center"]) + + # ------------------------------------------------------------------------------------------- + #### Add L0C coordinates that might got lost + if "time_qc" in ds: + ds_l1 = ds_l1.assign_coords({"time_qc": ds["time_qc"]}) + + #### ----------------------------------------------------------------------------. + #### Add encodings and attributes + # Add variables attributes + attrs_dict = get_attrs_dict() + ds_l1 = set_attrs(ds_l1, attrs_dict=attrs_dict) + + # Add variables encoding + encoding_dict = get_encoding_dict() + ds_l1 = set_encodings(ds_l1, encoding_dict=encoding_dict) + + # Add global attributes + ds_l1.attrs = attrs + return ds_l1 diff --git a/disdrodb/l1/resampling.py b/disdrodb/l1/resampling.py new file mode 100644 index 00000000..3cfcabbf --- /dev/null +++ b/disdrodb/l1/resampling.py @@ -0,0 +1,236 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Utilities for temporal resampling.""" + + +import pandas as pd +import xarray as xr + +from disdrodb.utils.time import regularize_dataset + +DEFAULT_ACCUMULATIONS = ["10s", "30s", "1min", "2min", "5min", "10min", "30min", "1hour"] + + +def add_sample_interval(ds, sample_interval): + """Add a sample_interval coordinate to the dataset. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset to which the sample_interval coordinate will be added. + sample_interval : int or float + The dataset sample interval in seconds. + + Returns + ------- + xarray.Dataset + The dataset with the added sample interval coordinate. + + Notes + ----- + The function adds a new coordinate named 'sample_interval' to the dataset and + updates the 'measurement_interval' attribute. + """ + # Add sample_interval coordinate + ds["sample_interval"] = sample_interval + ds["sample_interval"].attrs["description"] = "Sample interval" + ds["sample_interval"].attrs["long_name"] = "Sample interval" + ds["sample_interval"].attrs["units"] = "seconds" + ds = ds.set_coords("sample_interval") + # Update measurement_interval attribute + ds.attrs = ds.attrs.copy() + ds.attrs["measurement_interval"] = int(sample_interval) + return ds + + +def define_window_size(sample_interval, accumulation_interval): + """ + Calculate the rolling window size based on sampling and accumulation intervals. + + Parameters + ---------- + sampling_interval : int + The sampling interval in seconds. + accumulation_interval : int + The desired accumulation interval in seconds. + + Returns + ------- + int + The calculated window size as the number of sampling intervals required to cover the accumulation interval. + + Raises + ------ + ValueError + If the accumulation interval is not a multiple of the sampling interval. + + Examples + -------- + >>> define_window_size(60, 300) + 5 + + >>> define_window_size(120, 600) + 5 + """ + # Check compatitiblity + if accumulation_interval % sample_interval != 0: + raise ValueError("The accumulation interval must be a multiple of the sample interval.") + + # Calculate the window size + window_size = accumulation_interval // sample_interval + + return window_size + + +def resample_dataset(ds, sample_interval, accumulation_interval, rolling=True): + """ + Resample the dataset to a specified accumulation interval. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset to be resampled. + sample_interval : int + The sample interval of the input dataset. + accumulation_interval : int + The interval in seconds over which to accumulate the data. + rolling : bool, optional + If True, apply a rolling window before resampling. Default is True. + If True, forward rolling is performed. + The output timesteps correspond to the starts of the periods over which + the resampling operation has been performed ! + + Returns + ------- + xarray.Dataset + The resampled dataset with updated attributes. + + Notes + ----- + - The function regularizes the dataset (infill possible missing timesteps) + before performing the resampling operation. + - Variables are categorized into those to be averaged, accumulated, minimized, and maximized. + - Custom processing for quality flags and handling of NaNs is defined. + - The function updates the dataset attributes and the sample_interval coordinate. + + """ + # Retrieve attributes + attrs = ds.attrs.copy() + + # TODO: here infill NaN with zero if necessary before regularizing ! + + # Ensure regular dataset without missing timesteps + ds = regularize_dataset(ds, freq=f"{sample_interval}s") + + # Initialize resample dataset + ds_resampled = xr.Dataset() + + # Retrieve variables to average/sum + var_to_average = ["fall_velocity"] + var_to_cumulate = ["raw_drop_number", "drop_number", "drop_counts", "n_drops_selected", "n_drops_discarded"] + var_to_min = ["Dmin"] + var_to_max = ["Dmax"] + + # Retrieve available variables + var_to_average = [var for var in var_to_average if var in ds] + var_to_cumulate = [var for var in var_to_cumulate if var in ds] + var_to_min = [var for var in var_to_min if var in ds] + var_to_max = [var for var in var_to_max if var in ds] + + # TODO Define custom processing + # - quality_flag --> take worst + # - skipna if less than fraction (to not waste lot of data when aggregating over i.e. hours) + + # Resample the dataset + # - Rolling currently does not allow direct rolling forward. + # - We currently use center=False which means search for data backward (right-aligned) ! + # - We then drop the first 'window_size' NaN timesteps and we shift backward the timesteps. + # - https://github.com/pydata/xarray/issues/9773 + # - https://github.com/pydata/xarray/issues/8958 + if not rolling: + # Resample + if len(var_to_average) > 0: + ds_resampled.update( + ds[var_to_average].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).mean(skipna=False), + ) + if len(var_to_cumulate) > 0: + ds_resampled.update( + ds[var_to_cumulate].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).sum(skipna=False), + ) + if len(var_to_min) > 0: + ds_resampled.update( + ds[var_to_min].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).min(skipna=False), + ) + if len(var_to_max) > 0: + ds_resampled.update( + ds[var_to_max].resample({"time": pd.Timedelta(seconds=accumulation_interval)}).max(skipna=False), + ) + + else: + # Roll and Resample + window_size = define_window_size(sample_interval=sample_interval, accumulation_interval=accumulation_interval) + if len(var_to_average) > 0: + ds_resampled.update(ds[var_to_average].rolling({"time": window_size}, center=False).mean(skipna=False)) + if len(var_to_cumulate) > 0: + ds_resampled.update(ds[var_to_cumulate].rolling({"time": window_size}, center=False).sum(skipna=False)) + + if len(var_to_min) > 0: + ds_resampled.update(ds[var_to_min].rolling({"time": window_size}, center=False).min(skipna=False)) + if len(var_to_max) > 0: + ds_resampled.update(ds[var_to_max].rolling({"time": window_size}, center=False).max(skipna=False)) + # Ensure time to correspond to the start time of the integration + ds_resampled = ds_resampled.isel(time=slice(window_size - 1, None)).assign_coords( + {"time": ds_resampled["time"].data[: -window_size + 1]}, + ) + + # Add attributes + ds_resampled.attrs = attrs + if rolling: + ds_resampled.attrs["rolled"] = "True" + else: + ds_resampled.attrs["rolled"] = "False" + + # Add accumulation_interval as new sample_interval coordinate + ds_resampled = add_sample_interval(ds_resampled, sample_interval=accumulation_interval) + return ds_resampled + + +def get_possible_accumulations(sample_interval, accumulations=None): + """ + Get a list of valid accumulation intervals based on the sampling time. + + Parameters + ---------- + - sample_interval (int): The inferred sampling time in seconds. + - accumulations (list of int or string): List of desired accumulation intervals. + If provide integers, specify accumulation in seconds. + + Returns + ------- + - list of int: Valid accumulation intervals in seconds. + """ + # Select default accumulations + if accumulations is None: + accumulations = DEFAULT_ACCUMULATIONS + + # Get accumulations in seconds + accumulations = [int(pd.Timedelta(acc).total_seconds()) if isinstance(acc, str) else acc for acc in accumulations] + + # Filter candidate accumulations to include only those that are multiples of the sampling time + possible_accumulations = [acc for acc in accumulations if acc % sample_interval == 0] + + return possible_accumulations diff --git a/disdrodb/l1/routines.py b/disdrodb/l1/routines.py new file mode 100644 index 00000000..96477aa5 --- /dev/null +++ b/disdrodb/l1/routines.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement DISDRODB L1 processing.""" + +import datetime +import logging +import os +import time +from typing import Optional + +import dask +import xarray as xr + +# Directory +from disdrodb.api.create_directories import ( + create_logs_directory, + create_product_directory, +) +from disdrodb.api.io import get_filepaths, get_required_product +from disdrodb.api.path import ( + define_l1_filename, +) +from disdrodb.configs import get_base_dir +from disdrodb.l1.processing import generate_l1 +from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel + +# Logger +from disdrodb.utils.logger import ( + close_logger, + create_logger_file, + create_product_logs, + log_error, + log_info, +) +from disdrodb.utils.writer import write_product + +logger = logging.getLogger(__name__) + + +def get_l1_options(): + """Get L1 options.""" + # - TODO: from YAML + # - TODO: as function of sensor name + + # minimum_diameter + # --> OTT_Parsivel: 0.2495 + # --> RD80: 0.313 + # --> LPM: 0.125 (we currently discard first bin with this setting) + + # maximum_diameter + # LPM: 8 mm + # RD80: 5.6 mm + # OTT: 26 mm + + l1_options = { + # Fall velocity option + "fall_velocity_method": "Beard1976", + # Diameter-Velocity Filtering Options + "minimum_diameter": 0.2495, # OTT Parsivel first two bin no data ! + "maximum_diameter": 8, + "minimum_velocity": 0, + "maximum_velocity": 12, + "above_velocity_fraction": 0.5, + "above_velocity_tolerance": None, + "below_velocity_fraction": 0.5, + "below_velocity_tolerance": None, + "small_diameter_threshold": 1, # 2 + "small_velocity_threshold": 2.5, # 3 + "maintain_smallest_drops": True, + } + return l1_options + + +@delayed_if_parallel +@single_threaded_if_parallel +def _generate_l1( + filepath, + data_dir, + logs_dir, + campaign_name, + station_name, + # Processing options + force, + verbose, + parallel, # this is used only to initialize the correct logger ! +): + """Generate the L1 product from the DISRODB L0C netCDF file. + + Parameters + ---------- + filepath : str + Path to the L0C netCDF file. + data_dir : str + Directory where the L1 netCDF file will be saved. + logs_dir : str + Directory where the log file will be saved. + campaign_name : str + Name of the campaign. + station_name : str + Name of the station. + force : bool + If True, overwrite existing files. + verbose : bool + Whether to verbose the processing. + + Returns + ------- + str + Path to the log file generated during processing. + + Notes + ----- + If an error occurs during processing, it is caught and logged, + but no error is raised to interrupt the execution. + """ + # -----------------------------------------------------------------. + # Define product name + product = "L1" + + # -----------------------------------------------------------------. + # Create file logger + filename = os.path.basename(filepath) + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, + filename=filename, + parallel=parallel, + ) + + ##------------------------------------------------------------------------. + # Log start processing + msg = f"{product} processing of {filename} has started." + log_info(logger, msg, verbose=verbose) + + ##------------------------------------------------------------------------. + # Retrieve L1 configurations + l1_options = get_l1_options() + + ##------------------------------------------------------------------------. + ### Core computation + try: + # Open the raw netCDF + with xr.open_dataset(filepath, chunks={}, cache=False) as ds: + ds = ds[["raw_drop_number"]].load() + + # Produce L1 dataset + ds = generate_l1(ds=ds, **l1_options) + + # Write L1 netCDF4 dataset + if ds["time"].size > 1: + # Define filepath + filename = define_l1_filename(ds, campaign_name=campaign_name, station_name=station_name) + filepath = os.path.join(data_dir, filename) + # Write to disk + write_product(ds, product=product, filepath=filepath, force=force) + + ##--------------------------------------------------------------------. + # Clean environment + del ds + + # Log end processing + msg = f"{product} processing of {filename} has ended." + log_info(logger, msg, verbose=verbose) + + ##--------------------------------------------------------------------. + # Otherwise log the error + except Exception as e: + error_type = str(type(e).__name__) + msg = f"{error_type}: {e}" + log_error(logger, msg, verbose=verbose) + + # Close the file logger + close_logger(logger) + + # Return the logger file path + return logger_filepath + + +def run_l1_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L1 processing of a specific DISDRODB station when invoked from the terminal. + + The L1 routines just filter the raw drop spectrum and compute basic statistics. + The L1 routine expects as input L0C files where each file has a unique sample interval. + + This function is intended to be called through the ``disdrodb_run_l1_station`` + command-line interface. + + Parameters + ---------- + data_source : str + The name of the institution (for campaigns spanning multiple countries) or + the name of the country (for campaigns or sensor networks within a single country). + Must be provided in UPPER CASE. + campaign_name : str + The name of the campaign. Must be provided in UPPER CASE. + station_name : str + The name of the station. + force : bool, optional + If ``True``, existing data in the destination directories will be overwritten. + If ``False`` (default), an error will be raised if data already exists in the destination directories. + verbose : bool, optional + If ``True`` (default), detailed processing information will be printed to the terminal. + If ``False``, less information will be displayed. + parallel : bool, optional + If ``True``, files will be processed in multiple processes simultaneously, + with each process using a single thread to avoid issues with the HDF/netCDF library. + If ``False`` (default), files will be processed sequentially in a single process, + and multi-threading will be automatically exploited to speed up I/O tasks. + debugging_mode : bool, optional + If ``True``, the amount of data processed will be reduced. + Only the first 3 files will be processed. By default, ``False``. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + + """ + # Define product + product = "L1" + + # Define base directory + base_dir = get_base_dir(base_dir) + + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + + # ------------------------------------------------------------------------. + # Start processing + if verbose: + t_i = time.time() + msg = f"{product} processing of station {station_name} has started." + log_info(logger=logger, msg=msg, verbose=verbose) + + # ------------------------------------------------------------------------. + # Create directory structure + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=product, + force=force, + ) + + # -------------------------------------------------------------------------. + # List files to process + required_product = get_required_product(product) + flag_not_available_data = False + try: + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=required_product, + # Processing options + debugging_mode=debugging_mode, + ) + except Exception as e: + print(str(e)) # Case where no file paths available + flag_not_available_data = True + + # -------------------------------------------------------------------------. + # If no data available, print error message and return None + if flag_not_available_data: + msg = ( + f"{product} processing of {data_source} {campaign_name} {station_name}" + + f"has not been launched because of missing {required_product} data." + ) + print(msg) + return + + # -----------------------------------------------------------------. + # Generate L1 files + # - Loop over the L0 netCDF files and generate L1 files. + # - If parallel=True, it does that in parallel using dask.delayed + list_tasks = [ + _generate_l1( + filepath=filepath, + data_dir=data_dir, + logs_dir=logs_dir, + campaign_name=campaign_name, + station_name=station_name, + # Processing options + force=force, + verbose=verbose, + parallel=parallel, + ) + for filepath in filepaths + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + + # -----------------------------------------------------------------. + # Define L1 summary logs + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Logs list + list_logs=list_logs, + ) + + # ---------------------------------------------------------------------. + # End L1 processing + if verbose: + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + msg = f"{product} processing of station {station_name} completed in {timedelta_str}" + log_info(logger=logger, msg=msg, verbose=verbose) + + +####-------------------------------------------------------------------------------------------------------------------. diff --git a/disdrodb/l1_env/__init__.py b/disdrodb/l1_env/__init__.py new file mode 100644 index 00000000..b6330547 --- /dev/null +++ b/disdrodb/l1_env/__init__.py @@ -0,0 +1,17 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Core functions for DISDRODB L1 ENV production.""" diff --git a/disdrodb/l1_env/routines.py b/disdrodb/l1_env/routines.py new file mode 100644 index 00000000..5acc40f1 --- /dev/null +++ b/disdrodb/l1_env/routines.py @@ -0,0 +1,38 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Core functions for DISDRODB ENV production.""" + +import xarray as xr + + +def get_default_environment_dataset(): + """Define defaults values for the ENV dataset.""" + ds_env = xr.Dataset() + ds_env["sea_level_air_pressure"] = 101_325 + ds_env["gas_constant_dry_air"] = 287.04 + ds_env["lapse_rate"] = 0.0065 + ds_env["relative_humidity"] = 0.95 # Value between 0 and 1 ! + ds_env["temperature"] = 20 + 273.15 + return ds_env + + +def load_env_dataset(ds): + """Load the ENV dataset.""" + # TODO - Retrieve relative_humidity and temperature from L1-ENV + ds_env = get_default_environment_dataset() + ds_env = ds_env.assign_coords({"altitude": ds["altitude"], "latitude": ds["latitude"]}) + return ds_env diff --git a/disdrodb/l2/__init__.py b/disdrodb/l2/__init__.py new file mode 100644 index 00000000..36d681b8 --- /dev/null +++ b/disdrodb/l2/__init__.py @@ -0,0 +1,17 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Module for DISDRODB L2 production.""" diff --git a/disdrodb/l2/empirical_dsd.py b/disdrodb/l2/empirical_dsd.py new file mode 100644 index 00000000..49d9ef90 --- /dev/null +++ b/disdrodb/l2/empirical_dsd.py @@ -0,0 +1,1330 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Functions for computation of DSD parameters.""" + +import numpy as np +import xarray as xr + + +def get_effective_sampling_area(sensor_name, diameter): + """Compute the effective sampling area of the disdrometer.""" + if sensor_name in ["OTT_Parsivel", "OTT_Parsivel2"]: + # Calculate sampling area for each diameter bin (S_i) + L = 180 / 1000 # Length of the Parsivel beam in m (180 mm) + B = 30 / 1000 # Width of the Parsivel beam in m (30mm) + sampling_area = L * (B - diameter / 1000 / 2) + elif sensor_name in "Thies_LPM": + # TODO: provided as variable varying with time? + L = 228 / 1000 # Length of the Parsivel beam in m (228 mm) + B = 20 / 1000 # Width of the Parsivel beam in m (20 mm) + sampling_area = L * (B - diameter / 1000 / 2) + elif sensor_name in "RD80": + sampling_area = 1 # TODO + else: + raise NotImplementedError + return sampling_area + + +def _get_spectrum_dims(ds): + if "velocity_bin_center" in ds.dims: + dims = ["diameter_bin_center", "velocity_bin_center"] + else: + dims = ["diameter_bin_center"] + return dims + + +def get_drop_volume(diameter): + """ + Compute the volume of a droplet assuming it is spherical. + + Parameters + ---------- + diameter : float or array-like + The diameter of the droplet(s). Can be a scalar or an array of diameters. + + Returns + ------- + array-like + The volume of the droplet(s) calculated in cubic units based on the input diameter(s). + + Notes + ----- + The volume is calculated using the formula for the volume of a sphere: + V = (π/6) * d^3, where d is the diameter of the droplet. + """ + return np.pi / 6 * diameter**3 # /6 = 4/3*(0.5**3) + + +####-------------------------------------------------------------------------------------------------------------------. + + +def get_drop_average_velocity(drop_number): + r""" + Calculate the drop average velocity \\( v_m(D))) \\) per diameter class. + + Parameters + ---------- + drop_number : xarray.DataArray + Array of drop counts \\( n(D,v) \\) per diameter (and velocity, if available) bins + over the time integration period. + + Returns + ------- + average_velocity : xarray.DataArray + Array of drop average velocity \\( v_m(D))) \\) in m·s⁻¹ . + """ + velocity = xr.ones_like(drop_number) * drop_number["velocity_bin_center"] + average_velocity = ((velocity * drop_number).sum(dim="velocity_bin_center")) / drop_number.sum( + dim="velocity_bin_center", + ) + # average_velocity = average_velocity.where(average_velocity > 0, 0) + return average_velocity + + +def get_drop_number_concentration(drop_number, velocity, diameter_bin_width, sampling_area, sample_interval): + r""" + Calculate the volumetric drop number concentration \\( N(D) \\) per diameter class. + + Computes the drop number concentration \\( N(D) \\) [m⁻³·mm⁻¹] for each diameter + class based on the measured drop counts and sensor parameters. This represents + the number of drops per unit volume per unit diameter interval. + It is also referred to as the drop size distribution N(D) per cubic metre per millimetre [m-3 mm-1] + + Parameters + ---------- + velocity : xarray.DataArray + Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s). + diameter_bin_width : xarray.DataArray + Width of each diameter bin \\( \\Delta D \\) in millimeters (mm). + drop_number : xarray.DataArray + Array of drop counts \\( n(D,v) \\) per diameter (and velocity, if available) + bins over the time integration period. + sample_interval : float or xarray.DataArray + Time over which the drops are counted \\( \\Delta t \\) in seconds (s). + sampling_area : float or xarray.DataArray + The effective sampling area \\( A \\) of the sensor in square meters (m²). + + Returns + ------- + drop_number_concentration : xarray.DataArray or ndarray + Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹, representing + the number of drops per unit volume per unit diameter interval. + + Notes + ----- + The drop number concentration \\( N(D) \\) is calculated using: + + .. math:: + + N(D) = \frac{n(D)}{A_{\text{eff}}(D) \\cdot \\Delta D \\cdot \\Delta t \\cdot v(D)} + + where: + + - \\( n(D,v) \\): Number of drops counted in diameter (and velocity) bins. + - \\( A_{\text{eff}}(D) \\): Effective sampling area of the sensor for diameter \\( D \\) in square meters (m²). + - \\( \\Delta D \\): Diameter bin width in millimeters (mm). + - \\( \\Delta t \\): Time integration period in seconds (s). + - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) in meters per second (m/s). + + The effective sampling area \\( A_{\text{eff}}(D) \\) depends on the sensor and may vary with drop diameter. + """ + # Ensure velocity is 2D (diameter, velocity) + velocity = xr.ones_like(drop_number) * velocity + + # Compute drop number concentration + # - For disdrometer with velocity bins + if "velocity_bin_center" in drop_number.dims: + drop_number_concentration = (drop_number / velocity).sum(dim=["velocity_bin_center"]) / ( + sampling_area * diameter_bin_width * sample_interval + ) + # - For impact disdrometers + else: + drop_number_concentration = drop_number / (sampling_area * diameter_bin_width * sample_interval * velocity) + return drop_number_concentration + + +# def get_drop_number_concentration1(drop_counts, velocity, diameter_bin_width, sampling_area, sample_interval): +# r""" +# Calculate the volumetric drop number concentration \\( N(D) \\) per diameter class. + +# Computes the drop number concentration \\( N(D) \\) [m⁻³·mm⁻¹] for each diameter +# class based on the measured drop counts and sensor parameters. This represents +# the number of drops per unit volume per unit diameter interval. +# It is also referred to as the drop size distribution N(D) per cubic metre per millimetre [m-3 mm-1] + +# Parameters +# ---------- +# velocity : xarray.DataArray +# Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s). +# diameter_bin_width : xarray.DataArray +# Width of each diameter bin \\( \\Delta D \\) in millimeters (mm). +# drop_counts : xarray.DataArray +# Array of drop counts \\( n(D) \\) per diameter bin over the time integration period. +# sample_interval : float or xarray.DataArray +# Time over which the drops are counted \\( \\Delta t \\) in seconds (s). +# sampling_area : xarray.DataArray + +# Returns +# ------- +# drop_number_concentration : xarray.DataArray or ndarray +# Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹, representing +# the number of drops per unit volume per unit diameter interval. + +# Notes +# ----- +# The drop number concentration \\( N(D) \\) is calculated using: + +# .. math:: + +# N(D) = \frac{n(D)}{A_{\text{eff}}(D) \\cdot \\Delta D \\cdot \\Delta t \\cdot v(D)} + +# where: + +# - \\( n(D) \\): Number of drops counted in diameter bin \\( D \\). +# - \\( A_{\text{eff}}(D) \\): Effective sampling area of the sensor for diameter \\( D \\) in square meters (m²). +# - \\( \\Delta D \\): Diameter bin width in millimeters (mm). +# - \\( \\Delta t \\): Time integration period in seconds (s). +# - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) in meters per second (m/s). + +# The effective sampling area \\( A_{\text{eff}}(D) \\) depends on the sensor and may vary with drop diameter. +# """ +# drop_number_concentration = drop_counts / (sampling_area * diameter_bin_width * sample_interval * velocity) +# return drop_number_concentration + + +def get_total_number_concentration(drop_number_concentration, diameter_bin_width): + r""" + Compute the total number concentration \\( N_t \\) from the drop size distribution. + + Calculates the total number concentration \\( N_t \\) [m⁻³] by integrating the + drop number concentration over all diameter bins. + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹. + diameter_bin_width : xarray.DataArray + Width of each diameter bin \\( \\Delta D \\) in millimeters (mm). + + Returns + ------- + total_number_concentration : xarray.DataArray or ndarray + Total number concentration \\( N_t \\) in m⁻³, representing the total number + of drops per unit volume. + + Notes + ----- + The total number concentration \\( N_t \\) is calculated by integrating over the diameter bins: + + .. math:: + + N_t = \\sum_{\text{bins}} N(D) \\cdot \\Delta D + + where: + + - \\( N(D) \\): Drop number concentration in each diameter bin [m⁻³·mm⁻¹]. + - \\( \\Delta D \\): Diameter bin width in millimeters (mm). + + """ + total_number_concentration = (drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center") + return total_number_concentration + + +def get_moment(drop_number_concentration, diameter, diameter_bin_width, moment): + r""" + Calculate the m-th moment of the drop size distribution. + + Computes the m-th moment of the drop size distribution (DSD), denoted as E[D**m], + where D is the drop diameter and m is the order of the moment. This is useful + in meteorology and hydrology for characterizing precipitation. For example, + weather radar measurements correspond to the sixth moment of the DSD (m = 6). + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + The drop number concentration N(D) for each diameter bin, + typically in units of number per cubic meter per millimeter (m⁻³ mm⁻¹). + diameter : xarray.DataArray + The equivalent volume diameters D of the drops in each bin, in meters (m). + diameter_bin_width : xarray.DataArray + The width dD of each diameter bin, in millimeters (mm). + moment : int or float + The order m of the moment to compute. + + Returns + ------- + moment_value : xarray.DataArray + The computed m-th moment of the drop size distribution, typically in units + dependent on the input units, such as mmᵐ m⁻³. + + Notes + ----- + The m-th moment is calculated using the formula: + + .. math:: + + M_m = \\sum_{\text{bins}} N(D) \\cdot D^m \\cdot dD + + where: + + - \\( M_m \\) is the m-th moment of the DSD. + - \\( N(D) \\) is the drop number concentration for diameter \\( D \\). + - \\( D^m \\) is the diameter raised to the power of \\( m \\). + - \\( dD \\) is the diameter bin width. + + This computation integrates over the drop size distribution to provide a + scalar value representing the statistical momen + """ + return ((diameter * 1000) ** moment * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center") + + +####------------------------------------------------------------------------------------------------------------------ +#### Rain and Reflectivity + + +def get_rain_rate(drop_counts, sampling_area, diameter, sample_interval): + r""" + Compute the rain rate \\( R \\) [mm/h] based on the drop size distribution and drop velocities. + + This function calculates the rain rate by integrating over the drop size distribution (DSD), + considering the volume of water falling per unit time and area. It uses the number of drops + counted in each diameter class, the effective sampling area of the sensor, the diameters of the + drops, and the time interval over which the drops are counted. + + Parameters + ---------- + drop_counts : xarray.DataArray + Array representing the number of drops per diameter class \\( n(D) \\) in each bin. + sample_interval : float or xarray.DataArray + The time duration over which drops are counted \\( \\Delta t \\) in seconds (s). + sampling_area : float or xarray.DataArray + The effective sampling area \\( A \\) of the sensor in square meters (m²). + diameter : xarray.DataArray + Array of drop diameters \\( D \\) in meters (m). + + Returns + ------- + rain_rate : xarray.DataArray + The computed rain rate \\( R \\) in millimeters per hour (mm/h), which represents the volume + of water falling per unit area per unit time. + + Notes + ----- + The rain rate \\( R \\) is calculated using the following formula: + + .. math:: + + R = \frac{\\pi}{6} \times 10^{-3} \times 3600 \times + \\sum_{\text{bins}} n(D) \cdot A(D) \cdot D^3 \cdot \\Delta t + + Where: + - \\( n(D) \\) is the number of drops in each diameter class. + - \\( A(D) \\) is the effective sampling area. + - \\( D \\) is the drop diameter. + - \\( \\Delta t \\) is the time interval for drop counts. + + This formula incorporates a conversion factor to express the rain rate in millimeters per hour. + """ + rain_rate = ( + np.pi + / 6 + / sample_interval + * (drop_counts / sampling_area * diameter**3).sum(dim="diameter_bin_center") + * 3600 + * 1000 + ) + + # 0.6 or / 6 --> Different variant across articles and codes !!! (pydsd 0.6, raupach 2015, ...) + # --> 1/6 * 3600 = 600 = 0.6 * 1e3 = 6 * 1e2 + # --> 1/6 * 3600 * 1000 = 0.6 * 1e6 = 6 * 1e5 --> 6 * 1e-4 (if diameter in mm) + # rain_rate = np.pi * 0.6 * 1e3 / sample_interval * ( + # (drop_counts * diameter**3 / sampling_area).sum(dim="diameter_bin_center") * 1000)) + # rain_rate = np.pi / 6 / sample_interval * ( + # (drop_counts * diameter**3 / sampling_area).sum(dim="diameter_bin_center") * 1000 * 3600) + + return rain_rate + + +def get_rain_rate_from_dsd(drop_number_concentration, velocity, diameter, diameter_bin_width): + r""" + Compute the rain rate \\( R \\) [mm/h] based on the drop size distribution and raindrop velocities. + + Calculates the rain rate by integrating over the drop size distribution (DSD), + considering the volume of water falling per unit time and area. + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + Array of drop number concentrations \\( N(D) \\) in m⁻³·mm⁻¹. + velocity : xarray.DataArray + Array of drop fall velocities \\( v(D) \\) corresponding to each diameter bin in meters per second (m/s). + diameter : xarray.DataArray + Array of drop diameters \\( D \\) in meters (m). + diameter_bin_width : xarray.DataArray + Width of each diameter bin \\( \\Delta D \\) in millimeters (mm). + + Returns + ------- + rain_rate : xarray.DataArray + The rain rate \\( R \\) in millimeters per hour (mm/h), representing the volume + of water falling per unit area per unit time. + + Notes + ----- + The rain rate \\( R \\) is calculated using: + + .. math:: + + R = \frac{\\pi}{6} \times 10^{-3} \times 3600 \times + \\sum_{\text{bins}} N(D) \\cdot v(D) \\cdot D^3 \\cdot \\Delta D + + where: + + - \\( N(D) \\): Drop number concentration [m⁻³·mm⁻¹]. + - \\( v(D) \\): Fall velocity of drops in diameter bin \\( D \\) [m/s]. + - \\( D \\): Drop diameter [mm]. + - \\( \\Delta D \\): Diameter bin width [mm]. + - The factor \\( \frac{\\pi}{6} \\) converts the diameter cubed to volume of a sphere. + - The factor \\( 10^{-3} \\) converts from mm³ to m³. + - The factor \\( 3600 \\) converts from seconds to hours. + + """ + # The following formula assume diameter in mm !!! + rain_rate = ( + np.pi + / 6 + * (drop_number_concentration * velocity * diameter**3 * diameter_bin_width).sum(dim="diameter_bin_center") + * 3600 + * 1000 + ) + + # Alternative formulation + # 3600*1000/6 = 6e5 + # 1e-9 for mm to meters conversion + # --> 6 * 1 e-4 + # rain_rate = 6 * np.pi * 1e-4 * ( + # (drop_number_concentration * velocity * diameter**3 * diameter_bin_width).sum(dim="diameter_bin_center") + # ) + return rain_rate + + +def get_rain_accumulation(rain_rate, sample_interval): + """ + Calculate the total rain accumulation over a specified time period. + + Parameters + ---------- + rain_rate : float or array-like + The rain rate in millimeters per hour (mm/h). + sample_interval : int + The time over which to accumulate rain, specified in seconds. + + Returns + ------- + float or numpy.ndarray + The total rain accumulation in millimeters (mm) over the specified time period. + + """ + rain_accumulation = rain_rate / 3600 * sample_interval + return rain_accumulation + + +def get_equivalent_reflectivity_factor(drop_number_concentration, diameter, diameter_bin_width): + r""" + Compute the equivalent reflectivity factor in decibels relative to 1 mm⁶·m⁻³ (dBZ). + + The equivalent reflectivity (in mm⁶·m⁻³) is obtained from the sixth moment of the drop size distribution (DSD). + The reflectivity factor is expressed in decibels relative to 1 mm⁶·m⁻³ using the formula: + + .. math:: + + Z = 10 \cdot \log_{10}(z) + + where \\( z \\) is the reflectivity in linear units of the DSD. + + To convert back the reflectivity factor to linear units (mm⁶·m⁻³), use the formula: + + .. math:: + + z = 10^{(Z/10)} + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + Array representing the concentration of droplets per diameter class in number per unit volume. + diameter : xarray.DataArray + Array of droplet diameters in meters (m). + diameter_bin_width : xarray.DataArray + Array representing the width of each diameter bin in millimeters (mm). + + Returns + ------- + xarray.DataArray + The equivalent reflectivity factor in decibels (dBZ). + + Notes + ----- + The function computes the sixth moment of the DSD using the formula: + + .. math:: + + z = \\sum n(D) \cdot D^6 \cdot \\Delta D + + where \\( n(D) \\) is the drop number concentration, \\( D \\) is the drop diameter, and + \\( \\Delta D \\) is the diameter bin width. + + """ + # Compute reflectivity in mm⁶·m⁻³ + z = ((diameter * 1000) ** 6 * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center") + invalid_mask = z > 0 + z = z.where(invalid_mask) + # Compute equivalent reflectivity factor in dBZ + # - np.log10(np.nan) returns -Inf ! + # --> We mask again after the log + Z = 10 * np.log10(z) + Z = Z.where(invalid_mask) + return Z + + +####------------------------------------------------------------------------------------------------------------------ +#### Liquid Water Content / Mass Parameters + + +def get_mass_spectrum(drop_number_concentration, diameter, water_density=1000): + """ + Calculate the rain drop mass spectrum m(D) in g/m3 mm-1. + + It represents the mass of liquid water as a function of raindrop diameter. + + Parameters + ---------- + drop_number_concentration : array-like + The concentration of droplets (number of droplets per unit volume) in each diameter bin. + diameter : array-like + The diameters of the droplets for each bin, in meters (m). + + + Returns + ------- + array-like + The calculated rain drop mass spectrum in grams per cubic meter per diameter (g/m3 mm-1). + + """ + # Convert water density from kg/m3 to g/m3 + water_density = water_density * 1000 + + # Calculate the volume constant for the water droplet formula + vol_constant = np.pi / 6.0 * water_density + + # Calculate the mass spectrum (lwc per diameter bin) + return vol_constant * (diameter**3 * drop_number_concentration) # [g/m3 mm-1] + + +def get_liquid_water_content(drop_number_concentration, diameter, diameter_bin_width, water_density=1000): + """ + Calculate the liquid water content based on drop number concentration and drop diameter. + + Parameters + ---------- + drop_number_concentration : array-like + The concentration of droplets (number of droplets per unit volume) in each diameter bin. + diameter : array-like + The diameters of the droplets for each bin, in meters (m). + diameter_bin_width : array-like + The width of each diameter bin, in millimeters (mm). + water_density : float, optional + The density of water in kg/m^3. The default is 1000 kg/m3. + + Returns + ------- + array-like + The calculated liquid water content in grams per cubic meter (g/m3). + + """ + # Convert water density from kg/m3 to g/m3 + water_density = water_density * 1000 + + # Calculate the volume constant for the water droplet formula + vol_constant = np.pi / 6.0 * water_density + + # Calculate the liquid water content + lwc = vol_constant * (diameter**3 * drop_number_concentration * diameter_bin_width).sum(dim="diameter_bin_center") + return lwc + + +def get_mom_liquid_water_content(moment_3, water_density=1000): + r""" + Calculate the liquid water content (LWC) from the third moment of the DSD. + + LWC represents the mass of liquid water per unit volume of air. + + Parameters + ---------- + moment_3 : float or array-like + The third moment of the drop size distribution, \\( M_3 \\), in units of + [m⁻³·mm³] (number per cubic meter times diameter cubed). + water_density : float, optional + The density of water in kilograms per cubic meter (kg/m³). + Default is 1000 kg/m³ (approximate density of water at 20°C). + + Returns + ------- + lwc : float or array-like + The liquid water content in grams per cubic meter (g/m³). + + Notes + ----- + The liquid water content is calculated using the formula: + + .. math:: + + \text{LWC} = \frac{\\pi \rho_w}{6} \\cdot M_3 + + where: + + - \\( \text{LWC} \\) is the liquid water content [g/m³]. + - \\( \rho_w \\) is the density of water [g/mm³]. + - \\( M_3 \\) is the third moment of the DSD [m⁻³·mm³]. + + Examples + -------- + Compute the liquid water content from the third moment: + + >>> moment_3 = 1e6 # Example value in [m⁻³·mm³] + >>> lwc = get_liquid_water_content_from_moments(moment_3) + >>> print(f"LWC: {lwc:.4f} g/m³") + LWC: 0.0005 g/m³ + """ + # Convert water density from kg/m³ to g/mm³ + water_density = water_density * 1e-6 # [kg/m³] * 1e-6 = [g/mm³] + # Calculate LWC [g/m3] + lwc = (np.pi * water_density / 6) * moment_3 # [g/mm³] * [m⁻³·mm³] = [g/m³] + return lwc + + +####-------------------------------------------------------------------------------------------------------- +#### Diameter parameters + + +def _get_last_xr_valid_idx(da_condition, dim, fill_value=None): + """ + Get the index of the last True value along a specified dimension in an xarray DataArray. + + This function finds the last index along the given dimension where the condition is True. + If all values are False or NaN along that dimension, the function returns ``fill_value``. + + Parameters + ---------- + da_condition : xarray.DataArray + A boolean DataArray where True indicates valid or desired values. + Should have the dimension specified in `dim`. + dim : str + The name of the dimension along which to find the last True index. + fill_value : int or float + The fill value when all values are False or NaN along the specified dimension. + The default is ``dim_size - 1``. + + Returns + ------- + last_idx : xarray.DataArray + An array containing the index of the last True value along the specified dimension. + If all values are False or NaN, the corresponding entry in `last_idx` will be NaN. + + Notes + ----- + The function works by reversing the DataArray along the specified dimension and using + `argmax` to find the first True value in the reversed array. It then calculates the + corresponding index in the original array. To handle cases where all values are False + or NaN (and `argmax` would return 0), the function checks if there is any True value + along the dimension and assigns NaN to `last_idx` where appropriate. + + Examples + -------- + >>> import xarray as xr + >>> da = xr.DataArray([[False, False, True], [False, False, False]], dims=["time", "diameter_bin_center"]) + >>> last_idx = _get_last_xr_valid_idx(da, "diameter_bin_center") + >>> print(last_idx) + + array([2., nan]) + Dimensions without coordinates: time + + In this example, for the first time step, the last True index is 2. + For the second time step, all values are False, so the function returns NaN. + + """ + # Get the size of the 'diameter_bin_center' dimension + dim_size = da_condition.sizes[dim] + + # Define default fillvalue + if fill_value is None: + fill_value = dim_size - 1 + + # Reverse the mask along 'diameter_bin_center' + da_condition_reversed = da_condition.isel({dim: slice(None, None, -1)}) + + # Check if there is any True value along the dimension for each slice + has_true = da_condition.any(dim=dim) + + # Find the first non-zero index in the reversed array + last_idx_from_end = da_condition_reversed.argmax(dim=dim) + + # Calculate the last True index in the original array + last_idx = xr.where( + has_true, + dim_size - last_idx_from_end - 1, + fill_value, + ) + return last_idx + + +def get_min_max_diameter(drop_counts): + """ + Get the minimum and maximum diameters where drop_counts is non-zero. + + Parameters + ---------- + drop_counts : xarray.DataArray + Drop counts with dimensions ("time", "diameter_bin_center") and + coordinate "diameter_bin_center". + + Returns + ------- + min_drop_diameter : xarray.DataArray + Minimum diameter where drop_counts is non-zero, for each time step. + max_drop_diameter : xarray.DataArray + Maximum diameter where drop_counts is non-zero, for each time step. + """ + # Create a boolean mask where drop_counts is non-zero + non_zero_mask = drop_counts > 0 + + # Find the first non-zero index along 'diameter_bin_center' for each time + # - Return 0 if all False, zero or NaN + first_non_zero_idx = non_zero_mask.argmax(dim="diameter_bin_center") + + # Calculate the last non-zero index in the original array + last_non_zero_idx = _get_last_xr_valid_idx(da_condition=non_zero_mask, dim="diameter_bin_center") + + # Get the 'diameter_bin_center' coordinate + diameters = drop_counts["diameter_bin_center"] + + # Retrieve the diameters corresponding to the first and last non-zero indices + min_drop_diameter = diameters.isel(diameter_bin_center=first_non_zero_idx.astype(int)) + max_drop_diameter = diameters.isel(diameter_bin_center=last_non_zero_idx.astype(int)) + + # Identify time steps where all drop_counts are zero + is_all_zero_or_nan = ~non_zero_mask.any(dim="diameter_bin_center") + + # Mask with NaN where no drop or all values are NaN + min_drop_diameter = min_drop_diameter.where(~is_all_zero_or_nan) + max_drop_diameter = max_drop_diameter.where(~is_all_zero_or_nan) + + return min_drop_diameter, max_drop_diameter + + +def get_mode_diameter(drop_number_concentration): + """Get raindrop diameter with highest occurrence.""" + diameter = drop_number_concentration["diameter_bin_center"] + # If all NaN, set to 0 otherwise argmax fail when all NaN data + idx_all_nan_mask = np.isnan(drop_number_concentration).all(dim="diameter_bin_center") + drop_number_concentration = drop_number_concentration.where(~idx_all_nan_mask, 0) + # Find index where all 0 + # --> argmax will return 0 + idx_all_zero = (drop_number_concentration == 0).all(dim="diameter_bin_center") + # Find the diameter index corresponding the "mode" + idx_observed_mode = drop_number_concentration.argmax(dim="diameter_bin_center") + # Find the diameter corresponding to the "mode" + diameter_mode = diameter.isel({"diameter_bin_center": idx_observed_mode}) + diameter_mode = diameter_mode.drop( + ["diameter_bin_width", "diameter_bin_lower", "diameter_bin_upper", "diameter_bin_center"], + ) + # Set to np.nan where data where all NaN or all 0 + idx_mask = np.logical_or(idx_all_nan_mask, idx_all_zero) + diameter_mode = diameter_mode.where(~idx_mask) + return diameter_mode + + +####-------------------------------------------------------------------------------------------------------------------. +#### Mass diameters + + +def get_mean_volume_drop_diameter(moment_3, moment_4): + r""" + Calculate the volume-weighted mean volume diameter \\( D_m \\) from DSD moments. + + The mean volume diameter of a drop size distribution (DSD) is computed using + the third and fourth moments. + + The volume-weighted mean volume diameter is also referred as the mass mean diameter. + It represents the first moment of the mass spectrum. + + Parameters + ---------- + moment_3 : float or array-like + The third moment of the drop size distribution, \\( M_3 \\), in units of + [m⁻³·mm³]. + moment_4 : float or array-like + The fourth moment of the drop size distribution, \\( M_4 \\), in units of + [m⁻³·mm⁴]. + + Returns + ------- + D_m : float or array-like + The mean volume diameter in millimeters (mm). + + Notes + ----- + The mean volume diameter is calculated using the formula: + + .. math:: + + D_m = \frac{M_4}{M_3} + + where: + + - \\( D_m \\) is the mean volume diameter [mm]. + - \\( M_3 \\) is the third moment of the DSD [m⁻³·mm³]. + - \\( M_4 \\) is the fourth moment of the DSD [m⁻³·mm⁴]. + + Examples + -------- + Compute the mean volume diameter from the third and fourth moments: + + >>> moment_3 = 1e6 # Example value in [m⁻³·mm³] + >>> moment_4 = 5e6 # Example value in [m⁻³·mm⁴] + >>> D_m = get_mean_volume_drop_diameter(moment_3, moment_4) + >>> print(f"Mean Volume Diameter D_m: {D_m:.4f} mm") + Mean Volume Diameter D_m: 5.0000 mm + + """ + D_m = moment_4 / moment_3 # Units: [mm⁴] / [mm³] = [mm] + return D_m + + +def get_std_volume_drop_diameter(drop_number_concentration, diameter_bin_width, diameter, mean_volume_diameter): + r""" + Calculate the standard deviation of the mass-weighted drop diameter (σₘ). + + This parameter is often also referred as the mass spectrum standard deviation. + It quantifies the spread or variability of DSD. + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + The drop number concentration \\( N(D) \\) for each diameter bin, typically in units of + number per cubic meter per millimeter (m⁻³·mm⁻¹). + diameter : xarray.DataArray + The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m). + diameter_bin_width : xarray.DataArray + The width \\( \\Delta D \\) of each diameter bin, in millimeters (mm). + mean_volume_diameter : xarray.DataArray + The mean volume diameter \\( D_m \\), in millimeters (mm). This is typically computed using the + third and fourth moments or directly from the DSD. + + Returns + ------- + sigma_m : xarray.DataArray or float + The standard deviation of the mass-weighted drop diameter, \\( \\sigma_m \\), + in millimeters (mm). + + Notes + ----- + The standard deviation of the mass-weighted drop diameter is calculated using the formula: + + .. math:: + + \\sigma_m = \\sqrt{\frac{\\sum [N(D) \\cdot (D - D_m)^2 \\cdot D^3 + \\cdot \\Delta D]}{\\sum [N(D) \\cdot D^3 \\cdot \\Delta D]}} + + where: + + - \\( N(D) \\) is the drop number concentration for diameter \\( D \\) [m⁻³·mm⁻¹]. + - \\( D \\) is the drop diameter [mm]. + - \\( D_m \\) is the mean volume diameter [mm]. + - \\( \\Delta D \\) is the diameter bin width [mm]. + - The numerator computes the weighted variance of diameters. + - The weighting factor \\( D^3 \\) accounts for mass (since mass ∝ \\( D^3 \\)). + + **Physical Interpretation:** + + - A smaller \\( \\sigma_m \\) indicates that the mass is concentrated around the + mean mass-weighted diameter, implying less variability in drop sizes. + - A larger \\( \\sigma_m \\) suggests a wider spread of drop sizes contributing + to the mass, indicating greater variability. + + References + ---------- + - Smith, P. L., Johnson, R. W., & Kliche, D. V. (2019). On Use of the Standard + Deviation of the Mass Distribution as a Parameter in Raindrop Size Distribution + Functions. *Journal of Applied Meteorology and Climatology*, 58(4), 787-796. + https://doi.org/10.1175/JAMC-D-18-0086.1 + - Williams, C. R., and Coauthors, 2014: Describing the Shape of Raindrop Size Distributions Using Uncorrelated + Raindrop Mass Spectrum Parameters. J. Appl. Meteor. Climatol., 53, 1282-1296, https://doi.org/10.1175/JAMC-D-13-076.1. + """ + const = drop_number_concentration * diameter_bin_width * diameter**3 + numerator = ((diameter * 1000 - mean_volume_diameter) ** 2 * const).sum(dim="diameter_bin_center") + sigma_m = np.sqrt(numerator / const.sum(dim="diameter_bin_center")) + return sigma_m + + +def get_median_volume_drop_diameter(drop_number_concentration, diameter, diameter_bin_width, water_density=1000): + r""" + Compute the median volume drop diameter (D50). + + The median volume drop diameter (D50) is defined as the diameter at which half of the total liquid water content + is contributed by drops smaller than D50, and half by drops larger than D50. + + Drops smaller (respectively larger) than D50 contribute to half of the + total rainwater content in the sampled volume. + D50 is sensitive to the concentration of large drops. + + Often referred also as D50 (50 for 50 percentile of the distribution). + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + The drop number concentration \( N(D) \) for each diameter bin, typically in units of + number per cubic meter per millimeter (m⁻³·mm⁻¹). + diameter : xarray.DataArray + The equivalent volume diameters \( D \) of the drops in each bin, in meters (m). + diameter_bin_width : xarray.DataArray + The width \( \Delta D \) of each diameter bin, in millimeters (mm). + water_density : float, optional + The density of water in kg/m^3. The default is 1000 kg/m3. + + Returns + ------- + xarray.DataArray + Median volume drop diameter (D50) [mm]. + The drop diameter that divides the volume of water contained in the sample into two equal parts. + + """ + d50 = get_quantile_volume_drop_diameter( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + fraction=0.5, + water_density=water_density, + ) + return d50 + + +def get_quantile_volume_drop_diameter( + drop_number_concentration, + diameter, + diameter_bin_width, + fraction, + water_density=1000, +): + r""" + Compute the diameter corresponding to a specified fraction of the cumulative liquid water content (LWC). + + This function calculates the diameter \( D_f \) at which the cumulative LWC reaches + a specified fraction \( f \) of the total LWC for each drop size distribution (DSD). + When \( f = 0.5 \), it computes the median volume drop diameter. + + + Parameters + ---------- + drop_number_concentration : xarray.DataArray + The drop number concentration \( N(D) \) for each diameter bin, typically in units of + number per cubic meter per millimeter (m⁻³·mm⁻¹). + diameter : xarray.DataArray + The equivalent volume diameters \( D \) of the drops in each bin, in meters (m). + diameter_bin_width : xarray.DataArray + The width \( \Delta D \) of each diameter bin, in millimeters (mm). + fraction : float + The fraction \( f \) of the total liquid water content to compute the diameter for. + Default is 0.5, which computes the median volume diameter (D50). + For other percentiles, use 0.1 for D10, 0.9 for D90, etc. Must be between 0 and 1 (exclusive). + water_density : float, optional + The density of water in kg/m^3. The default is 1000 kg/m3. + + Returns + ------- + D_f : xarray.DataArray + The diameter \( D_f \) corresponding to the specified fraction \( f \) of cumulative LWC, + in millimeters (mm). For `fraction=0.5`, this is the median volume drop diameter D50. + + Notes + ----- + The calculation involves computing the cumulative sum of the liquid water content + contributed by each diameter bin and finding the diameter at which the cumulative + sum reaches the specified fraction \( f \) of the total liquid water content. + + Linear interpolation is used between the two diameter bins where the cumulative LWC + crosses the target LWC fraction. + + """ + # Check fraction + if not (0 < fraction < 1): + raise ValueError("Fraction must be between 0 and 1 (exclusive)") + + # Convert water density from kg/m3 to g/m3 + water_density = water_density * 1000 + + # Compute LWC per diameter bin [g/m3] + lwc_per_diameter = np.pi / 6.0 * water_density * (diameter**3 * drop_number_concentration * diameter_bin_width) + + # Compute rain rate per diameter [mm/hr] + # rain_rate_per_diameter = np.pi / 6 * ( + # (drop_number_concentration * velocity * diameter**3 * diameter_bin_width) * 3600 * 1000 + # ) + + # Compute the cumulative sum of LWC along the diameter bins + cumulative_lwc = lwc_per_diameter.cumsum(dim="diameter_bin_center") + + # ------------------------------------------------------. + # Retrieve total lwc and target lwc + total_lwc = cumulative_lwc.isel(diameter_bin_center=-1) + target_lwc = total_lwc * fraction + + # Retrieve idx half volume is reached + # --> If all NaN or False, argmax and _get_last_xr_valid_idx(fill_value=0) return 0 ! + idx_upper = (cumulative_lwc >= target_lwc).argmax(dim="diameter_bin_center") + idx_lower = _get_last_xr_valid_idx( + da_condition=(cumulative_lwc <= target_lwc), + dim="diameter_bin_center", + fill_value=0, + ) + + # Define mask when fraction fall exactly at a diameter bin center + # - Also related to the case of only values in the first bin. + solution_is_bin_center = idx_upper == idx_lower + + # Define diameter increment from lower bin center + y1 = cumulative_lwc.isel(diameter_bin_center=idx_lower) + y2 = cumulative_lwc.isel(diameter_bin_center=idx_upper) + yt = target_lwc + d1 = diameter.isel(diameter_bin_center=idx_lower) # m + d2 = diameter.isel(diameter_bin_center=idx_upper) # m + d_increment = (d2 - d1) * (yt - y1) / (y2 - y1) + + # Define quantile diameter + d = xr.where(solution_is_bin_center, d1, d1 + d_increment) + + # Set NaN where total sum is 0 or all NaN + mask_invalid = np.logical_or(total_lwc == 0, np.isnan(total_lwc)) + d = d.where(~mask_invalid) + + # Convert diameter to mm + d = d * 1000 + + return d + + +####----------------------------------------------------------------------------------------------------- +#### Normalized Gamma Parameters + + +def get_normalized_intercept_parameter(liquid_water_content, mean_volume_diameter, water_density=1000): + r""" + Calculate the normalized intercept parameter \\( N_w \\) of the drop size distribution. + + A higher \\( N_w \\) indicates a higher concentration of smaller drops. + The \\( N_w \\) is used in models to represent the DSD when assuming a normalized gamma distribution. + + Parameters + ---------- + liquid_water_content : float or array-like + Liquid water content \\( LWC \\) in grams per cubic meter (g/m³). + mean_volume_diameter : float or array-like + Mean volume diameter \\( D_m \\) in millimeters (mm). + water_density : float, optional + Density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³). + The default is 1000 kg/m³. + + Returns + ------- + Nw : xarray.DataArray or float + Normalized intercept parameter \\( N_w \\) in units of m⁻3·mm⁻¹. + + Notes + ----- + The normalized intercept parameter \\( N_w \\) is calculated using the formula: + + .. math:: + + N_w = \frac{256}{\\pi \rho_w} \\cdot \frac{W}{D_m^4} + + where: + + - \\( N_w \\) is the normalized intercept parameter. + - \\( W \\) is the liquid water content in g/m³. + - \\( D_m \\) is the mean volume diameter in mm. + - \\( \rho_w \\) is the density of water in kg/m³. + """ + # Conversion to g/m3 + water_density = water_density * 1000 # g/m3 + + # Compute Nw + # --> 1e9 is used to convert from mm-4 to m-3 mm-1 + # - 256 = 4**4 + # - lwc = (np.pi * water_density / 6) * moment_3 + Nw = (256.0 / (np.pi * water_density)) * liquid_water_content / mean_volume_diameter**4 * 1e9 + return Nw + + +def get_mom_normalized_intercept_parameter(moment_3, moment_4): + r""" + Calculate the normalized intercept parameter \\( N_w \\) of the drop size distribution. + + moment_3 : float or array-like + The third moment of the drop size distribution, \\( M_3 \\), in units of + [m⁻³·mm³] (number per cubic meter times diameter cubed). + + moment_4 : float or array-like + The foruth moment of the drop size distribution, \\( M_3 \\), in units of + [m⁻³·mm4]. + + Returns + ------- + Nw : xarray.DataArray or float + Normalized intercept parameter \\( N_w \\) in units of m⁻3·mm⁻¹. + + References + ---------- + Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001: + The Concept of “Normalized” Distribution to Describe Raindrop Spectra: + A Tool for Cloud Physics and Cloud Remote Sensing. + J. Appl. Meteor. Climatol., 40, 1118-1140, + https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2 + + """ + Nw = 256 / 6 * moment_3**5 / moment_4**4 + return Nw + + +####-------------------------------------------------------------------------------------------------------- +#### Kinetic Energy Parameters + + +def get_min_max_drop_kinetic_energy(drop_number, diameter, velocity, water_density=1000): + r""" + Calculate the minimum and maximum kinetic energy of raindrops in a drop size distribution (DSD). + + This function computes the kinetic energy of individual raindrops based on their diameters and + fall velocities and returns the minimum and maximum values among these drops for each time step. + + Parameters + ---------- + drop_number : xarray.DataArray + The number of drops in each diameter (and velocity, if available) bin(s). + diameter : xarray.DataArray + The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m). + velocity : xarray.DataArray or float + The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s). + water_density : float, optional + The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³). + Default is 1000 kg/m³. + + Returns + ------- + min_drop_kinetic_energy : xarray.DataArray + The minimum kinetic energy among the drops present in the DSD, in joules (J). + max_drop_kinetic_energy : xarray.DataArray + The maximum kinetic energy among the drops present in the DSD, in joules (J). + + Notes + ----- + The kinetic energy \\( KE \\) of an individual drop is calculated using: + + .. math:: + + KE = \frac{1}{2} \\cdot m \\cdot v^2 + + where: + + - \\( m \\) is the mass of the drop, calculated as: + + .. math:: + + m = \frac{\\pi}{6} \\cdot \rho_w \\cdot D^3 + + with \\( D \\) being the drop diameter. + + - \\( v \\) is the fall velocity of the drop. + """ + # Ensure velocity is 2D (diameter, velocity) + velocity = xr.ones_like(drop_number) * velocity + + # # Compute the mass of each drop: m = (π/6) * rho_w * D^3 + # mass = (np.pi / 6) * water_density * diameter**3 # Units: kg + + # # Compute kinetic energy: KE = 0.5 * m * v^2 + # ke = 0.5 * mass * velocity**2 # Units: J + + # Compute kinetic energy + ke = 1 / 12 * water_density * np.pi * diameter**3 * velocity**2 + + # Select kinetic energies where drops are present + ke = ke.where(drop_number > 0) + + # Compute min, mean and maximum drop kinetic energy + max_drop_kinetic_energy = ke.max(dim=_get_spectrum_dims(ke)) + min_drop_kinetic_energy = ke.min(dim=_get_spectrum_dims(ke)) + return min_drop_kinetic_energy, max_drop_kinetic_energy + + +def get_kinetic_energy_density_flux( + drop_number, + diameter, + velocity, + sampling_area, + sample_interval, + water_density=1000, +): + r""" + Calculate the kinetic energy flux density (KE) of rainfall over time. + + This function computes the total kinetic energy of raindrops passing through the sensor's sampling area + per unit time and area, resulting in the kinetic energy flux density + in joules per square meter per hour (J·m⁻²·h⁻¹). + + Typical values range between 0 and 5000 J·m⁻²·h⁻¹ . + KE = E * R + + Parameters + ---------- + drop_number : xarray.DataArray + The number of drops in each diameter (and velocity, if available) bin(s). + diameter : xarray.DataArray + The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m). + velocity : xarray.DataArray or float + The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s). + Values are broadcasted to match the dimensions of `drop_number`. + sampling_area : float + The effective sampling area \\( A \\) of the sensor in square meters (m²). + sample_interval : float + The time over which the drops are counted \\( \\Delta t \\) in seconds (s). + water_density : float, optional + The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³). + Default is 1000 kg/m³. + + Returns + ------- + kinetic_energy_flux : xarray.DataArray + The kinetic energy flux density of rainfall in joules per square meter per hour (J·m⁻²·h⁻¹). + Dimensions are reduced to ('time',). + + Notes + ----- + The kinetic energy flux density \\( KE \\) is calculated using: + + .. math:: + + KE = \frac{1}{2} \\cdot \frac{\rho_w \\pi}{6} \\cdot \frac{1}{\\Delta t} \\cdot 3600 \\cdot \\sum_{i,j} + \\left( \frac{n_{ij} \\cdot D_i^3 \\cdot v_j^2}{A} \right) + + where: + + - \\( n_{ij} \\) is the number of drops in diameter bin \\( i \\) and velocity bin \\( j \\). + - \\( D_i \\) is the diameter of bin \\( i \\). + - \\( v_j \\) is the velocity of bin \\( j \\). + - \\( A \\) is the sampling area. + - \\( \\Delta t \\) is the time integration period in seconds. + - The factor \\( 3600 \\) converts the rate to per hour. + + """ + # Ensure velocity is 2D (diameter, velocity) + velocity = xr.ones_like(drop_number) * velocity + + # # Compute rain drop kinetic energy [J] + # ke = 0.5 * water_density * np.pi / 6 * diameter **3 * velocity**2 + # # Compute total kinetic energy in [J / m2] + # total_kinetic_energy = (ke * drop_number / sampling_area).sum(dim=["diameter_bin_center", "velocity_bin_center"]) + # # Compute kinetic energy density flux (KE) (J/m2/h) + # kinetic_energy_flux = total_kinetic_energy / sample_interval * 3600 + + # Compute kinetic energy flux density (KE) (J/m2/h) + kinetic_energy_flux = ( + water_density + * np.pi + / 12 + / sample_interval + * 3600 + * ((drop_number * diameter**3 * velocity**2) / sampling_area).sum( + dim=_get_spectrum_dims(drop_number), + ) + ) + return kinetic_energy_flux + + +def get_rainfall_kinetic_energy(drop_number, diameter, velocity, rain_accumulation, sampling_area, water_density=1000): + r""" + Calculate the kinetic energy per unit rainfall depth (E) in joules per square meter per millimeter (J·m⁻²·mm⁻¹). + + This function computes the kinetic energy of the rainfall per millimeter of rain, providing a measure of the + energy associated with each unit of rainfall depth. This parameter is useful for understanding the potential + impact of raindrop erosion and the intensity of rainfall events. + + The values typically range between 0 and 40 J·m⁻²·mm⁻¹. + E is related to the kinetic energy flux density (KE) by the rain rate: E = KE/R . + + Parameters + ---------- + drop_number : xarray.DataArray + The number of drops in each diameter (and velocity, if available) bin(s). + diameter : xarray.DataArray + The equivalent volume diameters \\( D \\) of the drops in each bin, in meters (m). + velocity : xarray.DataArray or float + The fall velocities \\( v \\) of the drops in each bin, in meters per second (m/s). + Values are broadcasted to match the dimensions of `drop_number`. + rain_accumulation : xarray.DataArray or float + The total rainfall accumulation over the time integration period, in millimeters (mm). + sampling_area : float + The effective sampling area \\( A \\) of the sensor in square meters (m²). + water_density : float, optional + The density of water \\( \rho_w \\) in kilograms per cubic meter (kg/m³). + Default is 1000 kg/m³. + + Returns + ------- + E : xarray.DataArray + The kinetic energy per unit rainfall depth in joules per square meter per millimeter (J·m⁻²·mm⁻¹). + Dimensions are reduced to ('time',). + + Notes + ----- + The kinetic energy per unit rainfall depth \\( E \\) is calculated using: + + .. math:: + + E = \frac{1}{2} \\cdot \frac{\\pi}{6} \\cdot \frac{\rho_w}{R} \\cdot \\sum_{i,j} + \\left( \frac{n_{ij} \\cdot D_i^3 \\cdot v_j^2}{A} \right) + + where: + + - \\( n_{ij} \\) is the number of drops in diameter bin \\( i \\) and velocity bin \\( j \\). + - \\( D_i \\) is the diameter of bin \\( i \\). + - \\( v_j \\) is the velocity of bin \\( j \\). + - \\( A \\) is the sampling area. + - \\( R \\) is the rainfall accumulation over the integration period (mm). + """ + # Ensure velocity has the same dimensions as drop_number + velocity = xr.ones_like(drop_number) * velocity + # Compute rainfall kinetic energy per unit rainfall depth + E = ( + 0.5 + * np.pi + / 6 + * water_density + / rain_accumulation + * ((drop_number * diameter**3 * velocity**2) / sampling_area).sum( + dim=_get_spectrum_dims(drop_number), + ) + ) + return E diff --git a/disdrodb/l2/event.py b/disdrodb/l2/event.py new file mode 100644 index 00000000..41472008 --- /dev/null +++ b/disdrodb/l2/event.py @@ -0,0 +1,388 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Functions for event definition.""" +import dask +import numpy as np +import pandas as pd +import xarray as xr + +from disdrodb.api.info import get_start_end_time_from_filepaths +from disdrodb.utils.time import acronym_to_seconds, ensure_sorted_by_time + + +@dask.delayed +def _delayed_open_dataset(filepath): + with dask.config.set(scheduler="synchronous"): + ds = xr.open_dataset(filepath, chunks={}, autoclose=True, cache=False) + return ds + + +def identify_events( + filepaths, + parallel=False, + min_n_drops=5, + neighbor_min_size=2, + neighbor_time_interval="5MIN", + intra_event_max_time_gap="6H", + event_min_duration="5MIN", + event_min_size=3, +): + """Return a list of rainy events. + + Rainy timesteps are defined when n_drops_selected > min_n_drops. + Any rainy isolated timesteps (based on neighborhood criteria) is removed. + Then, consecutive rainy timesteps are grouped into the same event if the time gap between them does not + exceed `intra_event_max_time_gap`. Finally, events that do not meet minimum size or duration + requirements are filtered out. + + Parameters + ---------- + filepaths: list + List of L1C file paths. + parallel: bool + Whether to load the files in parallel. + Set parallel=True only in a multiprocessing environment. + The default is False. + neighbor_time_interval : str + The time interval around a given a timestep defining the neighborhood. + Only timesteps that fall within this time interval before or after a timestep are considered neighbors. + neighbor_min_size : int, optional + The minimum number of neighboring timesteps required within `neighbor_time_interval` for a + timestep to be considered non-isolated. Isolated timesteps are removed ! + - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs. + - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`. + - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`. + Defaults to 1. + intra_event_max_time_gap: str + The maximum time interval between two timesteps to be considered part of the same event. + This parameters is used to group timesteps into events ! + event_min_duration : str + The minimum duration an event must span. Events shorter than this duration are discarded. + event_min_size : int, optional + The minimum number of valid timesteps required for an event. Defaults to 1. + + Returns + ------- + list of dict + A list of events, where each event is represented as a dictionary with keys: + - "start_time": np.datetime64, start time of the event + - "end_time": np.datetime64, end time of the event + - "duration": np.timedelta64, duration of the event + - "n_timesteps": int, number of valid timesteps in the event + """ + # Open datasets in parallel + if parallel: + list_ds = dask.compute([_delayed_open_dataset(filepath) for filepath in filepaths])[0] + else: + list_ds = [xr.open_dataset(filepath, chunks={}, cache=False) for filepath in filepaths] + # Filter dataset for requested variables + variables = ["time", "n_drops_selected"] + list_ds = [ds[variables] for ds in list_ds] + # Concat datasets + ds = xr.concat(list_ds, dim="time", compat="no_conflicts", combine_attrs="override") + # Read in memory the variable needed + ds = ds.compute() + # Close file on disk + _ = [ds.close() for ds in list_ds] + del list_ds + # Sort dataset by time + ds = ensure_sorted_by_time(ds) + # Define candidate timesteps to group into events + idx_valid = ds["n_drops_selected"].data > min_n_drops + timesteps = ds["time"].data[idx_valid] + # Define event list + event_list = group_timesteps_into_event( + timesteps=timesteps, + neighbor_min_size=neighbor_min_size, + neighbor_time_interval=neighbor_time_interval, + intra_event_max_time_gap=intra_event_max_time_gap, + event_min_duration=event_min_duration, + event_min_size=event_min_size, + ) + return event_list + + +def group_timesteps_into_event( + timesteps, + intra_event_max_time_gap, + event_min_size=0, + event_min_duration="0S", + neighbor_min_size=0, + neighbor_time_interval="0S", +): + """ + Group candidate timesteps into events based on temporal criteria. + + This function groups valid candidate timesteps into events by considering how they cluster + in time. Any isolated timesteps (based on neighborhood criteria) are first removed. Then, + consecutive timesteps are grouped into the same event if the time gap between them does not + exceed `intra_event_max_time_gap`. Finally, events that do not meet minimum size or duration + requirements are filtered out. + + Please note that neighbor_min_size and neighbor_time_interval are very sensitive to the + actual sample interval of the data ! + + Parameters + ---------- + timesteps: np.ndarray + Candidate timesteps to be grouped into events. + neighbor_time_interval : str + The time interval around a given a timestep defining the neighborhood. + Only timesteps that fall within this time interval before or after a timestep are considered neighbors. + neighbor_min_size : int, optional + The minimum number of neighboring timesteps required within `neighbor_time_interval` for a + timestep to be considered non-isolated. Isolated timesteps are removed ! + - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs. + - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`. + - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`. + Defaults to 1. + intra_event_max_time_gap: str + The maximum time interval between two timesteps to be considered part of the same event. + This parameters is used to group timesteps into events ! + event_min_duration : str + The minimum duration an event must span. Events shorter than this duration are discarded. + event_min_size : int, optional + The minimum number of valid timesteps required for an event. Defaults to 1. + + Returns + ------- + list of dict + A list of events, where each event is represented as a dictionary with keys: + - "start_time": np.datetime64, start time of the event + - "end_time": np.datetime64, end time of the event + - "duration": np.timedelta64, duration of the event + - "n_timesteps": int, number of valid timesteps in the event + """ + # Retrieve datetime arguments + neighbor_time_interval = pd.Timedelta(acronym_to_seconds(neighbor_time_interval), unit="seconds") + intra_event_max_time_gap = pd.Timedelta(acronym_to_seconds(intra_event_max_time_gap), unit="seconds") + event_min_duration = pd.Timedelta(acronym_to_seconds(event_min_duration), unit="seconds") + + # Remove isolated timesteps + timesteps = remove_isolated_timesteps( + timesteps, + neighbor_min_size=neighbor_min_size, + neighbor_time_interval=neighbor_time_interval, + ) + + # Group timesteps into events + # - If two timesteps are separated by less than intra_event_max_time_gap, are considered the same event + events = group_timesteps_into_events(timesteps, intra_event_max_time_gap) + + # Define list of event + event_list = [ + { + "start_time": event[0], + "end_time": event[-1], + "duration": (event[-1] - event[0]).astype("m8[m]"), + "n_timesteps": len(event), + } + for event in events + ] + + # Filter event list by duration + event_list = [event for event in event_list if event["duration"] >= event_min_duration] + + # Filter event list by duration + event_list = [event for event in event_list if event["n_timesteps"] >= event_min_size] + + return event_list + + +def remove_isolated_timesteps(timesteps, neighbor_min_size, neighbor_time_interval): + """ + Remove isolated timesteps that do not have enough neighboring timesteps within a specified time gap. + + A timestep is considered isolated (and thus removed) if it does not have at least `neighbor_min_size` other + timesteps within the `neighbor_time_interval` before or after it. + In other words, for each timestep, we look for how many other timesteps fall into the + time interval [t - neighbor_time_interval, t + neighbor_time_interval], excluding it itself. + If the count of such neighbors is less than `neighbor_min_size`, that timestep is removed. + + Parameters + ---------- + timesteps : array-like of np.datetime64 + Sorted or unsorted array of valid timesteps. + neighbor_time_interval : np.timedelta64 + The time interval around a given a timestep defining the neighborhood. + Only timesteps that fall within this time interval before or after a timestep are considered neighbors. + neighbor_min_size : int, optional + The minimum number of neighboring timesteps required within `neighbor_time_interval` for a + timestep to be considered non-isolated. + - If `neighbor_min_size=0, then no timestep is considered isolated and no filtering occurs. + - If `neighbor_min_size=1`, the timestep must have at least one neighbor within `neighbor_time_interval`. + - If `neighbor_min_size=2`, the timestep must have at least two timesteps within `neighbor_time_interval`. + Defaults to 1. + + Returns + ------- + np.ndarray + Array of timesteps with isolated entries removed. + """ + # Sort timesteps + timesteps = np.array(timesteps) + timesteps.sort() + + # Do nothing if neighbor_min_size is 0 + if neighbor_min_size == 0: + return timesteps + + # Compute the start and end of the interval for each timestep + t_starts = timesteps - neighbor_time_interval + t_ends = timesteps + neighbor_time_interval + + # Use searchsorted to find the positions where these intervals would be inserted + # to keep the array sorted. This effectively gives us the bounds of timesteps + # within the neighbor interval. + left_indices = np.searchsorted(timesteps, t_starts, side="left") + right_indices = np.searchsorted(timesteps, t_ends, side="right") + + # The number of neighbors is the difference in indices minus one (to exclude the timestep itself) + n_neighbors = right_indices - left_indices - 1 + valid_mask = n_neighbors >= neighbor_min_size + + non_isolated_timesteps = timesteps[valid_mask] + + # NON VECTORIZED CODE + # non_isolated_timesteps = [] + # n_neighbours_arr = [] + # for i, t in enumerate(timesteps): + # n_neighbours = np.sum(np.logical_and(timesteps >= (t - neighbor_time_interval), + # timesteps <= (t + neighbor_time_interval))) - 1 + # n_neighbours_arr.append(n_neighbours) + # if n_neighbours > neighbor_min_size: + # non_isolated_timesteps.append(t) + # non_isolated_timesteps = np.array(non_isolated_timesteps) + return non_isolated_timesteps + + +def group_timesteps_into_events(timesteps, intra_event_max_time_gap): + """ + Group valid timesteps into events based on a maximum allowed dry interval. + + Parameters + ---------- + timesteps : array-like of np.datetime64 + Sorted array of valid timesteps. + intra_event_max_time_gap : np.timedelta64 + Maximum time interval allowed between consecutive valid timesteps for them + to be considered part of the same event. + + Returns + ------- + list of np.ndarray + A list of events, where each event is an array of timesteps. + """ + # Deal with case with no timesteps + if len(timesteps) == 0: + return [] + + # Ensure timesteps are sorted + timesteps.sort() + + # Compute differences between consecutive timesteps + diffs = np.diff(timesteps) + + # Identify the indices where the gap is larger than intra_event_max_time_gap + # These indices represent boundaries between events + break_indices = np.where(diffs > intra_event_max_time_gap)[0] + 1 + + # Split the timesteps at the identified break points + events = np.split(timesteps, break_indices) + + # NON VECTORIZED CODE + # events = [] + # current_event = [timesteps[0]] + # for i in range(1, len(timesteps)): + # current_t = timesteps[i] + # previous_t = timesteps[i - 1] + + # if current_t - previous_t <= intra_event_max_time_gap: + # current_event.append(current_t) + # else: + # events.append(current_event) + # current_event = [current_t] + + # events.append(current_event) + return events + + +####-----------------------------------------------------------------------------------. + + +def get_events_info(list_events, filepaths, accumulation_interval, rolling): + """ + Provide information about the required files for each event. + + For each event in `list_events`, this function identifies the file paths from `filepaths` that + overlap with the event period, adjusted by the `accumulation_interval`. The event period is + extended backward or forward based on the `rolling` parameter. + + Parameters + ---------- + list_events : list of dict + List of events, where each event is a dictionary containing at least 'start_time' and 'end_time' + keys with `numpy.datetime64` values. + filepaths : list of str + List of file paths corresponding to data files. + accumulation_interval : numpy.timedelta64 or int + Time interval to adjust the event period for accumulation. If an integer is provided, it is + assumed to be in seconds. + rolling : bool + If True, adjust the event period backward by `accumulation_interval` (rolling backward). + If False, adjust forward (aggregate forward). + + Returns + ------- + list of dict + A list where each element is a dictionary containing: + - 'start_time': Adjusted start time of the event (`numpy.datetime64`). + - 'end_time': Adjusted end time of the event (`numpy.datetime64`). + - 'filepaths': List of file paths overlapping with the adjusted event period. + + """ + # Ensure accumulation_interval is numpy.timedelta64 + if not isinstance(accumulation_interval, np.timedelta64): + accumulation_interval = np.timedelta64(accumulation_interval, "s") + + # Retrieve file start_time and end_time + files_start_time, files_end_time = get_start_end_time_from_filepaths(filepaths) + + # Retrieve information for each event + event_info = [] + for event_dict in list_events: + # Retrieve event time period + event_start_time = event_dict["start_time"] + event_end_time = event_dict["end_time"] + + # Add buffer to account for accumulation interval + if rolling: # backward + event_start_time = event_start_time - np.array(accumulation_interval, dtype="m8[s]") + else: # aggregate forward + event_end_time = event_end_time + np.array(accumulation_interval, dtype="m8[s]") + + # Derive event filepaths + overlaps = (files_start_time <= event_end_time) & (files_end_time >= event_start_time) + event_filepaths = np.array(filepaths)[overlaps].tolist() + + # Create dictionary + if len(event_filepaths) > 0: + event_info.append( + {"start_time": event_start_time, "end_time": event_end_time, "filepaths": event_filepaths}, + ) + + return event_info diff --git a/disdrodb/l2/processing.py b/disdrodb/l2/processing.py new file mode 100644 index 00000000..03bcb687 --- /dev/null +++ b/disdrodb/l2/processing.py @@ -0,0 +1,683 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement DISDRODB L2 processing.""" + +import numpy as np +import xarray as xr + +from disdrodb.l1.encoding_attrs import get_attrs_dict, get_encoding_dict +from disdrodb.l1.fall_velocity import get_raindrop_fall_velocity +from disdrodb.l1_env.routines import load_env_dataset +from disdrodb.l2.empirical_dsd import ( + get_drop_average_velocity, + get_drop_number_concentration, + get_drop_volume, + get_effective_sampling_area, + get_equivalent_reflectivity_factor, + get_kinetic_energy_density_flux, + get_liquid_water_content, + get_mean_volume_drop_diameter, + get_median_volume_drop_diameter, + get_min_max_drop_kinetic_energy, + get_mode_diameter, + get_moment, + get_normalized_intercept_parameter, + get_quantile_volume_drop_diameter, + get_rain_accumulation, + get_rain_rate, + get_rain_rate_from_dsd, + get_rainfall_kinetic_energy, + get_std_volume_drop_diameter, + get_total_number_concentration, +) +from disdrodb.psd import create_psd, estimate_model_parameters +from disdrodb.psd.fitting import compute_gof_stats +from disdrodb.scattering import get_radar_parameters +from disdrodb.utils.attrs import set_attrs +from disdrodb.utils.encoding import set_encodings +from disdrodb.utils.time import ensure_sample_interval_in_seconds + + +def define_diameter_array(diameter_min=0, diameter_max=10, diameter_spacing=0.05): + """ + Define an array of diameters and their corresponding bin properties. + + Parameters + ---------- + diameter_min : float, optional + The minimum diameter value. The default value is 0 mm. + diameter_max : float, optional + The maximum diameter value. The default value is 10 mm. + diameter_spacing : float, optional + The spacing between diameter values. The default value is 0.05 mm. + + Returns + ------- + xr.DataArray + A DataArray containing the center of each diameter bin, with coordinates for + the bin width, lower bound, upper bound, and center. + + """ + diameters_bounds = np.arange(diameter_min, diameter_max + diameter_spacing / 2, step=diameter_spacing) + diameters_bin_lower = diameters_bounds[:-1] + diameters_bin_upper = diameters_bounds[1:] + diameters_bin_width = diameters_bin_upper - diameters_bin_lower + diameters_bin_center = diameters_bin_lower + diameters_bin_width / 2 + da = xr.DataArray( + diameters_bin_center, + dims="diameter_bin_center", + coords={ + "diameter_bin_width": ("diameter_bin_center", diameters_bin_width), + "diameter_bin_lower": ("diameter_bin_center", diameters_bin_lower), + "diameter_bin_upper": ("diameter_bin_center", diameters_bin_upper), + "diameter_bin_center": ("diameter_bin_center", diameters_bin_center), + }, + ) + return da + + +def define_velocity_array(ds): + """ + Create the fall velocity DataArray using various methods. + + If 'velocity_bin_center' is a dimension in the dataset, returns a Dataset + with 'measured_velocity', 'average_velocity', and 'fall_velocity' as variables. + Otherwise, returns the 'fall_velocity' DataArray from the input dataset. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset containing velocity variables. + + Returns + ------- + velocity: xarray.DataArray + """ + drop_number = ds["drop_number"] + if "velocity_bin_center" in ds.dims: + velocity = xr.Dataset( + { + "measured_velocity": xr.ones_like(drop_number) * ds["velocity_bin_center"], + "average_velocity": xr.ones_like(drop_number) * ds["drop_average_velocity"], + "fall_velocity": xr.ones_like(drop_number) * ds["fall_velocity"], + }, + ).to_array(dim="velocity_method") + else: + velocity = ds["fall_velocity"] + return velocity + + +def compute_integral_parameters( + drop_number_concentration, + velocity, + diameter, + diameter_bin_width, + sample_interval, + water_density, +): + """ + Compute integral parameters of a drop size distribution (DSD). + + Parameters + ---------- + drop_number_concentration : array-like + Drop number concentration in each diameter bin [#/m3/mm]. + velocity : array-like + Fall velocity of drops in each diameter bin [m/s]. + diameter : array-like + Diameter of drops in each bin in m. + diameter_bin_width : array-like + Width of each diameter bin in mm. + sample_interval : float + Time interval over which the samples are collected in seconds. + water_density : float or array-like + Density of water [kg/m3]. + + Returns + ------- + ds : xarray.Dataset + Dataset containing the computed integral parameters: + - Nt : Total number concentration [#/m3] + - R : Rain rate [mm/h] + - P : Rain accumulation [mm] + - Z : Reflectivity factor [dBZ] + - W : Liquid water content [g/m3] + - D10 : Diameter at the 10th quantile of the cumulative LWC distribution [mm] + - D50 : Median volume drop diameter [mm] + - D90 : Diameter at the 90th quantile of the cumulative LWC distribution [mm] + - Dmode : Diameter at which the distribution peaks [mm] + - Dm : Mean volume drop diameter [mm] + - sigma_m : Standard deviation of the volume drop diameter [mm] + - Nw : Normalized intercept parameter [m-3·mm⁻¹] + - M1 to M6 : Moments of the drop size distribution + """ + # diameter in m! + + # Initialize dataset + ds = xr.Dataset() + + # Compute total number concentration (Nt) [#/m3] + total_number_concentration = get_total_number_concentration( + drop_number_concentration=drop_number_concentration, + diameter_bin_width=diameter_bin_width, + ) + + # Compute rain rate + rain_rate = get_rain_rate_from_dsd( + drop_number_concentration=drop_number_concentration, + velocity=velocity, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + ) + + # Compute rain accumulation (P) [mm] + rain_accumulation = get_rain_accumulation(rain_rate=rain_rate, sample_interval=sample_interval) + + # Compute moments (m0 to m6) + for moment in range(0, 7): + ds[f"M{moment}"] = get_moment( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + moment=moment, + ) + + # Compute Liquid Water Content (LWC) (W) [g/m3] + liquid_water_content = get_liquid_water_content( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + water_density=water_density, + ) + + # lwc_m = get_mom_liquid_water_content(moment_3=ds_l2["M3"], + # water_density=water_density) + + # Compute reflectivity in dBZ + reflectivity_factor = get_equivalent_reflectivity_factor( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + ) + + # Compute the diameter at which the distribution peak + mode_diameter = get_mode_diameter(drop_number_concentration) + + # Compute mean_volume_diameter (Dm) [mm] + mean_volume_diameter = get_mean_volume_drop_diameter(moment_3=ds["M3"], moment_4=ds["M4"]) + + # Compute σₘ[mm] + sigma_m = get_std_volume_drop_diameter( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + mean_volume_diameter=mean_volume_diameter, + ) + + # Compute normalized_intercept_parameter (Nw) [m-3·mm⁻¹] + normalized_intercept_parameter = get_normalized_intercept_parameter( + liquid_water_content=liquid_water_content, + mean_volume_diameter=mean_volume_diameter, + water_density=water_density, + ) + + # Nw = get_mom_normalized_intercept_parameter(moment_3=ds_l2["M3"], + # moment_4=ds_l2["M4"]) + + # Compute median volume_drop_diameter + d50 = get_median_volume_drop_diameter( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + water_density=water_density, + ) + + # Compute volume_drop_diameter for the 10th and 90th quantile of the cumulative LWC distribution + d10 = get_quantile_volume_drop_diameter( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + fraction=0.1, + water_density=water_density, + ) + + d90 = get_quantile_volume_drop_diameter( + drop_number_concentration=drop_number_concentration, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + fraction=0.9, + water_density=water_density, + ) + + ds["Nt"] = total_number_concentration + ds["R"] = rain_rate + ds["P"] = rain_accumulation + ds["Z"] = reflectivity_factor + ds["W"] = liquid_water_content + + ds["D10"] = d10 + ds["D50"] = d50 + ds["D90"] = d90 + ds["Dmode"] = mode_diameter + ds["Dm"] = mean_volume_diameter + ds["sigma_m"] = sigma_m + + ds["Nw"] = normalized_intercept_parameter + + return ds + + +####-------------------------------------------------------------------------- +#### L2 Empirical Parameters + + +def generate_l2_empirical(ds, ds_env=None): + """Generate the DISDRODB L2E dataset from the DISDRODB L1 dataset. + + Parameters + ---------- + ds : xarray.Dataset + DISDRODB L1 dataset. + ds_env : xarray.Dataset, optional + Environmental dataset used for fall velocity and water density estimates. + If None, a default environment dataset will be loaded. + + Returns + ------- + xarray.Dataset + DISRODB L2E dataset. + """ + # Retrieve attributes + attrs = ds.attrs.copy() + + # ------------------------------------------------------- + #### Preprocessing + # Discard all timesteps without measured drops + # - This allow to speed up processing + # - Regularization can be done at the end + ds = ds.isel(time=ds["n_drops_selected"] > 0) + + # Retrieve ENV dataset or take defaults + # --> Used for fall velocity and water density estimates + if ds_env is None: + ds_env = load_env_dataset(ds) + + # TODO: Derive water density as function of ENV (temperature, ...) + # --> (T == 10){density_water <- 999.7}else if(T == 20){density_water <- 998.2}else{density_water <- 995.7} + water_density = 1000 # kg / m3 + + # Determine if the velocity dimension is available + has_velocity_dimension = "velocity_bin_center" in ds.dims + + # ------------------------------------------------------- + # Extract variables from L1 + sensor_name = ds.attrs["sensor_name"] + diameter = ds["diameter_bin_center"] / 1000 # m + diameter_bin_width = ds["diameter_bin_width"] # mm + drop_number = ds["drop_number"] + drop_counts = ds["drop_counts"] + sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"]) # s + + # Compute sampling area [m2] + sampling_area = get_effective_sampling_area(sensor_name=sensor_name, diameter=diameter) # m2 + + # Select relevant L1 variables to L2 product + variables = [ + "drop_number", + "drop_counts", + "drop_number_concentration", + "sample_interval", + "n_drops_selected", + "n_drops_discarded", + "Dmin", + "Dmax", + "drop_average_velocity", + "fall_velocity", + ] + + variables = [var for var in variables if var in ds] + ds_l1_subset = ds[variables] + + # ------------------------------------------------------------------------------------------- + # Compute and add drop average velocity if an optical disdrometer (i.e OTT Parsivel or ThiesLPM) + # - Recompute it because if input dataset is aggregated, it must be updated ! + if has_velocity_dimension: + ds["drop_average_velocity"] = get_drop_average_velocity(ds["drop_number"]) + + # ------------------------------------------------------------------------------------------- + # Define velocity array with dimension 'velocity_method' + velocity = define_velocity_array(ds) + + # ------------------------------------------------------- + #### Compute L2 variables + # Compute drop number concentration (Nt) [#/m3/mm] + drop_number_concentration = get_drop_number_concentration( + drop_number=drop_number, + velocity=velocity, + diameter_bin_width=diameter_bin_width, + sample_interval=sample_interval, + sampling_area=sampling_area, + ) + + # Compute rain rate (R) [mm/hr] + rain_rate = get_rain_rate( + drop_counts=drop_counts, + sampling_area=sampling_area, + diameter=diameter, + sample_interval=sample_interval, + ) + + # Compute rain accumulation (P) [mm] + rain_accumulation = get_rain_accumulation(rain_rate=rain_rate, sample_interval=sample_interval) + + # Compute drop volume information (per diameter bin) + drop_volume = drop_counts * get_drop_volume(diameter) # (np.pi/6 * diameter**3 * drop_counts) + drop_total_volume = drop_volume.sum(dim="diameter_bin_center") + drop_relative_volume_ratio = drop_volume / drop_total_volume + + # Compute kinetic energy variables + # --> TODO: implement from_dsd (using drop_concentration!) + min_drop_kinetic_energy, max_drop_kinetic_energy = get_min_max_drop_kinetic_energy( + drop_number=drop_number, + diameter=diameter, + velocity=velocity, + water_density=water_density, + ) + + kinetic_energy_density_flux = get_kinetic_energy_density_flux( + drop_number=drop_number, + diameter=diameter, + velocity=velocity, + sample_interval=sample_interval, + sampling_area=sampling_area, + water_density=water_density, + ) + + rainfall_kinetic_energy = get_rainfall_kinetic_energy( + drop_number=drop_number, + diameter=diameter, + velocity=velocity, + sampling_area=sampling_area, + rain_accumulation=rain_accumulation, + water_density=water_density, + ) + + # ---------------------------------------------------------------------------- + # Compute integral parameters + ds_l2 = compute_integral_parameters( + drop_number_concentration=drop_number_concentration, + velocity=velocity, + diameter=diameter, + diameter_bin_width=diameter_bin_width, + sample_interval=sample_interval, + water_density=water_density, + ) + + # ---------------------------------------------------------------------------- + #### Create L2 Dataset + # Update with L1 parameters + ds_l2.update(ds_l1_subset) + + ds_l2["drop_number"] = drop_number # 2D V x D + ds_l2["drop_counts"] = drop_counts # 1D D + ds_l2["drop_number_concentration"] = drop_number_concentration + + ds_l2["drop_volume"] = drop_volume + ds_l2["drop_total_volume"] = drop_total_volume + ds_l2["drop_relative_volume_ratio"] = drop_relative_volume_ratio + + ds_l2["R"] = rain_rate + ds_l2["P"] = rain_accumulation + + # TODO: adapt code to compute from drop_number_concentration + ds_l2["KEmin"] = min_drop_kinetic_energy + ds_l2["KEmax"] = max_drop_kinetic_energy + ds_l2["E"] = rainfall_kinetic_energy + ds_l2["KE"] = kinetic_energy_density_flux + + # ---------------------------------------------------------------------------- + + # ----------------------------------------------------------------------------. + # Remove timesteps where rain rate is 0 + ds_l2 = ds_l2.isel(time=ds_l2["R"] > 0) + + # ----------------------------------------------------------------------------. + #### Add encodings and attributes + # Add variables attributes + attrs_dict = get_attrs_dict() + ds_l2 = set_attrs(ds_l2, attrs_dict=attrs_dict) + + # Add variables encoding + encoding_dict = get_encoding_dict() + ds_l2 = set_encodings(ds_l2, encoding_dict=encoding_dict) + + # Add global attributes + ds_l2.attrs = attrs + + return ds_l2 + + +####-------------------------------------------------------------------------- +#### L2 Model Parameters + + +def generate_l2_model( + ds, + ds_env=None, + fall_velocity_method="Beard1976", + # PSD discretization + diameter_min=0, + diameter_max=8, + diameter_spacing=0.05, + # Fitting options + psd_model=None, + optimization=None, + optimization_kwargs=None, + # GOF metrics options + gof_metrics=True, +): + """ + Generate the DISDRODB L2M dataset from a DISDRODB L2E dataset. + + This function estimates PSD model parameters and successively computes DSD integral parameters. + Optionally, radar variables at various bands are simulated using T-matrix simulations. + Goodness-of-fit metrics of the PSD can also be optionally included into the output dataset. + + Parameters + ---------- + ds : xarray.Dataset + DISDRODB L2E dataset. + ds_env : xarray.Dataset, optional + Environmental dataset used for fall velocity and water density estimates. + If None, a default environment dataset will be loaded. + diameter_min : float, optional + Minimum PSD diameter. The default value is 0 mm. + diameter_max : float, optional + Maximum PSD diameter. The default value is 8 mm. + diameter_spacing : float, optional + PSD diameter spacing. The default value is 0.05 mm. + psd_model : str + The PSD model to fit. See ``available_psd_models()``. + optimization : str, optional + The fitting optimization procedure. Either "GS" (Grid Search), "ML (Maximum Likelihood) + or "MOM" (Method of Moments). + optimization_kwargs : dict, optional + Dictionary with arguments to customize the fitting procedure. + gof_metrics : bool, optional + Whether to add goodness-of-fit metrics to the output dataset. The default is True. + + Returns + ------- + xarray.Dataset + DISDRODB L2M dataset. + """ + # ----------------------------------------------------------------------------. + #### NOTES + # - Final processing: Optionally filter dataset only when PSD has fitted ? + # --> but good to have everything to compare across models + + # ----------------------------------------------------------------------------. + # Retrieve attributes + attrs = ds.attrs.copy() + + # ------------------------------------------------------- + # Derive water density as function of ENV (temperature, ...) + # TODO --> Add into ds_env ! + # --> (T == 10){density_water <- 999.7}else if(T == 20){density_water <- 998.2}else{density_water <- 995.7} + water_density = 1000 # kg / m3 + + # Retrieve ENV dataset or take defaults + # --> Used for fall velocity and water density estimates + if ds_env is None: + ds_env = load_env_dataset(ds) + + ####------------------------------------------------------. + #### Preprocessing + # - Filtering criteria for when fitting a PSD + # TODO --> try to fit and define reasonable criteria based on R2, max deviation, rain_rate abs/relative error + + ####------------------------------------------------------. + #### Define default PSD optimization arguments + if psd_model is None and optimization is None: + psd_model = "NormalizedGammaPSD" + optimization = "GS" + optimization_kwargs = { + "target": "ND", + "transformation": "identity", + "error_order": 1, # MAE + } + + ####------------------------------------------------------. + #### Retrieve PSD parameters + ds_psd_params = estimate_model_parameters( + ds=ds, + psd_model=psd_model, + optimization=optimization, + optimization_kwargs=optimization_kwargs, + ) + psd_name = ds_psd_params.attrs["disdrodb_psd_model"] + psd = create_psd(psd_name, parameters=ds_psd_params) + + ####------------------------------------------------------- + #### Compute integral parameters + # Define diameter array + diameter = define_diameter_array( + diameter_min=diameter_min, + diameter_max=diameter_max, + diameter_spacing=diameter_spacing, + ) + diameter_bin_width = diameter["diameter_bin_width"] + + # Retrieve time of integration + sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"]) + + # Retrieve drop number concentration + drop_number_concentration = psd(diameter) + + # Retrieve fall velocity for each new diameter bin + velocity = get_raindrop_fall_velocity(diameter=diameter, method=fall_velocity_method, ds_env=ds_env) # mm + + # Compute integral parameters + ds_params = compute_integral_parameters( + drop_number_concentration=drop_number_concentration, + velocity=velocity, + diameter=diameter / 1000, # in meters ! + diameter_bin_width=diameter_bin_width, + sample_interval=sample_interval, + water_density=water_density, + ) + + #### ---------------------------------------------------------------------------- + #### Create L2 Dataset + # Update with PSD parameters + ds_params.update(ds_psd_params) + + # Add GOF statistics if asked + # TODO: Add metrics variables or GOF DataArray ? + if gof_metrics: + ds_gof = compute_gof_stats(drop_number_concentration=ds["drop_number_concentration"], psd=psd) + ds_params.update(ds_gof) + + #### ----------------------------------------------------------------------------. + #### Add encodings and attributes + # Add variables attributes + attrs_dict = get_attrs_dict() + ds_params = set_attrs(ds_params, attrs_dict=attrs_dict) + + # Add variables encoding + encoding_dict = get_encoding_dict() + ds_params = set_encodings(ds_params, encoding_dict=encoding_dict) + + # Add global attributes + ds_params.attrs = attrs + ds_params.attrs["disdrodb_psd_model"] = psd_name + + # Return dataset + return ds_params + + +####-------------------------------------------------------------------------------------------------------------------. +#### L2 Radar Parameters + + +def generate_l2_radar(ds, radar_band=None, canting_angle_std=7, diameter_max=8, axis_ratio="Thurai2007", parallel=True): + """Simulate polarimetric radar variables from empirical drop number concentration or the estimated PSD. + + Parameters + ---------- + ds : xarray.Dataset + Dataset containing the drop number concentration variable or the PSD parameters. + radar_band : str or list of str, optional + Radar band(s) to be used. + If ``None`` (the default), all available radar bands are used. + canting_angle_std : float or list of float, optional + Standard deviation of the canting angle. The default value is 7. + diameter_max : float or list of float, optional + Maximum diameter. The default value is 8 mm. + axis_ratio : str or list of str, optional + Method to compute the axis ratio. The default method is ``Thurai2007``. + parallel : bool, optional + Whether to compute radar variables in parallel. + The default value is ``True``. + + Returns + ------- + xarray.Dataset + Dataset containing the computed radar parameters. + """ + # Retrieve radar variables from L2E drop number concentration or from estimated L2M PSD model + ds_radar = get_radar_parameters( + ds=ds, + radar_band=radar_band, + canting_angle_std=canting_angle_std, + diameter_max=diameter_max, + axis_ratio=axis_ratio, + parallel=parallel, + ) + + #### ----------------------------------------------------------------------------. + #### Add encodings and attributes + # Add variables attributes + attrs_dict = get_attrs_dict() + ds_radar = set_attrs(ds_radar, attrs_dict=attrs_dict) + + # Add variables encoding + encoding_dict = get_encoding_dict() + ds_radar = set_encodings(ds_radar, encoding_dict=encoding_dict) + + # Return dataset + return ds_radar diff --git a/disdrodb/l2/processing_options.py b/disdrodb/l2/processing_options.py new file mode 100644 index 00000000..fd4639e1 --- /dev/null +++ b/disdrodb/l2/processing_options.py @@ -0,0 +1,109 @@ +# TODO: Write to YAML +# TODO: radar_simulation_enabled: differentiate between L2E and L2M: + +config = { + "global_settings": { + "time_integration": [ + "1MIN", + "10MIN", + "ROLL1MIN", + "ROLL10MIN", + ], # ["10S", "30S", "1MIN", "5MIN", "10MIN", "15MIN", "30MIN", "1H", "ROLL5MIN", "ROLL10MIN"], + # Radar options + "radar_simulation_enabled": True, + "radar_simulation_options": { + "radar_band": ["S", "C", "X", "Ku", "Ka", "W"], + "canting_angle_std": 7, + "diameter_max": 8, + "axis_ratio": "Thurai2007", + }, + # L2E options + # "l2e_options": {} + # L2M options + "l2m_options": { + "fall_velocity_method": "Beard1976", + "diameter_min": 0, + "diameter_max": 8, + "diameter_spacing": 0.05, + "gof_metrics": True, + "models": { + # PSD models fitting options + "GAMMA_ML": { + "psd_model": "GammaPSD", + "optimization": "ML", + "optimization_kwargs": { + "init_method": "M246", + "probability_method": "cdf", + "likelihood": "multinomial", + "truncated_likelihood": True, + "optimizer": "Nelder-Mead", + }, + }, + "NGAMMA_GS_LOG_ND_MAE": { + "psd_model": "NormalizedGammaPSD", + "optimization": "GS", + "optimization_kwargs": { + "target": "ND", + "transformation": "log", + "error_order": 1, # MAE + }, + }, + # "NGAMMA_GS_ND_MAE": { + # "psd_model": "NormalizedGammaPSD", + # "optimization": "GS", + # "optimization_kwargs": { + # "target": "ND", + # "transformation": "identity", + # "error_order": 1, # MAE + # }, + # }, + # "NGAMMA_GS_Z": { + # "psd_model": "NormalizedGammaPSD", + # "optimization": "GS", + # "optimization_kwargs": { + # "target": "Z", + # "transformation": "identity", # unused + # "error_order": 1, # unused + # }, + # }, + }, + }, + }, + "specific_settings": { + "10S": { + "radar_simulation_enabled": False, + }, + "30S": { + "radar_simulation_enabled": False, + }, + "10MIN": { + "radar_simulation_enabled": False, + }, + "15MIN": { + "radar_simulation_enabled": False, + }, + "30MIN": { + "radar_simulation_enabled": False, + }, + "1H": { + "radar_simulation_enabled": False, + }, + "ROLL10MIN": { + "radar_simulation_enabled": False, + }, + }, +} + + +def get_l2_processing_options(): + """Retrieve L2 processing options.""" + # TODO: Implement validation ! + l2_options_dict = {} + for tt in config["global_settings"]["time_integration"]: + l2_options_dict[tt] = config["global_settings"].copy() + _ = l2_options_dict[tt].pop("time_integration", None) + # Add specific settings + for tt, product_options in config["specific_settings"].items(): + if tt in l2_options_dict: + l2_options_dict[tt].update(product_options) + return l2_options_dict diff --git a/disdrodb/l2/routines.py b/disdrodb/l2/routines.py new file mode 100644 index 00000000..df2663a1 --- /dev/null +++ b/disdrodb/l2/routines.py @@ -0,0 +1,843 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implements routines for DISDRODB L2 processing.""" + +import datetime +import logging +import os +import time +from typing import Optional + +import dask +import numpy as np +import pandas as pd +import xarray as xr + +# Directory +from disdrodb.api.create_directories import ( + create_logs_directory, + create_product_directory, +) +from disdrodb.api.info import group_filepaths +from disdrodb.api.io import get_filepaths, get_required_product +from disdrodb.api.path import ( + define_accumulation_acronym, + define_l2e_filename, + define_l2m_filename, +) +from disdrodb.configs import get_base_dir +from disdrodb.l1.resampling import ( + regularize_dataset, + resample_dataset, +) +from disdrodb.l2.event import get_events_info, identify_events +from disdrodb.l2.processing import ( + generate_l2_empirical, + generate_l2_model, + generate_l2_radar, +) +from disdrodb.l2.processing_options import get_l2_processing_options +from disdrodb.metadata import read_station_metadata +from disdrodb.utils.decorator import delayed_if_parallel, single_threaded_if_parallel + +# Logger +from disdrodb.utils.logger import ( + close_logger, + create_logger_file, + create_product_logs, + log_error, + log_info, +) +from disdrodb.utils.time import ensure_sample_interval_in_seconds, get_resampling_information +from disdrodb.utils.writer import write_product + +logger = logging.getLogger(__name__) + + +####----------------------------------------------------------------------------. +#### L2E + + +@delayed_if_parallel +@single_threaded_if_parallel +def _generate_l2e( + start_time, + end_time, + filepaths, + data_dir, + logs_dir, + campaign_name, + station_name, + # Sampling options + accumulation_interval, + rolling, + # Radar options + radar_simulation_enabled, + radar_simulation_options, + # Processing options + force, + verbose, + parallel, # this is used by the decorator and to initialize correctly the logger ! +): + # -----------------------------------------------------------------. + # Define product name + product = "L2E" + + # -----------------------------------------------------------------. + # Create file logger + sample_interval_acronym = define_accumulation_acronym(seconds=accumulation_interval, rolling=rolling) + starting_time = pd.to_datetime(start_time).strftime("%Y%m%d%H%M%S") + ending_time = pd.to_datetime(end_time).strftime("%Y%m%d%H%M%S") + filename = f"L2E.{sample_interval_acronym}.{campaign_name}.{station_name}.s{starting_time}.e{ending_time}" + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, + filename=filename, + parallel=parallel, + ) + ##------------------------------------------------------------------------. + # Log start processing + msg = f"{product} processing of {filename} has started." + log_info(logger, msg, verbose=verbose) + + ##------------------------------------------------------------------------. + ### Core computation + try: + # ------------------------------------------------------------------------. + #### Open the dataset over the period of interest + # - Open the netCDFs + list_ds = [xr.open_dataset(filepath, chunks={}, cache=False, autoclose=True) for filepath in filepaths] + # - Concatenate datasets + ds = xr.concat(list_ds, dim="time", compat="no_conflicts", combine_attrs="override") + ds = ds.sel(time=slice(start_time, end_time)).compute() + # - Close file on disk + _ = [ds.close() for ds in list_ds] + + ##------------------------------------------------------------------------. + #### Resample dataset + # Here we set NaN in the raw_drop_number to 0 + # - We assume that NaN corresponds to 0 + # - When we regularize, we infill with NaN + # - When we aggregate with sum, we don't skip NaN + # --> Aggregation with original missing timesteps currently results in NaN ! + # TODO: Add tolerance on fraction of missing timesteps for large accumulation_intervals + ds["drop_number"] = xr.where(np.isnan(ds["drop_number"]), 0, ds["drop_number"]) + + # - Regularize dataset + # --> Infill missing timesteps with np.Nan + sample_interval = ensure_sample_interval_in_seconds(ds["sample_interval"]).item() + ds = regularize_dataset(ds, freq=f"{sample_interval}s") + + # - Resample dataset + ds = resample_dataset( + ds=ds, + sample_interval=sample_interval, + accumulation_interval=accumulation_interval, + rolling=rolling, + ) + + ##------------------------------------------------------------------------. + # Remove timesteps with no drops or NaN (from L2E computations) + # timestep_zero_drops = ds["time"].data[ds["n_drops_selected"].data == 0] + # timestep_nan = ds["time"].data[np.isnan(ds["n_drops_selected"].data)] + indices_valid_timesteps = np.where( + ~np.logical_or(ds["n_drops_selected"].data == 0, np.isnan(ds["n_drops_selected"].data)), + )[0] + ds = ds.isel(time=indices_valid_timesteps) + + ##------------------------------------------------------------------------. + #### Generate L2E product + ds = generate_l2_empirical(ds=ds) + + # Simulate L2M-based radar variables if asked + if radar_simulation_enabled: + ds_radar = generate_l2_radar(ds, parallel=not parallel, **radar_simulation_options) + ds.update(ds_radar) + ds.attrs = ds_radar.attrs.copy() + + ##------------------------------------------------------------------------. + #### Regularize back dataset + # TODO: infill timestep_zero_drops and timestep_nan differently ? + # --> R, P, LWC = 0, + # --> Z, D, with np.nan? + + ##------------------------------------------------------------------------. + # Write netCDF4 dataset + if ds["time"].size > 1: + filename = define_l2e_filename( + ds, + campaign_name=campaign_name, + station_name=station_name, + sample_interval=accumulation_interval, + rolling=rolling, + ) + filepath = os.path.join(data_dir, filename) + write_product(ds, product=product, filepath=filepath, force=force) + + ##--------------------------------------------------------------------. + # Clean environment + del ds + + # Log end processing + msg = f"{product} processing of {filename} has ended." + log_info(logger, msg, verbose=verbose) + + ##--------------------------------------------------------------------. + # Otherwise log the error + except Exception as e: + error_type = str(type(e).__name__) + msg = f"{error_type}: {e}" + log_error(logger, msg, verbose=verbose) + + # Close the file logger + close_logger(logger) + + # Return the logger file path + return logger_filepath + + +def is_possible_product(accumulation_interval, sample_interval, rolling): + """Assess if production is possible given the requested accumulation interval and source sample_interval.""" + # Avoid rolling product generation at source sample interval + if rolling and accumulation_interval == sample_interval: + return False + # Avoid product generation if the accumulation_interval is less than the sample interval + if accumulation_interval < sample_interval: + return False + # Avoid producti generation if accumulation_interval is not multiple of sample_interval + return accumulation_interval % sample_interval == 0 + + +def flatten_list(nested_list): + """Flatten a nested list into a single-level list.""" + if isinstance(nested_list, list) and len(nested_list) == 0: + return nested_list + # If list is already flat, return as is to avoid flattening to chars + if isinstance(nested_list, list) and not isinstance(nested_list[0], list): + return nested_list + return [item for sublist in nested_list for item in sublist] if isinstance(nested_list, list) else [nested_list] + + +def run_l2e_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Generate the L2E product of a specific DISDRODB station when invoked from the terminal. + + This function is intended to be called through the ``disdrodb_run_l2e_station`` + command-line interface. + + The DISDRODB L2E routine generate a L2E file for each event. + Events are defined based on the DISDRODB event settings options. + The DISDRODB event settings allows to produce L2E files either + per custom block of time (i.e day/month/year) or for blocks of rainy events. + + For stations with varying measurement intervals, DISDRODB defines a separate list of 'events' + for each measurement interval option. In other words, DISDRODB does not + mix files with data acquired at different sample intervals when resampling the data. + + L0C product generation ensure creation of files with unique sample intervals. + + Parameters + ---------- + data_source : str + The name of the institution (for campaigns spanning multiple countries) or + the name of the country (for campaigns or sensor networks within a single country). + Must be provided in UPPER CASE. + campaign_name : str + The name of the campaign. Must be provided in UPPER CASE. + station_name : str + The name of the station. + force : bool, optional + If ``True``, existing data in the destination directories will be overwritten. + If ``False`` (default), an error will be raised if data already exists in the destination directories. + verbose : bool, optional + If ``True`` (default), detailed processing information will be printed to the terminal. + If ``False``, less information will be displayed. + parallel : bool, optional + If ``True``, files will be processed in multiple processes simultaneously, + with each process using a single thread to avoid issues with the HDF/netCDF library. + If ``False`` (default), files will be processed sequentially in a single process, + and multi-threading will be automatically exploited to speed up I/O tasks. + debugging_mode : bool, optional + If ``True``, the amount of data processed will be reduced. + Only the first 3 files will be processed. By default, ``False``. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + + """ + # Define product + product = "L2E" + + # Define base directory + base_dir = get_base_dir(base_dir) + + # ------------------------------------------------------------------------. + # Start processing + if verbose: + t_i = time.time() + msg = f"{product} processing of station {station_name} has started." + log_info(logger=logger, msg=msg, verbose=verbose) + + # -------------------------------------------------------------------------. + # List L1 files to process + required_product = get_required_product(product) + flag_not_available_data = False + try: + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=required_product, + # Processing options + debugging_mode=False, + ) + except Exception as e: + print(str(e)) # Case where no file paths available + flag_not_available_data = True + + # -------------------------------------------------------------------------. + # If no data available, print error message and return None + if flag_not_available_data: + msg = ( + f"{product} processing of {data_source} {campaign_name} {station_name}" + + f"has not been launched because of missing {required_product} data." + ) + print(msg) + return + + # -------------------------------------------------------------------------. + # Retrieve L2 processing options + # - Each dictionary item contains the processing options for a given rolling/accumulation_interval combo + l2_processing_options = get_l2_processing_options() + + # ---------------------------------------------------------------------. + # Group filepaths by sample intervals + # - Typically the sample interval is fixed + # - Some stations might change the sample interval along the years + # - For each sample interval, separated processing take place here after ! + dict_filepaths = group_filepaths(filepaths, groups="sample_interval") + + # -------------------------------------------------------------------------. + # Define list of event + # - [(start_time, end_time)] + # TODO: Here pass event option list ! + # TODO: Implement more general define_events function + # - Either rainy events + # - Either time blocks (day/month/year) + # TODO: Define events identification settings based on accumulation + # - This is currently done at the source sample interval ! + # - Should we allow event definition for each accumulation interval and + # move this code inside the loop below + + # sample_interval = list(dict_filepaths)[0] + # filepaths = dict_filepaths[sample_interval] + + dict_list_events = { + sample_interval: identify_events(filepaths, parallel=parallel) + for sample_interval, filepaths in dict_filepaths.items() + } + + # ---------------------------------------------------------------------. + # Subset for debugging mode + if debugging_mode: + dict_list_events = { + sample_interval: list_events[0 : min(len(list_events), 3)] + for sample_interval, list_events in dict_list_events.items() + } + + # ---------------------------------------------------------------------. + # Loop + # rolling = False + # accumulation_interval = 60 + # sample_interval_acronym = "1MIN" + # l2_options = l2_processing_options["1MIN"] + for sample_interval_acronym, l2_options in l2_processing_options.items(): + + # Retrieve accumulation_interval and rolling option + accumulation_interval, rolling = get_resampling_information(sample_interval_acronym) + + # Retrieve radar simulation options + radar_simulation_enabled = l2_options.get("radar_simulation_enabled", False) + radar_simulation_options = l2_options["radar_simulation_options"] + + # ------------------------------------------------------------------. + # Group filepaths by events + # - This is done separately for each possible source sample interval + # - It groups filepaths by start_time and end_time provided by list_events + # - Here 'events' can also simply be period of times ('day', 'months', ...) + # - When aggregating/resampling/accumulating data, we need to load also + # some data before/after the actual event start_time/end_time + # - get_events_info adjust the event times to accounts for the required "border" data. + events_info = [ + get_events_info( + list_events=list_events, + filepaths=dict_filepaths[sample_interval], + accumulation_interval=accumulation_interval, + rolling=rolling, + ) + for sample_interval, list_events in dict_list_events.items() + if is_possible_product( + accumulation_interval=accumulation_interval, + sample_interval=sample_interval, + rolling=rolling, + ) + ] + events_info = flatten_list(events_info) + + # ------------------------------------------------------------------. + # Skip processing if no files available + # - When not compatible accumulation_interval with source sample_interval + if len(events_info) == 0: + continue + + # ------------------------------------------------------------------. + # Create product directory + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=product, + force=force, + # Option for L2E + sample_interval=accumulation_interval, + rolling=rolling, + ) + + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Option for L2E + sample_interval=accumulation_interval, + rolling=rolling, + ) + + # ------------------------------------------------------------------. + # Generate files + # - L2E product generation is optionally parallelized over events + # - If parallel=True, it does that in parallel using dask.delayed + list_tasks = [ + _generate_l2e( + start_time=event_info["start_time"], + end_time=event_info["end_time"], + filepaths=event_info["filepaths"], + data_dir=data_dir, + logs_dir=logs_dir, + campaign_name=campaign_name, + station_name=station_name, + # Sampling options + rolling=rolling, + accumulation_interval=accumulation_interval, + # Radar options + radar_simulation_enabled=radar_simulation_enabled, + radar_simulation_options=radar_simulation_options, + # Processing options + force=force, + verbose=verbose, + parallel=parallel, + ) + for event_info in events_info + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + + # -----------------------------------------------------------------. + # Define product summary logs + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Product options + sample_interval=accumulation_interval, + rolling=rolling, + # Logs list + list_logs=list_logs, + ) + + # ---------------------------------------------------------------------. + # End product processing + if verbose: + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + msg = f"{product} processing of station {station_name} completed in {timedelta_str}" + log_info(logger=logger, msg=msg, verbose=verbose) + + +####----------------------------------------------------------------------------. +#### L2M + + +@delayed_if_parallel +@single_threaded_if_parallel +def _generate_l2m( + filepath, + data_dir, + logs_dir, + campaign_name, + station_name, + # L2M options + sample_interval, + rolling, + model_name, + l2m_options, + # Radar options + radar_simulation_enabled, + radar_simulation_options, + # Processing options + force, + verbose, + parallel, # this is used only to initialize the correct logger ! +): + # -----------------------------------------------------------------. + # Define product name + product = "L2M" + + # -----------------------------------------------------------------. + # Define model options + psd_model = l2m_options["models"][model_name]["psd_model"] + optimization = l2m_options["models"][model_name]["optimization"] + optimization_kwargs = l2m_options["models"][model_name]["optimization_kwargs"] + other_options = {k: v for k, v in l2m_options.items() if k != "models"} + + # -----------------------------------------------------------------. + # Create file logger + filename = os.path.basename(filepath) + logger, logger_filepath = create_logger_file( + logs_dir=logs_dir, + filename=filename, + parallel=parallel, + ) + + ##------------------------------------------------------------------------. + # Log start processing + msg = f"{product} processing of {filename} has started." + log_info(logger, msg, verbose=verbose) + + ##------------------------------------------------------------------------. + ### Core computation + try: + # Open the raw netCDF + with xr.open_dataset(filepath, chunks={}, cache=False) as ds: + variables = [ + "drop_number_concentration", + "fall_velocity", + "D50", + "Nw", + "Nt", + "M1", + "M2", + "M3", + "M4", + "M5", + "M6", + ] + ds = ds[variables].load() + + # Produce L2M dataset + ds = generate_l2_model( + ds=ds, + psd_model=psd_model, + optimization=optimization, + optimization_kwargs=optimization_kwargs, + **other_options, + ) + + # Simulate L2M-based radar variables if asked + if radar_simulation_enabled: + ds_radar = generate_l2_radar(ds, parallel=not parallel, **radar_simulation_options) + ds.update(ds_radar) + ds.attrs = ds_radar.attrs.copy() + + # Write L2M netCDF4 dataset + if ds["time"].size > 1: + # Define filepath + filename = define_l2m_filename( + ds, + campaign_name=campaign_name, + station_name=station_name, + sample_interval=sample_interval, + rolling=rolling, + model_name=model_name, + ) + filepath = os.path.join(data_dir, filename) + # Write to disk + write_product(ds, product=product, filepath=filepath, force=force) + + ##--------------------------------------------------------------------. + # Clean environment + del ds + + # Log end processing + msg = f"{product} processing of {filename} has ended." + log_info(logger, msg, verbose=verbose) + + ##--------------------------------------------------------------------. + # Otherwise log the error + except Exception as e: + error_type = str(type(e).__name__) + msg = f"{error_type}: {e}" + log_error(logger, msg, verbose=verbose) + + # Close the file logger + close_logger(logger) + + # Return the logger file path + return logger_filepath + + +def run_l2m_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = True, + parallel: bool = True, + debugging_mode: bool = False, + base_dir: Optional[str] = None, +): + """ + Run the L2M processing of a specific DISDRODB station when invoked from the terminal. + + This function is intended to be called through the ``disdrodb_run_l2m_station`` + command-line interface. + + Parameters + ---------- + data_source : str + The name of the institution (for campaigns spanning multiple countries) or + the name of the country (for campaigns or sensor networks within a single country). + Must be provided in UPPER CASE. + campaign_name : str + The name of the campaign. Must be provided in UPPER CASE. + station_name : str + The name of the station. + force : bool, optional + If ``True``, existing data in the destination directories will be overwritten. + If ``False`` (default), an error will be raised if data already exists in the destination directories. + verbose : bool, optional + If ``True`` (default), detailed processing information will be printed to the terminal. + If ``False``, less information will be displayed. + parallel : bool, optional + If ``True``, files will be processed in multiple processes simultaneously, + with each process using a single thread to avoid issues with the HDF/netCDF library. + If ``False`` (default), files will be processed sequentially in a single process, + and multi-threading will be automatically exploited to speed up I/O tasks. + debugging_mode : bool, optional + If ``True``, the amount of data processed will be reduced. + Only the first 3 files will be processed. By default, ``False``. + base_dir : str, optional + The base directory of DISDRODB, expected in the format ``<...>/DISDRODB``. + If not specified, the path specified in the DISDRODB active configuration will be used. + + """ + # Define product + product = "L2M" + + # Define base directory + base_dir = get_base_dir(base_dir) + + # ------------------------------------------------------------------------. + # Start processing + if verbose: + t_i = time.time() + msg = f"{product} processing of station {station_name} has started." + log_info(logger=logger, msg=msg, verbose=verbose) + + # -------------------------------------------------------------------------. + # Retrieve L2 processing options + # - Each dictionary item contains the processing options for a given rolling/accumulation_interval combo + l2_processing_options = get_l2_processing_options() + + # ---------------------------------------------------------------------. + # Retrieve source sampling interval + # - If a station has varying measurement interval over time, choose the smallest one ! + metadata = read_station_metadata( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + sample_interval = metadata["measurement_interval"] + if isinstance(sample_interval, list): + sample_interval = min(sample_interval) + + # ---------------------------------------------------------------------. + # Loop + # sample_interval_acronym = "1MIN" + # l2_options = l2_processing_options["1MIN"] + for sample_interval_acronym, l2_options in l2_processing_options.items(): + + # Retrieve accumulation_interval and rolling option + accumulation_interval, rolling = get_resampling_information(sample_interval_acronym) + + # Retrieve L2M processing options + l2m_options = l2_options["l2m_options"] + + # Retrieve radar simulation options + radar_simulation_enabled = l2_options.get("radar_simulation_enabled", False) + radar_simulation_options = l2_options["radar_simulation_options"] + + # ------------------------------------------------------------------. + # Avoid generation of rolling products for source sample interval ! + if rolling and accumulation_interval == sample_interval: + continue + + # Avoid product generation if the accumulation_interval is less than the sample interval + if accumulation_interval < sample_interval: + continue + + # -----------------------------------------------------------------. + # List files to process + required_product = get_required_product(product) + flag_not_available_data = False + try: + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=required_product, + sample_interval=accumulation_interval, + rolling=rolling, + # Processing options + debugging_mode=debugging_mode, + ) + except Exception as e: + print(str(e)) # Case where no file paths available + flag_not_available_data = True + + # If no data available, try with other L2E accumulation intervals + if flag_not_available_data: + msg = ( + f"{product} processing of {data_source} {campaign_name} {station_name}" + + f"has not been launched because of missing {required_product} {sample_interval_acronym} data ." + ) + print(msg) + continue + + # -----------------------------------------------------------------. + # Loop over distributions to fit + # model_name = "GAMMA_ML" + # model_options = l2m_options["models"][model_name] + for model_name, model_options in l2m_options["models"].items(): + + # Retrieve model options + psd_model = model_options["psd_model"] + optimization = model_options["optimization"] + + # -----------------------------------------------------------------. + msg = f" - Production of L2M_{model_name} for sample interval {accumulation_interval} s has started." + log_info(logger=logger, msg=msg, verbose=verbose) + msg = f" - Estimating {psd_model} parameters using {optimization}." + log_info(logger=logger, msg=msg, verbose=verbose) + + # -------------------------------------------------------------. + # Create product directory + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=product, + force=force, + # Option for L2E + sample_interval=accumulation_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + ) + + # Define logs directory + logs_dir = create_logs_directory( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Option for L2E + sample_interval=accumulation_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + ) + + # Generate L2M files + # - Loop over the L2E netCDF files and generate L2M files. + # - If parallel=True, it does that in parallel using dask.delayed + list_tasks = [ + _generate_l2m( + filepath=filepath, + data_dir=data_dir, + logs_dir=logs_dir, + campaign_name=campaign_name, + station_name=station_name, + # L2M option + sample_interval=accumulation_interval, + rolling=rolling, + model_name=model_name, + l2m_options=l2m_options, + # Radar options + radar_simulation_enabled=radar_simulation_enabled, + radar_simulation_options=radar_simulation_options, + # Processing options + force=force, + verbose=verbose, + parallel=parallel, + ) + for filepath in filepaths + ] + list_logs = dask.compute(*list_tasks) if parallel else list_tasks + + # -----------------------------------------------------------------. + # Define L2M summary logs + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + # Product options + model_name=model_name, + sample_interval=sample_interval, + rolling=rolling, + # Logs list + list_logs=list_logs, + ) + + # ---------------------------------------------------------------------. + # End L2M processing + if verbose: + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + msg = f"{product} processing of station {station_name} completed in {timedelta_str}" + log_info(logger=logger, msg=msg, verbose=verbose) diff --git a/disdrodb/metadata/geolocation.py b/disdrodb/metadata/geolocation.py new file mode 100644 index 00000000..8ee1cc76 --- /dev/null +++ b/disdrodb/metadata/geolocation.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Metadata tools to verify/complete geolocation information.""" +import time + +import numpy as np +import requests + + +def infer_altitude(latitude, longitude, dem="aster30m"): + """Infer station altitude using a Digital Elevation Model (DEM). + + This function uses the OpenTopoData API to infer the altitude of a given + location specified by latitude and longitude. + By default, it uses the ASTER DEM at 30m resolution. + + Parameters + ---------- + latitude : float + The latitude of the location for which to infer the altitude. + longitude : float + The longitude of the location for which to infer the altitude. + dem : str, optional + The DEM to use for altitude inference. Options are "aster30m" (default), + "srtm30", and "mapzen". + + Returns + ------- + elevation : float + The inferred altitude of the specified location. + + Raises + ------ + ValueError + If the altitude retrieval fails. + + Notes + ----- + - The OpenTopoData API has a limit of 1000 calls per day. + - Each request can include up to 100 locations. + - The API allows a maximum of 1 call per second. + + References + ---------- + https://www.opentopodata.org/api/ + """ + import requests + + url = f"https://api.opentopodata.org/v1/{dem}?locations={latitude},{longitude}" + r = requests.get(url) + + data = r.json() + if data["status"] == "OK": + elevation = data["results"][0]["elevation"] + else: + raise ValueError("Altitude retrieval failed.") + return elevation + + +def infer_altitudes(lats, lons, dem="aster30m"): + """ + Infer altitude of a given location using OpenTopoData API. + + Parameters + ---------- + lats : list or array-like + List or array of latitude coordinates. + lons : list or array-like + List or array of longitude coordinates. + dem : str, optional + Digital Elevation Model (DEM) to use for altitude inference. + The default DEM is "aster30m". + + Returns + ------- + elevations : numpy.ndarray + Array of inferred altitudes corresponding to the input coordinates. + + Raises + ------ + ValueError + If the latitude and longitude arrays do not have the same length. + If altitude retrieval fails for any block of coordinates. + + Notes + ----- + - The OpenTopoData API has a limit of 1000 calls per day. + - Each request can include up to 100 locations. + - The API allows a maximum of 1 call per second. + - The API requests are made in blocks of up to 100 coordinates, + with a 2-second delay between requests. + """ + # Check that lats and lons have the same length + if len(lats) != len(lons): + raise ValueError("Latitude and longitude arrays must have the same length.") + + # Maximum number of locations per API request + max_locations = 100 + elevations = [] + + # Total number of coordinates + total_coords = len(lats) + + # Loop over the coordinates in blocks of max_locations + for i in range(0, total_coords, max_locations): + + # Wait 2 seconds before another API request + time.sleep(2) + + # Get the block of coordinates + block_lats = lats[i : i + max_locations] + block_lons = lons[i : i + max_locations] + + # Create the list_coords string in the format "lat1,lon1|lat2,lon2|..." + list_coords = "|".join([f"{lat},{lon}" for lat, lon in zip(block_lats, block_lons)]) + + # Define API URL + url = f"https://api.opentopodata.org/v1/{dem}?locations={list_coords}&interpolation=nearest" + + # Retrieve info + r = requests.get(url) + data = r.json() + + # Parse info + if data.get("status") == "OK": + elevations.extend([result["elevation"] for result in data["results"]]) + else: + raise ValueError(f"Altitude retrieval failed for block starting at index {i}.") + elevations = np.array(elevations).astype(float) + return elevations diff --git a/disdrodb/metadata/manipulation.py b/disdrodb/metadata/manipulation.py index a8f15454..9d9370a3 100644 --- a/disdrodb/metadata/manipulation.py +++ b/disdrodb/metadata/manipulation.py @@ -17,6 +17,11 @@ # along with this program. If not, see . # -----------------------------------------------------------------------------. """Metadata Manipulation Tools.""" +import shutil + +from disdrodb.api.io import available_stations +from disdrodb.api.path import define_metadata_filepath +from disdrodb.configs import get_base_dir def remove_invalid_metadata_keys(metadata): @@ -46,3 +51,40 @@ def sort_metadata_dictionary(metadata): list_metadata_keys = get_valid_metadata_keys() metadata = {k: metadata[k] for k in list_metadata_keys} return metadata + + +def update_processed_metadata(): + """Update metadata in the 'DISDRODB/Processed' directory.""" + base_dir = get_base_dir() + # Retrieve list of all processed stations + # --> (data_source, campaign_name, station_name) + list_info = available_stations( + product="L0B", + ) + + # Retrieve metadata filepaths + list_src_dst_path = [ + ( + # Source + define_metadata_filepath( + product="RAW", + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + check_exists=False, + ), + # Destination + define_metadata_filepath( + product="L0B", + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=base_dir, + check_exists=False, + ), + ) + for data_source, campaign_name, station_name in list_info + ] + # Copy file from RAW directory to Processed directory + _ = [shutil.copyfile(src_path, dst_path) for (src_path, dst_path) in list_src_dst_path] diff --git a/disdrodb/psd/__init__.py b/disdrodb/psd/__init__.py new file mode 100644 index 00000000..f5068957 --- /dev/null +++ b/disdrodb/psd/__init__.py @@ -0,0 +1,38 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement PSD model and fitting routines.""" + + +from disdrodb.psd.fitting import estimate_model_parameters +from disdrodb.psd.models import ( + ExponentialPSD, + GammaPSD, + LognormalPSD, + NormalizedGammaPSD, + available_psd_models, + create_psd, +) + +__all__ = [ + "available_psd_models", + "create_psd", + "estimate_model_parameters", + "LognormalPSD", + "ExponentialPSD", + "GammaPSD", + "NormalizedGammaPSD", +] diff --git a/disdrodb/psd/fitting.py b/disdrodb/psd/fitting.py new file mode 100644 index 00000000..314bda62 --- /dev/null +++ b/disdrodb/psd/fitting.py @@ -0,0 +1,2132 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Routines for PSD fitting.""" +import numpy as np +import scipy.stats as ss +import xarray as xr +from scipy.integrate import quad +from scipy.optimize import minimize +from scipy.special import gamma, gammainc, gammaln # Regularized lower incomplete gamma function + +from disdrodb.psd.models import ExponentialPSD, GammaPSD, LognormalPSD, NormalizedGammaPSD +from disdrodb.utils.warnings import suppress_warnings + + +####--------------------------------------------------------------------------------------. +#### Goodness of fit (GOF) +def compute_gof_stats(drop_number_concentration, psd): + """ + Compute various goodness-of-fit (GoF) statistics between observed and predicted values. + + Parameters + ---------- + - drop_number_concentration: xarray.DataArray with dimensions ('time', 'diameter_bin_center') + - psd: instance of PSD class + + Returns + ------- + - ds: xarray.Dataset containing the computed GoF statistics + """ + from disdrodb.l2.empirical_dsd import get_mode_diameter + + # Retrieve diameter bin width + diameter = drop_number_concentration["diameter_bin_center"] + diameter_bin_width = drop_number_concentration["diameter_bin_width"] + + # Define observed and predicted values and compute errors + observed_values = drop_number_concentration + fitted_values = psd(diameter) # .transpose(*observed_values.dims) + error = observed_values - fitted_values + + # Compute GOF statistics + with suppress_warnings(): + # Compute Pearson correlation + pearson_r = xr.corr(observed_values, fitted_values, dim="diameter_bin_center") + + # Compute MSE + mse = (error**2).mean(dim="diameter_bin_center") + + # Compute maximum error + max_error = error.max(dim="diameter_bin_center") + relative_max_error = error.max(dim="diameter_bin_center") / observed_values.max(dim="diameter_bin_center") + + # Compute difference in total number concentration + total_number_concentration_obs = (observed_values * diameter_bin_width).sum(dim="diameter_bin_center") + total_number_concentration_pred = (fitted_values * diameter_bin_width).sum(dim="diameter_bin_center") + total_number_concentration_difference = total_number_concentration_pred - total_number_concentration_obs + + # Compute Kullback-Leibler divergence + # - Compute pdf per bin + pk_pdf = observed_values / total_number_concentration_obs + qk_pdf = fitted_values / total_number_concentration_pred + + # - Compute probabilities per bin + pk = pk_pdf * diameter_bin_width + pk = pk / pk.sum(dim="diameter_bin_center") # this might not be necessary + qk = qk_pdf * diameter_bin_width + qk = qk / qk.sum(dim="diameter_bin_center") # this might not be necessary + + # - Compute divergence + log_prob_ratio = np.log(pk / qk) + log_prob_ratio = log_prob_ratio.where(np.isfinite(log_prob_ratio)) + kl_divergence = (pk * log_prob_ratio).sum(dim="diameter_bin_center") + + # Other statistics that can be computed also from different diameter discretization + # - Compute max deviation at distribution mode + max_deviation = observed_values.max(dim="diameter_bin_center") - fitted_values.max(dim="diameter_bin_center") + max_relative_deviation = max_deviation / fitted_values.max(dim="diameter_bin_center") + + # - Compute diameter difference of the distribution mode + diameter_mode_deviation = get_mode_diameter(observed_values) - get_mode_diameter(fitted_values) + + # Create an xarray.Dataset to hold the computed statistics + ds = xr.Dataset( + { + "r2": pearson_r**2, # Squared Pearson correlation coefficient + "mse": mse, # Mean Squared Error + "max_error": max_error, # Maximum Absolute Error + "relative_max_error": relative_max_error, # Relative Maximum Error + "total_number_concentration_difference": total_number_concentration_difference, + "kl_divergence": kl_divergence, # Kullback-Leibler divergence + "max_deviation": max_deviation, # Deviation at distribution mode + "max_relative_deviation": max_relative_deviation, # Relative deviation at mode + "diameter_mode_deviation": diameter_mode_deviation, # Difference in mode diameters + }, + ) + return ds + + +####--------------------------------------------------------------------------------------. +#### Maximum Likelihood (ML) + + +def get_expected_probabilities(params, cdf_func, pdf_func, bin_edges, probability_method, normalized=False): + """ + Compute the expected probabilities for each bin given the distribution parameters. + + Parameters + ---------- + params : array-like + Parameters for the CDF or PDF function. + cdf_func : callable + Cumulative distribution function (CDF) that takes bin edges and parameters as inputs. + pdf_func : callable + Probability density function (PDF) that takes a value and parameters as inputs. + bin_edges : array-like + Edges of the bins for which to compute the probabilities. + probability_method : {'cdf', 'pdf'} + Method to compute the probabilities. If 'cdf', use the CDF to compute probabilities. + If 'pdf', integrate the PDF over each bin range. + normalized : bool, optional + If True, normalize the probabilities to sum to 1. Default is False. + + Returns + ------- + expected_probabilities : numpy.ndarray + Array of expected probabilities for each bin. + + Notes + ----- + - If the 'cdf' method is used, the probabilities are computed as the difference in CDF values at the bin edges. + - If the 'pdf' method is used, the probabilities are computed by integrating the PDF over each bin range. + - Any zero or negative probabilities are replaced with a very small positive number (1e-10) to ensure optimization. + - If `normalized` is True, the probabilities are normalized to sum to 1. + + """ + if probability_method == "cdf": + # Compute the CDF at bin edges + cdf_vals = cdf_func(bin_edges, params) + # Compute probabilities for each bin + expected_probabilities = np.diff(cdf_vals) + # Replace any zero or negative probabilities with a very small positive number + # --> Otherwise do not optimize ... + expected_probabilities = np.maximum(expected_probabilities, 1e-10) + # Or integrate PDF over the bin range + else: # probability_method == "pdf": + # For each bin, integrate the PDF over the bin range + expected_probabilities = np.array( + [quad(lambda x: pdf_func(x, params), bin_edges[i], bin_edges[i + 1])[0] for i in range(len(bin_edges) - 1)], + ) + if normalized: + # Normalize probabilities to sum to 1 + total_probability = np.sum(expected_probabilities) + expected_probabilities /= total_probability + return expected_probabilities + + +def get_adjusted_nt(cdf, params, Nt, bin_edges): + """Adjust Nt for the proportion of missing drops. See Johnson's et al., 2013 Eqs. 3 and 4.""" + # Estimate proportion of missing drops (Johnson's 2011 Eqs. 3) + # --> Alternative: p = 1 - np.sum(pdf(diameter, params)* diameter_bin_width) # [-] + p = 1 - np.diff(cdf([bin_edges[0], bin_edges[-1]], params)).item() # [-] + # Adjusts Nt for the proportion of drops not observed + # p = np.clip(p, 0, 1 - 1e-12) + if np.isclose(p, 1, atol=1e-12): + return np.nan + return Nt / (1 - p) # [m-3] + + +def compute_negative_log_likelihood( + params, + bin_edges, + counts, + cdf_func, + pdf_func, + param_constraints=None, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, +): + """ + General negative log-likelihood function for fitting distributions to binned data. + + Parameters + ---------- + params : array-like + Parameters of the distribution. + bin_edges : array-like + Edges of the bins (length N+1). + counts : array-like + Observed counts in each bin (length N). + cdf_func : callable + Cumulative distribution function of the distribution. + pdf_func : callable + Probability density function of the distribution. + param_constraints : callable, optional + Function that checks if parameters are valid. + probability_method : str, optional + Method to compute expected probabilities, either 'cdf' or 'pdf'. Default is 'cdf'. + likelihood : str, optional + Type of likelihood to compute, either 'multinomial' or 'poisson'. Default is 'multinomial'. + truncated_likelihood : bool, optional + Whether to normalize the expected probabilities. Default is True. + nll : float + Negative log-likelihood value. + + Returns + ------- + nll: float + The negative log-likelihood value. + """ + # Check if parameters are valid + if param_constraints is not None and not param_constraints(params): + return np.inf + + # Compute (unormalized) expected probabilities using CDF + expected_probabilities = get_expected_probabilities( + params=params, + cdf_func=cdf_func, + pdf_func=pdf_func, + bin_edges=bin_edges, + probability_method=probability_method, + normalized=truncated_likelihood, + ) + + # Ensure expected probabilities are valid + if np.any(expected_probabilities <= 0): + return np.inf + + # Compute negative log-likelihood + if likelihood == "poisson": + n_total = np.sum(counts) + expected_counts = expected_probabilities * n_total + expected_counts = np.maximum(expected_counts, 1e-10) # Avoid zero expected counts + nll = -np.sum(counts * np.log(expected_counts) - expected_counts) + else: # likelihood == "multinomial": + # Compute likelihood + nll = -np.sum(counts * np.log(expected_probabilities)) + return nll + + +def estimate_lognormal_parameters( + counts, + bin_edges, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + output_dictionary=True, + optimizer="Nelder-Mead", +): + """ + Estimate the parameters of a lognormal distribution given histogram data. + + Parameters + ---------- + counts : array-like + The counts for each bin in the histogram. + bin_edges : array-like + The edges of the bins. + probability_method : str, optional + The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``. + likelihood : str, optional + The likelihood function to use, either ``"multinomial"`` or ``"poisson"``. + The default is ``"multinomial"``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + output_dictionary : bool, optional + Whether to return the output as a dictionary. + If False, returns a numpy array. The default is ``True`` + optimizer : str, optional + The optimization method to use. Default is ``"Nelder-Mead"``. + + Returns + ------- + dict or numpy.ndarray + The estimated parameters of the lognormal distribution. + If ``output_dictionary`` is ``True``, returns a dictionary with keys ``Nt``, ``mu``, and ``sigma``. + If ``output_dictionary`` is ``False``,returns a numpy array with values [Nt, mu, sigma]. + + Notes + ----- + The lognormal distribution is defined as: + N(D) = Nt / (sqrt(2 * pi) * sigma * D) * exp(-(ln(D) - mu)**2 / (2 * sigma**2)) + where Nt is the total number of counts, mu is the mean of the log of the distribution, + and sigma is the standard deviation of the log of the distribution. + + References + ---------- + .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.lognorm.html#scipy.stats.lognorm + """ + # LogNormal + # - mu = log(scale) + # - loc = 0 + + # Initialize bad results + null_output = ( + {"Nt": np.nan, "mu": np.nan, "sigma": np.nan} if output_dictionary else np.array([np.nan, np.nan, np.nan]) + ) + + # Define the CDF and PDF functions for the lognormal distribution + def lognorm_cdf(x, params): + sigma, scale = params + return ss.lognorm.cdf(x, sigma, loc=0, scale=scale) + + def lognorm_pdf(x, params): + sigma, scale = params + return ss.lognorm.pdf(x, sigma, loc=0, scale=scale) + + # Define valid parameters for the lognormal distribution + def param_constraints(params): + sigma, scale = params + return sigma > 0 and scale > 0 + + # Definite initial guess for the parameters + initial_params = [1.0, 1.0] # sigma, scale + + # Define bounds for sigma and scale + bounds = [(1e-6, None), (1e-6, None)] + + # Minimize the negative log-likelihood + with suppress_warnings(): + result = minimize( + compute_negative_log_likelihood, + initial_params, + args=( + bin_edges, + counts, + lognorm_cdf, + lognorm_pdf, + param_constraints, + probability_method, + likelihood, + truncated_likelihood, + ), + bounds=bounds, + method=optimizer, + ) + + # Check if the fit had success + if not result.success: + return null_output + + # Define Nt + Nt = np.sum(counts).item() + + # Retrieve parameters + params = result.x + if truncated_likelihood: + Nt = get_adjusted_nt(cdf=lognorm_cdf, params=params, Nt=Nt, bin_edges=bin_edges) + sigma, scale = params + mu = np.log(scale) + + # Define output + output = {"Nt": Nt, "mu": mu, "sigma": sigma} if output_dictionary else np.array([Nt, mu, sigma]) + return output + + +def estimate_exponential_parameters( + counts, + bin_edges, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + output_dictionary=True, + optimizer="Nelder-Mead", +): + """ + Estimate the parameters of an exponential distribution given histogram data. + + Parameters + ---------- + counts : array-like + The counts for each bin in the histogram. + bin_edges : array-like + The edges of the bins. + probability_method : str, optional + The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``. + likelihood : str, optional + The likelihood function to use, either ``"multinomial"`` or ``"poisson"``. + The default is ``"multinomial"``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + output_dictionary : bool, optional + Whether to return the output as a dictionary. + If False, returns a numpy array. The default is ``True`` + optimizer : str, optional + The optimization method to use. Default is ``"Nelder-Mead"``. + + Returns + ------- + dict or numpy.ndarray + The estimated parameters of the exponential distribution. + If ``output_dictionary`` is ``True``, returns a dictionary with keys ``N0`` and ``Lambda``. + If `output_dictionary` is ``False``, returns a numpy array with [N0, Lambda]. + + Notes + ----- + The exponential distribution is defined as: + N(D) = N0 * exp(-Lambda * D) = Nt * Lambda * exp(-Lambda * D) + where Lambda = 1 / scale and N0 = Nt * Lambda. + + References + ---------- + .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html + """ + # Initialize bad results + null_output = {"N0": np.nan, "Lambda": np.nan} if output_dictionary else np.array([np.nan, np.nan]) + + # Define the CDF and PDF functions for the exponential distribution + def exp_cdf(x, params): + scale = params[0] + return ss.expon.cdf(x, loc=0, scale=scale) + + def exp_pdf(x, params): + scale = params[0] + return ss.expon.pdf(x, loc=0, scale=scale) + + # Define valid parameters for the exponential distribution + def param_constraints(params): + scale = params[0] + return scale > 0 + + # Definite initial guess for the scale parameter + initial_params = [1.0] # scale + + # Define bounds for scale + bounds = [(1e-6, None)] + + # Minimize the negative log-likelihood + with suppress_warnings(): + result = minimize( + compute_negative_log_likelihood, + initial_params, + args=( + bin_edges, + counts, + exp_cdf, + exp_pdf, + param_constraints, + probability_method, + likelihood, + truncated_likelihood, + ), + bounds=bounds, + method=optimizer, + ) + + # Check if the fit had success + if not result.success: + return null_output + + # Define Nt + Nt = np.sum(counts).item() + + # Retrieve parameters + params = result.x + if truncated_likelihood: + Nt = get_adjusted_nt(cdf=exp_cdf, params=params, Nt=Nt, bin_edges=bin_edges) + scale = params[0] + Lambda = 1 / scale + N0 = Nt * Lambda + + # Define output + output = {"N0": N0, "Lambda": Lambda} if output_dictionary else np.array([N0, Lambda]) + return output + + +def estimate_gamma_parameters( + counts, + a, + scale, + bin_edges, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + output_dictionary=True, + optimizer="Nelder-Mead", +): + """ + Estimate the parameters of a gamma distribution given histogram data. + + Parameters + ---------- + counts : array-like + The counts for each bin in the histogram. + a: float + The shape parameter of the scipy.stats.gamma distribution. + A good default value is 1. + scale: float + The scale parameter of the scipy.stats.gamma distribution. + A good default value is 1. + bin_edges : array-like + The edges of the bins. + probability_method : str, optional + The method to compute probabilities, either ``"cdf"`` or ``"pdf"``. The default is ``"cdf"``. + likelihood : str, optional + The likelihood function to use, either ``"multinomial"`` or ``"poisson"``. + The default is ``"multinomial"``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + output_dictionary : bool, optional + Whether to return the output as a dictionary. + If False, returns a numpy array. The default is ``True`` + optimizer : str, optional + The optimization method to use. Default is ``"Nelder-Mead"``. + + Returns + ------- + dict or numpy.ndarray + The estimated parameters of the gamma distribution. + If ``output_dictionary`` is ``True``, returns a dictionary with keys ``N0``, ``mu`` and ``Lambda``. + If `output_dictionary` is ``False``, returns a numpy array with [N0, mu, Lambda]. + + Notes + ----- + The gamma distribution is defined as: + N(D) = N0 * D**mu * exp(-Lambda*D) + where Lambda = 1/scale, and mu = a - 1 with ``a`` being the shape parameter of the gamma distribution. + N0 is defined as N0 = Nt*Lambda**(mu+1)/gamma(mu+1). + + References + ---------- + .. [1] https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gamma.html + + """ + # Initialize bad results + null_output = ( + {"N0": np.nan, "mu": np.nan, "lambda": np.nan} if output_dictionary else np.array([np.nan, np.nan, np.nan]) + ) + + # Define the CDF and PDF functions for the gamma distribution + def gamma_cdf(x, params): + a, scale = params + return ss.gamma.cdf(x, a, loc=0, scale=scale) + + def gamma_pdf(x, params): + a, scale = params + return ss.gamma.pdf(x, a, loc=0, scale=scale) + + # Define valid parameters for the gamma distribution + # mu = -0.99 is a vertical line essentially ... + def param_constraints(params): + a, scale = params + return a > 0.1 and scale > 0 # using a > 0 cause some troubles + + # Definite initial guess for the parameters + initial_params = [a, scale] # (mu=a-1, a=mu+1) + + # Define bounds for a and scale + bounds = [(1e-6, None), (1e-6, None)] + + # Minimize the negative log-likelihood + with suppress_warnings(): + result = minimize( + compute_negative_log_likelihood, + initial_params, + args=( + bin_edges, + counts, + gamma_cdf, + gamma_pdf, + param_constraints, + probability_method, + likelihood, + truncated_likelihood, + ), + method=optimizer, + bounds=bounds, + ) + + # Check if the fit had success + if not result.success: + return null_output + + # Define Nt + Nt = np.sum(counts).item() + + # Retrieve parameters + params = result.x + if truncated_likelihood: + Nt = get_adjusted_nt(cdf=gamma_cdf, params=params, Nt=Nt, bin_edges=bin_edges) + a, scale = params + mu = a - 1 + Lambda = 1 / scale + + # Compute N0 + # - Use logarithmic computations to prevent overflow + # - N0 = Nt * Lambda ** (mu + 1) / gamma(mu + 1) + with suppress_warnings(): + log_N0 = np.log(Nt) + (mu + 1) * np.log(Lambda) - gammaln(mu + 1) + N0 = np.exp(log_N0) + if not np.isfinite(N0): + N0 = np.nan + + # Define output + output = {"N0": N0, "mu": mu, "Lambda": Lambda} if output_dictionary else np.array([N0, mu, Lambda]) + return output + + +def _get_initial_gamma_parameters(ds, mom_method=None): + if mom_method is None: + ds_init = xr.Dataset( + { + "a": xr.ones_like(ds["M1"]), + "scale": xr.ones_like(ds["M1"]), + }, + ) + else: + ds_init = get_mom_parameters( + ds=ds, + psd_model="GammaPSD", + mom_methods=mom_method, + ) + ds_init["a"] = ds_init["mu"] + 1 + ds_init["scale"] = 1 / ds_init["Lambda"] + return ds_init + + +def get_gamma_parameters( + ds, + init_method=None, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + optimizer="Nelder-Mead", +): + """ + Estimate gamma distribution parameters for drop size distribution (DSD) data. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset containing drop size distribution data. It must include the following variables: + - ``drop_number_concentration``: The number concentration of drops. + - ``diameter_bin_width``": The width of each diameter bin. + - ``diameter_bin_lower``: The lower bounds of the diameter bins. + - ``diameter_bin_upper``: The upper bounds of the diameter bins. + - ``diameter_bin_center``: The center values of the diameter bins. + - The moments M0...M6 variables required to compute the initial parameters + with the specified mom_method. + init_method: str or list + The method(s) of moments used to initialize the gamma parameters. + If None, the scale parameter is set to 1 and mu to 0 (a=1). + probability_method : str, optional + Method to compute probabilities. The default is ``cdf``. + likelihood : str, optional + Likelihood function to use for fitting. The default is ``multinomial``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + optimizer : str, optional + Optimization method to use. The default is ``Nelder-Mead``. + + Returns + ------- + xarray.Dataset + Dataset containing the estimated gamma distribution parameters: + - ``N0``: Intercept parameter. + - ``mu``: Shape parameter. + - ``Lambda``: Scale parameter. + The dataset will also have an attribute ``disdrodb_psd_model`` set to ``GammaPSD``. + + Notes + ----- + The function uses `xr.apply_ufunc` to fit the lognormal distribution parameters + in parallel, leveraging Dask for parallel computation. + + """ + # Define inputs + counts = ds["drop_number_concentration"] * ds["diameter_bin_width"] + diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1]) + + # Define initial parameters (a, scale) + ds_init = _get_initial_gamma_parameters(ds, mom_method=init_method) + + # Define kwargs + kwargs = { + "output_dictionary": False, + "bin_edges": diameter_breaks, + "probability_method": probability_method, + "likelihood": likelihood, + "truncated_likelihood": truncated_likelihood, + "optimizer": optimizer, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + estimate_gamma_parameters, + counts, + ds_init["a"], + ds_init["scale"], + kwargs=kwargs, + input_core_dims=[["diameter_bin_center"], [], []], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["N0", "mu", "Lambda"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "GammaPSD" + return ds_params + + +def get_lognormal_parameters( + ds, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + optimizer="Nelder-Mead", +): + """ + Estimate lognormal distribution parameters for drop size distribution (DSD) data. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset containing drop size distribution data. It must include the following variables: + - ``drop_number_concentration``: The number concentration of drops. + - ``diameter_bin_width``": The width of each diameter bin. + - ``diameter_bin_lower``: The lower bounds of the diameter bins. + - ``diameter_bin_upper``: The upper bounds of the diameter bins. + - ``diameter_bin_center``: The center values of the diameter bins. + probability_method : str, optional + Method to compute probabilities. The default is ``cdf``. + likelihood : str, optional + Likelihood function to use for fitting. The default is ``multinomial``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + optimizer : str, optional + Optimization method to use. The default is ``Nelder-Mead``. + + Returns + ------- + xarray.Dataset + Dataset containing the estimated lognormal distribution parameters: + - ``Nt``: Total number concentration. + - ``mu``: Mean of the lognormal distribution. + - ``sigma``: Standard deviation of the lognormal distribution. + The resulting dataset will have an attribute ``disdrodb_psd_model`` set to ``LognormalPSD``. + + Notes + ----- + The function uses `xr.apply_ufunc` to fit the lognormal distribution parameters + in parallel, leveraging Dask for parallel computation. + + """ + # Define inputs + counts = ds["drop_number_concentration"] * ds["diameter_bin_width"] + diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1]) + + # Define kwargs + kwargs = { + "output_dictionary": False, + "bin_edges": diameter_breaks, + "probability_method": probability_method, + "likelihood": likelihood, + "truncated_likelihood": truncated_likelihood, + "optimizer": optimizer, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + estimate_lognormal_parameters, + counts, + kwargs=kwargs, + input_core_dims=[["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["Nt", "mu", "sigma"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "LognormalPSD" + + return ds_params + + +def get_exponential_parameters( + ds, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + optimizer="Nelder-Mead", +): + """ + Estimate the parameters of an exponential particle size distribution (PSD) from the given dataset. + + Fitting this model is equivalent to fitting a GammaPSD model fixing ``mu`` to 0. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset containing drop number concentration data and diameter information. + It must include the following variables: + - ``drop_number_concentration``: The number concentration of drops. + - ``diameter_bin_width``": The width of each diameter bin. + - ``diameter_bin_lower``: The lower bounds of the diameter bins. + - ``diameter_bin_upper``: The upper bounds of the diameter bins. + - ``diameter_bin_center``: The center values of the diameter bins. + probability_method : str, optional + Method to compute probabilities. The default is ``cdf``. + likelihood : str, optional + Likelihood function to use for fitting. The default is ``multinomial``. + truncated_likelihood : bool, optional + Whether to use truncated likelihood. The default is ``True``. + optimizer : str, optional + Optimization method to use. The default is ``Nelder-Mead``. + + Returns + ------- + xarray.Dataset + Dataset containing the estimated expontial distribution parameters: + - ``N0``: Intercept parameter. + - ``Lambda``: Scale parameter. + The resulting dataset will have an attribute ``disdrodb_psd_model`` set to ``ExponentialPSD``. + + Notes + ----- + The function uses `xr.apply_ufunc` to fit the exponential distribution parameters + in parallel, leveraging Dask for parallel computation. + + """ + # Define inputs + counts = ds["drop_number_concentration"] * ds["diameter_bin_width"] + diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1]) + + # Define kwargs + kwargs = { + "output_dictionary": False, + "bin_edges": diameter_breaks, + "probability_method": probability_method, + "likelihood": likelihood, + "truncated_likelihood": truncated_likelihood, + "optimizer": optimizer, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + estimate_exponential_parameters, + counts, + kwargs=kwargs, + input_core_dims=[["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 2}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["N0", "Lambda"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "ExponentialPSD" + return ds_params + + +####-------------------------------------------------------------------------------------------------------------------. + + +def _estimate_gamma_parameters_johnson( + drop_number_concentration, + diameter, + diameter_breaks, + output_dictionary=True, + method="Nelder-Mead", + mu=0.5, + Lambda=3, + **kwargs, +): + """Deprecated Maximum likelihood estimation of Gamma model. + + N(D) = N_t * lambda**(mu+1) / gamma(mu+1) D**mu exp(-lambda*D) + + Args: + spectra: The DSD for which to find parameters [mm-1 m-3]. + widths: Class widths for each DSD bin [mm]. + diams: Class-centre diameters for each DSD bin [mm]. + mu: Initial value for shape parameter mu [-]. + lambda_param: Initial value for slope parameter lambda [mm^-1]. + kwargs: Extra arguments for the optimization process. + + Returns + ------- + Dictionary with estimated mu, lambda, and N0. + mu (shape) N0 (scale) lambda(slope) + + Notes + ----- + The last bin counts are not accounted in the fitting procedure ! + + References + ---------- + Johnson, R. W., D. V. Kliche, and P. L. Smith, 2011: Comparison of Estimators for Parameters of Gamma Distributions + with Left-Truncated Samples. J. Appl. Meteor. Climatol., 50, 296-310, https://doi.org/10.1175/2010JAMC2478.1 + + Johnson, R.W., Kliche, D., & Smith, P.L. (2010). + Maximum likelihood estimation of gamma parameters for coarsely binned and truncated raindrop size data. + Quarterly Journal of the Royal Meteorological Society, 140. DOI:10.1002/qj.2209 + + """ + # Initialize bad results + if output_dictionary: + null_output = {"mu": np.nan, "lambda": np.nan, "N0": np.nan} + else: + null_output = np.array([np.nan, np.nan, np.nan]) + + # Initialize parameters + # --> Ideally with method of moments estimate + # --> See equation 8 of Johnson's 2013 + x0 = [mu, Lambda] + + # Compute diameter_bin_width + diameter_bin_width = np.diff(diameter_breaks) + + # Convert drop_number_concentration from mm-1 m-3 to m-3. + spectra = np.asarray(drop_number_concentration) * diameter_bin_width + + # Define cost function + # - Parameter to be optimized on first positions + def _cost_function(parameters, spectra, diameter_breaks): + # Assume spectra to be in unit [m-3] (drop_number_concentration*diameter_bin_width) ! + mu, Lambda = parameters + # Precompute gamma integrals between various diameter bins + # - gamminc(mu+1) already divides the integral by gamma(mu+1) ! + pgamma_d = gammainc(mu + 1, Lambda * diameter_breaks) + # Compute probability with interval + delta_pgamma_bins = pgamma_d[1:] - pgamma_d[:-1] + # Compute normalization over interval + denominator = pgamma_d[-1] - pgamma_d[0] + # Compute cost function + # a = mu - 1, x = lambda + if mu > -1 and Lambda > 0: + cost = np.sum(-spectra * np.log(delta_pgamma_bins / denominator)) + return cost + return np.inf + + # Minimize the cost function + with suppress_warnings(): + bounds = [(0, None), (0, None)] # Force mu and lambda to be non-negative + res = minimize( + _cost_function, + x0=x0, + args=(spectra, diameter_breaks), + method=method, + bounds=bounds, + **kwargs, + ) + + # Check if the fit had success + if not res.success: + return null_output + + # Extract parameters + mu = res.x[0] # [-] + Lambda = res.x[1] # [mm-1] + + # Estimate tilde_N_T using the total drop concentration + tilde_N_T = np.sum(drop_number_concentration * diameter_bin_width) # [m-3] + + # Estimate proportion of missing drops (Johnson's 2011 Eqs. 3) + with suppress_warnings(): + D = diameter + p = 1 - np.sum((Lambda ** (mu + 1)) / gamma(mu + 1) * D**mu * np.exp(-Lambda * D) * diameter_bin_width) # [-] + + # Convert tilde_N_T to N_T using Johnson's 2013 Eqs. 3 and 4. + # - Adjusts for the proportion of drops not observed + N_T = tilde_N_T / (1 - p) # [m-3] + + # Compute N0 + N0 = N_T * (Lambda ** (mu + 1)) / gamma(mu + 1) # [m-3 * mm^(-mu-1)] + + # Compute Dm + # Dm = (mu + 4)/ Lambda + + # Compute Nw + # Nw = N0* D^mu / f(mu) , with f(mu of the Normalized PSD) + + # Define output + output = {"mu": mu, "Lambda": Lambda, "N0": N0} if output_dictionary else np.array([mu, Lambda, N0]) + return output + + +def get_gamma_parameters_johnson2014(ds, method="Nelder-Mead"): + """Deprecated model. See Gamma Model with truncated_likelihood and 'pdf'.""" + drop_number_concentration = ds["drop_number_concentration"] + diameter = ds["diameter_bin_center"] + diameter_breaks = np.append(ds["diameter_bin_lower"].data, ds["diameter_bin_upper"].data[-1]) + # Define kwargs + kwargs = { + "output_dictionary": False, + "diameter_breaks": diameter_breaks, + "method": method, + } + da_params = xr.apply_ufunc( + _estimate_gamma_parameters_johnson, + drop_number_concentration, + diameter, + # diameter_bin_width, + kwargs=kwargs, + input_core_dims=[["diameter_bin_center"], ["diameter_bin_center"]], # ["diameter_bin_center"], + output_core_dims=[["parameters"]], + vectorize=True, + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["mu", "Lambda", "N0"]}) + + # Convert to skill Dataset + ds_params = da_params.to_dataset(dim="parameters") + return ds_params + + +####-----------------------------------------------------------------------------------------. +#### Grid Search (GS) + + +def _compute_rain_rate(ND, D, dD, V): + axis = 1 if ND.ndim == 2 else None + rain_rate = np.pi / 6 * np.sum(ND * V * (D / 1000) ** 3 * dD, axis=axis) * 3600 * 1000 + return rain_rate # mm/h + + +def _compute_lwc(ND, D, dD, rho_w=1000): + axis = 1 if ND.ndim == 2 else None + lwc = np.pi / 6.0 * (rho_w * 1000) * np.sum((D / 1000) ** 3 * ND * dD, axis=axis) + return lwc # g/m3 + + +def _compute_z(ND, D, dD): + axis = 1 if ND.ndim == 2 else None + z = np.sum(((D) ** 6 * ND * dD), axis=axis) # mm⁶·m⁻³ + Z = 10 * np.log10(z) + return Z + + +def _compute_cost_function(ND_obs, ND_preds, D, dD, V, target, transformation, error_order): + # Assume ND_obs of shape (D bins) and ND_preds of shape (# params, D bins) + if target == "ND": + if transformation == "identity": + errors = np.mean(np.abs(ND_obs[None, :] - ND_preds) ** error_order, axis=1) + if transformation == "log": + errors = np.mean(np.abs(np.log(ND_obs[None, :] + 1) - np.log(ND_preds + 1)) ** error_order, axis=1) + if transformation == "np.sqrt": + errors = np.mean(np.abs(np.sqrt(ND_obs[None, :]) - np.sqrt(ND_preds)) ** error_order, axis=1) + elif target == "Z": + errors = np.abs(_compute_z(ND_obs, D, dD) - _compute_z(ND_preds, D, dD)) + elif target == "R": + errors = np.abs(_compute_rain_rate(ND_obs, D, dD, V) - _compute_rain_rate(ND_preds, D, dD, V)) + elif target == "LWC": + errors = np.abs(_compute_lwc(ND_obs, D, dD) - _compute_lwc(ND_preds, D, dD)) + else: + raise ValueError("Invalid target") + return errors + + +def apply_exponential_gs( + Nt, + ND_obs, + V, + # Coords + D, + dD, + # Error options + target, + transformation, + error_order, +): + """Apply Grid Search for the ExponentialPSD distribution.""" + # Define set of mu values + lambda_arr = np.arange(0.01, 20, step=0.01) + + # Perform grid search + with suppress_warnings(): + # Compute ND + N0_arr = Nt * lambda_arr + ND_preds = ExponentialPSD.formula(D=D[None, :], N0=N0_arr[:, None], Lambda=lambda_arr[:, None]) + + # Compute errors + errors = _compute_cost_function( + ND_obs=ND_obs, + ND_preds=ND_preds, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Identify best parameter set + best_index = np.argmin(errors) + return np.array([N0_arr[best_index].item(), lambda_arr[best_index].item()]) + + +def _apply_gamma_gs(mu_values, lambda_values, Nt, ND_obs, D, dD, V, target, transformation, error_order): + """Routine for GammaPSD parameters grid search.""" + # Define combinations of parameters for grid search + combo = np.meshgrid(mu_values, lambda_values, indexing="xy") + mu_arr = combo[0].ravel() + lambda_arr = combo[1].ravel() + + # Perform grid search + with suppress_warnings(): + # Compute ND + N0 = np.exp(np.log(Nt) + (mu_arr[:, None] + 1) * np.log(lambda_arr[:, None]) - gammaln(mu_arr[:, None] + 1)) + ND_preds = GammaPSD.formula(D=D[None, :], N0=N0, Lambda=lambda_arr[:, None], mu=mu_arr[:, None]) + + # Compute errors + errors = _compute_cost_function( + ND_obs=ND_obs, + ND_preds=ND_preds, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Best parameter + best_index = np.argmin(errors) + return N0[best_index].item(), mu_arr[best_index].item(), lambda_arr[best_index].item() + + +def apply_gamma_gs( + Nt, + ND_obs, + V, + # Coords + D, + dD, + # Error options + target, + transformation, + error_order, +): + """Estimate GammaPSD model parameters using Grid Search.""" + # Define initial set of parameters + mu_step = 0.5 + lambda_step = 0.5 + mu_values = np.arange(0.01, 20, step=mu_step) + lambda_values = np.arange(0, 60, step=lambda_step) + + # First round of GS + N0, mu, Lambda = _apply_gamma_gs( + mu_values=mu_values, + lambda_values=lambda_values, + Nt=Nt, + ND_obs=ND_obs, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Second round of GS + mu_values = np.arange(mu - mu_step * 2, mu + mu_step * 2, step=mu_step / 20) + lambda_values = np.arange(Lambda - lambda_step * 2, Lambda + lambda_step * 2, step=lambda_step / 20) + N0, mu, Lambda = _apply_gamma_gs( + mu_values=mu_values, + lambda_values=lambda_values, + Nt=Nt, + ND_obs=ND_obs, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + return np.array([N0, mu, Lambda]) + + +def _apply_lognormal_gs(mu_values, sigma_values, Nt, ND_obs, D, dD, V, target, transformation, error_order): + """Routine for LognormalPSD parameters grid search.""" + # Define combinations of parameters for grid search + combo = np.meshgrid(mu_values, sigma_values, indexing="xy") + mu_arr = combo[0].ravel() + sigma_arr = combo[1].ravel() + + # Perform grid search + with suppress_warnings(): + # Compute ND + ND_preds = LognormalPSD.formula(D=D[None, :], Nt=Nt, mu=mu_arr[:, None], sigma=sigma_arr[:, None]) + + # Compute errors + errors = _compute_cost_function( + ND_obs=ND_obs, + ND_preds=ND_preds, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Best parameter + best_index = np.argmin(errors) + return Nt, mu_arr[best_index].item(), sigma_arr[best_index].item() + + +def apply_lognormal_gs( + Nt, + ND_obs, + V, + # Coords + D, + dD, + # Error options + target, + transformation, + error_order, +): + """Estimate LognormalPSD model parameters using Grid Search.""" + # Define initial set of parameters + mu_step = 0.5 + sigma_step = 0.5 + mu_values = np.arange(0.01, 20, step=mu_step) # TODO: define realistic values + sigma_values = np.arange(0, 20, step=sigma_step) # TODO: define realistic values + + # First round of GS + Nt, mu, sigma = _apply_lognormal_gs( + mu_values=mu_values, + sigma_values=sigma_values, + Nt=Nt, + ND_obs=ND_obs, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Second round of GS + mu_values = np.arange(mu - mu_step * 2, mu + mu_step * 2, step=mu_step / 20) + sigma_values = np.arange(sigma - sigma_step * 2, sigma + sigma_step * 2, step=sigma_step / 20) + Nt, mu, sigma = _apply_lognormal_gs( + mu_values=mu_values, + sigma_values=sigma_values, + Nt=Nt, + ND_obs=ND_obs, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + return np.array([Nt, mu, sigma]) + + +def apply_normalized_gamma_gs( + Nw, + D50, + ND_obs, + V, + # Coords + D, + dD, + # Error options + target, + transformation, + error_order, +): + """Estimate NormalizedGammaPSD model parameters using Grid Search.""" + # Define set of mu values + mu_arr = np.arange(0.01, 20, step=0.01) + + # Perform grid search + with suppress_warnings(): + # Compute ND + ND_preds = NormalizedGammaPSD.formula(D=D[None, :], D50=D50, Nw=Nw, mu=mu_arr[:, None]) + + # Compute errors + errors = _compute_cost_function( + ND_obs=ND_obs, + ND_preds=ND_preds, + D=D, + dD=dD, + V=V, + target=target, + transformation=transformation, + error_order=error_order, + ) + + # Identify best parameter set + mu = mu_arr[np.argmin(errors)] + return np.array([Nw, mu, D50]) + + +def get_exponential_parameters_gs(ds, target="ND", transformation="log", error_order=1): + """Estimate the parameters of an Exponential distribution using Grid Search.""" + # "target": ["ND", "LWC", "Z", "R"] + # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration + # "error_order": 1, # MAE/MSE ... only for drop_number_concentration + + # Define kwargs + kwargs = { + "D": ds["diameter_bin_center"].data, + "dD": ds["diameter_bin_width"].data, + "target": target, + "transformation": transformation, + "error_order": error_order, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + apply_exponential_gs, + # Variables varying over time + ds["Nt"], + ds["drop_number_concentration"], + ds["fall_velocity"], + # Other options + kwargs=kwargs, + # Settings + input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 2}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["N0", "Lambda"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "ExponentialPSD" + return ds_params + + +def get_gamma_parameters_gs(ds, target="ND", transformation="log", error_order=1): + """Compute Grid Search to identify mu and Lambda Gamma distribution parameters.""" + # "target": ["ND", "LWC", "Z", "R"] + # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration + # "error_order": 1, # MAE/MSE ... only for drop_number_concentration + + # Define kwargs + kwargs = { + "D": ds["diameter_bin_center"].data, + "dD": ds["diameter_bin_width"].data, + "target": target, + "transformation": transformation, + "error_order": error_order, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + apply_gamma_gs, + # Variables varying over time + ds["Nt"], + ds["drop_number_concentration"], + ds["fall_velocity"], + # Other options + kwargs=kwargs, + # Settings + input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["N0", "mu", "Lambda"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "GammaPSD" + return ds_params + + +def get_lognormal_parameters_gs(ds, target="ND", transformation="log", error_order=1): + """Compute Grid Search to identify mu and sigma lognormal distribution parameters.""" + # "target": ["ND", "LWC", "Z", "R"] + # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration + # "error_order": 1, # MAE/MSE ... only for drop_number_concentration + + # Define kwargs + kwargs = { + "D": ds["diameter_bin_center"].data, + "dD": ds["diameter_bin_width"].data, + "target": target, + "transformation": transformation, + "error_order": error_order, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + apply_lognormal_gs, + # Variables varying over time + ds["Nt"], + ds["drop_number_concentration"], + ds["fall_velocity"], + # Other options + kwargs=kwargs, + # Settings + input_core_dims=[[], ["diameter_bin_center"], ["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["Nt", "mu", "sigma"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "LognormalPSD" + return ds_params + + +def get_normalized_gamma_parameters_gs(ds, target="ND", transformation="log", error_order=1): + r"""Estimate $\mu$ of a Normalized Gamma distribution using Grid Search. + + The D50 and Nw parameters of the Normalized Gamma distribution are derived empirically from the observed DSD. + $\mu$ is derived by minimizing the errors between the observed DSD and modelled Normalized Gamma distribution. + + Parameters + ---------- + Nd : array_like + A drop size distribution + D50: optional, float + Median drop diameter in mm. If none is given, it will be estimated. + Nw: optional, float + Normalized Intercept Parameter. If none is given, it will be estimated. + order: optional, float + Order to which square the error when computing the sum of errors. + Order = 2 is equivalent to minimize the mean squared error (MSE) (L2 norm). The default is 2. + Order = 1 is equivalent to minimize the mean absolute error (MAE) (L1 norm). + Higher orders typically stretch higher the gamma distribution. + + Returns + ------- + + + """ + # "target": ["ND", "LWC", "Z", "R"] + # "transformation": "log", "identity", "sqrt", # only for drop_number_concentration + # "error_order": 1, # MAE/MSE ... only for drop_number_concentration + + # Define kwargs + kwargs = { + "D": ds["diameter_bin_center"].data, + "dD": ds["diameter_bin_width"].data, + "target": target, + "transformation": transformation, + "error_order": error_order, + } + + # Fit distribution in parallel + da_params = xr.apply_ufunc( + apply_normalized_gamma_gs, + # Variables varying over time + ds["Nw"], + ds["D50"], + ds["drop_number_concentration"], + ds["fall_velocity"], + # Other options + kwargs=kwargs, + # Settings + input_core_dims=[[], [], ["diameter_bin_center"], ["diameter_bin_center"]], + output_core_dims=[["parameters"]], + vectorize=True, + dask="parallelized", + dask_gufunc_kwargs={"output_sizes": {"parameters": 3}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_params = da_params.assign_coords({"parameters": ["Nw", "mu", "D50"]}) + + # Create parameters dataset + ds_params = da_params.to_dataset(dim="parameters") + + # Add DSD model name to the attribute + ds_params.attrs["disdrodb_psd_model"] = "NormalizedGammaPSD" + return ds_params + + +####-----------------------------------------------------------------. +#### Methods of Moments (MOM) +# - M246 DEFAULT FOR GAMMA ? +# - LMOM (Johnson et al., 2014) + + +def get_exponential_parameters_Zhang2008(moment_l, moment_m, l, m): # noqa: E741 + """Calculate Exponential DSD parameters using the method of moments (MOM). + + The choice of moments is given in the parameters. + + Parameters + ---------- + moment_l: float + First moment to use. + moment_l: float + Second moment to use. + l : float + Moment order. + m : float + Moment order, + + References + ---------- + [1] Zhang, et. al., 2008, Diagnosing the Intercept Parameter for Exponential Raindrop Size + Distribution Based on Video Disdrometer Observations: Model Development. J. Appl. + Meteor. Climatol., + https://doi.org/10.1175/2008JAMC1876.1 + """ + num = moment_l * gamma(m + 1) + den = moment_m * gamma(l + 1) + Lambda = np.power(num / den, (1 / (m - l))) + N0 = moment_l * np.power(Lambda, l + 1) / gamma(l + 1) + return N0, Lambda + + +def get_exponential_parameters_M34(moment_3, moment_4): + """Compute exponential distribution parameters following Testud 2001. + + References + ---------- + Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001: + The Concept of “Normalized” Distribution to Describe Raindrop Spectra: + A Tool for Cloud Physics and Cloud Remote Sensing. + J. Appl. Meteor. Climatol., 40, 1118-1140, + https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2 + """ + N0 = 256 / gamma(4) * moment_3**5 / moment_4**4 + Dm = moment_4 / moment_3 + Lambda = 4 / Dm + return N0, Lambda + + +def get_gamma_parameters_M012(M0, M1, M2): + """Compute gamma distribution parameters following Cao et al., 2009. + + References + ---------- + Cao, Q., and G. Zhang, 2009: + Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra. + J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1. + """ + # TODO: really bad results. check formula ! + G = M1**3 / M0 / M2 + mu = 1 / (1 - G) - 2 + Lambda = M0 / M1 * (mu + 1) + N0 = Lambda ** (mu + 1) * M0 / gamma(mu + 1) + return N0, mu, Lambda + + +def get_gamma_parameters_M234(M2, M3, M4): + """Compute gamma distribution parameters following Cao et al., 2009. + + References + ---------- + Cao, Q., and G. Zhang, 2009: + Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra. + J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1. + """ + G = M3**2 / M2 / M4 + mu = 1 / (1 - G) - 4 + Lambda = M2 / M3 * (mu + 3) + N0 = Lambda ** (mu + 3) * M2 / gamma(mu + 3) + return N0, mu, Lambda + + +def get_gamma_parameters_M246(M2, M4, M6): + """Compute gamma distribution parameters following Ulbrich 1998. + + References + ---------- + Ulbrich, C. W., and D. Atlas, 1998: + Rainfall Microphysics and Radar Properties: Analysis Methods for Drop Size Spectra. + J. Appl. Meteor. Climatol., 37, 912-923, + https://doi.org/10.1175/1520-0450(1998)037<0912:RMARPA>2.0.CO;2 + + Cao, Q., and G. Zhang, 2009: + Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra. + J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1. + + Thurai, M., Williams, C.R., Bringi, V.N., 2014: + Examining the correlations between drop size distribution parameters using data + from two side-by-side 2D-video disdrometers. + Atmospheric Research, 144, 95-110, https://doi.org/10.1016/j.atmosres.2014.01.002. + """ + G = M4**2 / M2 / M6 + + # TODO: Different formulas ! + # Thurai et al., 2014 (A4), Ulbrich et al., 1998 (2) + # mu = ((7.0 - 11.0 * G) - + # np.sqrt((7.0 - 11.0 * G) ** 2.0 - 4.0 * (G - 1.0) * (30.0 * G - 12.0)) / (2.0 * (G - 1.0))) + mu = (7.0 - 11.0 * G) - np.sqrt(G**2 + 89 * G + 1) / (2.0 * (G - 1.0)) + + # Cao et al., 2009 (B3) + # --> Wrong ??? + mu = (7.0 - 11.0 * G) - np.sqrt(G**2 + 14 * G + 1) / (2.0 * (G - 1.0)) + + Lambda = np.sqrt((4 + mu) * (3 + mu) * M2 / M4) + # Cao et al., 2009 + N0 = M2 * Lambda ** (3 + mu) / gamma(3 + mu) + # # Thurai et al., 2014 + # N0 = M3 * Lambda ** (4 + mu) / gamma(4 + mu) + # # Ulbrich et al., 1998 + # N0 = M6 * Lambda ** (7.0 + mu) / gamma(7 + mu) + return N0, mu, Lambda + + +def get_gamma_parameters_M456(M4, M5, M6): + """Compute gamma distribution parameters following Cao et al., 2009. + + References + ---------- + Cao, Q., and G. Zhang, 2009: + Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra. + J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1. + """ + G = M5**2 / M4 / M6 + mu = 1 / (1 - G) - 6 + Lambda = M4 / M5 * (mu + 5) + N0 = Lambda ** (mu + 5) * M4 / gamma(mu + 5) + return N0, mu, Lambda + + +def get_gamma_parameters_M346(M3, M4, M6): + """Compute gamma distribution parameters following Kozu 1991. + + References + ---------- + Kozu, T., and K. Nakamura, 1991: + Rainfall Parameter Estimation from Dual-Radar Measurements + Combining Reflectivity Profile and Path-integrated Attenuation. + J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2 + + Tokay, A., and D. A. Short, 1996: + Evidence from Tropical Raindrop Spectra of the Origin of Rain from + Stratiform versus Convective Clouds. + J. Appl. Meteor. Climatol., 35, 355-371, + https://doi.org/10.1175/1520-0450(1996)035<0355:EFTRSO>2.0.CO;2 + + Cao, Q., and G. Zhang, 2009: + Errors in Estimating Raindrop Size Distribution Parameters Employing Disdrometer and Simulated Raindrop Spectra. + J. Appl. Meteor. Climatol., 48, 406-425, https://doi.org/10.1175/2008JAMC2026.1. + """ + G = M4**3 / M3**2 / M6 + + # Kozu + mu = (5.5 * G - 4 + np.sqrt(G * (G * 0.25 + 2))) / (1 - G) + + # Cao et al., 2009 (equivalent) + # mu = (11 * G - 8 + np.sqrt(G * (G + 8))) / (2 * (1 - G)) + + Lambda = (mu + 4) * M3 / M4 + N0 = Lambda ** (mu + 4) * M3 / gamma(mu + 4) + return N0, mu, Lambda + + +def get_lognormal_parameters_M346(M3, M4, M6): + """Compute lognormal distribution parameters following Kozu1991. + + References + ---------- + Kozu, T., and K. Nakamura, 1991: + Rainfall Parameter Estimation from Dual-Radar Measurements + Combining Reflectivity Profile and Path-integrated Attenuation. + J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2 + """ + L3 = np.log(M3) + L4 = np.log(M4) + L6 = np.log(M6) + Nt = np.exp((24 * L3 - 27 * L4 - 6 * L6) / 3) + mu = (-10 * L3 + 13.5 * L4 - 3.5 * L6) / 3 + sigma = (2 * L3 - 3 * L4 + L6) / 3 + return Nt, mu, sigma + + +def _get_gamma_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset: + # Get the correct function and list of variables for the requested method + func, needed_moments = MOM_METHODS_DICT["GammaPSD"][mom_method] + + # Extract the required arrays from the dataset + arrs = [ds[var_name] for var_name in needed_moments] + + # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data + N0, mu, Lambda = func(*arrs) + + # Return a new Dataset containing the results + ds = xr.Dataset( + { + "N0": N0, + "mu": mu, + "Lambda": Lambda, + }, + coords=ds.coords, + ) + return ds + + +def _get_lognormal_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset: + # Get the correct function and list of variables for the requested method + func, needed_moments = MOM_METHODS_DICT["LognormalPSD"][mom_method] + + # Extract the required arrays from the dataset + arrs = [ds[var_name] for var_name in needed_moments] + + # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data + Nt, mu, sigma = func(*arrs) + + # Return a new Dataset containing the results + ds = xr.Dataset( + { + "Nt": Nt, + "mu": mu, + "sigma": sigma, + }, + coords=ds.coords, + ) + return ds + + +def _get_exponential_parameters_mom(ds: xr.Dataset, mom_method: str) -> xr.Dataset: + # Get the correct function and list of variables for the requested method + func, needed_moments = MOM_METHODS_DICT["ExponentialPSD"][mom_method] + + # Extract the required arrays from the dataset + arrs = [ds[var_name] for var_name in needed_moments] + + # Apply the function. This will produce (mu, Lambda, N0) with the same coords/shapes as input data + N0, Lambda = func(*arrs) + + # Return a new Dataset containing the results + ds = xr.Dataset( + { + "N0": N0, + "Lambda": Lambda, + }, + coords=ds.coords, + ) + return ds + + +####--------------------------------------------------------------------------------------. +#### Routines dictionary + + +MOM_METHODS_DICT = { + "GammaPSD": { + # "M012": (get_gamma_parameters_M012, ["M0", "M1", "M2"]), + "M234": (get_gamma_parameters_M234, ["M2", "M3", "M4"]), + "M246": (get_gamma_parameters_M246, ["M2", "M4", "M6"]), + "M456": (get_gamma_parameters_M456, ["M4", "M5", "M6"]), + "M346": (get_gamma_parameters_M346, ["M3", "M4", "M6"]), + }, + "LognormalPSD": { + "M346": (get_lognormal_parameters_M346, ["M3", "M4", "M6"]), + }, + "ExponentialPSD": { + "M234": (get_exponential_parameters_M34, ["M3", "M4"]), + }, +} + + +OPTIMIZATION_ROUTINES_DICT = { + "MOM": { + "GammaPSD": _get_gamma_parameters_mom, + "LognormalPSD": _get_lognormal_parameters_mom, + "ExponentialPSD": _get_exponential_parameters_mom, + }, + "GS": { + "GammaPSD": get_gamma_parameters_gs, + "NormalizedGammaPSD": get_normalized_gamma_parameters_gs, + "LognormalPSD": get_lognormal_parameters_gs, + "ExponentialPSD": get_exponential_parameters_gs, + }, + "ML": { + "GammaPSD": get_gamma_parameters, + "LognormalPSD": get_lognormal_parameters, + "ExponentialPSD": get_exponential_parameters, + }, +} + + +def available_mom_methods(psd_model): + """Implemented MOM methods for a given PSD model.""" + return list(MOM_METHODS_DICT[psd_model]) + + +def available_optimization(psd_model): + """Implemented fitting methods for a given PSD model.""" + return [opt for opt in list(OPTIMIZATION_ROUTINES_DICT) if psd_model in OPTIMIZATION_ROUTINES_DICT[opt]] + + +####--------------------------------------------------------------------------------------. +#### Argument checkers + + +def check_psd_model(psd_model, optimization): + """Check valid psd_model argument.""" + valid_psd_models = list(OPTIMIZATION_ROUTINES_DICT[optimization]) + if psd_model not in valid_psd_models: + msg = ( + f"{optimization} optimization is not available for 'psd_model' {psd_model}. " + f"Accepted PSD models are {valid_psd_models}." + ) + raise ValueError(msg) + + +def check_target(target): + """Check valid target argument.""" + valid_targets = ["ND", "R", "Z", "LWC"] + if target not in valid_targets: + raise ValueError(f"Invalid 'target' {target}. Valid targets are {valid_targets}.") + return target + + +def check_transformation(transformation): + """Check valid transformation argument.""" + valid_transformation = ["identity", "log", "sqrt"] + if transformation not in valid_transformation: + raise ValueError( + f"Invalid 'transformation' {transformation}. Valid transformations are {transformation}.", + ) + return transformation + + +def check_likelihood(likelihood): + """Check valid likelihood argument.""" + valid_likelihood = ["multinomial", "poisson"] + if likelihood not in valid_likelihood: + raise ValueError(f"Invalid 'likelihood' {likelihood}. Valid values are {valid_likelihood}.") + return likelihood + + +def check_truncated_likelihood(truncated_likelihood): + """Check valid truncated_likelihood argument.""" + if not isinstance(truncated_likelihood, bool): + raise TypeError(f"Invalid 'truncated_likelihood' argument {truncated_likelihood}. Must be True or False.") + return truncated_likelihood + + +def check_probability_method(probability_method): + """Check valid probability_method argument.""" + # Check valid probability_method + valid_probability_method = ["cdf", "pdf"] + if probability_method not in valid_probability_method: + raise ValueError( + f"Invalid 'probability_method' {probability_method}. Valid values are {valid_probability_method}.", + ) + return probability_method + + +def check_optimizer(optimizer): + """Check valid optimizer argument.""" + # Check valid probability_method + valid_optimizer = ["Nelder-Mead", "Powell", "L-BFGS-B"] + if optimizer not in valid_optimizer: + raise ValueError( + f"Invalid 'optimizer' {optimizer}. Valid values are {valid_optimizer}.", + ) + return optimizer + + +def check_mom_methods(mom_methods, psd_model): + """Check valid mom_methods arguments.""" + if isinstance(mom_methods, str): + mom_methods = [mom_methods] + valid_mom_methods = available_mom_methods(psd_model) + invalid_mom_methods = np.array(mom_methods)[np.isin(mom_methods, valid_mom_methods, invert=True)] + if len(invalid_mom_methods) > 0: + raise ValueError( + f"Unknown mom_methods '{invalid_mom_methods}' for {psd_model}. Choose from {valid_mom_methods}.", + ) + return mom_methods + + +def check_optimization(optimization): + """Check valid optimization argument.""" + valid_optimization = list(OPTIMIZATION_ROUTINES_DICT) + if optimization not in valid_optimization: + raise ValueError( + f"Invalid 'optimization' {optimization}. Valid procedure are {valid_optimization}.", + ) + return optimization + + +def check_optimization_kwargs(optimization_kwargs, optimization, psd_model): + """Check valid optimization_kwargs.""" + dict_arguments = { + "ML": { + "init_method": None, + "probability_method": check_probability_method, + "likelihood": check_likelihood, + "truncated_likelihood": check_truncated_likelihood, + "optimizer": check_optimizer, + }, + "GS": { + "target": check_target, + "transformation": check_transformation, + "error_order": None, + }, + "MOM": { + "mom_methods": None, + }, + } + optimization = check_optimization(optimization) + check_psd_model(psd_model=psd_model, optimization=optimization) + + # Retrieve the expected arguments for the given optimization method + expected_arguments = dict_arguments.get(optimization, {}) + + # Check for missing arguments in optimization_kwargs + missing_args = [arg for arg in expected_arguments if arg not in optimization_kwargs] + if missing_args: + raise ValueError(f"Missing required arguments for {optimization} optimization: {missing_args}") + + # Validate argument values + _ = [check(optimization_kwargs[arg]) for arg, check in expected_arguments.items() if callable(check)] + + # Further special checks + if optimization == "MOM": + _ = check_mom_methods(mom_methods=optimization_kwargs["mom_methods"], psd_model=psd_model) + if optimization == "ML": + if optimization_kwargs["init_method"] is not None: + _ = check_mom_methods(mom_methods=optimization_kwargs["init_method"], psd_model=psd_model) + + +####--------------------------------------------------------------------------------------. +#### Wrappers for fitting + + +def get_mom_parameters(ds: xr.Dataset, psd_model: str, mom_methods: str) -> xr.Dataset: + """ + Compute PSD model parameters using various method-of-moments (MOM) approaches. + + The method is specified by the `mom_methods` acronym, e.g. 'M012', 'M234', 'M246'. + + Parameters + ---------- + ds : xr.Dataset + An xarray Dataset with the required moments M0...M6 as data variables. + mom_methods: str or list + Valid MOM methods are {'M012', 'M234', 'M246', 'M456', 'M346'}. + + Returns + ------- + xr.Dataset + A Dataset containing mu, Lambda, and N0 variables. + If multiple mom_methods are specified, the dataset has the dimension mom_method. + + """ + # Check inputs + check_psd_model(psd_model=psd_model, optimization="MOM") + mom_methods = check_mom_methods(mom_methods, psd_model=psd_model) + + # Retrieve function + func = OPTIMIZATION_ROUTINES_DICT["MOM"][psd_model] + + # Compute parameters + if len(mom_methods) == 1: + ds = func(ds=ds, mom_method=mom_methods[0]) + ds.attrs["mom_method"] = mom_methods[0] + return ds + list_ds = [func(ds=ds, mom_method=mom_method) for mom_method in mom_methods] + ds = xr.concat(list_ds, dim="mom_method") + ds = ds.assign_coords({"mom_method": mom_methods}) + return ds + + +def get_ml_parameters( + ds, + psd_model, + init_method=None, + probability_method="cdf", + likelihood="multinomial", + truncated_likelihood=True, + optimizer="Nelder-Mead", +): + """ + Estimate model parameters for a given distribution using Maximum Likelihood. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset containing drop number concentration data and diameter information. + It must include the following variables: + - ``drop_number_concentration``: The number concentration of drops. + - ``diameter_bin_width``": The width of each diameter bin. + - ``diameter_bin_lower``: The lower bounds of the diameter bins. + - ``diameter_bin_upper``: The upper bounds of the diameter bins. + - ``diameter_bin_center``: The center values of the diameter bins. + psd_model : str + The PSD model to fit. See ``available_psd_models()``. + init_method: str or list + The method(s) of moments used to initialize the PSD model parameters. + See ``available_mom_methods(psd_model)``. + probability_method : str, optional + Method to compute probabilities. The default is ``cdf``. + likelihood : str, optional + Likelihood function to use for fitting. The default is ``multinomial``. + truncated_likelihood : bool, optional + Whether to use Truncated Maximum Likelihood (TML). The default is ``True``. + optimizer : str, optional + Optimization method to use. The default is ``Nelder-Mead``. + + Returns + ------- + xarray.Dataset + The dataset containing the estimated parameters. + + """ + # -----------------------------------------------------------------------------. + # Check arguments + check_psd_model(psd_model, optimization="ML") + likelihood = check_likelihood(likelihood) + probability_method = check_probability_method(probability_method) + optimizer = check_optimizer(optimizer) + + # Check valid init_method + if init_method is not None: + init_method = check_mom_methods(mom_methods=init_method, psd_model=psd_model) + + # Retrieve estimation function + func = OPTIMIZATION_ROUTINES_DICT["ML"][psd_model] + + # Retrieve parameters + ds_params = func( + ds=ds, + init_method=init_method, + probability_method=probability_method, + likelihood=likelihood, + truncated_likelihood=truncated_likelihood, + optimizer=optimizer, + ) + # Return dataset with parameters + return ds_params + + +def get_gs_parameters(ds, psd_model, target="ND", transformation="log", error_order=1): + # Check valid psd_model + check_psd_model(psd_model, optimization="GS") + + # Check valid target + target = check_target(target) + + # Check valid transformation + transformation = check_transformation(transformation) + + # Retrieve estimation function + func = OPTIMIZATION_ROUTINES_DICT["GS"][psd_model] + + # Estimate parameters + ds_params = func(ds, target=target, transformation=transformation, error_order=error_order) + + # Return dataset with parameters + return ds_params + + +def estimate_model_parameters( + ds, + psd_model, + optimization, + optimization_kwargs, +): + + optimization = check_optimization(optimization) + check_optimization_kwargs(optimization_kwargs=optimization_kwargs, optimization=optimization, psd_model=psd_model) + + # Define function + dict_func = { + "ML": get_ml_parameters, + "MOM": get_mom_parameters, + "GS": get_gs_parameters, + } + func = dict_func[optimization] + + # Retrieve parameters + ds_params = func(ds, psd_model=psd_model, **optimization_kwargs) + + # Finalize attributes + ds_params.attrs["disdrodb_psd_model"] = psd_model + ds_params.attrs["disdrodb_psd_optimization"] = optimization + if optimization == "GS": + ds_params.attrs["disdrodb_psd_optimization_target"] = optimization_kwargs["target"] + + return ds_params diff --git a/disdrodb/psd/models.py b/disdrodb/psd/models.py new file mode 100644 index 00000000..41e1e28b --- /dev/null +++ b/disdrodb/psd/models.py @@ -0,0 +1,729 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Definition of PSD models. + +The class implementation is inspired by pytmatrix.psd and pyradsim.psd modules +and adapted to allow efficient vectorized computations with xarray. + +Source code: +- https://github.com/jleinonen/pytmatrix/blob/master/pytmatrix/psd.py +- https://github.com/wolfidan/pyradsim/blob/master/pyradsim/psd.py + +""" + +import numpy as np +import xarray as xr +from pytmatrix.psd import PSD +from scipy.special import gamma as gamma_f +from scipy.stats import expon, gamma, lognorm + +# psd.log_likelihood +# psd.moment(order) +# psd.mean +# psd.variance +# psd.mode + +# TODO +# - psd.isel(**kwargs) +# - psd.sel(**kwargs) + +# __eq__ +# --> Generalize using self.parameters and deep diff + + +# ------------------------------------------------------------------------------------------------------------. + + +def available_psd_models(): + """Return a list of available PSD models.""" + return list(PSD_MODELS_DICT) + + +def check_psd_model(psd_model): + """Check validity of a PSD model.""" + available_models = available_psd_models() + if psd_model not in available_models: + raise ValueError(f"{psd_model} is an invalid PSD model. Valid models are: {available_models}.") + return psd_model + + +def get_psd_model(psd_model): + """Retrieve the PSD Class.""" + return PSD_MODELS_DICT[psd_model] + + +def get_psd_model_formula(psd_model): + """Retrieve the PSD formula.""" + return PSD_MODELS_DICT[psd_model].formula + + +def create_psd(psd_model, parameters): # TODO: check name around + """Define a PSD from a dictionary or xr.Dataset of parameters.""" + psd_class = get_psd_model(psd_model) + psd = psd_class.from_parameters(parameters) + return psd + + +def get_required_parameters(psd_model): + """Retrieve the list of parameters required by a PSD model.""" + psd_class = get_psd_model(psd_model) + return psd_class.required_parameters() + + +def clip_values(D, values, Dmax=np.inf): + """Clip values outside the [Dmin,Dmax) interval to 0.""" + # Handle scalar input + if np.isscalar(D): + if Dmax < D or D == 0.0: + return 0.0 + return values + + # Handle numpy array input + if isinstance(values, np.ndarray): + mask = (Dmax < D) | (D == 0) + values = np.where(mask, 0, values) + + # Handle xarray.DataArray input + elif isinstance(values, xr.DataArray): + values = xr.where(np.logical_or(Dmax < D, D == 0), 0, values) + values = values.where(~np.isnan(values).any(dim="diameter_bin_center")) + else: + raise TypeError("Input 'D' and 'values' must be a scalar, numpy array or an xarray.DataArray.") + return values + + +def is_scalar(value): + """Determines if the input value is a scalar.""" + return isinstance(value, (float, int)) or isinstance(value, (np.ndarray, xr.DataArray)) and value.size == 1 + + +class XarrayPSD(PSD): + """PSD class template allowing vectorized computations with xarray. + + We currently inherit from pytmatrix PSD to allow scattering simulations: + --> https://github.com/ltelab/pytmatrix-lte/blob/880170b4ca62a04e8c843619fa1b8713b9e11894/pytmatrix/psd.py#L321 + """ + + def __eq__(self, other): + """Check if two objects are equal.""" + return False + + def has_scalar_parameters(self): + """Check if the PSD object contains only a single set of parameters.""" + return np.all(is_scalar(value) for param, value in self.parameters.items()) + + def formula(self, D, **parameters): + """PSD formula.""" + pass + + def __call__(self, D): + """Compute the PSD.""" + values = self.formula(D=D, **self.parameters) + return clip_values(D=D, values=values, Dmax=self.Dmax) + + def moment(self, order, nbins_diam=1024): + """ + Compute the moments of the Particle Size Distribution (PSD). + + Parameters + ---------- + order : int + The order of the moment to compute. + nbins_diam : int, optional + The number of bins to use for the diameter range (default is 1024). + + Returns + ------- + float + The computed moment of the PSD. + + Notes + ----- + The method uses numerical integration (trapezoidal rule) to compute the moment. + """ + dbins = np.linspace(self.Dmin, self.Dmax, nbins_diam) + dD = dbins[1] - dbins[0] + return np.trapz(dbins**order * self.__call__(dbins), dx=dD) + + +class LognormalPSD(XarrayPSD): + """Lognormal drop size distribution (DSD). + + Callable class to provide a lognormal PSD with the given parameters. + + The PSD form is: + + N(D) = Nt/(sqrt(2*pi)*sigma*D)) * exp(-(ln(D)-mu)**2 / (2*sigma**2)) + + # g = sigma + # theta = 0 + + Attributes + ---------- + Nt: + g: + theta: + mu: + sigma: + + """ + + def __init__(self, Nt=1.0, mu=0.0, sigma=1.0, Dmin=0, Dmax=None, coverage=0.999): + self.Nt = Nt + self.mu = mu + self.sigma = sigma + self.parameters = {"Nt": self.Nt, "mu": self.mu, "sigma": self.sigma} + # Define Dmin and Dmax + self.Dmin = Dmin + if Dmax is not None: + self.Dmax = Dmax + else: + dmax = lognorm.ppf(coverage, s=self.sigma, scale=np.exp(self.mu)) + if isinstance(self.sigma, xr.DataArray): + self.Dmax = xr.DataArray(dmax, dims=self.sigma.dims, coords=self.sigma.coords) + else: + self.Dmax = dmax + + @staticmethod + def required_parameters(): + """Return the required parameters of the PSD.""" + return ["Nt", "mu", "sigma"] + + @property + def name(self): + """Return name of the PSD.""" + return "LognormalPSD" + + @staticmethod + def from_parameters(parameters): + """Initialize LognormalPSD from a dictionary or xr.Dataset. + + Args: + parameters (dict or xr.Dataset): Parameters to initialize the class. + + Returns + ------- + LognormalPSD: An instance of LognormalPSD initialized with the parameters. + """ + Nt = parameters["Nt"] + mu = parameters["mu"] + sigma = parameters["sigma"] + return LognormalPSD(Nt=Nt, mu=mu, sigma=sigma) + + def parameters_summary(self): + """Return a string with the parameter summary.""" + if self.has_scalar_parameters(): + summary = "".join( + [ + f"{self.name}\n", + f"$Nt = {self.Nt:.2f}$\n", + f"$\\sigma = {self.sigma:.2f}$\n" f"$\\mu = {self.mu:.2f}$\n\n", + ], + ) + else: + summary = "" f"{self.name} with N-d parameters \n" + return summary + + @staticmethod + def formula(D, Nt, mu, sigma): + """Calculates the Lognormal PSD values.""" + coeff = Nt / (np.sqrt(2.0 * np.pi) * sigma * (D)) + expon = np.exp(-((np.log(D) - mu) ** 2) / (2.0 * sigma**2)) + return coeff * expon + + # def __eq__(self, other): + # try: + # return isinstance(other, ExponentialPSD) and \ + # (self.N0 == other.N0) and (self.Lambda == other.Lambda) and \ + # (self.Dmax == other.Dmax) + # except AttributeError: + # return False + + # params dictionary ! + + +class ExponentialPSD(XarrayPSD): + """Exponential particle size distribution (PSD). + + Callable class to provide an exponential PSD with the given + parameters. The attributes can also be given as arguments to the + constructor. + + The PSD form is: + N(D) = N0 * exp(-Lambda*D) + + Attributes + ---------- + N0: the intercept parameter. + Lambda: the inverse scale parameter + Dmax: the maximum diameter to consider (defaults to 11/Lambda, i.e. approx. 3*D50, if None) + + Args (call): + D: the particle diameter. + + Returns (call): + The PSD value for the given diameter. + Returns 0 for all diameters larger than Dmax. + """ + + def __init__(self, N0=1.0, Lambda=1.0, Dmin=0, Dmax=None, coverage=0.999): + # Define parameters + self.N0 = N0 + self.Lambda = Lambda + self.parameters = {"N0": self.N0, "Lambda": self.Lambda} + + # Define Dmin and Dmax + self.Dmin = Dmin + if Dmax is not None: + self.Dmax = Dmax + else: + dmax = expon.ppf(coverage, scale=1 / self.Lambda) + if isinstance(self.Lambda, xr.DataArray): + self.Dmax = xr.DataArray(dmax, dims=self.Lambda.dims, coords=self.Lambda.coords) + else: + self.Dmax = dmax + + @staticmethod + def required_parameters(): + """Return the required parameters of the PSD.""" + return ["N0", "Lambda"] + + @property + def name(self): + """Return name of the PSD.""" + return "ExponentialPSD" + + @staticmethod + def from_parameters(parameters): + """Initialize ExponentialPSD from a dictionary or xr.Dataset. + + Args: + parameters (dict or xr.Dataset): Parameters to initialize the class. + + Returns + ------- + ExponentialPSD: An instance of ExponentialPSD initialized with the parameters. + """ + N0 = parameters["N0"] + Lambda = parameters["Lambda"] + return ExponentialPSD(N0=N0, Lambda=Lambda) + + def parameters_summary(self): + """Return a string with the parameter summary.""" + if self.has_scalar_parameters(): + summary = "".join( + [ + f"{self.name}\n", + f"$N0 = {self.N0:.2f}$\n", + f"$\\lambda = {self.Lambda:.2f}$\n\n", + ], + ) + else: + summary = "" f"{self.name} with N-d parameters \n" + return summary + + @staticmethod + def formula(D, N0, Lambda): + """Calculates the Exponential PSD values.""" + return N0 * np.exp(-Lambda * D) + + def __eq__(self, other): + """Check if two objects are equal.""" + try: + return ( + isinstance(other, ExponentialPSD) + and (self.N0 == other.N0) + and (self.Lambda == other.Lambda) + and (self.Dmax == other.Dmax) + ) + except AttributeError: + return False + + +class GammaPSD(ExponentialPSD): + """Gamma particle size distribution (PSD). + + Callable class to provide an gamma PSD with the given + parameters. The attributes can also be given as arguments to the + constructor. + + The PSD form is: + N(D) = N0 * D**mu * exp(-Lambda*D) + + Attributes + ---------- + N0: the intercept parameter [mm**(-1-mu) m**-3] (scale parameter) + Lambda: the inverse scale parameter [mm-1] (slope parameter) + mu: the shape parameter [-] + Dmax: the maximum diameter to consider (defaults to 11/Lambda, + i.e. approx. 3*D50, if None) + + Args (call): + D: the particle diameter. + + Returns (call): + The PSD value for the given diameter. + Returns 0 for all diameters larger than Dmax. + + References + ---------- + Ulbrich, C. W., 1985: The Effects of Drop Size Distribution Truncation on + Rainfall Integral Parameters and Empirical Relations. + J. Appl. Meteor. Climatol., 24, 580-590, https://doi.org/10.1175/1520-0450(1985)024<0580:TEODSD>2.0.CO;2 + """ + + def __init__(self, N0=1.0, mu=0.0, Lambda=1.0, Dmin=0, Dmax=None, coverage=0.999): + # Define parameters + self.N0 = N0 + self.Lambda = Lambda + self.mu = mu + self.parameters = {"N0": self.N0, "mu": self.mu, "Lambda": self.Lambda} + # Define Dmin and Dmax + self.Dmin = Dmin + if Dmax is not None: + self.Dmax = Dmax + else: + dmax = gamma.ppf(coverage, a=self.mu + 1.0, scale=1.0 / self.Lambda) + if isinstance(self.Lambda, xr.DataArray): + self.Dmax = xr.DataArray(dmax, dims=self.Lambda.dims, coords=self.Lambda.coords) + else: + self.Dmax = dmax + + @staticmethod + def required_parameters(): + """Return the required parameters of the PSD.""" + return ["N0", "mu", "Lambda"] + + @property + def name(self): + """Return name of the PSD.""" + return "GammaPSD" + + @staticmethod + def from_parameters(parameters): + """Initialize GammaPSD from a dictionary or xr.Dataset. + + Args: + parameters (dict or xr.Dataset): Parameters to initialize the class. + + Returns + ------- + GammaPSD: An instance of GammaPSD initialized with the parameters. + """ + N0 = parameters["N0"] + Lambda = parameters["Lambda"] + mu = parameters["mu"] + return GammaPSD(N0=N0, Lambda=Lambda, mu=mu) + + def parameters_summary(self): + """Return a string with the parameter summary.""" + if self.has_scalar_parameters(): + summary = "".join( + [ + f"{self.name}\n", + f"$\\mu = {self.mu:.2f}$\n", + f"$N0 = {self.N0:.2f}$\n", + f"$\\lambda = {self.Lambda:.2f}$\n\n", + ], + ) + else: + summary = "" f"{self.name} with N-d parameters \n" + return summary + + @staticmethod + def formula(D, N0, Lambda, mu): + """Calculates the Gamma PSD values.""" + return N0 * np.exp(mu * np.log(D) - Lambda * D) + + def __eq__(self, other): + """Check if two objects are equal.""" + try: + return super().__eq__(other) and self.mu == other.mu + except AttributeError: + return False + + +class NormalizedGammaPSD(XarrayPSD): + """Normalized gamma particle size distribution (PSD). + + Callable class to provide a normalized gamma PSD with the given + parameters. The attributes can also be given as arguments to the + constructor. + + The PSD form is: + + N(D) = Nw * f(mu) * (D/D50)**mu * exp(-(mu+3.67)*D/D50) + f(mu) = 6/(3.67**4) * (mu+3.67)**(mu+4)/Gamma(mu+4) + + An alternative formulation as function of Dm: + # Testud (2001), Bringi (2001), Williams et al., 2014, Dolan 2018 + # --> Normalized with respect to liquid water content (mass) --> Nx=D3/Dm4 + N(D) = Nw * f1(mu) * (D/Dm)**mu * exp(-(mu+4)*D/Dm) # Nw * f(D; Dm, mu) + f1(mu) = 6/(4**4) * (mu+4)**(mu+4)/Gamma(mu+4) + + Note: gamma(4) = 6 + + An alternative formulation as function of Dm: + # Tokay et al., 2010 + # Illingworth et al., 2002 (see eq10 to derive full formulation!) + # --> Normalized with respect to total concentration --> Nx = #/Dm + N(D) = Nt* * f2(mu) * (D/Dm)**mu * exp(-(mu+4)*D/Dm) + f2(mu) = (mu+4)**(mu+1)/Gamma(mu+1) + + Attributes + ---------- + D50: the median volume diameter. + Nw: the intercept parameter. + mu: the shape parameter. + Dmax: the maximum diameter to consider (defaults to 3*D50 when + if None) + + Args (call): + D: the particle diameter. + + Returns (call): + The PSD value for the given diameter. + Returns 0 for all diameters larger than Dmax. + + References + ---------- + Willis, P. T., 1984: Functional Fits to Some Observed Drop Size Distributions and Parameterization of Rain. + J. Atmos. Sci., 41, 1648-1661, https://doi.org/10.1175/1520-0469(1984)041<1648:FFTSOD>2.0.CO;2 + + Testud, J., S. Oury, R. A. Black, P. Amayenc, and X. Dou, 2001: The Concept of “Normalized” Distribution + to Describe Raindrop Spectra: A Tool for Cloud Physics and Cloud Remote Sensing. + J. Appl. Meteor. Climatol., 40, 1118-1140, https://doi.org/10.1175/1520-0450(2001)040<1118:TCONDT>2.0.CO;2 + + Illingworth, A. J., and T. M. Blackman, 2002: + The Need to Represent Raindrop Size Spectra as Normalized Gamma Distributions for + the Interpretation of Polarization Radar Observations. + J. Appl. Meteor. Climatol., 41, 286-297, https://doi.org/10.1175/1520-0450(2002)041<0286:TNTRRS>2.0.CO;2 + + Bringi, V. N., G. Huang, V. Chandrasekar, and E. Gorgucci, 2002: + A Methodology for Estimating the Parameters of a Gamma Raindrop Size Distribution Model from + Polarimetric Radar Data: Application to a Squall-Line Event from the TRMM/Brazil Campaign. + J. Atmos. Oceanic Technol., 19, 633-645, https://doi.org/10.1175/1520-0426(2002)019<0633:AMFETP>2.0.CO;2 + + Bringi, V. N., V. Chandrasekar, J. Hubbert, E. Gorgucci, W. L. Randeu, and M. Schoenhuber, 2003: + Raindrop Size Distribution in Different Climatic Regimes from Disdrometer and Dual-Polarized Radar Analysis. + J. Atmos. Sci., 60, 354-365, https://doi.org/10.1175/1520-0469(2003)060<0354:RSDIDC>2.0.CO;2 + + Tokay, A., and P. G. Bashor, 2010: An Experimental Study of Small-Scale Variability of Raindrop Size Distribution. + J. Appl. Meteor. Climatol., 49, 2348-2365, https://doi.org/10.1175/2010JAMC2269.1 + + """ + + def __init__(self, Nw=1.0, D50=1.0, mu=0.0, Dmin=0, Dmax=None): + self.D50 = D50 + self.mu = mu + self.Dmin = Dmin + self.Dmax = 3.0 * D50 if Dmax is None else Dmax + self.Nw = Nw + self.parameters = {"Nw": Nw, "D50": D50, "mu": mu} + + @staticmethod + def required_parameters(): + """Return the required parameters of the PSD.""" + return ["Nw", "D50", "mu"] + + @property + def name(self): + """Return the PSD name.""" + return "NormalizedGammaPSD" + + @staticmethod + def from_parameters(parameters): + """Initialize NormalizedGammaPSD from a dictionary or xr.Dataset. + + Args: + parameters (dict or xr.Dataset): Parameters to initialize the class. + + Returns + ------- + NormalizedGammaPSD: An instance of NormalizedGammaPSD initialized with the parameters. + """ + D50 = parameters["D50"] + Nw = parameters["Nw"] + mu = parameters["mu"] + return NormalizedGammaPSD(D50=D50, Nw=Nw, mu=mu) + + @staticmethod + def formula(D, Nw, D50, mu): + """Calculates the NormalizedGamma PSD values.""" + d_ratio = D / D50 + nf = Nw * 6.0 / 3.67**4 * (3.67 + mu) ** (mu + 4) / gamma_f(mu + 4) + return nf * np.exp(mu * np.log(d_ratio) - (3.67 + mu) * d_ratio) + + def parameters_summary(self): + """Return a string with the parameter summary.""" + if self.has_scalar_parameters(): + summary = "".join( + [ + f"{self.name}\n", + f"$\\mu = {self.mu:.2f}$\n", + f"$Nw = {self.Nw:.2f}$\n", + f"$D50 = {self.D50:.2f}$\n", + ], + ) + else: + summary = "" f"{self.name} with N-d parameters \n" + return summary + + def __eq__(self, other): + """Check if two objects are equal.""" + try: + return ( + isinstance(other, NormalizedGammaPSD) + and (self.D50 == other.D50) + and (self.Nw == other.Nw) + and (self.mu == other.mu) + and (self.Dmax == other.Dmax) + ) + except AttributeError: + return False + + +PSD_MODELS_DICT = { + "LognormalPSD": LognormalPSD, + "ExponentialPSD": ExponentialPSD, + "GammaPSD": GammaPSD, + "NormalizedGammaPSD": NormalizedGammaPSD, +} + + +class BinnedPSD(PSD): + """Binned gamma particle size distribution (PSD). + + Callable class to provide a binned PSD with the given bin edges and PSD + values. + + Args (constructor): + The first argument to the constructor should specify n+1 bin edges, + and the second should specify n bin_psd values. + + Args (call): + D: the particle diameter. + + Returns (call): + The PSD value for the given diameter. + Returns 0 for all diameters outside the bins. + """ + + def __init__(self, bin_edges, bin_psd): + if len(bin_edges) != len(bin_psd) + 1: + raise ValueError("There must be n+1 bin edges for n bins.") + + self.bin_edges = bin_edges + self.bin_psd = bin_psd + + def psd_for_D(self, D): + """ + Calculate the particle size distribution (PSD) for a given diameter D. + + Parameters + ---------- + D : float + The diameter for which to calculate the PSD. + + Returns + ------- + float + The PSD value corresponding to the given diameter D. Returns 0.0 if D is outside the range of bin edges. + + Notes + ----- + This method uses a binary search algorithm to find the appropriate bin for the given diameter D. + """ + if not (self.bin_edges[0] < D <= self.bin_edges[-1]): + return 0.0 + + # binary search for the right bin + start = 0 + end = len(self.bin_edges) + while end - start > 1: + half = (start + end) // 2 + if self.bin_edges[start] < D <= self.bin_edges[half]: + end = half + else: + start = half + + return self.bin_psd[start] + + def __call__(self, D): + """Compute the PSD.""" + if np.shape(D) == (): # D is a scalar + return self.psd_for_D(D) + return np.array([self.psd_for_D(d) for d in D]) + + def __eq__(self, other): + """Check PSD equality.""" + if other is None: + return False + return ( + len(self.bin_edges) == len(other.bin_edges) + and (self.bin_edges == other.bin_edges).all() + and (self.bin_psd == other.bin_psd).all() + ) + + +####-----------------------------------------------------------------. +#### Moments Computation + + +def get_exponential_moment(N0, Lambda, moment): + """Compute exponential distribution moments.""" + return N0 * gamma_f(moment + 1) / Lambda ** (moment + 1) + + +def get_gamma_moment_v1(N0, mu, Lambda, moment): + """Compute gamma distribution moments. + + References + ---------- + Kozu, T., and K. Nakamura, 1991: + Rainfall Parameter Estimation from Dual-Radar Measurements + Combining Reflectivity Profile and Path-integrated Attenuation. + J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2 + """ + # Zhang et al 2001: N0 * gamma_f(mu + moment + 1) * Lambda ** (-(mu + moment + 1)) + return N0 * gamma_f(mu + moment + 1) / Lambda ** (mu + moment + 1) + + +def get_gamma_moment_v2(Nt, mu, Lambda, moment): + """Compute gamma distribution moments. + + References + ---------- + Kozu, T., and K. Nakamura, 1991: + Rainfall Parameter Estimation from Dual-Radar Measurements + Combining Reflectivity Profile and Path-integrated Attenuation. + J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2 + """ + return Nt * gamma_f(mu + moment + 1) / gamma_f(mu + 1) / Lambda**moment + + +def get_lognormal_moment(Nt, sigma, mu, moment): + """Compute lognormal distribution moments. + + References + ---------- + Kozu, T., and K. Nakamura, 1991: + Rainfall Parameter Estimation from Dual-Radar Measurements + Combining Reflectivity Profile and Path-integrated Attenuation. + J. Atmos. Oceanic Technol., 8, 259-270, https://doi.org/10.1175/1520-0426(1991)008<0259:RPEFDR>2.0.CO;2 + """ + return Nt * np.exp(moment * mu + 1 / 2 * moment * sigma**2) diff --git a/disdrodb/routines.py b/disdrodb/routines.py new file mode 100644 index 00000000..1d70c8ed --- /dev/null +++ b/disdrodb/routines.py @@ -0,0 +1,1058 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB CLI routine wrappers.""" +import datetime +import time +from typing import Optional + +from disdrodb.api.io import available_stations, get_required_product +from disdrodb.utils.cli import _execute_cmd + +####--------------------------------------------------------------------------. +#### Run DISDRODB Station Processing + + +def run_disdrodb_l0_station( + data_source, + campaign_name, + station_name, + # L0 archive options + l0a_processing: bool = True, + l0b_processing: bool = True, + l0c_processing: bool = True, + remove_l0a: bool = False, + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0 processing of a specific DISDRODB station from the terminal. + + Parameters + ---------- + data_source : str + Institution name (when campaign data spans more than 1 country), + or country (when all campaigns (or sensor networks) are inside a given country). + Must be UPPER CASE. + campaign_name : str + Campaign name. Must be UPPER CASE. + station_name : str + Station name + l0a_processing : bool + Whether to launch processing to generate L0A Apache Parquet file(s) from raw data. + The default is ``True``. + l0b_processing : bool + Whether to launch processing to generate L0B netCDF4 file(s) from L0A data. + The default is ``True``. + l0b_processing : bool + Whether to launch processing to generate L0C netCDF4 file(s) from L0B data. + The default is ``True``. + l0c_processing : bool + Whether to launch processing to generate L0C netCDF4 file(s) from L0C data. + The default is True. + remove_l0a : bool + Whether to keep the L0A files after having generated the L0B netCDF products. + The default is ``False``. + remove_l0b : bool + Whether to remove the L0B files after having produced L0C netCDF files. + The default is False. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``True``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L0A, it processes just the first 3 raw data files for each station. + For L0B, it processes just the first 100 rows of 3 L0A files for each station. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + # ---------------------------------------------------------------------. + t_i = time.time() + print(f"L0 processing of station {station_name} has started.") + + # ------------------------------------------------------------------. + # L0A processing + if l0a_processing: + run_disdrodb_l0a_station( + # Station arguments + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + # ------------------------------------------------------------------. + # L0B processing + if l0b_processing: + run_disdrodb_l0b_station( + # Station arguments + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0B processing options + remove_l0a=remove_l0a, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + + # ------------------------------------------------------------------. + # L0C processing + if l0c_processing: + run_disdrodb_l0c_station( + # Station arguments + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0C processing options + remove_l0b=remove_l0b, + # Processing options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + + # -------------------------------------------------------------------------. + # End of L0 processing for all stations + timedelta_str = str(datetime.timedelta(seconds=round(time.time() - t_i))) + print(f"L0 processing of stations {station_name} completed in {timedelta_str}") + + +def run_disdrodb_l0a_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0A processing of a station calling the disdrodb_l0a_station in the terminal.""" + # Define command + cmd = " ".join( + [ + "disdrodb_run_l0a_station", + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +def run_disdrodb_l0b_station( + # Station arguments + data_source, + campaign_name, + station_name, + # L0B processing options + remove_l0a: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0B processing of a station calling disdrodb_run_l0b_station in the terminal.""" + # Define command + cmd = " ".join( + [ + "disdrodb_run_l0b_station", + # Station arguments + data_source, + campaign_name, + station_name, + # L0B processing options + "--remove_l0a", + str(remove_l0a), + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +def run_disdrodb_l0c_station( + # Station arguments + data_source, + campaign_name, + station_name, + # L0C options + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0C processing of a station calling the disdrodb_l0c_station in the terminal.""" + # TODO: implement remove_l0b! + + # Define command + cmd = " ".join( + [ + "disdrodb_run_l0c_station", + # Station arguments + data_source, + campaign_name, + station_name, + # L0C processing options + "--remove_l0b", + str(remove_l0b), + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +def run_disdrodb_l1_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L1 processing of a station calling the disdrodb_l1_station in the terminal.""" + # Define command + cmd = " ".join( + [ + "disdrodb_run_l1_station", + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +def run_disdrodb_l2e_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L2E processing of a station calling the disdrodb_l1_station in the terminal.""" + # Define command + cmd = " ".join( + [ + "disdrodb_run_l2e_station", + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +def run_disdrodb_l2m_station( + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L2M processing of a station calling the disdrodb_l2m_station in the terminal.""" + # Define command + cmd = " ".join( + [ + "disdrodb_run_l2m_station", + # Station arguments + data_source, + campaign_name, + station_name, + # Processing options + "--force", + str(force), + "--verbose", + str(verbose), + "--debugging_mode", + str(debugging_mode), + "--parallel", + str(parallel), + "--base_dir", + str(base_dir), + ], + ) + # Execute command + _execute_cmd(cmd) + + +####--------------------------------------------------------------------------. +#### Run DISDRODB Archive Processing + + +def run_disdrodb_l0a( + data_sources=None, + campaign_names=None, + station_names=None, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0A processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``True``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L0A, it processes just the first 3 raw data files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + # Define products + product = "L0A" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l0a_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l0b( + data_sources=None, + campaign_names=None, + station_names=None, + # L0B processing options + remove_l0a: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0B processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB L0A stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + remove_l0a : bool + Whether to keep the L0A files after having generated the L0B netCDF products. + The default is ``False``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``True``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L0B, it processes just the first 100 rows of 3 L0A files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + # Define products + product = "L0B" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l0b_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0B options + remove_l0a=remove_l0a, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l0c( + data_sources=None, + campaign_names=None, + station_names=None, + # L0C options + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0C processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + remove_l0b : bool + Whether to remove the L0B files after having produced L0C netCDF files. + The default is False. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``False``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L1B, it processes just 3 L0B files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + # Define products + product = "L0C" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l0c_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0C options + remove_l0b=remove_l0b, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l0( + data_sources=None, + campaign_names=None, + station_names=None, + # L0 archive options + l0a_processing: bool = True, + l0b_processing: bool = True, + l0c_processing: bool = True, + remove_l0a: bool = False, + remove_l0b: bool = False, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L0 processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + l0a_processing : bool + Whether to launch processing to generate L0A Apache Parquet file(s) from raw data. + The default is ``True``. + l0b_processing : bool + Whether to launch processing to generate L0B netCDF4 file(s) from L0A data. + The default is ``True``. + l0c_processing : bool + Whether to launch processing to generate L0C netCDF4 file(s) from L0B data. + The default is ``True``. + remove_l0a : bool + Whether to keep the L0A files after having generated the L0B netCDF products. + The default is ``False``. + remove_l0b : bool + Whether to remove the L0B files after having produced all L0C netCDF files. + The default is ``False``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``False``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L0A, it processes just the first 3 raw data files. + For L0B, it processes just the first 100 rows of 3 L0A files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + # Define starting product + if l0c_processing: + required_product = get_required_product("L0C") + if l0b_processing: + required_product = get_required_product("L0B") + if l0a_processing: + required_product = get_required_product("L0A") + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"L0 processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"L0 processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l0_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # L0 archive options + l0a_processing=l0a_processing, + l0b_processing=l0b_processing, + l0c_processing=l0c_processing, + remove_l0a=remove_l0a, + remove_l0b=remove_l0b, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"L0 processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l1( + data_sources=None, + campaign_names=None, + station_names=None, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L1 processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``False``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L1B, it processes just 3 L0B files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + product = "L1" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l1_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l2e( + data_sources=None, + campaign_names=None, + station_names=None, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L2E processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``False``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L2E, it processes just 3 L1 files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + product = "L2E" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l2e_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +def run_disdrodb_l2m( + data_sources=None, + campaign_names=None, + station_names=None, + # Processing options + force: bool = False, + verbose: bool = False, + debugging_mode: bool = False, + parallel: bool = True, + base_dir: Optional[str] = None, +): + """Run the L2M processing of DISDRODB stations. + + This function allows to launch the processing of many DISDRODB stations with a single command. + From the list of all available DISDRODB stations, it runs the processing of the + stations matching the provided data_sources, campaign_names and station_names. + + Parameters + ---------- + data_sources : list + Name of data source(s) to process. + The name(s) must be UPPER CASE. + If campaign_names and station are not specified, process all stations. + The default is ``None``. + campaign_names : list + Name of the campaign(s) to process. + The name(s) must be UPPER CASE. + The default is ``None``. + station_names : list + Station names to process. + The default is ``None``. + force : bool + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. + The default is ``False``. + verbose : bool + Whether to print detailed processing information into terminal. + The default is ``False``. + parallel : bool + If ``True``, the files are processed simultaneously in multiple processes. + Each process will use a single thread to avoid issues with the HDF/netCDF library. + By default, the number of process is defined with ``os.cpu_count()``. + If ``False``, the files are processed sequentially in a single process. + If ``False``, multi-threading is automatically exploited to speed up I/0 tasks. + debugging_mode : bool + If ``True``, it reduces the amount of data to process. + For L2MB, it processes just 3 L0B files. + The default is ``False``. + base_dir : str (optional) + Base directory of DISDRODB. Format: ``<...>/DISDRODB``. + If ``None`` (the default), the ``base_dir`` path specified in the DISDRODB active configuration will be used. + """ + product = "L2M" + required_product = get_required_product(product) + + # Get list of available stations + list_info = available_stations( + base_dir=base_dir, + product=required_product, + data_sources=data_sources, + campaign_names=campaign_names, + station_names=station_names, + raise_error_if_empty=True, + ) + + # Print message + n_stations = len(list_info) + print(f"{product} processing of {n_stations} stations started.") + + # Loop over stations + for data_source, campaign_name, station_name in list_info: + print(f"{product} processing of {data_source} {campaign_name} {station_name} station started.") + # Run processing + run_disdrodb_l2m_station( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Process options + force=force, + verbose=verbose, + debugging_mode=debugging_mode, + parallel=parallel, + ) + print(f"{product} processing of {data_source} {campaign_name} {station_name} station ended.") + + +####--------------------------------------------------------------------------. diff --git a/disdrodb/scattering/__init__.py b/disdrodb/scattering/__init__.py new file mode 100644 index 00000000..e79aa02d --- /dev/null +++ b/disdrodb/scattering/__init__.py @@ -0,0 +1,28 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement PSD scattering routines.""" + + +from disdrodb.scattering.axis_ratio import available_axis_ratio, get_axis_ratio +from disdrodb.scattering.routines import available_radar_bands, get_radar_parameters + +__all__ = [ + "available_radar_bands", + "available_axis_ratio", + "get_axis_ratio", + "get_radar_parameters", +] diff --git a/disdrodb/scattering/axis_ratio.py b/disdrodb/scattering/axis_ratio.py new file mode 100644 index 00000000..9542c2a0 --- /dev/null +++ b/disdrodb/scattering/axis_ratio.py @@ -0,0 +1,345 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement drop axis ratio theoretical models.""" + +import numpy as np +import xarray as xr + + +def available_axis_ratio(): + """Return a list of the available drop axis ratio methods.""" + return list(AXIS_RATIO_METHODS) + + +def get_axis_ratio_method(method): + """Return the specified drop axis ratio method.""" + method = check_axis_ratio(method) + return AXIS_RATIO_METHODS[method] + + +def check_axis_ratio(method): + """Check validity of the specified drop axis ratio method.""" + available_methods = available_axis_ratio() + if method not in available_methods: + raise ValueError(f"{method} is an invalid axis-ratio method. Valid methods: {available_methods}.") + return method + + +def get_axis_ratio(diameter, method): + """ + Compute the axis ratio of raindrops using the specified method. + + Parameters + ---------- + diameter : array-like + Raindrops diameter in mm. + method : str + The method to use for calculating the axis ratio. Available methods are: + 'Thurai2005', 'Thurai2007', 'Battaglia2010', 'Brandes2002', + 'Pruppacher1970', 'Beard1987', 'Andsager1999'. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + Raises + ------ + ValueError + If the specified method is not one of the available methods. + + Notes + ----- + This function serves as a wrapper to various axis ratio models for raindrops. + It selects and applies the appropriate model based on the `method` parameter. + + Examples + -------- + >>> diameter = np.array([0.5, 1.0, 2.0, 3.0]) + >>> axis_ratio = get_axis_ratio(diameter, method="Brandes2002") + + """ + # Retrieve axis ratio function + func = get_axis_ratio_method(method) + + # Retrieve axis ratio + axis_ratio = func(diameter) + + # Clip values between 0 and 1 + axis_ratio = np.clip(axis_ratio, 0, 1) + return axis_ratio + + +def get_axis_ratio_andsager_1999(diameter): + """ + Compute the axis ratio of raindrops using the Andsager et al. (1999) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + Notes + ----- + This function calculates the axis ratio of raindrops based on the method described + in Andsager et al. (1999). For diameters between 1.1 mm and 4.4 mm, it uses the + average axis-ratio relationship given by Kubesh and Beard (1993): + + axis_ratio = 1.012 - 0.144 * D - 1.03 * D^2 + + For diameters outside this range (0.1 mm to 1.1 mm and 4.4 mm to 7.0 mm), + it uses the equilibrium shape equation from Beard and Chuang (1987). + + References + ---------- + Andsager, K., Beard, K. V., & Laird, N. F. (1999). + Laboratory measurements of axis ratios for large raindrops. + Journal of the Atmospheric Sciences, 56(15), 2673-2683. + + Kubesh, R. J., & Beard, K. V. (1993). + Laboratory measurements of spontaneous oscillations for moderate-size raindrops. + Journal of the Atmospheric Sciences, 50(7), 1089-1098. + + Beard, K. V., & Chuang, C. (1987). + A new model for the equilibrium shape of raindrops. + Journal of the Atmospheric Sciences, 44(11), 1509-1524. + + """ + # Convert diameter to centimeters + diameter_cm = diameter * 0.1 + + # Axis ratio for diameters outside 1.1 mm to 4.4 mm using equilibrium model + axis_ratio_equilibrium = get_axis_ratio_beard_1987(diameter) + + # Axis ratio for diameters between 1.1 mm and 4.4 mm using Kubesh & Beard (1993) model + axis_ratio_kubesh = 1.012 - 0.144 * diameter_cm - 1.03 * diameter_cm**2 + + # Combine models based on diameter ranges + axis_ratio = xr.where( + (diameter_cm >= 1.1) & (diameter_cm < 4.4), + axis_ratio_kubesh, + axis_ratio_equilibrium, + ) + + return axis_ratio + + +def get_axis_ratio_battaglia_2010(diameter): + """ + Compute the axis ratio of raindrops using the Battaglia et al. (2010) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + Notes + ----- + - For diameters less than or equal to 1 mm, the axis ratio is constant at 1.0. + - For diameters greater than or equal to 5 mm, the axis ratio is constant at 0.7. + - Between 1 mm and 5 mm, the axis ratio varies linearly. + + The axis ratio is calculated using the equation: + + axis_ratio = 1.075 - 0.075 * D + + where **D** is the diameter in millimeters. + + References + ---------- + Battaglia, A., Rustemeier, E., Tokay, A., Blahak, U., & Simmer, C. (2010). + PARSIVEL Snow Observations: A Critical Assessment. + Journal of Atmospheric and Oceanic Technology, 27(2), 333-344. + https://doi.org/10.1175/2009JTECHA1332.1 + + """ + axis_ratio = 1.075 - 0.075 * diameter + axis_ratio = xr.where(diameter <= 1, 1.0, axis_ratio) + axis_ratio = xr.where(diameter >= 5, 0.7, axis_ratio) + return axis_ratio + + +def get_axis_ratio_beard_1987(diameter): + """ + Compute the axis ratio of raindrops using the Beard and Chuang (1987) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in centimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + Notes + ----- + The formula is a polynomial fit to the numerical model of Beard and Chuang (1987), with + drop diameters between 1 and 7 mm. + + References + ---------- + Beard, K. V., & Chuang, C. (1987). + A new model for the equilibrium shape of raindrops. + Journal of the Atmospheric Sciences, 44(11), 1509-1524. + https://doi.org/10.1175/1520-0469(1987)044<1509:ANMFTE>2.0.CO;2 + """ + return 1.0048 + 5.7e-04 * diameter - 2.628e-02 * diameter**2 + 3.682e-03 * diameter**3 - 1.677e-04 * diameter**4 + + +def get_axis_ratio_brandes_2002(diameter): + """ + Compute the axis ratio of raindrops using the Brandes et al. (2002) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + References + ---------- + Brandes, E. A., Zhang, G., & Vivekanandan, J. (2002). + Experiments in rainfall estimation with a polarimetric radar in a subtropical environment. + Journal of Applied Meteorology, 41(6), 674-685. + https://doi.org/10.1175/1520-0450(2002)041<0674:EIREWA>2.0.CO;2 + + Brandes, et al. 2005: On the Influence of Assumed Drop Size Distribution Form + on Radar-Retrieved Thunderstorm Microphysics. J. Appl. Meteor. Climatol., 45, 259-268. + """ + # Valid for drop diameters between 0.1 to 8.1 mm + axis_ratio = 0.9951 + 0.0251 * diameter - 0.03644 * diameter**2 + 0.005303 * diameter**3 - 0.0002492 * diameter**4 + return axis_ratio + + +def get_axis_ratio_pruppacher_1970(diameter): + """ + Compute the axis ratio of raindrops using the Pruppacher and Pitter (1971) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + Notes + ----- + This formula is a linear fit to wind tunnel data of Pruppacher and Pitter (1971) with + drop diameters between 1 and 9 mm. + + References + ---------- + Pruppacher, H. R., & Pitter, R. L. (1971). + A Semi-Empirical Determination of the Shape of Cloud and Precipitation Drops. + Journal of the Atmospheric Sciences, 28(1), 86-94. + https://doi.org/10.1175/1520-0469(1971)028<0086:ASEDOT>2.0.CO;2 + """ + axis_ratio = 1.03 - 0.062 * diameter + return axis_ratio + + +def get_axis_ratio_thurai_2005(diameter): + """ + Compute the axis ratio of raindrops using the Thurai et al. (2005) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + References + ---------- + Thurai, M., and V. N. Bringi, 2005: Drop Axis Ratios from a 2D Video Disdrometer. + J. Atmos. Oceanic Technol., 22, 966-978, https://doi.org/10.1175/JTECH1767.1 + + """ + # Valid between 1 and 5 mm + axis_ratio = 0.9707 + 4.26e-2 * diameter - 4.29e-2 * diameter**2 + 6.5e-3 * diameter**3 - 3e-4 * diameter**4 + return axis_ratio + + +def get_axis_ratio_thurai_2007(diameter): + """ + Compute the axis ratio of raindrops using the Thurai et al. (2007) method. + + Parameters + ---------- + diameter : array-like + Diameter of the raindrops in millimeters. + + Returns + ------- + axis_ratio : array-like + Calculated axis ratios corresponding to the input diameters. + + References + ---------- + Thurai, M., G. J. Huang, V. N. Bringi, W. L. Randeu, and M. Schönhuber, 2007: + Drop Shapes, Model Comparisons, and Calculations of Polarimetric Radar Parameters in Rain. + J. Atmos. Oceanic Technol., 24, 1019-1032, https://doi.org/10.1175/JTECH2051.1 + + """ + # Assume spherical drop when diameter < 0.7 mm + axis_ratio_below_0_7 = 1 + # Beard and Kubesh (1991) for drops diameter between 0.7 mm and 1.5 mm + axis_ratio_below_1_5 = ( + 1.173 - 0.5165 * diameter + 0.4698 * diameter**2 - 0.1317 * diameter**3 - 8.5e-3 * diameter**4 + ) + # Formula fitted on measurements of Thurai et al., 2005 for drop diameter above 1.5 mm + # --> This is very similar to Pruppacher1970 ! + axis_ratio_above_1_5 = ( + 1.065 - 6.25e-2 * diameter - 3.99e-3 * diameter**2 + 7.66e-4 * diameter**3 - 4.095e-5 * diameter**4 + ) + # Combine axis ratio + axis_ratio_below_1_5 = xr.where(diameter > 0.7, axis_ratio_below_1_5, axis_ratio_below_0_7) + axis_ratio = xr.where(diameter > 1.5, axis_ratio_above_1_5, axis_ratio_below_1_5) + return axis_ratio + + +AXIS_RATIO_METHODS = { + "Thurai2005": get_axis_ratio_thurai_2005, + "Thurai2007": get_axis_ratio_thurai_2007, + "Battaglia2010": get_axis_ratio_battaglia_2010, + "Brandes2002": get_axis_ratio_brandes_2002, + "Pruppacher1970": get_axis_ratio_pruppacher_1970, + "Beard1987": get_axis_ratio_beard_1987, + "Andsager1999": get_axis_ratio_andsager_1999, +} diff --git a/disdrodb/scattering/routines.py b/disdrodb/scattering/routines.py new file mode 100644 index 00000000..acb9b571 --- /dev/null +++ b/disdrodb/scattering/routines.py @@ -0,0 +1,450 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Implement PSD scattering routines.""" + +import itertools + +import dask +import numpy as np +import xarray as xr +from pytmatrix import orientation, radar, refractive, tmatrix_aux +from pytmatrix.psd import BinnedPSD, PSDIntegrator +from pytmatrix.tmatrix import Scatterer + +from disdrodb.psd.models import create_psd, get_required_parameters +from disdrodb.scattering.axis_ratio import check_axis_ratio, get_axis_ratio_method +from disdrodb.utils.warnings import suppress_warnings + +# Wavelengths for which the refractive index is defined in pytmatrix (in mm) +wavelength_dict = { + "S": tmatrix_aux.wl_S, + "C": tmatrix_aux.wl_C, + "X": tmatrix_aux.wl_X, + "Ku": tmatrix_aux.wl_Ku, + "Ka": tmatrix_aux.wl_Ka, + "W": tmatrix_aux.wl_W, +} + + +def available_radar_bands(): + """Return a list of the available radar bands.""" + return list(wavelength_dict) + + +def check_radar_band(radar_band): + """Check the validity of the specified radar band.""" + available_bands = available_radar_bands() + if radar_band not in available_bands: + raise ValueError(f"{radar_band} is an invalid radar band. Valid radar bands: {available_bands}.") + return radar_band + + +def get_radar_wavelength(radar_band): + """Get the wavelength of a radar band.""" + wavelength = wavelength_dict[radar_band] + return wavelength + + +def initialize_scatterer(wavelength, canting_angle_std=7, D_max=8, axis_ratio="Thurai2007"): + """Initialize T-matrix scatterer object for a given wavelength.""" + # Retrieve custom axis ratio function + axis_ratio_func = get_axis_ratio_method(axis_ratio) + + # Retrieve water complex refractive index + # - Here we currently assume 10 °C + # - m_w_0C and m_w_20C are also available + # TODO: should be another dimension ? Or use scatterer.psd_integrator.m_func? + water_refractive_index = refractive.m_w_10C[wavelength] + + # ---------------------------------------------------------------. + # Initialize Scatterer class + scatterer = Scatterer(wavelength=wavelength, m=water_refractive_index) + # - Define particle orientation PDF for orientational averaging + # --> The standard deviation of the angle with respect to vertical orientation (the canting angle). + scatterer.or_pdf = orientation.gaussian_pdf(std=canting_angle_std) + # - Define orientation methods + # --> Alternatives: orient_averaged_fixed, orient_single + scatterer.orient = orientation.orient_averaged_fixed + + # ---------------------------------------------------------------. + # Initialize PSDIntegrator + scatterer.psd_integrator = PSDIntegrator() + # - Define axis_ratio_func + # --> The Scatterer class expects horizontal to vertical + scatterer.psd_integrator.axis_ratio_func = lambda D: 1.0 / axis_ratio_func(D) + # - Define function to compute refrative index (as function of D) + # scatterer.psd_integrator.m_func = None # Use constant value of scatterer.m + # - Define number of points over which to integrate + scatterer.psd_integrator.num_points = 1024 + # - Define maximum drop diameter + scatterer.psd_integrator.D_max = D_max + # - Define geometries + scatterer.psd_integrator.geometries = (tmatrix_aux.geom_horiz_back, tmatrix_aux.geom_horiz_forw) + # ---------------------------------------------------------------. + # Initialize scattering table + scatterer.psd_integrator.init_scatter_table(scatterer) + return scatterer + + +def compute_radar_variables(scatterer): + """Compute radar variables for a given scatter object with a specified PSD. + + To speed up computations, this function should input a scatterer object with + a preinitialized scattering table. + """ + # Compute radar parameters + radar_vars = {} + scatterer.set_geometry(tmatrix_aux.geom_horiz_back) + radar_vars["Zh"] = 10 * np.log10(radar.refl(scatterer, h_pol=True)) # dBZ + radar_vars["Zdr"] = 10 * np.log10(radar.Zdr(scatterer)) # dB + radar_vars["rho_hv"] = radar.rho_hv(scatterer) + radar_vars["ldr"] = radar.ldr(scatterer) + scatterer.set_geometry(tmatrix_aux.geom_horiz_forw) + radar_vars["Kdp"] = radar.Kdp(scatterer) + radar_vars["Ai"] = radar.Ai(scatterer) + return radar_vars + + +def _estimate_empirical_radar_parameters( + drop_number_concentration, + bin_edges, + scatterer, + output_dictionary, +): + # Initialize bad results + if output_dictionary: + null_output = {"Zh": np.nan, "Zdr": np.nan, "rho_hv": np.nan, "ldr": np.nan, "Kdp": np.nan, "Ai": np.nan} + else: + null_output = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]) + + # Assign PSD model to the scatterer object + scatterer.psd = BinnedPSD(bin_edges, drop_number_concentration) + + # Get radar variables + with suppress_warnings(): + try: + radar_vars = compute_radar_variables(scatterer) + output = radar_vars if output_dictionary else np.array(list(radar_vars.values())) + except Exception: + output = null_output + return output + + +def _estimate_model_radar_parameters( + parameters, + psd_model, + psd_parameters_names, + scatterer, + output_dictionary, +): + # Initialize bad results + if output_dictionary: + null_output = {"Zh": np.nan, "Zdr": np.nan, "rho_hv": np.nan, "ldr": np.nan, "Kdp": np.nan, "Ai": np.nan} + else: + null_output = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]) + + # Assign PSD model to the scatterer object + parameters = dict(zip(psd_parameters_names, parameters)) + scatterer.psd = create_psd(psd_model, parameters) + + # Get radar variables + with suppress_warnings(): + radar_vars = compute_radar_variables(scatterer) + try: + radar_vars = compute_radar_variables(scatterer) + output = radar_vars if output_dictionary else np.array(list(radar_vars.values())) + except Exception: + output = null_output + return output + + +def get_psd_parameters(ds): + """Return a xr.Dataset with the PSD parameters.""" + psd_model = ds.attrs["disdrodb_psd_model"] + required_parameters = get_required_parameters(psd_model) + missing_parameters = [param for param in required_parameters if param not in ds] + if len(missing_parameters) > 0: + raise ValueError(f"The {psd_model} parameters {missing_parameters} are not present in the dataset.") + return ds[required_parameters] + + +def get_model_radar_parameters( + ds, + radar_band, + canting_angle_std=7, + diameter_max=8, + axis_ratio="Thurai2007", +): + """Compute radar parameters from a PSD model. + + Parameters + ---------- + ds : xarray.Dataset + Dataset containing the parameters of the PSD model. + The dataset attribute disdrodb_psd_model specifies the PSD model to use. + radar_band : str + Radar band to be used. + canting_angle_std : float, optional + Standard deviation of the canting angle. The default value is 7. + diameter_max : float, optional + Maximum diameter. The default value is 8 mm. + axis_ratio : str, optional + Method to compute the axis ratio. The default method is ``Thurai2007``. + + Returns + ------- + xarray.Dataset + Dataset containing the computed radar parameters. + """ + # Retrieve psd model and parameters. + psd_model = ds.attrs["disdrodb_psd_model"] + required_parameters = get_required_parameters(psd_model) + ds_parameters = get_psd_parameters(ds) + + # Check argument validity + axis_ratio = check_axis_ratio(axis_ratio) + radar_band = check_radar_band(radar_band) + + # Retrieve wavelengths in mm + wavelength = get_radar_wavelength(radar_band) + + # Create DataArray with PSD parameters + da_parameters = ds_parameters.to_array(dim="psd_parameters").compute() + + # Initialize scattering table + scatterer = initialize_scatterer( + wavelength=wavelength, + canting_angle_std=canting_angle_std, + D_max=diameter_max, + axis_ratio=axis_ratio, + ) + + # Define kwargs + kwargs = { + "output_dictionary": False, + "psd_model": psd_model, + "psd_parameters_names": required_parameters, + "scatterer": scatterer, + } + + # Loop over each PSD (not in parallel --> dask="forbidden") + # - It costs much more to initiate the scatterer rather than looping over timesteps ! + da_radar = xr.apply_ufunc( + _estimate_model_radar_parameters, + da_parameters, + kwargs=kwargs, + input_core_dims=[["psd_parameters"]], + output_core_dims=[["radar_variables"]], + vectorize=True, + dask="forbidden", + dask_gufunc_kwargs={"output_sizes": {"radar_variables": 5}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_radar = da_radar.assign_coords({"radar_variables": ["Zh", "Zdr", "rho_hv", "ldr", "Kdp", "Ai"]}) + + # Create parameters dataset + ds_radar = da_radar.to_dataset(dim="radar_variables") + + # Expand dimensions for later merging + dims_dict = { + "radar_band": [radar_band], + "axis_ratio": [axis_ratio], + "canting_angle_std": [canting_angle_std], + "diameter_max": [diameter_max], + } + ds_radar = ds_radar.expand_dims(dim=dims_dict) + return ds_radar + + +def get_empirical_radar_parameters( + ds, + radar_band=None, + canting_angle_std=7, + diameter_max=8, + axis_ratio="Thurai2007", +): + """Compute radar parameters from empirical drop number concentration. + + Parameters + ---------- + ds : xarray.Dataset + Dataset containing the drop number concentration variable. + radar_band : str + Radar band to be used. + canting_angle_std : float, optional + Standard deviation of the canting angle. The default value is 7. + diameter_max : float, optional + Maximum diameter. The default value is 8 mm. + axis_ratio : str, optional + Method to compute the axis ratio. The default method is ``Thurai2007``. + + Returns + ------- + xarray.Dataset + Dataset containing the computed radar parameters. + """ + # Define inputs + da_drop_number_concentration = ds["drop_number_concentration"].compute() + + # Define bin edges + bin_edges = np.append(ds["diameter_bin_lower"].compute().data, ds["diameter_bin_upper"].compute().data[-1]) + + # Check argument validity + axis_ratio = check_axis_ratio(axis_ratio) + radar_band = check_radar_band(radar_band) + + # Retrieve wavelengths in mm + wavelength = get_radar_wavelength(radar_band) + + # Initialize scattering table + scatterer = initialize_scatterer( + wavelength=wavelength, + canting_angle_std=canting_angle_std, + D_max=diameter_max, + axis_ratio=axis_ratio, + ) + + # Define kwargs + kwargs = { + "output_dictionary": False, + "bin_edges": bin_edges, + "scatterer": scatterer, + } + + # Loop over each PSD (not in parallel --> dask="forbidden") + # - It costs much more to initiate the scatterer rather than looping over timesteps ! + da_radar = xr.apply_ufunc( + _estimate_empirical_radar_parameters, + da_drop_number_concentration, + kwargs=kwargs, + input_core_dims=[["diameter_bin_center"]], + output_core_dims=[["radar_variables"]], + vectorize=True, + dask="forbidden", + dask_gufunc_kwargs={"output_sizes": {"radar_variables": 5}}, # lengths of the new output_core_dims dimensions. + output_dtypes=["float64"], + ) + + # Add parameters coordinates + da_radar = da_radar.assign_coords({"radar_variables": ["Zh", "Zdr", "rho_hv", "ldr", "Kdp", "Ai"]}) + + # Create parameters dataset + ds_radar = da_radar.to_dataset(dim="radar_variables") + + # Expand dimensions for later merging + dims_dict = { + "radar_band": [radar_band], + "axis_ratio": [axis_ratio], + "canting_angle_std": [canting_angle_std], + "diameter_max": [diameter_max], + } + ds_radar = ds_radar.expand_dims(dim=dims_dict) + return ds_radar + + +def get_radar_parameters( + ds, + radar_band=None, + canting_angle_std=7, + diameter_max=8, + axis_ratio="Thurai2007", + parallel=True, +): + """Compute radar parameters from empirical drop number concentration or PSD model. + + Parameters + ---------- + ds : xarray.Dataset + Dataset containing the drop number concentration variable. + radar_band : str or list of str, optional + Radar band(s) to be used. + If ``None`` (the default), all available radar bands are used. + canting_angle_std : float or list of float, optional + Standard deviation of the canting angle. The default value is 7. + diameter_max : float or list of float, optional + Maximum diameter. The default value is 8 mm. + axis_ratio : str or list of str, optional + Method to compute the axis ratio. The default method is ``Thurai2007``. + parallel : bool, optional + Whether to compute radar variables in parallel. + The default value is ``True``. + + Returns + ------- + xarray.Dataset + Dataset containing the computed radar parameters. + """ + # Decide whether to simulate radar parameters based on empirical PSD or model PSD + if "disdrodb_psd_model" not in ds.attrs and "drop_number_concentration" not in ds: + raise ValueError("The input dataset is not a DISDRODB L2E or L2M product.") + # Model-based simulation + if "disdrodb_psd_model" in ds.attrs: + func = get_model_radar_parameters + ds_subset = get_psd_parameters(ds).compute() + # Empirical PSD simulation + else: + func = get_empirical_radar_parameters + ds_subset = ds[["drop_number_concentration"]].compute() + + # Initialize radar band if not provided + if radar_band is None: + radar_band = available_radar_bands() + + # Ensure parameters are list + diameter_max = np.atleast_1d(diameter_max) + canting_angle_std = np.atleast_1d(canting_angle_std) + axis_ratio = np.atleast_1d(axis_ratio) + radar_band = np.atleast_1d(radar_band) + + # Check parameters validity + axis_ratio = [check_axis_ratio(method) for method in axis_ratio] + radar_band = [check_radar_band(band) for band in radar_band] + + # Retrieve combination of parameters + list_params = [ + { + "radar_band": rb.item(), + "canting_angle_std": cas.item(), + "axis_ratio": ar.item(), + "diameter_max": d_max.item(), + } + for rb, cas, ar, d_max in itertools.product(radar_band, canting_angle_std, axis_ratio, diameter_max) + ] + + # Compute radar variables for each configuration in parallel + # - The function expects the data into memory (no dask arrays !) + if parallel: + list_ds = [dask.delayed(func)(ds_subset, **params) for params in list_params] + list_ds = dask.compute(*list_ds) + else: + list_ds = [func(ds_subset, **params) for params in list_params] + + # Merge into a single dataset + ds_radar = xr.merge(list_ds) + + # Copy global attributes from input dataset + ds_radar.attrs = ds.attrs.copy() + + # Remove single dimensions (add info to attributes) + parameters = ["radar_band", "canting_angle_std", "axis_ratio", "diameter_max"] + for param in parameters: + if ds_radar.sizes[param] == 1: + ds_radar.attrs[f"disdrodb_scattering_{param}"] = ds_radar[param].item() + ds_radar = ds_radar.squeeze() + return ds_radar diff --git a/disdrodb/tests/conftest.py b/disdrodb/tests/conftest.py index 97e28bed..0263538c 100644 --- a/disdrodb/tests/conftest.py +++ b/disdrodb/tests/conftest.py @@ -151,7 +151,7 @@ def create_fake_raw_data_file( return str(filepath) -@pytest.fixture() +@pytest.fixture def create_test_config_files(request): # noqa PT004 """Create the specified config files into a temporary "test" directory. diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml b/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml index 8e63baff..3e334611 100644 --- a/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml +++ b/disdrodb/tests/data/check_readers/DISDRODB/Raw/EPFL/PARSIVEL_2007/metadata/10.yml @@ -37,7 +37,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 10 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc index fe41f9b0..1d407e18 100644 Binary files a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc and b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/ground_truth/CAIRNGORM/L0B.DIVEN.CAIRNGORM.s20170210000000.e20170210000400.V0.nc differ diff --git a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml index 54be96d8..492c0c4a 100755 --- a/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml +++ b/disdrodb/tests/data/check_readers/DISDRODB/Raw/UK/DIVEN/metadata/CAIRNGORM.yml @@ -38,7 +38,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 60 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml index 4d04c7be..0d55359d 100644 --- a/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml +++ b/disdrodb/tests/data/test_dir_structure/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml @@ -28,7 +28,7 @@ firmware_dsp: "" firmware_version: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 contributors: "" authors: "" institution: "" diff --git a/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml index 838e17f9..f9741785 100644 --- a/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml +++ b/disdrodb/tests/data/test_dir_structure/DISDRODB/Raw/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml @@ -37,7 +37,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml index 19bd7cf0..5ce6e661 100644 --- a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml +++ b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATIONID.yml @@ -28,7 +28,7 @@ firmware_dsp: "" firmware_version: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 contributors: "" authors: "" institution: "" diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml index 5dfd8800..953b97c7 100644 --- a/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml +++ b/disdrodb/tests/pytest_files/test_folders_files_creation/DISDRODB/Processed/DATA_SOURCE/CAMPAIGN_NAME/metadata/STATION_NAME.yml @@ -36,7 +36,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml b/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml index a5e873db..cfd7a2cf 100644 --- a/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml +++ b/disdrodb/tests/pytest_files/test_folders_files_creation/metadata/123.yml @@ -36,7 +36,7 @@ firmware_version: "" sensor_beam_length: "" sensor_beam_width: "" sensor_nominal_width: "" -measurement_interval: "" +measurement_interval: 30 calibration_sensitivity: "" calibration_certification_date: "" calibration_certification_url: "" diff --git a/disdrodb/tests/test_api/test_api_create_directories.py b/disdrodb/tests/test_api/test_api_create_directories.py index b7dd8544..0a747433 100644 --- a/disdrodb/tests/test_api/test_api_create_directories.py +++ b/disdrodb/tests/test_api/test_api_create_directories.py @@ -26,21 +26,22 @@ _check_campaign_name_consistency, _check_data_source_consistency, _copy_station_metadata, - create_directory_structure, create_initial_station_structure, create_issue_directory, create_l0_directory_structure, create_metadata_directory, + create_product_directory, create_test_archive, ) from disdrodb.api.path import ( define_campaign_dir, + define_data_dir, define_issue_filepath, define_metadata_dir, define_metadata_filepath, define_station_dir, ) -from disdrodb.api.scripts.disdrodb_initialize_station import disdrodb_initialize_station +from disdrodb.cli.disdrodb_initialize_station import disdrodb_initialize_station from disdrodb.tests.conftest import ( create_fake_issue_file, create_fake_metadata_directory, @@ -139,7 +140,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product): ) # Execute create_l0_directory_structure - create_l0_directory_structure( + data_dir = create_l0_directory_structure( product=product, force=False, raw_dir=raw_dir, @@ -148,6 +149,8 @@ def test_create_l0_directory_structure(tmp_path, mocker, product): ) # Test product, metadata and station directories have been created + assert os.path.exists(data_dir) + assert os.path.isdir(data_dir) assert os.path.exists(dst_station_dir) assert os.path.isdir(dst_station_dir) assert os.path.exists(dst_metadata_dir) @@ -177,7 +180,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product): assert os.path.exists(product_filepath) # Test delete file if already data in L0A (if force=True) - create_l0_directory_structure( + data_dir = create_l0_directory_structure( product=product, force=True, raw_dir=raw_dir, @@ -185,6 +188,8 @@ def test_create_l0_directory_structure(tmp_path, mocker, product): station_name=station_name, ) assert not os.path.exists(product_filepath) + assert os.path.exists(data_dir) + assert os.path.isdir(data_dir) assert os.path.exists(dst_station_dir) assert os.path.isdir(dst_station_dir) assert os.path.exists(dst_metadata_dir) @@ -193,7 +198,7 @@ def test_create_l0_directory_structure(tmp_path, mocker, product): assert os.path.isfile(dst_metadata_filepath) -def test_create_directory_structure(tmp_path, mocker): +def test_create_product_directory(tmp_path): start_product = "L0A" dst_product = "L0B" # Define station info @@ -205,20 +210,15 @@ def test_create_directory_structure(tmp_path, mocker): metadata_dict["sensor_name"] = "OTT_Parsivel" metadata_dict["reader"] = "GPM/IFLOODS" - processed_dir = define_campaign_dir( - base_dir=base_dir, - product=start_product, - data_source=data_source, - campaign_name=campaign_name, - ) - # Test raise error without data with pytest.raises(ValueError): - create_directory_structure( + _ = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, product=dst_product, force=False, - processed_dir=processed_dir, - station_name=station_name, ) # Add fake file @@ -232,11 +232,13 @@ def test_create_directory_structure(tmp_path, mocker): # Test raise error without metadata file with pytest.raises(ValueError): - create_directory_structure( + _ = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, product=dst_product, force=False, - processed_dir=processed_dir, - station_name=station_name, ) # Add metadata @@ -249,18 +251,27 @@ def test_create_directory_structure(tmp_path, mocker): metadata_dict=metadata_dict, ) - # Execute create_directory_structure - create_directory_structure( - processed_dir=processed_dir, - product=dst_product, + # Execute create_product_directory + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, station_name=station_name, + product=dst_product, force=False, ) - # Test product directory has been created - dst_station_dir = os.path.join(processed_dir, dst_product) - assert os.path.exists(dst_station_dir) - assert os.path.isdir(dst_station_dir) + # Test product data directory has been created + expected_data_dir = define_data_dir( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product=dst_product, + ) + assert expected_data_dir == data_dir + assert os.path.exists(data_dir) + assert os.path.isdir(data_dir) # Test raise error if already data in dst_product (if force=False) dst_product_file_filepath = create_fake_raw_data_file( @@ -272,32 +283,39 @@ def test_create_directory_structure(tmp_path, mocker): ) with pytest.raises(ValueError): - create_directory_structure( + _ = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, product=dst_product, force=False, - processed_dir=processed_dir, - station_name=station_name, ) assert os.path.exists(dst_product_file_filepath) # Test delete file if already data in L0A (if force=True) - create_directory_structure( + data_dir = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, product=dst_product, force=True, - processed_dir=processed_dir, - station_name=station_name, ) + assert expected_data_dir == data_dir assert not os.path.exists(dst_product_file_filepath) - assert os.path.exists(dst_station_dir) - assert os.path.isdir(dst_station_dir) + assert os.path.exists(data_dir) + assert os.path.isdir(data_dir) # Test raise error if bad station_name with pytest.raises(ValueError): - create_directory_structure( + _ = create_product_directory( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name="INEXISTENT_STATION", product=dst_product, force=False, - processed_dir=processed_dir, - station_name="INEXISTENT_STATION", ) diff --git a/disdrodb/tests/test_api/test_api_info.py b/disdrodb/tests/test_api/test_api_info.py index a9284e1b..bd031350 100644 --- a/disdrodb/tests/test_api/test_api_info.py +++ b/disdrodb/tests/test_api/test_api_info.py @@ -68,7 +68,7 @@ # valid_filepath = VALID_FNAME -@pytest.fixture() +@pytest.fixture def valid_filepath(tmp_path): # Create a valid filepath for testing filepath = tmp_path / VALID_FNAME @@ -76,7 +76,7 @@ def valid_filepath(tmp_path): return str(filepath) -@pytest.fixture() +@pytest.fixture def invalid_filepath(tmp_path): # Create an invalid filepath for testing filepath = tmp_path / INVALID_FNAME @@ -275,5 +275,5 @@ def test_get_end_time_from_filepaths(valid_filepath): def test_get_start_end_time_from_filepaths(valid_filepath): start_time, end_time = get_start_end_time_from_filepaths(valid_filepath) - assert np.array_equal(start_time, np.array([START_TIME])) - assert np.array_equal(end_time, np.array([END_TIME])) + assert np.array_equal(start_time, np.array([START_TIME]).astype("M8[s]")) + assert np.array_equal(end_time, np.array([END_TIME]).astype("M8[s]")) diff --git a/disdrodb/tests/test_api/test_api_path.py b/disdrodb/tests/test_api/test_api_path.py index b2146451..dd08141a 100644 --- a/disdrodb/tests/test_api/test_api_path.py +++ b/disdrodb/tests/test_api/test_api_path.py @@ -18,7 +18,6 @@ # -----------------------------------------------------------------------------. """Test DISDRODB path.""" import datetime -import os import numpy as np import pandas as pd @@ -26,97 +25,59 @@ import xarray as xr from disdrodb.api.path import ( - define_campaign_dir, - define_l0a_filepath, - define_l0a_station_dir, - define_l0b_filepath, - define_l0b_station_dir, + # define_campaign_dir, + define_l0a_filename, + define_l0b_filename, + define_l0c_filename, ) -PROCESSED_FOLDER_WINDOWS = "\\DISDRODB\\Processed" -PROCESSED_FOLDER_LINUX = "/DISDRODB/Processed" +# PROCESSED_FOLDER_WINDOWS = "\\DISDRODB\\Processed" +# PROCESSED_FOLDER_LINUX = "/DISDRODB/Processed" -@pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX]) -def test_define_l0a_station_dir(processed_folder): - res = ( - define_l0a_station_dir(processed_folder, "STATION_NAME") - .replace(processed_folder, "") - .replace("\\", "") - .replace("/", "") - ) - assert res == "L0ASTATION_NAME" - +# @pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX]) +# def test_define_l0a_station_dir(processed_folder): +# res = ( +# define_l0a_station_dir(processed_folder, "STATION_NAME") +# .replace(processed_folder, "") +# .replace("\\", "") +# .replace("/", "") +# ) +# assert res == "L0ASTATION_NAME" -@pytest.mark.parametrize("processed_folder", [PROCESSED_FOLDER_WINDOWS, PROCESSED_FOLDER_LINUX]) -def test_define_l0b_station_dir(processed_folder): - res = ( - define_l0b_station_dir(processed_folder, "STATION_NAME") - .replace(processed_folder, "") - .replace("\\", "") - .replace("/", "") - ) - assert res == "L0BSTATION_NAME" - -def test_define_l0a_filepath(tmp_path): - from disdrodb.l0.standards import PRODUCT_VERSION +def test_define_l0a_filename(): + from disdrodb import PRODUCT_VERSION # Set variables product = "L0A" - base_dir = tmp_path / "DISDRODB" - data_source = "DATA_SOURCE" campaign_name = "CAMPAIGN_NAME" station_name = "STATION_NAME" start_date = datetime.datetime(2019, 3, 26, 0, 0, 0) end_date = datetime.datetime(2021, 2, 8, 0, 0, 0) - start_date_str = start_date.strftime("%Y%m%d%H%M%S") - end_date_str = end_date.strftime("%Y%m%d%H%M%S") - - # Set paths - processed_dir = define_campaign_dir( - base_dir=base_dir, - product=product, - data_source=data_source, - campaign_name=campaign_name, - ) # Create dataframe df = pd.DataFrame({"time": pd.date_range(start=start_date, end=end_date)}) - # Test the function - res = define_l0a_filepath(df, processed_dir, station_name) - # Define expected results - expected_name = ( - f"{product}.{campaign_name.upper()}.{station_name}.s{start_date_str}.e{end_date_str}.{PRODUCT_VERSION}.parquet" - ) - expected_path = os.path.join(processed_dir, product, station_name, expected_name) - assert res == expected_path + expected_name = f"{product}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.parquet" + # Test the function + res = define_l0a_filename(df, campaign_name, station_name) + assert res == expected_name -def test_define_l0b_filepath(tmp_path): - from disdrodb.l0.standards import PRODUCT_VERSION - # Set variables +@pytest.mark.parametrize("product", ["L0B", "L0C"]) +def test_define_l0b_filename(product): + from disdrodb import PRODUCT_VERSION - product = "L0B" - base_dir = tmp_path / "DISDRODB" - data_source = "DATA_SOURCE" + # Set variables campaign_name = "CAMPAIGN_NAME" station_name = "STATION_NAME" + sample_interval = 10 + sample_interval_str = "10S" start_date = datetime.datetime(2019, 3, 26, 0, 0, 0) end_date = datetime.datetime(2021, 2, 8, 0, 0, 0) - start_date_str = start_date.strftime("%Y%m%d%H%M%S") - end_date_str = end_date.strftime("%Y%m%d%H%M%S") - - # Set paths - processed_dir = define_campaign_dir( - base_dir=base_dir, - product=product, - data_source=data_source, - campaign_name=campaign_name, - ) # Create xarray object timesteps = pd.date_range(start=start_date, end=end_date) @@ -124,15 +85,17 @@ def test_define_l0b_filepath(tmp_path): ds = xr.DataArray( data=data, dims=["time"], - coords={"time": pd.date_range(start=start_date, end=end_date)}, + coords={"time": pd.date_range(start=start_date, end=end_date), "sample_interval": sample_interval}, ) - # Test the function - res = define_l0b_filepath(ds, processed_dir, station_name) - # Define expected results - expected_name = ( - f"{product}.{campaign_name.upper()}.{station_name}.s{start_date_str}.e{end_date_str}.{PRODUCT_VERSION}.nc" - ) - expected_path = os.path.join(processed_dir, product, station_name, expected_name) - assert res == expected_path + # TODO: MODIFY ! + if product == "L0B": + expected_name = f"{product}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.nc" + else: + expected_name = f"{product}.{sample_interval_str}.CAMPAIGN_NAME.STATION_NAME.s20190326000000.e20210208000000.{PRODUCT_VERSION}.nc" + + # Test the function + define_filename_func = define_l0b_filename if product == "L0B" else define_l0c_filename + res = define_filename_func(ds, campaign_name, station_name) + assert res == expected_name diff --git a/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py b/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py index 5b74d534..0526dcb9 100644 --- a/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py +++ b/disdrodb/tests/test_data_transfer/test_data_transfer_scripts.py @@ -20,10 +20,10 @@ from click.testing import CliRunner -from disdrodb.data_transfer.scripts.disdrodb_download_archive import disdrodb_download_archive -from disdrodb.data_transfer.scripts.disdrodb_download_station import disdrodb_download_station -from disdrodb.data_transfer.scripts.disdrodb_upload_archive import disdrodb_upload_archive -from disdrodb.data_transfer.scripts.disdrodb_upload_station import disdrodb_upload_station +from disdrodb.cli.disdrodb_download_archive import disdrodb_download_archive +from disdrodb.cli.disdrodb_download_station import disdrodb_download_station +from disdrodb.cli.disdrodb_upload_archive import disdrodb_upload_archive +from disdrodb.cli.disdrodb_upload_station import disdrodb_upload_station from disdrodb.tests.conftest import create_fake_metadata_file TEST_ZIP_FPATH = ( diff --git a/disdrodb/tests/test_issue/test_issue_checks.py b/disdrodb/tests/test_issue/test_issue_checks.py index c9beb94f..2bc3496d 100644 --- a/disdrodb/tests/test_issue/test_issue_checks.py +++ b/disdrodb/tests/test_issue/test_issue_checks.py @@ -40,10 +40,6 @@ def test__is_numpy_array_string(): arr = np.array(["foo", "bar"], dtype=np.str_) assert _is_numpy_array_string(arr) - # Test unicode array - arr = np.array(["foo", "bar"], dtype=np.unicode_) - assert _is_numpy_array_string(arr) - # Test nonstring array arr = np.array([1, 2, 3]) assert not _is_numpy_array_string(arr) diff --git a/disdrodb/tests/test_l0/test_check_readers.py b/disdrodb/tests/test_l0/test_check_readers.py index 5472e45d..2d79cb77 100644 --- a/disdrodb/tests/test_l0/test_check_readers.py +++ b/disdrodb/tests/test_l0/test_check_readers.py @@ -92,7 +92,7 @@ def _check_station_reader_results( campaign_name=campaign_name, station_name=station_name, force=True, - verbose=False, + verbose=True, debugging_mode=False, parallel=False, ) @@ -164,6 +164,8 @@ def test_check_all_readers(tmp_path) -> None: base_dir=test_base_dir, ) + # data_source, campaign_name, station_name = list_stations_info[0] + # data_source, campaign_name, station_name = list_stations_info[1] for data_source, campaign_name, station_name in list_stations_info: _check_station_reader_results( base_dir=test_base_dir, diff --git a/disdrodb/tests/test_l0/test_cmd_processing.py b/disdrodb/tests/test_l0/test_cmd_processing.py index 759edd27..7fd220cf 100644 --- a/disdrodb/tests/test_l0/test_cmd_processing.py +++ b/disdrodb/tests/test_l0/test_cmd_processing.py @@ -25,137 +25,430 @@ from click.testing import CliRunner from disdrodb import __root_path__ -from disdrodb.api.path import define_station_dir -from disdrodb.l0.scripts.disdrodb_run_l0 import disdrodb_run_l0 -from disdrodb.l0.scripts.disdrodb_run_l0_station import disdrodb_run_l0_station -from disdrodb.l0.scripts.disdrodb_run_l0a import disdrodb_run_l0a -from disdrodb.l0.scripts.disdrodb_run_l0a_station import disdrodb_run_l0a_station -from disdrodb.l0.scripts.disdrodb_run_l0b import disdrodb_run_l0b -from disdrodb.l0.scripts.disdrodb_run_l0b_station import disdrodb_run_l0b_station +from disdrodb.api.path import define_data_dir +from disdrodb.cli.disdrodb_run_l0 import disdrodb_run_l0 +from disdrodb.cli.disdrodb_run_l0_station import disdrodb_run_l0_station +from disdrodb.cli.disdrodb_run_l0a import disdrodb_run_l0a +from disdrodb.cli.disdrodb_run_l0a_station import disdrodb_run_l0a_station +from disdrodb.cli.disdrodb_run_l0b import disdrodb_run_l0b +from disdrodb.cli.disdrodb_run_l0b_station import disdrodb_run_l0b_station +from disdrodb.routines import ( + run_disdrodb_l0_station, + run_disdrodb_l0a, + run_disdrodb_l0a_station, + run_disdrodb_l0b, + run_disdrodb_l0b_station, +) from disdrodb.utils.directories import count_files BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB") DATA_SOURCE = "EPFL" CAMPAIGN_NAME = "PARSIVEL_2007" STATION_NAME = "10" +DEBUGGING_MODE = True +VERBOSE = False +FORCE = False +# test_base_dir = "/tmp/new/DISDRODB" +# shutil.copytree(BASE_DIR, test_base_dir) +# parallel = False + +@pytest.mark.parametrize("cli", [True, False]) @pytest.mark.parametrize("parallel", [True, False]) -def test_disdrodb_run_l0a_station(tmp_path, parallel): +def test_disdrodb_run_l0a_station(tmp_path, parallel, cli): """Test the disdrodb_run_l0a_station command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) - runner = CliRunner() - runner.invoke( - disdrodb_run_l0a_station, - [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", str(test_base_dir), "--parallel", parallel], - ) - - station_dir = define_station_dir( + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0a_station, + [ + DATA_SOURCE, + CAMPAIGN_NAME, + STATION_NAME, + "--base_dir", + test_base_dir, + "--parallel", + parallel, + "--debugging_mode", + DEBUGGING_MODE, + "--verbose", + VERBOSE, + "--force", + FORCE, + ], + ) + else: + run_disdrodb_l0a_station( + # Station arguments + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, + # Processing options + parallel=parallel, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( base_dir=test_base_dir, product="L0A", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.parquet", recursive=True) > 0 + assert count_files(data_dir, glob_pattern="*.parquet", recursive=True) > 0 +@pytest.mark.parametrize("cli", [True, False]) @pytest.mark.parametrize("parallel", [True, False]) -def test_disdrodb_run_l0b_station(tmp_path, parallel): +def test_disdrodb_run_l0b_station(tmp_path, parallel, cli): """Test the disdrodb_run_l0b_station command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) - runner = CliRunner() - runner.invoke( - disdrodb_run_l0a_station, - [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--parallel", parallel], + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0a_station, + [ + DATA_SOURCE, + CAMPAIGN_NAME, + STATION_NAME, + "--base_dir", + test_base_dir, + "--parallel", + parallel, + "--debugging_mode", + DEBUGGING_MODE, + "--verbose", + VERBOSE, + "--force", + FORCE, + ], + ) + runner.invoke( + disdrodb_run_l0b_station, + [ + DATA_SOURCE, + CAMPAIGN_NAME, + STATION_NAME, + "--base_dir", + test_base_dir, + "--parallel", + parallel, + "--debugging_mode", + DEBUGGING_MODE, + "--force", + FORCE, + ], + ) + else: + run_disdrodb_l0a_station( + # Station arguments + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, + # Processing options + parallel=parallel, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + run_disdrodb_l0b_station( + # Station arguments + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, + # Processing options + parallel=parallel, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( + base_dir=test_base_dir, + product="L0B", + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, ) + assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0 - runner.invoke( - disdrodb_run_l0b_station, - [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--parallel", parallel], - ) - station_dir = define_station_dir( +@pytest.mark.parametrize("cli", [True, False]) +@pytest.mark.parametrize("parallel", [True, False]) +@pytest.mark.parametrize("verbose", [True, False]) +def test_disdrodb_run_l0_nc_station(tmp_path, verbose, parallel, cli): + """Test the disdrodb_run_l0_station process correctly raw netCDF files.""" + BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB") + DATA_SOURCE = "UK" + CAMPAIGN_NAME = "DIVEN" + STATION_NAME = "CAIRNGORM" + + test_base_dir = tmp_path / "DISDRODB" + shutil.copytree(BASE_DIR, test_base_dir) + + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0_station, + [ + DATA_SOURCE, + CAMPAIGN_NAME, + STATION_NAME, + "--base_dir", + test_base_dir, + "--verbose", + verbose, + "--parallel", + parallel, + ], + ) + else: + run_disdrodb_l0_station( + # Station arguments + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, + # Processing options + parallel=parallel, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( base_dir=test_base_dir, product="L0B", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0 + assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0 +@pytest.mark.parametrize("cli", [True, False]) @pytest.mark.parametrize("verbose", [True, False]) -def test_disdrodb_run_l0_station(tmp_path, verbose): +def test_disdrodb_run_l0_station(tmp_path, verbose, cli): """Test the disdrodb_run_l0_station command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) - runner = CliRunner() - runner.invoke( - disdrodb_run_l0_station, - [DATA_SOURCE, CAMPAIGN_NAME, STATION_NAME, "--base_dir", test_base_dir, "--verbose", verbose], - ) - - station_dir = define_station_dir( + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0_station, + [ + DATA_SOURCE, + CAMPAIGN_NAME, + STATION_NAME, + "--base_dir", + test_base_dir, + "--verbose", + verbose, + "--parallel", + False, + "--debugging_mode", + DEBUGGING_MODE, + "--force", + FORCE, + ], + ) + else: + run_disdrodb_l0_station( + # Station arguments + data_source=DATA_SOURCE, + campaign_name=CAMPAIGN_NAME, + station_name=STATION_NAME, + # Processing options + parallel=False, + force=FORCE, + verbose=verbose, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( base_dir=test_base_dir, product="L0B", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0 + assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0 -def test_disdrodb_run_l0a(tmp_path): +@pytest.mark.parametrize("cli", [True, False]) +def test_disdrodb_run_l0a(tmp_path, cli): """Test the disdrodb_run_l0a command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) - runner = CliRunner() - runner.invoke(disdrodb_run_l0a, ["--base_dir", test_base_dir]) - station_dir = define_station_dir( + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0a, + [ + "--base_dir", + test_base_dir, + "--data_sources", + DATA_SOURCE, + "--campaign_names", + CAMPAIGN_NAME, + "--station_names", + STATION_NAME, + "--verbose", + VERBOSE, + "--parallel", + False, + "--debugging_mode", + DEBUGGING_MODE, + "--force", + FORCE, + ], + ) + else: + run_disdrodb_l0a( + # Station arguments + data_sources=DATA_SOURCE, + campaign_names=CAMPAIGN_NAME, + station_names=STATION_NAME, + # Processing options + parallel=False, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( base_dir=test_base_dir, product="L0A", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.parquet", recursive=True) > 0 + assert count_files(data_dir, glob_pattern="*.parquet", recursive=True) > 0 -def test_disdrodb_run_l0b(tmp_path): +@pytest.mark.parametrize("cli", [True, False]) +def test_disdrodb_run_l0b(tmp_path, cli): """Test the disdrodb_run_l0b command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) - runner = CliRunner() - runner.invoke(disdrodb_run_l0a, ["--base_dir", test_base_dir]) - - runner.invoke(disdrodb_run_l0b, ["--base_dir", test_base_dir]) - - station_dir = define_station_dir( + # Produce data + if cli: + runner = CliRunner() + runner.invoke( + disdrodb_run_l0a, + [ + "--base_dir", + test_base_dir, + "--data_sources", + DATA_SOURCE, + "--campaign_names", + CAMPAIGN_NAME, + "--station_names", + STATION_NAME, + "--verbose", + VERBOSE, + "--parallel", + False, + "--debugging_mode", + DEBUGGING_MODE, + "--force", + FORCE, + ], + ) + + runner.invoke( + disdrodb_run_l0b, + [ + "--base_dir", + test_base_dir, + "--data_sources", + DATA_SOURCE, + "--campaign_names", + CAMPAIGN_NAME, + "--station_names", + STATION_NAME, + "--verbose", + VERBOSE, + "--parallel", + False, + "--debugging_mode", + DEBUGGING_MODE, + "--force", + FORCE, + ], + ) + else: + run_disdrodb_l0a( + # Station arguments + data_sources=DATA_SOURCE, + campaign_names=CAMPAIGN_NAME, + station_names=STATION_NAME, + # Processing options + parallel=False, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + run_disdrodb_l0b( + # Station arguments + data_sources=DATA_SOURCE, + campaign_names=CAMPAIGN_NAME, + station_names=STATION_NAME, + # Processing options + parallel=False, + force=FORCE, + verbose=VERBOSE, + debugging_mode=DEBUGGING_MODE, + base_dir=test_base_dir, + ) + + # Check files are produced + data_dir = define_data_dir( base_dir=test_base_dir, product="L0B", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0 + assert count_files(data_dir, glob_pattern="*.nc", recursive=True) > 0 @pytest.mark.parametrize("remove_l0a", [True, False]) @pytest.mark.parametrize("remove_l0b", [True, False]) -@pytest.mark.parametrize("l0b_concat", [True, False]) -def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b, l0b_concat): +def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b): """Test the disdrodb_run_l0b command.""" test_base_dir = tmp_path / "DISDRODB" shutil.copytree(BASE_DIR, test_base_dir) + # Produce data runner = CliRunner() runner.invoke( disdrodb_run_l0, @@ -168,75 +461,39 @@ def test_disdrodb_run_l0(tmp_path, remove_l0a, remove_l0b, l0b_concat): remove_l0a, "--remove_l0b", remove_l0b, - "--l0b_concat", - l0b_concat, ], ) - l0a_station_dir = define_station_dir( + # Check files are produced + l0a_data_dir = define_data_dir( base_dir=test_base_dir, product="L0A", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - l0b_station_dir = define_station_dir( + l0b_data_dir = define_data_dir( base_dir=test_base_dir, product="L0B", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - if remove_l0a: - assert count_files(l0a_station_dir, glob_pattern="*.parquet", recursive=True) == 0 - - if not remove_l0a: - assert count_files(l0a_station_dir, glob_pattern="*.parquet", recursive=True) > 0 - - if l0b_concat: - if remove_l0b: - assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) == 0 - else: - assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) > 0 - - # If not L0B concat, do not remove L0B also if remove_l0b is specified ! - if not l0b_concat and remove_l0b: - assert count_files(l0b_station_dir, glob_pattern="*.nc", recursive=True) > 0 - - -@pytest.mark.parametrize("parallel", [True, False]) -@pytest.mark.parametrize("verbose", [True, False]) -def test_disdrodb_run_l0_nc_station(tmp_path, verbose, parallel): - """Test the disdrodb_run_l0_station process correctly raw netCDF files.""" - BASE_DIR = os.path.join(__root_path__, "disdrodb", "tests", "data", "check_readers", "DISDRODB") - DATA_SOURCE = "UK" - CAMPAIGN_NAME = "DIVEN" - STATION_NAME = "CAIRNGORM" - - test_base_dir = tmp_path / "DISDRODB" - shutil.copytree(BASE_DIR, test_base_dir) - - runner = CliRunner() - runner.invoke( - disdrodb_run_l0_station, - [ - DATA_SOURCE, - CAMPAIGN_NAME, - STATION_NAME, - "--base_dir", - test_base_dir, - "--verbose", - verbose, - "--parallel", - parallel, - ], - ) - - station_dir = define_station_dir( + l0c_data_dir = define_data_dir( base_dir=test_base_dir, - product="L0B", + product="L0C", data_source=DATA_SOURCE, campaign_name=CAMPAIGN_NAME, station_name=STATION_NAME, ) - assert count_files(station_dir, glob_pattern="*.nc", recursive=True) > 0 + if remove_l0a: + assert count_files(l0a_data_dir, glob_pattern="*.parquet", recursive=True) == 0 + else: + assert count_files(l0a_data_dir, glob_pattern="*.parquet", recursive=True) > 0 + + if remove_l0b: + assert count_files(l0b_data_dir, glob_pattern="*.nc", recursive=True) == 0 + else: + assert count_files(l0b_data_dir, glob_pattern="*.nc", recursive=True) > 0 + + assert count_files(l0c_data_dir, glob_pattern="*.nc", recursive=True) > 0 diff --git a/disdrodb/tests/test_l0/test_io.py b/disdrodb/tests/test_l0/test_io.py index f598e4ac..6905a9a1 100644 --- a/disdrodb/tests/test_l0/test_io.py +++ b/disdrodb/tests/test_l0/test_io.py @@ -23,11 +23,11 @@ import pandas as pd import pytest +from disdrodb.api.io import get_filepaths from disdrodb.api.path import define_campaign_dir from disdrodb.l0.io import ( _check_glob_pattern, _read_l0a, - get_l0a_filepaths, get_raw_filepaths, read_l0a_dataframe, ) @@ -102,18 +102,14 @@ def test_get_l0a_filepaths(tmp_path): campaign_name = "CAMPAIGN_NAME" station_name = "STATION_NAME" - processed_dir = define_campaign_dir( - base_dir=base_dir, - product="L0A", - data_source=data_source, - campaign_name=campaign_name, - ) - # Test that the function raises an error if no files presenet with pytest.raises(ValueError): - get_l0a_filepaths( - processed_dir=processed_dir, + _ = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, station_name=station_name, + product="L0A", ) # Add fake data files @@ -128,15 +124,24 @@ def test_get_l0a_filepaths(tmp_path): ) # Test that the function returns the correct number of files in debugging mode - filepaths = get_l0a_filepaths( - processed_dir=processed_dir, + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, station_name=station_name, + product="L0A", debugging_mode=True, ) assert len(filepaths) == 2 # max(2, 3) # Test that the function returns the correct number of files in normal mode - filepaths = get_l0a_filepaths(processed_dir=processed_dir, station_name=station_name) + filepaths = get_filepaths( + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + product="L0A", + ) assert len(filepaths) == 2 diff --git a/disdrodb/tests/test_l0/test_l0a_processing.py b/disdrodb/tests/test_l0/test_l0a_processing.py index b45decb1..c87059ad 100644 --- a/disdrodb/tests/test_l0/test_l0a_processing.py +++ b/disdrodb/tests/test_l0/test_l0a_processing.py @@ -151,7 +151,8 @@ def test_remove_corrupted_rows(): remove_corrupted_rows(pd.DataFrame()) # Test case 3: Check if the function raises ValueError when only one row remains - with pytest.raises(ValueError, match=r"Only 1 row remains after data corruption checks. Check the file."): + msg = r"Only 1 row remains after data corruption checks. Check the raw file and maybe delete it." + with pytest.raises(ValueError, match=msg): remove_corrupted_rows(pd.DataFrame({"raw_drop_number": ["1"]})) @@ -569,7 +570,7 @@ def test_write_l0a(tmp_path): # create dummy dataframe data = [{"a": "1", "b": "2", "c": "3"}, {"a": "2", "b": "2", "c": "3"}] df = pd.DataFrame(data).set_index("a") - df["time"] = pd.Timestamp.now() + df["time"] = pd.Timestamp.now().to_numpy().astype("M8[ns]") # open by default as [ns]. Now() returns as [us] # Write parquet file filepath = os.path.join(tmp_path, "fake_data_sample.parquet") diff --git a/disdrodb/tests/test_l0/test_l0b_concat.py b/disdrodb/tests/test_l0/test_l0b_concat.py deleted file mode 100644 index 2e4e34b6..00000000 --- a/disdrodb/tests/test_l0/test_l0b_concat.py +++ /dev/null @@ -1,362 +0,0 @@ -#!/usr/bin/env python3 - -# -----------------------------------------------------------------------------. -# Copyright (c) 2021-2023 DISDRODB developers -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -----------------------------------------------------------------------------. -"""Test DISDRODB L0B netCDF concatenation routines.""" - -import os - -import numpy as np -import pandas as pd -import pytest -import xarray as xr - -from disdrodb.api.path import define_campaign_dir -from disdrodb.l0.l0_processing import run_l0b_concat, run_l0b_concat_station -from disdrodb.l0.routines import run_disdrodb_l0b_concat -from disdrodb.tests.conftest import create_fake_metadata_file, create_fake_station_dir -from disdrodb.utils.directories import count_files, list_files -from disdrodb.utils.netcdf import xr_concat_datasets - - -def create_dummy_l0b_file(filepath: str, time): - # Define the size of the dimensions - n_lat = 10 - n_lon = 10 - - # Assign lat/lon coordinates - lat_data = np.linspace(-90, 90, n_lat, dtype=np.float32) - lon_data = np.linspace(-180, 180, n_lon, dtype=np.float32) - - # Define variable dictionary - data = np.random.rand(len(time), len(lat_data), len(lon_data)).astype(np.float32) - data_vars = { - "rainfall_rate_32bit": (("time", "lat", "lon"), data), - } - # Create the coordinate dictionary - coords_dict = { - "lat": ("lat", lat_data), - "lon": ("lon", lon_data), - "time": ("time", time), - } - # Create a dataset with dimensions lat, lon, and time - ds = xr.Dataset(data_vars, coords=coords_dict) - # Set global attribute - ds.attrs["sensor_name"] = "OTT_Parsivel" - - # Set variable attributes - ds["lat"].attrs["long_name"] = "latitude" - ds["lat"].attrs["units"] = "degrees_north" - ds["lon"].attrs["long_name"] = "longitude" - ds["lon"].attrs["units"] = "degrees_east" - ds["time"].attrs["long_name"] = "time" - # ds["time"].attrs["units"] = "days since 2023-01-01" - - # Write the dataset to a new NetCDF file - ds.to_netcdf(filepath) - ds.close() - return filepath - - -def test_xr_concat_datasets(tmp_path): - # Write L0B files - filepath1 = os.path.join(tmp_path, "test_1.nc") - filepath2 = os.path.join(tmp_path, "test_2.nc") - - time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=3, freq="D")) - time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D")) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2) - - # Check with file in correct orders - filepaths = [filepath1, filepath2] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), np.concatenate((time_data_1, time_data_2)).astype(float)) - - # Check with file in reverse orders - filepaths = [filepath2, filepath1] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), np.concatenate((time_data_1, time_data_2)).astype(float)) - - -def test_xr_concat_completely_overlapped_datasets(tmp_path): - # Write L0B files - filepath1 = os.path.join(tmp_path, "test_1.nc") - filepath2 = os.path.join(tmp_path, "test_2.nc") - filepath3 = os.path.join(tmp_path, "test_3.nc") - - time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=6, freq="D")) - time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D")) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2) - _ = create_dummy_l0b_file(filepath=filepath3, time=time_data_2[::-1]) - - # Check with file in correct orders - filepaths = [filepath1, filepath2] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), time_data_1.astype(float)) - - # Check with file in reverse orders - filepaths = [filepath2, filepath1] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), time_data_1.astype(float)) - - # Check if completely overlapped but reversed order - filepaths = [filepath2, filepath3] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 3 - np.testing.assert_allclose(time_values.astype(float), time_data_2.astype(float)) - - -def test_xr_concat_completely_partial_overlapped_datasets(tmp_path): - # Write L0B files - filepath1 = os.path.join(tmp_path, "test_1.nc") - filepath2 = os.path.join(tmp_path, "test_2.nc") - - time_data_1 = np.array(pd.date_range(start="2023-01-01", periods=4, freq="D")) - time_data_2 = np.array(pd.date_range(start="2023-01-04", periods=3, freq="D")) - - unique_time_data = np.sort(np.unique(np.concatenate((time_data_1, time_data_2)))) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2) - - # Check with file in correct orders - filepaths = [filepath1, filepath2] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), unique_time_data.astype(float)) - - # Check with file in reverse orders - filepaths = [filepath2, filepath1] - ds = xr_concat_datasets(filepaths) - time_values = ds["time"].to_numpy() - assert len(time_values) == 6 - np.testing.assert_allclose(time_values.astype(float), unique_time_data.astype(float)) - - -def test_run_l0b_concat(tmp_path): - # Define station info - base_dir = tmp_path / "DISDRODB" - data_source = "DATA_SOURCE" - campaign_name = "CAMPAIGN_NAME" - station_name = "test_station" - - processed_dir = define_campaign_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - ) - # Define fake L0B directory structure - station_dir = create_fake_station_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name, - ) - - # Add dummy L0B files - filepath1 = os.path.join(station_dir, "test_1.nc") - filepath2 = os.path.join(station_dir, "test_2.nc") - - time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64) - time_data_2 = np.array([3.0, 4.0, 5.0], dtype=np.float64) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2) - - # Monkey patch the write_l0b function - def mock_write_l0b(ds: xr.Dataset, filepath: str, force=False) -> None: - ds.to_netcdf(filepath, engine="netcdf4") - - from disdrodb.l0 import l0b_processing - - l0b_processing.write_l0b = mock_write_l0b - - # Run concatenation command - run_l0b_concat(processed_dir=processed_dir, station_name=station_name, verbose=False) - - # Assert only 1 file is created - list_concatenated_files = list_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False) - assert len(list_concatenated_files) == 1 - - # Read concatenated netCDF file - ds = xr.open_dataset(list_concatenated_files[0]) - assert len(ds["time"].to_numpy()) == 6 - - -def test_run_l0b_concat_station(tmp_path): - # Define stations info - base_dir = tmp_path / "DISDRODB" - data_source = "DATA_SOURCE" - campaign_name = "CAMPAIGN_NAME" - station_name1 = "test_station_1" - - # Define fake directory structure for the two L0B stations - # # Define fake L0B directory structure - station1_dir = create_fake_station_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - ) - _ = create_fake_metadata_file( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - ) - - # Add dummy L0B files for two stations - filepath1 = os.path.join(station1_dir, f"{station_name1}_file.nc") - time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - - # Run concatenation command - run_l0b_concat_station( - base_dir=str(base_dir), - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - remove_l0b=True, - verbose=False, - ) - - # Assert files where removed - assert not os.path.exists(filepath1) - - # Assert the presence of 2 concatenated netcdf files (one for each station) - processed_dir = define_campaign_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - ) - - assert count_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False) == 1 - - # Check that if L0B files are removed, raise error if no stations available - with pytest.raises(ValueError): - run_l0b_concat_station( - base_dir=str(base_dir), - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - remove_l0b=True, - verbose=False, - ) - - -def test_run_disdrodb_l0b_concat(tmp_path): - # Define stations info - base_dir = tmp_path / "DISDRODB" - data_source = "DATA_SOURCE" - campaign_name = "CAMPAIGN_NAME" - station_name1 = "test_station_1" - station_name2 = "test_station_2" - - # Define fake directory structure for the two L0B stations - # # Define fake L0B directory structure - station1_dir = create_fake_station_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - ) - station2_dir = create_fake_station_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name2, - ) - _ = create_fake_metadata_file( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name1, - ) - _ = create_fake_metadata_file( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - station_name=station_name2, - ) - # Add dummy L0B files for two stations - filepath1 = os.path.join(station1_dir, f"{station_name1}_file.nc") - filepath2 = os.path.join(station2_dir, f"{station_name2}_file.nc") - - time_data_1 = np.array([0.0, 1.0, 2.0], dtype=np.float64) - time_data_2 = np.array([3.0, 4.0, 5.0], dtype=np.float64) - - _ = create_dummy_l0b_file(filepath=filepath1, time=time_data_1) - _ = create_dummy_l0b_file(filepath=filepath2, time=time_data_2) - - # Run concatenation command - run_disdrodb_l0b_concat( - base_dir=str(base_dir), - data_sources=data_source, - campaign_names=campaign_name, - station_names=[station_name1, station_name2], - remove_l0b=True, - verbose=False, - ) - - # Assert files where removed - assert not os.path.exists(filepath1) - assert not os.path.exists(filepath2) - - # Assert the presence of 2 concatenated netcdf files (one for each station) - processed_dir = define_campaign_dir( - base_dir=base_dir, - product="L0B", - data_source=data_source, - campaign_name=campaign_name, - ) - - assert count_files(os.path.join(processed_dir, "L0B"), glob_pattern="*.nc", recursive=False) == 2 - - # Check that if L0B files are removed, raise error if no stations available - with pytest.raises(ValueError): - run_disdrodb_l0b_concat( - base_dir=str(base_dir), - data_sources=data_source, - campaign_names=campaign_name, - station_names=[station_name1, station_name2], - remove_l0b=True, - verbose=False, - ) diff --git a/disdrodb/tests/test_l0/test_l0b_processing.py b/disdrodb/tests/test_l0/test_l0b_processing.py index 89ad123a..f01413b4 100644 --- a/disdrodb/tests/test_l0/test_l0b_processing.py +++ b/disdrodb/tests/test_l0/test_l0b_processing.py @@ -26,8 +26,6 @@ from disdrodb.l0 import l0b_processing from disdrodb.l0.l0b_processing import ( - _set_attrs_dict, - _set_coordinate_attributes, _set_variable_attributes, add_dataset_crs_coords, create_l0b_from_l0a, @@ -168,43 +166,6 @@ def test_add_dataset_crs_coords(): assert ds_out["crs"].to_numpy() == "WGS84" -def test_set_attrs_dict(): - ds = xr.Dataset({"var1": xr.DataArray([1, 2, 3], dims="time")}) - attrs_dict = {"var1": {"attr1": "value1"}} - ds = _set_attrs_dict(ds, attrs_dict) - assert ds["var1"].attrs["attr1"] == "value1" - - attrs_dict = {"var2": {"attr1": "value1"}} - ds = _set_attrs_dict(ds, attrs_dict) - assert "var2" not in ds - - attrs_dict = {"var1": {"attr1": "value1"}, "var2": {"attr2": "value2"}} - ds = _set_attrs_dict(ds, attrs_dict) - assert ds["var1"].attrs["attr1"] == "value1" - assert "var2" not in ds - - -def test__set_coordinate_attributes(): - # Create example dataset - ds = xr.Dataset( - { - "var1": xr.DataArray([1, 2, 3], dims="time"), - "lat": xr.DataArray([0, 1, 2], dims="time"), - "lon": xr.DataArray([0, 1, 2], dims="time"), - }, - ) - ds.lat.attrs["units"] = "degrees_north" - ds.lon.attrs["units"] = "degrees_east" - - # Call the function and check the output - ds_out = _set_coordinate_attributes(ds) - assert "units" in ds_out["lat"].attrs - assert ds_out["lat"].attrs["units"] == "degrees_north" - assert "units" in ds_out["lon"].attrs - assert ds_out["lon"].attrs["units"] == "degrees_east" - assert "units" not in ds_out["var1"].attrs - - def test__set_variable_attributes(mocker): # Create a sample dataset data = np.random.rand(10, 10) @@ -472,79 +433,3 @@ def test__convert_object_variables_to_string(): # Check that variable 'b' is of type 'float' assert ds["b"].dtype == "float" - - -@pytest.fixture() -def encoding_dict_1(): - # create a test encoding dictionary - return { - "var1": {"dtype": "float32", "chunksizes": (10, 10, 10)}, - "var2": {"dtype": "int16", "chunksizes": (5, 5, 5)}, - "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)}, - } - - -@pytest.fixture() -def encoding_dict_2(): - # create a test encoding dictionary - return { - "var1": {"dtype": "float32", "chunksizes": (100, 100, 100)}, - "var2": {"dtype": "int16", "chunksizes": (100, 100, 100)}, - "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)}, - } - - -@pytest.fixture() -def ds(): - # create a test xr.Dataset - data = { - "var1": (["time", "x", "y"], np.random.random((10, 20, 30))), - "var2": (["time", "x", "y"], np.random.randint(0, 10, size=(10, 20, 30))), - "var3": (["time", "x", "y"], np.random.random((10, 20, 30))), - } - coords = {"time": np.arange(10), "x": np.arange(20), "y": np.arange(30)} - return xr.Dataset(data, coords) - - -def test_sanitize_encodings_dict(encoding_dict_1, encoding_dict_2, ds): - result = l0b_processing.sanitize_encodings_dict(encoding_dict_1, ds) - - assert isinstance(result, dict) - - # Test that the dictionary contains the same keys as the input dictionary - assert set(result.keys()) == set(encoding_dict_1.keys()) - - # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes - # in the dataset - for var in result: - assert tuple(result[var]["chunksizes"]) <= ds[var].shape - - result = l0b_processing.sanitize_encodings_dict(encoding_dict_2, ds) - - assert isinstance(result, dict) - - # Test that the dictionary contains the same keys as the input dictionary - assert set(result.keys()) == set(encoding_dict_2.keys()) - - # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes - # in the dataset - for var in result: - assert tuple(result[var]["chunksizes"]) <= ds[var].shape - - -def test_rechunk_dataset(): - # Create a sample xarray dataset - data = { - "a": (["x", "y"], [[1, 2, 3], [4, 5, 6]]), - "b": (["x", "y"], [[7, 8, 9], [10, 11, 12]]), - } - coords = {"x": [0, 1], "y": [0, 1, 2]} - ds = xr.Dataset(data, coords=coords) - - # Define the encoding dictionary - encoding_dict = {"a": {"chunksizes": (1, 2)}, "b": {"chunksizes": (2, 1)}} - - # Test the rechunk_dataset function - ds_rechunked = l0b_processing.rechunk_dataset(ds, encoding_dict) - assert ds_rechunked["a"].chunks == ((1, 1), (2, 1)) - assert ds_rechunked["b"].chunks == ((2,), (1, 1, 1)) diff --git a/disdrodb/tests/test_l0/test_standards.py b/disdrodb/tests/test_l0/test_standards.py index 10b8436a..ab54864d 100644 --- a/disdrodb/tests/test_l0/test_standards.py +++ b/disdrodb/tests/test_l0/test_standards.py @@ -33,7 +33,6 @@ get_l0a_encodings_dict, get_n_velocity_bins, get_nan_flags_dict, - get_time_encoding, get_valid_coordinates_names, get_valid_dimension_names, get_valid_names, @@ -105,10 +104,6 @@ def test_get_valid_names(sensor_name): assert isinstance(get_valid_names(sensor_name), list) -def test_get_time_encoding(): - assert isinstance(get_time_encoding(), dict) - - def test_get_n_velocity_bins(): # Impact disdrometer sensor_name = "RD_80" diff --git a/disdrodb/tests/test_utils/test_utils_attrs.py b/disdrodb/tests/test_utils/test_utils_attrs.py new file mode 100644 index 00000000..b592c96a --- /dev/null +++ b/disdrodb/tests/test_utils/test_utils_attrs.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Test DISDRODB netCDF4 attributes utilities.""" +import xarray as xr + +from disdrodb.utils.attrs import set_attrs, set_coordinate_attributes + + +def test_set_attrs(): + ds = xr.Dataset({"var1": xr.DataArray([1, 2, 3], dims="time")}) + attrs_dict = {"var1": {"attr1": "value1"}} + ds = set_attrs(ds, attrs_dict) + assert ds["var1"].attrs["attr1"] == "value1" + + attrs_dict = {"var2": {"attr1": "value1"}} + ds = set_attrs(ds, attrs_dict) + assert "var2" not in ds + + attrs_dict = {"var1": {"attr1": "value1"}, "var2": {"attr2": "value2"}} + ds = set_attrs(ds, attrs_dict) + assert ds["var1"].attrs["attr1"] == "value1" + assert "var2" not in ds + + +def test_set_coordinate_attributes(): + # Create example dataset + ds = xr.Dataset( + { + "var1": xr.DataArray([1, 2, 3], dims="time"), + "lat": xr.DataArray([0, 1, 2], dims="time"), + "lon": xr.DataArray([0, 1, 2], dims="time"), + }, + ) + ds.lat.attrs["units"] = "degrees_north" + ds.lon.attrs["units"] = "degrees_east" + + # Call the function and check the output + ds_out = set_coordinate_attributes(ds) + assert "units" in ds_out["lat"].attrs + assert ds_out["lat"].attrs["units"] == "degrees_north" + assert "units" in ds_out["lon"].attrs + assert ds_out["lon"].attrs["units"] == "degrees_east" + assert "units" not in ds_out["var1"].attrs diff --git a/disdrodb/tests/test_utils/test_utils_encoding.py b/disdrodb/tests/test_utils/test_utils_encoding.py new file mode 100644 index 00000000..0af75882 --- /dev/null +++ b/disdrodb/tests/test_utils/test_utils_encoding.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Test DISDRODB netCDF4 encoding utilities.""" +import numpy as np +import pytest +import xarray as xr + +from disdrodb.utils.encoding import get_time_encoding, rechunk_dataset, sanitize_encodings_dict + + +def test_rechunk_dataset(): + # Create a sample xarray dataset + data = { + "a": (["x", "y"], [[1, 2, 3], [4, 5, 6]]), + "b": (["x", "y"], [[7, 8, 9], [10, 11, 12]]), + } + coords = {"x": [0, 1], "y": [0, 1, 2]} + ds = xr.Dataset(data, coords=coords) + + # Define the encoding dictionary + encoding_dict = {"a": {"chunksizes": (1, 2)}, "b": {"chunksizes": (2, 1)}} + + # Test the rechunk_dataset function + ds_rechunked = rechunk_dataset(ds, encoding_dict) + assert ds_rechunked["a"].chunks == ((1, 1), (2, 1)) + assert ds_rechunked["b"].chunks == ((2,), (1, 1, 1)) + + +@pytest.fixture +def encoding_dict_1(): + # create a test encoding dictionary + return { + "var1": {"dtype": "float32", "chunksizes": (10, 10, 10)}, + "var2": {"dtype": "int16", "chunksizes": (5, 5, 5)}, + "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)}, + } + + +@pytest.fixture +def encoding_dict_2(): + # create a test encoding dictionary + return { + "var1": {"dtype": "float32", "chunksizes": (100, 100, 100)}, + "var2": {"dtype": "int16", "chunksizes": (100, 100, 100)}, + "var3": {"dtype": "float64", "chunksizes": (100, 100, 100)}, + } + + +@pytest.fixture +def ds(): + # create a test xr.Dataset + data = { + "var1": (["time", "x", "y"], np.random.random((10, 20, 30))), + "var2": (["time", "x", "y"], np.random.randint(0, 10, size=(10, 20, 30))), + "var3": (["time", "x", "y"], np.random.random((10, 20, 30))), + } + coords = {"time": np.arange(10), "x": np.arange(20), "y": np.arange(30)} + return xr.Dataset(data, coords) + + +def test_sanitize_encodings_dict(encoding_dict_1, encoding_dict_2, ds): + result = sanitize_encodings_dict(encoding_dict_1, ds) + + assert isinstance(result, dict) + + # Test that the dictionary contains the same keys as the input dictionary + assert set(result.keys()) == set(encoding_dict_1.keys()) + + # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes + # in the dataset + for var in result: + assert tuple(result[var]["chunksizes"]) <= ds[var].shape + + result = sanitize_encodings_dict(encoding_dict_2, ds) + + assert isinstance(result, dict) + + # Test that the dictionary contains the same keys as the input dictionary + assert set(result.keys()) == set(encoding_dict_2.keys()) + + # Test that the chunk sizes in the returned dictionary are smaller than or equal to the corresponding array shapes + # in the dataset + for var in result: + assert tuple(result[var]["chunksizes"]) <= ds[var].shape + + +def test_get_time_encoding(): + assert isinstance(get_time_encoding(), dict) diff --git a/disdrodb/tests/test_utils/test_utils_logger.py b/disdrodb/tests/test_utils/test_utils_logger.py index d9438935..f72c86a1 100644 --- a/disdrodb/tests/test_utils/test_utils_logger.py +++ b/disdrodb/tests/test_utils/test_utils_logger.py @@ -22,10 +22,11 @@ import pytest +from disdrodb.api.path import define_campaign_dir, define_logs_dir from disdrodb.utils.logger import ( close_logger, - create_file_logger, - define_summary_log, + create_logger_file, + create_product_logs, log_debug, log_error, log_info, @@ -40,20 +41,42 @@ def create_dummy_log_file(filepath, contents): return filepath -def test_define_summary_log(tmp_path): +def test_create_product_logs(tmp_path): + test_base_dir = tmp_path / "DISDRODB" + data_source = "DATA_SOURCE" + campaign_name = "CAMPAIGN_NAME" station_name = "STATION_NAME" - logs_dir = tmp_path / "PRODUCT" / "logs" - logs_dir.mkdir(parents=True) - - logs_station_dir = logs_dir / station_name - logs_station_dir.mkdir(parents=True, exist_ok=True) - - log1_fpath = logs_station_dir / "log1.log" - log2_fpath = logs_station_dir / "log2.log" + product = "L0A" + + # Define directory where logs files are saved + logs_dir = define_logs_dir( + product=product, + base_dir=test_base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + os.makedirs(logs_dir, exist_ok=True) + + # Define paths of logs files + log1_fpath = os.path.join(logs_dir, "log1.log") + log2_fpath = os.path.join(logs_dir, "log2.log") + + # Define /summary and /problem directory + campaign_dir = define_campaign_dir( + base_dir=test_base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + ) + logs_summary_dir = os.path.join(campaign_dir, "logs", "summary") + logs_problem_dir = os.path.join(campaign_dir, "logs", "problems") - summary_log_path = logs_dir / f"logs_summary_{station_name}.log" - problem_log_path = logs_dir / f"logs_problem_{station_name}.log" + # Define summary and problem filepath + summary_log_path = os.path.join(logs_summary_dir, f"SUMMARY.{product}.{campaign_name}.{station_name}.log") + problem_log_path = os.path.join(logs_problem_dir, f"PROBLEMS.{product}.{campaign_name}.{station_name}.log") + ####-------------------------------------. # Create dummy log files log_contents1 = ( "INFO: DUMMY MESSAGE \nProcess has started \nWARNING: Potential issue detected \nNOTHING TO SUMMARIZE \n" @@ -65,15 +88,25 @@ def test_define_summary_log(tmp_path): # Call the function with the list of log files list_logs = [str(log_file1), str(log_file2)] - define_summary_log(list_logs) + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=test_base_dir, + # Logs list + list_logs=list_logs, + ) # Check summary log file with open(str(summary_log_path)) as f: summary_contents = f.read() - assert "WARNING: Potential issue detected" in summary_contents - assert "ERROR: Critical failure occurred" in summary_contents + assert "Process has started" in summary_contents assert "Process has ended" in summary_contents + assert "WARNING: Potential issue detected" in summary_contents + assert "ERROR: Critical failure occurred" in summary_contents + assert "INFO: DUMMY MESSAGE" not in summary_contents assert "NOTHING TO SUMMARIZE" not in summary_contents @@ -91,32 +124,63 @@ def test_define_summary_log(tmp_path): def test_define_summary_log_when_no_problems(tmp_path): """Test that not problem log file is created if no errors occurs.""" + test_base_dir = tmp_path / "DISDRODB" + data_source = "DATA_SOURCE" + campaign_name = "CAMPAIGN_NAME" station_name = "STATION_NAME" - logs_dir = tmp_path / "PRODUCT" / "logs" - logs_dir.mkdir(parents=True) - - logs_station_dir = logs_dir / station_name - logs_station_dir.mkdir(parents=True, exist_ok=True) - - log1_fpath = logs_station_dir / "log1.log" - log2_fpath = logs_station_dir / "log2.log" + product = "L0A" + + # Define directory where logs files are saved + logs_dir = define_logs_dir( + product=product, + base_dir=test_base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + ) + os.makedirs(logs_dir, exist_ok=True) + + # Define paths of logs files + log1_fpath = os.path.join(logs_dir, "log1.log") + log2_fpath = os.path.join(logs_dir, "log2.log") + + # Define /summary and /problem directory + campaign_dir = define_campaign_dir( + base_dir=test_base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + ) + logs_summary_dir = os.path.join(campaign_dir, "logs", "summary") + logs_problem_dir = os.path.join(campaign_dir, "logs", "problems") - summary_log_path = logs_dir / f"logs_summary_{station_name}.log" - problem_log_path = logs_dir / f"logs_problem_{station_name}.log" + # Define summary and problem filepath + summary_log_path = os.path.join(logs_summary_dir, f"SUMMARY.{product}.{campaign_name}.{station_name}.log") + problem_log_path = os.path.join(logs_problem_dir, f"PROBLEMS.{product}.{campaign_name}.{station_name}.log") + ####-------------------------------------. # Check that if no problems, the problems log is not created log_contents1 = "INFO: DUMMY MESSAGE \nProcess has started \n Process has ended \n" log_contents2 = "INFO: DUMMY MESSAGE \nProcess has started \n Process has ended \n" log_file1 = create_dummy_log_file(log1_fpath, log_contents1) log_file2 = create_dummy_log_file(log2_fpath, log_contents2) - list_logs = [str(log_file1), str(log_file2)] - define_summary_log(list_logs) + list_logs = [str(log_file1), str(log_file2)] # noqa + + # List logs direc + create_product_logs( + product=product, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + base_dir=test_base_dir, + list_logs=None, # search for logs based on inputs + ) assert os.path.exists(summary_log_path) assert not os.path.exists(problem_log_path) -@pytest.fixture() +@pytest.fixture def test_logger(): logger = logging.getLogger("test_logger") logger.setLevel(logging.DEBUG) # Capture all log levels @@ -155,7 +219,7 @@ def test_log_error(caplog, test_logger, capfd): assert " - Error message" in out -@pytest.fixture() +@pytest.fixture def log_environment(tmp_path): processed_dir = tmp_path / "processed" os.makedirs(processed_dir, exist_ok=True) @@ -165,9 +229,10 @@ def log_environment(tmp_path): return processed_dir, product, station_name, filename -def test_create_file_logger_paralle_false(log_environment): +def test_create_logger_file_paralle_false(log_environment): processed_dir, product, station_name, filename = log_environment - logger = create_file_logger(str(processed_dir), product, station_name, filename, parallel=False) + logs_dir = os.path.join(str(processed_dir), "logs", product, station_name) + logger, logger_filepath = create_logger_file(logs_dir, filename, parallel=False) assert isinstance(logger, logging.Logger) @@ -193,6 +258,7 @@ def test_create_file_logger_paralle_false(log_environment): def test_close_logger(log_environment): processed_dir, product, station_name, filename = log_environment - logger = create_file_logger(str(processed_dir), product, station_name, filename, parallel=False) + logs_dir = os.path.join(str(processed_dir), "logs", product, station_name) + logger, logger_filepath = create_logger_file(logs_dir, filename, parallel=False) close_logger(logger) assert not logger.handlers diff --git a/disdrodb/tests/test_utils/test_utils_scripts.py b/disdrodb/tests/test_utils/test_utils_scripts.py index 240b4a59..27e41c7f 100644 --- a/disdrodb/tests/test_utils/test_utils_scripts.py +++ b/disdrodb/tests/test_utils/test_utils_scripts.py @@ -16,9 +16,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . # -----------------------------------------------------------------------------. -"""Test DISDRODB scripts utility.""" +"""Test DISDRODB command-line interface scripts utilities.""" -from disdrodb.utils.scripts import parse_arg_to_list, parse_base_dir +from disdrodb.utils.cli import parse_arg_to_list, parse_base_dir def test_parse_arg_to_list_empty_string(): diff --git a/disdrodb/utils/__init__.py b/disdrodb/utils/__init__.py index e69de29b..9fe0f797 100644 --- a/disdrodb/utils/__init__.py +++ b/disdrodb/utils/__init__.py @@ -0,0 +1,17 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB Utils Module.""" diff --git a/disdrodb/utils/attrs.py b/disdrodb/utils/attrs.py new file mode 100644 index 00000000..c52ade13 --- /dev/null +++ b/disdrodb/utils/attrs.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB netCDF4 attributes utilities.""" +import datetime + +from disdrodb import CONVENTIONS, PRODUCT_VERSION, SOFTWARE_VERSION + +####---------------------------------------------------------------------. +#### Variable attributes + + +def set_attrs(ds, attrs_dict): + """Set attributes to the variables of the xr.Dataset.""" + for var in attrs_dict: + if var in ds: + ds[var].attrs.update(attrs_dict[var]) + return ds + + +####---------------------------------------------------------------------. +#### Coordinates attributes + + +def get_coords_attrs_dict(): + """Return dictionary with DISDRODB coordinates attributes.""" + attrs_dict = {} + # Define diameter attributes + attrs_dict["diameter_bin_center"] = { + "name": "diameter_bin_center", + "standard_name": "diameter_bin_center", + "long_name": "diameter_bin_center", + "units": "mm", + "description": "Bin center drop diameter value", + } + attrs_dict["diameter_bin_width"] = { + "name": "diameter_bin_width", + "standard_name": "diameter_bin_width", + "long_name": "diameter_bin_width", + "units": "mm", + "description": "Drop diameter bin width", + } + attrs_dict["diameter_bin_upper"] = { + "name": "diameter_bin_upper", + "standard_name": "diameter_bin_upper", + "long_name": "diameter_bin_upper", + "units": "mm", + "description": "Bin upper bound drop diameter value", + } + attrs_dict["velocity_bin_lower"] = { + "name": "velocity_bin_lower", + "standard_name": "velocity_bin_lower", + "long_name": "velocity_bin_lower", + "units": "mm", + "description": "Bin lower bound drop diameter value", + } + # Define velocity attributes + attrs_dict["velocity_bin_center"] = { + "name": "velocity_bin_center", + "standard_name": "velocity_bin_center", + "long_name": "velocity_bin_center", + "units": "m/s", + "description": "Bin center drop fall velocity value", + } + attrs_dict["velocity_bin_width"] = { + "name": "velocity_bin_width", + "standard_name": "velocity_bin_width", + "long_name": "velocity_bin_width", + "units": "m/s", + "description": "Drop fall velocity bin width", + } + attrs_dict["velocity_bin_upper"] = { + "name": "velocity_bin_upper", + "standard_name": "velocity_bin_upper", + "long_name": "velocity_bin_upper", + "units": "m/s", + "description": "Bin upper bound drop fall velocity value", + } + attrs_dict["velocity_bin_lower"] = { + "name": "velocity_bin_lower", + "standard_name": "velocity_bin_lower", + "long_name": "velocity_bin_lower", + "units": "m/s", + "description": "Bin lower bound drop fall velocity value", + } + # Define geolocation attributes + attrs_dict["latitude"] = { + "name": "latitude", + "standard_name": "latitude", + "long_name": "Latitude", + "units": "degrees_north", + } + attrs_dict["longitude"] = { + "name": "longitude", + "standard_name": "longitude", + "long_name": "Longitude", + "units": "degrees_east", + } + attrs_dict["altitude"] = { + "name": "altitude", + "standard_name": "altitude", + "long_name": "Altitude", + "units": "m", + "description": "Elevation above sea level", + } + # Define time attributes + attrs_dict["time"] = { + "name": "time", + "standard_name": "time", + "long_name": "time", + "description": "UTC Time", + } + + return attrs_dict + + +def set_coordinate_attributes(ds): + """Set coordinates attributes.""" + # Get attributes dictionary + attrs_dict = get_coords_attrs_dict() + # Set attributes + ds = set_attrs(ds, attrs_dict) + return ds + + +####-------------------------------------------------------------------------. +#### DISDRODB Global Attributes + + +def set_disdrodb_attrs(ds, product: str): + """Add DISDRODB processing information to the netCDF global attributes. + + It assumes stations metadata are already added the dataset. + + Parameters + ---------- + ds : xarray.Dataset + Dataset + product: str + DISDRODB product. + + Returns + ------- + xarray dataset + Dataset. + """ + # Add dataset conventions + ds.attrs["Conventions"] = CONVENTIONS + + # Add featureType + if "platform_type" in ds.attrs: + platform_type = ds.attrs["platform_type"] + if platform_type == "fixed": + ds.attrs["featureType"] = "timeSeries" + else: + ds.attrs["featureType"] = "trajectory" + + # Update DISDRODDB attributes + ds = update_disdrodb_attrs(ds=ds, product=product) + return ds + + +def update_disdrodb_attrs(ds, product: str): + """Add DISDRODB processing information to the netCDF global attributes. + + It assumes stations metadata are already added the dataset. + + Parameters + ---------- + ds : xarray dataset. + Dataset + product: str + DISDRODB product. + + Returns + ------- + xarray dataset + Dataset. + """ + # Add time_coverage_start and time_coverage_end + ds.attrs["time_coverage_start"] = str(ds["time"].data[0]) + ds.attrs["time_coverage_end"] = str(ds["time"].data[-1]) + + # DISDRODDB attributes + # - Add DISDRODB processing info + now = datetime.datetime.utcnow() + current_time = now.strftime("%Y-%m-%d %H:%M:%S") + ds.attrs["disdrodb_processing_date"] = current_time + # - Add DISDRODB product and version + ds.attrs["disdrodb_product_version"] = PRODUCT_VERSION + ds.attrs["disdrodb_software_version"] = SOFTWARE_VERSION + ds.attrs["disdrodb_product"] = product + return ds diff --git a/disdrodb/utils/cli.py b/disdrodb/utils/cli.py new file mode 100644 index 00000000..bbe62715 --- /dev/null +++ b/disdrodb/utils/cli.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB command-line-interface scripts utilities.""" + +import click + + +def _execute_cmd(cmd, raise_error=False): + """Execute command in the terminal, streaming output in python console.""" + from subprocess import PIPE, CalledProcessError, Popen + + with Popen(cmd, shell=True, stdout=PIPE, bufsize=1, universal_newlines=True) as p: + for line in p.stdout: + print(line, end="") + + # Raise error if command didn't run successfully + if p.returncode != 0 and raise_error: + raise CalledProcessError(p.returncode, p.args) + + +def _parse_empty_string_and_none(args): + """Utility to parse argument passed from the command line. + + If ``args = ''``, returns None. + If ``args = 'None'`` returns None. + Otherwise return ``args``. + """ + # If '', set to 'None' + args = None if args == "" else args + # - If multiple arguments, split by space + if isinstance(args, str) and args == "None": + args = None + return args + + +def parse_arg_to_list(args): + """Utility to pass list to command line scripts. + + If ``args = ''`` returns ``None``. + If ``args = 'None'`` returns ``None``. + If ``args = 'variable'`` returns ``[variable]``. + If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``. + """ + # If '' or 'None' --> Set to None + args = _parse_empty_string_and_none(args) + # - If multiple arguments, split by space + if isinstance(args, str): + # - Split by space + list_args = args.split(" ") + # - Remove '' (deal with multi space) + args = [args for args in list_args if len(args) > 0] + return args + + +def parse_base_dir(base_dir): + """Utility to parse base_dir provided by command line. + + If ``base_dir = 'None'`` returns ``None``. + If ``base_dir = ''`` returns ``None``. + """ + # If '', set to 'None' + return _parse_empty_string_and_none(base_dir) + + +def click_station_arguments(function: object): + """Click command line arguments for DISDRODB station processing. + + Parameters + ---------- + function : object + Function. + """ + function = click.argument("station_name", metavar="")(function) + function = click.argument("campaign_name", metavar="")(function) + function = click.argument("data_source", metavar="")(function) + return function + + +def click_base_dir_option(function: object): + """Click command line argument for DISDRODB ``base_dir``. + + Parameters + ---------- + function : object + Function. + """ + function = click.option( + "--base_dir", + type=str, + show_default=True, + default=None, + help="DISDRODB base directory", + )(function) + return function + + +def click_stations_options(function: object): + """Click command line options for DISDRODB archive L0 processing. + + Parameters + ---------- + function : object + Function. + """ + function = click.option( + "--data_sources", + type=str, + show_default=True, + default="", + help="DISDRODB data sources to process", + )(function) + function = click.option( + "--campaign_names", + type=str, + show_default=True, + default="", + help="DISDRODB campaign names to process", + )(function) + function = click.option( + "--station_names", + type=str, + show_default=True, + default="", + help="DISDRODB station names to process", + )(function) + return function + + +def click_processing_options(function: object): + """Click command line default parameters for L0 processing options. + + Parameters + ---------- + function : object + Function. + """ + function = click.option( + "-p", + "--parallel", + type=bool, + show_default=True, + default=False, + help="Process files in parallel", + )(function) + function = click.option( + "-d", + "--debugging_mode", + type=bool, + show_default=True, + default=False, + help="Switch to debugging mode", + )(function) + function = click.option("-v", "--verbose", type=bool, show_default=True, default=True, help="Verbose")(function) + function = click.option( + "-f", + "--force", + type=bool, + show_default=True, + default=False, + help="Force overwriting", + )(function) + return function + + +def click_remove_l0a_option(function: object): + """Click command line argument for ``remove_l0a``.""" + function = click.option( + "--remove_l0a", + type=bool, + show_default=True, + default=False, + help="If true, remove the L0A files once the L0B processing is terminated.", + )(function) + return function + + +def click_remove_l0b_option(function: object): + """Click command line argument for ``remove_l0b``.""" + function = click.option( + "--remove_l0b", + type=bool, + show_default=True, + default=False, + help="If true, remove the L0B files once the L0C processing is terminated.", + )(function) + return function + + +def click_l0_archive_options(function: object): + """Click command line arguments for L0 processing archiving of a station. + + Parameters + ---------- + function : object + Function. + """ + function = click.option( + "--remove_l0b", + type=bool, + show_default=True, + default=False, + help="If true, remove all source L0B files once L0B concatenation is terminated.", + )(function) + function = click.option( + "--remove_l0a", + type=bool, + show_default=True, + default=False, + help="If true, remove the L0A files once the L0B processing is terminated.", + )(function) + function = click.option( + "-l0c", + "--l0c_processing", + type=bool, + show_default=True, + default=True, + help="Perform L0C processing.", + )(function) + function = click.option( + "-l0b", + "--l0b_processing", + type=bool, + show_default=True, + default=True, + help="Perform L0B processing.", + )(function) + function = click.option( + "-l0a", + "--l0a_processing", + type=bool, + show_default=True, + default=True, + help="Perform L0A processing.", + )(function) + return function diff --git a/disdrodb/utils/dask.py b/disdrodb/utils/dask.py new file mode 100644 index 00000000..ee3c5aae --- /dev/null +++ b/disdrodb/utils/dask.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Utilities for Dask Distributed computations.""" +import logging +import os + + +def initialize_dask_cluster(): + """Initialize Dask Cluster.""" + import dask + from dask.distributed import Client, LocalCluster + + # Set HDF5_USE_FILE_LOCKING to avoid going stuck with HDF + os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" + # Retrieve the number of process to run + available_workers = os.cpu_count() - 2 # if not set, all CPUs + num_workers = dask.config.get("num_workers", available_workers) + # Silence dask warnings + dask.config.set({"logging.distributed": "error"}) + # dask.config.set({"distributed.admin.system-monitor.gil.enabled": False}) + # Create dask.distributed local cluster + cluster = LocalCluster( + n_workers=num_workers, + threads_per_worker=1, + processes=True, + # memory_limit='8GB', + # silence_logs=False, + ) + client = Client(cluster) + return cluster, client + + +def close_dask_cluster(cluster, client): + """Close Dask Cluster.""" + logger = logging.getLogger() + # Backup current log level + original_level = logger.level + logger.setLevel(logging.CRITICAL + 1) # Set level to suppress all logs + # Close cluster + # - Avoid log 'distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.' + try: + cluster.close() + client.close() + finally: + # Restore the original log level + logger.setLevel(original_level) diff --git a/disdrodb/utils/decorator.py b/disdrodb/utils/decorator.py new file mode 100644 index 00000000..64bd76e1 --- /dev/null +++ b/disdrodb/utils/decorator.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB decorators.""" +import functools + +import dask + + +def delayed_if_parallel(function): + """Decorator to make the function delayed if its ``parallel`` argument is ``True``.""" + + @functools.wraps(function) + def wrapper(*args, **kwargs): + # Check if it must be a delayed function + parallel = kwargs.get("parallel") + # If parallel is True + if parallel: + # Enforce verbose to be False + kwargs["verbose"] = False + # Define the delayed task + result = dask.delayed(function)(*args, **kwargs) + else: + # Else run the function + result = function(*args, **kwargs) + return result + + return wrapper + + +def single_threaded_if_parallel(function): + """Decorator to make a function use a single threadon delayed if its ``parallel`` argument is ``True``.""" + + @functools.wraps(function) + def wrapper(*args, **kwargs): + # Check if it must be a delayed function + parallel = kwargs.get("parallel") + # If parallel is True + if parallel: + # Call function with single thread + # with dask.config.set(scheduler='single-threaded'): + with dask.config.set(scheduler="synchronous"): + result = function(*args, **kwargs) + else: + # Else run the function as usual + result = function(*args, **kwargs) + return result + + return wrapper diff --git a/disdrodb/utils/directories.py b/disdrodb/utils/directories.py index 8eba18b6..2db94043 100644 --- a/disdrodb/utils/directories.py +++ b/disdrodb/utils/directories.py @@ -90,21 +90,18 @@ def create_directory(path: str, exist_ok=True) -> None: os.makedirs(path, exist_ok=exist_ok) logger.debug(f"Created directory {path}.") except Exception as e: + dir_path = os.path.dirname(path) dir_name = os.path.basename(path) - msg = f"Can not create directory {dir_name} inside . Error: {e}" + msg = f"Can not create directory {dir_name} inside {dir_path}. Error: {e}" logger.exception(msg) raise FileNotFoundError(msg) -def create_required_directory(dir_path, dir_name): +def create_required_directory(dir_path, dir_name, exist_ok=True): """Create directory ``dir_name`` inside the ``dir_path`` directory.""" - try: - new_dir = os.path.join(dir_path, dir_name) - os.makedirs(new_dir, exist_ok=True) - except Exception as e: - msg = f"Can not create directory {dir_name} at {new_dir}. Error: {e}" - logger.exception(msg) - raise FileNotFoundError(msg) + dir_path = ensure_string_path(dir_path, msg="'path' must be a string", accepth_pathlib=True) + new_dir_path = os.path.join(dir_path, dir_name) + create_directory(path=new_dir_path, exist_ok=exist_ok) def is_empty_directory(path): @@ -119,9 +116,7 @@ def is_empty_directory(path): return False paths = os.listdir(path) - if len(paths) == 0: - return True - return False + return len(paths) == 0 def _remove_file_or_directories(path): diff --git a/disdrodb/utils/encoding.py b/disdrodb/utils/encoding.py new file mode 100644 index 00000000..7b052a56 --- /dev/null +++ b/disdrodb/utils/encoding.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB netCDF4 encoding utilities.""" +import xarray as xr + +EPOCH = "seconds since 1970-01-01 00:00:00" + + +def set_encodings(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset: + """Apply the encodings to the xarray Dataset. + + Parameters + ---------- + ds : xarray.Dataset + Input xarray dataset. + encoding_dict : dict + Dictionary with encoding specifications. + + Returns + ------- + xr.Dataset + Output xarray dataset. + """ + # Subset encoding dictionary + # - Here below encoding_dict contains only keys (variables) within the dataset + encoding_dict = {var: encoding_dict[var] for var in ds.data_vars if var in encoding_dict} + + # Ensure chunksize smaller than the array shape + encoding_dict = sanitize_encodings_dict(encoding_dict, ds) + + # Rechunk variables for fast writing ! + # - This pop the chunksize argument from the encoding dict ! + ds = rechunk_dataset(ds, encoding_dict) + + # Set time encoding + ds["time"].encoding.update(get_time_encoding()) + + # Set the variable encodings + for var, encoding in encoding_dict.items(): + ds[var].encoding.update(encoding) + + # Ensure no deprecated "missing_value" attribute + # - When source dataset is netcdf (i.e. ARM) + for var in list(ds.variables): + _ = ds[var].encoding.pop("missing_value", None) + + return ds + + +def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict: + """Ensure chunk size to be smaller than the array shape. + + Parameters + ---------- + encoding_dict : dict + Dictionary containing the variable encodings. + ds : xarray.Dataset + Input dataset. + + Returns + ------- + dict + Encoding dictionary. + """ + for var in ds.data_vars: + if var in encoding_dict: + shape = ds[var].shape + chunks = encoding_dict[var].get("chunksizes", None) + if chunks is not None: + chunks = [shape[i] if chunks[i] > shape[i] else chunks[i] for i in range(len(chunks))] + encoding_dict[var]["chunksizes"] = chunks + return encoding_dict + + +def rechunk_dataset(ds: xr.Dataset, encoding_dict: dict) -> xr.Dataset: + """Coerce the dataset arrays to have the chunk size specified in the encoding dictionary. + + Parameters + ---------- + ds : xarray.Dataset + Input xarray dataset + encoding_dict : dict + Dictionary containing the encoding to write the xarray dataset as a netCDF. + + Returns + ------- + xr.Dataset + Output xarray dataset + """ + for var in ds.data_vars: + if var in encoding_dict: + chunks = encoding_dict[var].pop("chunksizes", None) + if chunks is not None: + dims = list(ds[var].dims) + chunks_dict = dict(zip(dims, chunks)) + ds[var] = ds[var].chunk(chunks_dict) + return ds + + +def get_time_encoding() -> dict: + """Create time encoding. + + Returns + ------- + dict + Time encoding. + """ + encoding = {} + encoding["units"] = EPOCH + encoding["calendar"] = "proleptic_gregorian" + return encoding diff --git a/disdrodb/utils/logger.py b/disdrodb/utils/logger.py index 2fae3d30..42f55080 100644 --- a/disdrodb/utils/logger.py +++ b/disdrodb/utils/logger.py @@ -24,10 +24,9 @@ from asyncio.log import logger -def create_file_logger(processed_dir, product, station_name, filename, parallel): - """Create file logger.""" +def create_logger_file(logs_dir, filename, parallel): + """Create logger file.""" # Create logs directory - logs_dir = os.path.join(processed_dir, "logs", product, station_name) os.makedirs(logs_dir, exist_ok=True) # Define logger filepath @@ -44,10 +43,14 @@ def create_file_logger(processed_dir, product, station_name, filename, parallel) handler.setFormatter(logging.Formatter(format_type)) logger.addHandler(handler) logger.setLevel(logging.DEBUG) - return logger + + # Define logger filepath + # - LogCaptureHandler of pytest does not have baseFilename attribute --> So set None + logger_filepath = logger.handlers[0].baseFilename if not os.environ.get("PYTEST_CURRENT_TEST") else None + return logger, logger_filepath -def close_logger(logger: logger) -> None: +def close_logger(logger) -> None: """Close the logger. Parameters @@ -80,7 +83,8 @@ def log_debug(logger: logger, msg: str, verbose: bool = False) -> None: """ if verbose: print(" - " + msg) - logger.debug(msg) + if logger is not None: + logger.debug(msg) def log_info(logger: logger, msg: str, verbose: bool = False) -> None: @@ -98,7 +102,8 @@ def log_info(logger: logger, msg: str, verbose: bool = False) -> None: """ if verbose: print(" - " + msg) - logger.info(msg) + if logger is not None: + logger.info(msg) def log_warning(logger: logger, msg: str, verbose: bool = False) -> None: @@ -116,7 +121,8 @@ def log_warning(logger: logger, msg: str, verbose: bool = False) -> None: """ if verbose: print(" - " + msg) - logger.warning(msg) + if logger is not None: + logger.warning(msg) def log_error(logger: logger, msg: str, verbose: bool = False) -> None: @@ -134,15 +140,12 @@ def log_error(logger: logger, msg: str, verbose: bool = False) -> None: """ if verbose: print(" - " + msg) - logger.error(msg) + if logger is not None: + logger.error(msg) -def _get_logs_dir(list_logs): - list_logs = sorted(list_logs) - station_logs_dir = os.path.dirname(list_logs[0]) - station_name = station_logs_dir.split(os.path.sep)[-1] - logs_dir = os.path.dirname(station_logs_dir) - return station_name, logs_dir +####---------------------------------------------------------------------------. +#### SUMMARY LOGS def _define_station_summary_log_file(list_logs, summary_filepath): @@ -163,16 +166,27 @@ def _define_station_summary_log_file(list_logs, summary_filepath): def _define_station_problem_log_file(list_logs, problem_filepath): # - Copy the log of files with warnings and error list_keywords = ["ERROR"] # "WARNING" + list_patterns = ["ValueError: Less than 5 timesteps available for day"] re_keyword = re.compile("|".join(list_keywords)) + # Compile patterns to ignore, escaping any special regex characters + re_patterns = re.compile("|".join(map(re.escape, list_patterns))) if list_patterns else None + # Initialize problem log file any_problem = False + n_files = len(list_logs) + n_files_with_problems = 0 with open(problem_filepath, "w") as output_file: + # Loop over log files and collect problems for log_filepath in list_logs: log_with_problem = False # Check if an error is reported with open(log_filepath) as input_file: for line in input_file: if re_keyword.search(line): + # If the line matches an ignore pattern, skip it + if re_patterns and re_patterns.search(line): + continue log_with_problem = True + n_files_with_problems += 1 any_problem = True break # If it is reported, copy the log file in the logs_problem file @@ -180,34 +194,154 @@ def _define_station_problem_log_file(list_logs, problem_filepath): with open(log_filepath) as input_file: output_file.write(input_file.read()) + # Add number of files with problems + msg = f"SUMMARY: {n_files_with_problems} of {n_files} files had problems." + output_file.write(msg) + # If no problems occurred, remove the logs_problem_.log file if not any_problem: os.remove(problem_filepath) -def define_summary_log(list_logs): - """Define a station summary and a problems log file from the list of input logs. - - The summary log select only logged lines with ``root``, ``WARNING`` and ``ERROR`` keywords. - The problems log file select only logged lines with the ``ERROR`` keyword. - The two log files are saved in the parent directory of the input ``list_logs``. - - The function assume that the files logs are located at: +def create_product_logs( + product, + data_source, + campaign_name, + station_name, + base_dir=None, + # Product options + sample_interval=None, + rolling=None, + model_name=None, + # Logs list + list_logs=None, # If none, list it ! +): + """Create station summary and station problems log files. + + The summary log selects only logged lines with ``root``, ``WARNING``, and ``ERROR`` keywords. + The problems log file selects only logged lines with the ``ERROR`` keyword. + + The logs directory structure is the follow: + /logs + - /files// (same structure as data ... a log for each processed file) + - /summary + --> SUMMARY....log + - /problems + --> PROBLEMS....log - ``/DISDRODB/Processed///logs///.log`` + Parameters + ---------- + product : str + The DISDRODB product. + data_source : str + The data source name. + campaign_name : str + The campaign name. + station_name : str + The station name. + base_dir : str, optional + The base directory path. Default is None. + sample_interval : str, optional + The sample interval for L2E option. Default is None. + rolling : str, optional + The rolling option for L2E. Default is None. + model_name : str, optional + The model name for L2M. Default is None. + list_logs : list, optional + List of log file paths. If None, the function will list the log files. + + Returns + ------- + None """ + from disdrodb.api.path import define_campaign_dir, define_filename, define_logs_dir + from disdrodb.utils.directories import list_files + + # --------------------------------------------------------. + # Search for logs file + if list_logs is None: + # Define product logs directory within /files/.... + logs_dir = define_logs_dir( + product=product, + base_dir=base_dir, + data_source=data_source, + campaign_name=campaign_name, + station_name=station_name, + # Option for L2E + sample_interval=sample_interval, + rolling=rolling, + # Option for L2M + model_name=model_name, + ) + list_logs = list_files(logs_dir, glob_pattern="*", recursive=True) + + # --------------------------------------------------------. # LogCaptureHandler of pytest does not have baseFilename attribute, so it returns None if list_logs[0] is None: return - station_name, logs_dir = _get_logs_dir(list_logs) - + # --------------------------------------------------------. + # Define /summary and /problem directory + campaign_dir = define_campaign_dir( + base_dir=base_dir, + product=product, + data_source=data_source, + campaign_name=campaign_name, + ) + logs_summary_dir = os.path.join(campaign_dir, "logs", "summary") + logs_problem_dir = os.path.join(campaign_dir, "logs", "problems") + + os.makedirs(logs_summary_dir, exist_ok=True) + os.makedirs(logs_problem_dir, exist_ok=True) + + # --------------------------------------------------------. # Define station summary log file name - summary_filepath = os.path.join(logs_dir, f"logs_summary_{station_name}.log") + summary_filename = define_filename( + product=product, + campaign_name=campaign_name, + station_name=station_name, + # L2E option + sample_interval=sample_interval, + rolling=rolling, + # L2M option + model_name=model_name, + # Filename options + add_version=False, + add_time_period=False, + add_extension=False, + prefix="SUMMARY", + suffix="log", + ) + summary_filepath = os.path.join(logs_summary_dir, summary_filename) + # Define station problem logs file name - problem_filepath = os.path.join(logs_dir, f"logs_problem_{station_name}.log") - # Create station summary log file + problem_filename = define_filename( + product=product, + campaign_name=campaign_name, + station_name=station_name, + # L2E option + sample_interval=sample_interval, + rolling=rolling, + # L2M option + model_name=model_name, + # Filename options + add_version=False, + add_time_period=False, + add_extension=False, + prefix="PROBLEMS", + suffix="log", + ) + problem_filepath = os.path.join(logs_problem_dir, problem_filename) + + # --------------------------------------------------------. + # Create summary log file _define_station_summary_log_file(list_logs, summary_filepath) - # Create station ptoblems log file (if no problems, no file) + + # Create problem log file (if no problems, no file created) _define_station_problem_log_file(list_logs, problem_filepath) + + # --------------------------------------------------------. + # Remove /problem directory if empty ! + if len(os.listdir(logs_problem_dir)) == 0: + os.rmdir(logs_problem_dir) diff --git a/disdrodb/utils/scripts.py b/disdrodb/utils/scripts.py deleted file mode 100644 index 86d35924..00000000 --- a/disdrodb/utils/scripts.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 - -# -----------------------------------------------------------------------------. -# Copyright (c) 2021-2023 DISDRODB developers -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# -----------------------------------------------------------------------------. -"""DISDRODB scripts utility.""" - -import click - - -def _execute_cmd(cmd, raise_error=False): - """Execute command in the terminal, streaming output in python console.""" - from subprocess import PIPE, CalledProcessError, Popen - - with Popen(cmd, shell=True, stdout=PIPE, bufsize=1, universal_newlines=True) as p: - for line in p.stdout: - print(line, end="") - - # Raise error if command didn't run successfully - if p.returncode != 0 and raise_error: - raise CalledProcessError(p.returncode, p.args) - - -def _parse_empty_string_and_none(args): - """Utility to parse argument passed from the command line. - - If ``args = ''``, returns None. - If ``args = 'None'`` returns None. - Otherwise return ``args``. - """ - # If '', set to 'None' - args = None if args == "" else args - # - If multiple arguments, split by space - if isinstance(args, str) and args == "None": - args = None - return args - - -def parse_arg_to_list(args): - """Utility to pass list to command line scripts. - - If ``args = ''`` returns ``None``. - If ``args = 'None'`` returns ``None``. - If ``args = 'variable'`` returns ``[variable]``. - If ``args = 'variable1 variable2'`` returns ``[variable1, variable2]``. - """ - # If '' or 'None' --> Set to None - args = _parse_empty_string_and_none(args) - # - If multiple arguments, split by space - if isinstance(args, str): - # - Split by space - list_args = args.split(" ") - # - Remove '' (deal with multi space) - args = [args for args in list_args if len(args) > 0] - return args - - -def parse_base_dir(base_dir): - """Utility to parse base_dir provided by command line. - - If ``base_dir = 'None'`` returns ``None``. - If ``base_dir = ''`` returns ``None``. - """ - # If '', set to 'None' - return _parse_empty_string_and_none(base_dir) - - -def click_station_arguments(function: object): - """Click command line arguments for DISDRODB station processing. - - Parameters - ---------- - function : object - Function. - """ - function = click.argument("station_name", metavar="")(function) - function = click.argument("campaign_name", metavar="")(function) - function = click.argument("data_source", metavar="")(function) - return function - - -def click_base_dir_option(function: object): - """Click command line argument for DISDRODB ``base_dir``. - - Parameters - ---------- - function : object - Function. - """ - function = click.option( - "--base_dir", - type=str, - show_default=True, - default=None, - help="DISDRODB base directory", - )(function) - return function diff --git a/disdrodb/utils/time.py b/disdrodb/utils/time.py new file mode 100644 index 00000000..2da1aa1b --- /dev/null +++ b/disdrodb/utils/time.py @@ -0,0 +1,545 @@ +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""This module contains utilities related to the processing of temporal dataset.""" +import logging +import re +from typing import Optional + +import numpy as np +import pandas as pd +import xarray as xr +from xarray.core import dtypes + +from disdrodb.utils.logger import log_info, log_warning + +logger = logging.getLogger(__name__) + +####------------------------------------------------------------------------------------. +#### Sampling Interval Acronyms + + +def seconds_to_acronym(seconds): + """ + Convert a duration in seconds to a readable string format (e.g., "1H30", "1D2H"). + + Parameters + ---------- + - seconds (int): The time duration in seconds. + + Returns + ------- + - str: The duration as a string in a format like "30S", "1MIN30S", "1H30MIN", or "1D2H". + """ + timedelta = pd.Timedelta(seconds=seconds) + components = timedelta.components + + parts = [] + if components.days > 0: + parts.append(f"{components.days}D") + if components.hours > 0: + parts.append(f"{components.hours}H") + if components.minutes > 0: + parts.append(f"{components.minutes}MIN") + if components.seconds > 0: + parts.append(f"{components.seconds}S") + acronym = "".join(parts) + return acronym + + +def get_resampling_information(sample_interval_acronym): + """ + Extract resampling information from the sample interval acronym. + + Parameters + ---------- + sample_interval_acronym: str + A string representing the sample interval: e.g., "1H30MIN", "ROLL1H30MIN". + + Returns + ------- + sample_interval_seconds, rolling: tuple + Sample_interval in seconds and whether rolling is enabled. + """ + rolling = sample_interval_acronym.startswith("ROLL") + if rolling: + sample_interval_acronym = sample_interval_acronym[4:] # Remove "ROLL" + + # Allowed pattern: one or more occurrences of "" + # where unit is exactly one of D, H, MIN, or S. + # Examples: 1H, 30MIN, 2D, 45S, and any concatenation like 1H30MIN. + pattern = r"^(\d+(?:D|H|MIN|S))+$" + + # Check if the entire string matches the pattern + if not re.match(pattern, sample_interval_acronym): + raise ValueError( + f"Invalid sample interval acronym '{sample_interval_acronym}'. " + "Must be composed of one or more groups, where unit is D, H, MIN, or S.", + ) + + # Regular expression to match duration components and extract all (value, unit) pairs + pattern = r"(\d+)(D|H|MIN|S)" + matches = re.findall(pattern, sample_interval_acronym) + + # Conversion factors for each unit + unit_to_seconds = { + "D": 86400, # Seconds in a day + "H": 3600, # Seconds in an hour + "MIN": 60, # Seconds in a minute + "S": 1, # Seconds in a second + } + + # Parse matches and calculate total seconds + sample_interval = 0 + for value, unit in matches: + value = int(value) + if unit in unit_to_seconds: + sample_interval += value * unit_to_seconds[unit] + return sample_interval, rolling + + +def acronym_to_seconds(acronym): + """ + Extract the interval in seconds from the duration acronym. + + Parameters + ---------- + acronym: str + A string representing a duration: e.g., "1H30MIN", "ROLL1H30MIN". + + Returns + ------- + seconds + Duration in seconds. + """ + seconds, _ = get_resampling_information(acronym) + return seconds + + +####------------------------------------------------------------------------------------. +#### Xarray utilities + + +def get_dataset_start_end_time(ds: xr.Dataset, time_dim="time"): + """Retrieves dataset starting and ending time. + + Parameters + ---------- + ds : xarray.Dataset + Input dataset + time_dim: str + Name of the time dimension. + The default is "time". + + Returns + ------- + tuple + (``starting_time``, ``ending_time``) + + """ + starting_time = ds[time_dim].to_numpy()[0] + ending_time = ds[time_dim].to_numpy()[-1] + return (starting_time, ending_time) + + +def _define_fill_value(ds, fill_value): + fill_value = {} + for var in ds.data_vars: + if np.issubdtype(ds[var].dtype, np.floating): + fill_value[var] = dtypes.NA + elif np.issubdtype(ds[var].dtype, np.integer): + if "_FillValue" in ds[var].attrs: + fill_value[var] = ds[var].attrs["_FillValue"] + else: + fill_value[var] = np.iinfo(ds[var].dtype).max + return fill_value + + +def _check_time_sorted(ds, time_dim): + time_diff = np.diff(ds[time_dim].data.astype(int)) + if np.any(time_diff == 0): + raise ValueError(f"In the {time_dim} dimension there are duplicated timesteps !") + if not np.all(time_diff > 0): + print(f"The {time_dim} dimension was not sorted. Sorting it now !") + ds = ds.sortby(time_dim) + return ds + + +def regularize_dataset( + ds: xr.Dataset, + freq: str, + time_dim: str = "time", + method: Optional[str] = None, + fill_value=None, +): + """Regularize a dataset across time dimension with uniform resolution. + + Parameters + ---------- + ds : xarray.Dataset + xarray Dataset. + time_dim : str, optional + The time dimension in the xarray.Dataset. The default is ``"time"``. + freq : str + The ``freq`` string to pass to `pd.date_range()` to define the new time coordinates. + Examples: ``freq="2min"``. + method : str, optional + Method to use for filling missing timesteps. + If ``None``, fill with ``fill_value``. The default is ``None``. + For other possible methods, see xarray.Dataset.reindex()`. + fill_value : (float, dict), optional + Fill value to fill missing timesteps. + If not specified, for float variables it uses ``dtypes.NA`` while for + for integers variables it uses the maximum allowed integer value or, + in case of undecoded variables, the ``_FillValue`` DataArray attribute.. + + Returns + ------- + ds_reindexed : xarray.Dataset + Regularized dataset. + + """ + ds = _check_time_sorted(ds, time_dim=time_dim) + start_time, end_time = get_dataset_start_end_time(ds, time_dim=time_dim) + new_time_index = pd.date_range( + start=pd.to_datetime(start_time), + end=pd.to_datetime(end_time), + freq=freq, + ) + + # Define fill_value dictionary + if fill_value is None: + fill_value = _define_fill_value(ds, fill_value) + + # Regularize dataset and fill with NA values + ds = ds.reindex( + {time_dim: new_time_index}, + method=method, # do not fill gaps + # tolerance=tolerance, # mismatch in seconds + fill_value=fill_value, + ) + return ds + + +def ensure_sorted_by_time(ds): + """Ensure a dataset is sorted by time.""" + # Check sorted by time and sort if necessary + is_sorted = np.all(ds["time"].data[:-1] <= ds["time"].data[1:]) + if not is_sorted: + ds = ds.sortby("time") + return ds + + +####------------------------------------------ +#### Sampling interval utilities + + +def ensure_sample_interval_in_seconds(sample_interval): + """ + Ensure the sample interval is in seconds. + + Parameters + ---------- + sample_interval : int, numpy.ndarray, xarray.DataArray, or numpy.timedelta64 + The sample interval to be converted to seconds. + It can be: + - An integer representing the interval in seconds. + - A numpy array or xarray DataArray of integers representing intervals in seconds. + - A numpy.timedelta64 object representing the interval. + - A numpy array or xarray DataArray of numpy.timedelta64 objects representing intervals. + + Returns + ------- + int, numpy.ndarray, or xarray.DataArray + The sample interval converted to seconds. The return type matches the input type: + - If the input is an integer, the output is an integer. + - If the input is a numpy array, the output is a numpy array of integers. + - If the input is an xarray DataArray, the output is an xarray DataArray of integers. + + """ + if ( + isinstance(sample_interval, int) + or isinstance(sample_interval, (np.ndarray, xr.DataArray)) + and np.issubdtype(sample_interval.dtype, int) + ): + return sample_interval + if isinstance(sample_interval, np.timedelta64): + return sample_interval / np.timedelta64(1, "s") + if isinstance(sample_interval, np.ndarray) and np.issubdtype(sample_interval.dtype, np.timedelta64): + return sample_interval.astype("timedelta64[s]").astype(int) + if isinstance(sample_interval, xr.DataArray) and np.issubdtype(sample_interval.dtype, np.timedelta64): + sample_interval = sample_interval.copy() + sample_interval_int = sample_interval.data.astype("timedelta64[s]").astype(int) + sample_interval.data = sample_interval_int + return sample_interval + raise TypeError( + "sample_interval must be an int, numpy.timedelta64, or numpy array of timedelta64.", + ) + + +def infer_sample_interval(ds, robust=False, verbose=False, logger=None): + """Infer the sample interval of a dataset. + + NOTE: This function is not used in the DISDRODB processing chain. + """ + # Check sorted by time and sort if necessary + ds = ensure_sorted_by_time(ds) + + # Calculate number of timesteps + n_timesteps = len(ds["time"].data) + + # Calculate time differences in seconds + deltadt = np.diff(ds["time"].data).astype("timedelta64[s]").astype(int) + + # Round each delta to the nearest multiple of 5 (because the smallest possible sample interval is 10 s) + # Example: for sample_interval = 10, deltat values like 8, 9, 11, 12 become 10 ... + # Example: for sample_interval = 10, deltat values like 6, 7 or 13, 14 become respectively 5 and 15 ... + # Example: for sample_interval = 30, deltat values like 28,29,30,31,32 deltat become 30 ... + # Example: for sample_interval = 30, deltat values like 26, 27 or 33, 34 become respectively 25 and 35 ... + # --> Need other rounding after having identified the most frequent sample interval to coerce such values to 30 + min_sample_interval = 10 + min_half_sample_interval = min_sample_interval / 2 + deltadt = np.round(deltadt / min_half_sample_interval) * min_half_sample_interval + + # Identify unique time intervals and their occurrences + unique_deltas, counts = np.unique(deltadt, return_counts=True) + + # Determine the most frequent time interval (mode) + most_frequent_delta_idx = np.argmax(counts) + sample_interval = unique_deltas[most_frequent_delta_idx] + + # Reround deltadt once knowing the sample interval + # - If sample interval is 10: all values between 6 and 14 are rounded to 10, below 6 to 0, above 14 to 20 + # - If sample interval is 30: all values between 16 and 44 are rounded to 30, below 16 to 0, above 44 to 20 + deltadt = np.round(deltadt / sample_interval) * sample_interval + + # Identify unique time intervals and their occurrences + unique_deltas, counts = np.unique(deltadt, return_counts=True) + fractions = np.round(counts / len(deltadt) * 100, 2) + + # Identify the minimum delta (except 0) + min_delta = unique_deltas[unique_deltas != 0].min() + + # Determine the most frequent time interval (mode) + most_frequent_delta_idx = np.argmax(counts) + sample_interval = unique_deltas[most_frequent_delta_idx] + sample_interval_fraction = fractions[most_frequent_delta_idx] + + # Inform about irregular sampling + unexpected_intervals = unique_deltas[unique_deltas != sample_interval] + unexpected_intervals_counts = counts[unique_deltas != sample_interval] + unexpected_intervals_fractions = fractions[unique_deltas != sample_interval] + if verbose and len(unexpected_intervals) > 0: + msg = "Irregular timesteps detected." + log_info(logger=logger, msg=msg, verbose=verbose) + for interval, count, fraction in zip( + unexpected_intervals, + unexpected_intervals_counts, + unexpected_intervals_fractions, + ): + msg = f" Interval: {interval} seconds, Occurrence: {count}, Frequency: {fraction} %" + log_info(logger=logger, msg=msg, verbose=verbose) + + # Perform checks + # - Raise error if negative or zero time intervals are presents + # - If robust = False, still return the estimated sample_interval + if robust and np.any(deltadt == 0): + raise ValueError("Likely presence of duplicated timesteps.") + + ####-------------------------------------------------------------------------. + #### Informative messages + # - Log a warning if estimated sample interval has frequency less than 60 % + sample_interval_fraction_threshold = 60 + msg = ( + f"The most frequent sampling interval ({sample_interval} s) " + + f"has a frequency lower than {sample_interval_fraction_threshold}%: {sample_interval_fraction} %. " + + f"Total number of timesteps: {n_timesteps}." + ) + if sample_interval_fraction < sample_interval_fraction_threshold: + log_warning(logger=logger, msg=msg, verbose=verbose) + + # - Log a warning if an unexpected interval has frequency larger than 20 percent + frequent_unexpected_intervals = unexpected_intervals[unexpected_intervals_fractions > 20] + if len(frequent_unexpected_intervals) != 0: + frequent_unexpected_intervals_str = ", ".join( + f"{interval} seconds" for interval in frequent_unexpected_intervals + ) + msg = ( + "The following unexpected intervals have a frequency " + + f"greater than 20%: {frequent_unexpected_intervals_str} %. " + + f"Total number of timesteps: {n_timesteps}." + ) + log_warning(logger=logger, msg=msg, verbose=verbose) + + # - Raise error if the most frequent interval is not the expected one ! + if sample_interval != min_delta: + raise ValueError( + f"The most frequent sampling interval ({sample_interval} seconds) " + f"is not the smallest interval ({min_delta} seconds). " + "Inconsistent sampling intervals in the dataset !", + ) + + return int(sample_interval) + + +####--------------------------------------------------------------------------------- +#### Timesteps regularization + + +def get_problematic_timestep_indices(timesteps, sample_interval): + """Identify timesteps with missing previous or following timesteps.""" + previous_time = timesteps - pd.Timedelta(seconds=sample_interval) + next_time = timesteps + pd.Timedelta(seconds=sample_interval) + idx_previous_missing = np.where(~np.isin(previous_time, timesteps))[0][1:] + idx_next_missing = np.where(~np.isin(next_time, timesteps))[0][:-1] + idx_isolated_missing = np.intersect1d(idx_previous_missing, idx_next_missing) + idx_previous_missing = idx_previous_missing[np.isin(idx_previous_missing, idx_isolated_missing, invert=True)] + idx_next_missing = idx_next_missing[np.isin(idx_next_missing, idx_isolated_missing, invert=True)] + return idx_previous_missing, idx_next_missing, idx_isolated_missing + + +def regularize_timesteps(ds, sample_interval, robust=False, add_quality_flag=True, logger=None, verbose=True): + """Ensure timesteps match with the sample_interval.""" + # Check sorted by time and sort if necessary + ds = ensure_sorted_by_time(ds) + + # Convert time to pandas.DatetimeIndex for easier manipulation + times = pd.to_datetime(ds["time"].values) + + # Determine the start and end times + start_time = times[0].floor(f"{sample_interval}s") + end_time = times[-1].ceil(f"{sample_interval}s") + + # Create the expected time grid + expected_times = pd.date_range(start=start_time, end=end_time, freq=f"{sample_interval}s") + + # Convert to numpy arrays + times = times.to_numpy(dtype="M8[s]") + expected_times = expected_times.to_numpy(dtype="M8[s]") + + # Map original times to the nearest expected times + # Calculate the difference between original times and expected times + time_deltas = np.abs(times - expected_times[:, None]).astype(int) + + # Find the index of the closest expected time for each original time + nearest_indices = np.argmin(time_deltas, axis=0) + adjusted_times = expected_times[nearest_indices] + + # Check for duplicates in adjusted times + unique_times, counts = np.unique(adjusted_times, return_counts=True) + duplicates = unique_times[counts > 1] + + # Initialize time quality flag + # - 0 when ok or just rounded to closest 00 + # - 1 if previous timestep is missing + # - 2 if next timestep is missing + # - 3 if previous and next timestep is missing + # - 4 if solved duplicated timesteps + # - 5 if needed to drop duplicated timesteps and select the last + flag_previous_missing = 1 + flag_next_missing = 2 + flag_isolated_timestep = 3 + flag_solved_duplicated_timestep = 4 + flag_dropped_duplicated_timestep = 5 + qc_flag = np.zeros(adjusted_times.shape) + + # Initialize list with the duplicated timesteps index to drop + # - We drop the first occurrence because is likely the shortest interval + idx_to_drop = [] + + # Attempt to resolve for duplicates + if duplicates.size > 0: + # Handle duplicates + for dup_time in duplicates: + # Indices of duplicates + dup_indices = np.where(adjusted_times == dup_time)[0] + n_duplicates = len(dup_indices) + # Define previous and following timestep + prev_time = dup_time - pd.Timedelta(seconds=sample_interval) + next_time = dup_time + pd.Timedelta(seconds=sample_interval) + # Try to find missing slots before and after + # - If more than 3 duplicates, impossible to solve ! + count_solved = 0 + # If the previous timestep is available, set that one + if n_duplicates == 2: + if prev_time not in adjusted_times: + adjusted_times[dup_indices[0]] = prev_time + qc_flag[dup_indices[0]] = flag_solved_duplicated_timestep + count_solved += 1 + elif next_time not in adjusted_times: + adjusted_times[dup_indices[-1]] = next_time + qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep + count_solved += 1 + else: + pass + elif n_duplicates == 3: + if prev_time not in adjusted_times: + adjusted_times[dup_indices[0]] = prev_time + qc_flag[dup_indices[0]] = flag_dropped_duplicated_timestep + count_solved += 1 + if next_time not in adjusted_times: + adjusted_times[dup_indices[-1]] = next_time + qc_flag[dup_indices[-1]] = flag_solved_duplicated_timestep + count_solved += 1 + if count_solved != n_duplicates - 1: + idx_to_drop = np.append(idx_to_drop, dup_indices[0:-1]) + qc_flag[dup_indices[-1]] = flag_dropped_duplicated_timestep + msg = ( + f"Cannot resolve {n_duplicates} duplicated timesteps" + f"(after trailing seconds correction) around {dup_time}." + ) + log_warning(logger=logger, msg=msg, verbose=verbose) + if robust: + raise ValueError(msg) + + # Update the time coordinate (Convert to ns for xarray compatibility) + ds = ds.assign_coords({"time": adjusted_times.astype("datetime64[ns]")}) + + # Update quality flag values for next and previous timestep is missing + if add_quality_flag: + idx_previous_missing, idx_next_missing, idx_isolated_missing = get_problematic_timestep_indices( + adjusted_times, + sample_interval, + ) + qc_flag[idx_previous_missing] = np.maximum(qc_flag[idx_previous_missing], flag_previous_missing) + qc_flag[idx_next_missing] = np.maximum(qc_flag[idx_next_missing], flag_next_missing) + qc_flag[idx_isolated_missing] = np.maximum(qc_flag[idx_isolated_missing], flag_isolated_timestep) + + # If the first timestep is at 00:00 and currently flagged as previous missing (1), reset to 0 + # first_time = pd.to_datetime(adjusted_times[0]).time() + # first_expected_time = pd.Timestamp("00:00:00").time() + # if first_time == first_expected_time and qc_flag[0] == flag_previous_missing: + # qc_flag[0] = 0 + + # # If the last timestep is flagged and currently flagged as next missing (2), reset it to 0 + # last_time = pd.to_datetime(adjusted_times[-1]).time() + # last_time_expected = (pd.Timestamp("00:00:00") - pd.Timedelta(30, unit="seconds")).time() + # # Check if adding one interval would go beyond the end_time + # if last_time == last_time_expected and qc_flag[-1] == flag_next_missing: + # qc_flag[-1] = 0 + + # Assign time quality flag coordinate + ds["time_qc"] = xr.DataArray(qc_flag, dims="time") + ds = ds.set_coords("time_qc") + + # Drop duplicated timesteps + if len(idx_to_drop) > 0: + idx_to_drop = idx_to_drop.astype(int) + idx_valid_timesteps = np.arange(0, ds["time"].size) + idx_valid_timesteps = np.delete(idx_valid_timesteps, idx_to_drop) + ds = ds.isel(time=idx_valid_timesteps) + # Return dataset + return ds diff --git a/disdrodb/utils/warnings.py b/disdrodb/utils/warnings.py new file mode 100644 index 00000000..e9e1546f --- /dev/null +++ b/disdrodb/utils/warnings.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Warning utilities.""" +import warnings +from contextlib import contextmanager + + +@contextmanager +def suppress_warnings(): + """Context manager suppressing RuntimeWarnings and UserWarnings.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + warnings.simplefilter("ignore", UserWarning) + yield diff --git a/disdrodb/utils/writer.py b/disdrodb/utils/writer.py new file mode 100644 index 00000000..81f3e839 --- /dev/null +++ b/disdrodb/utils/writer.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2023 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""DISDRODB product writers.""" + +import os + +import xarray as xr + +from disdrodb.utils.attrs import set_disdrodb_attrs +from disdrodb.utils.directories import create_directory, remove_if_exists + + +def write_product(ds: xr.Dataset, filepath: str, product: str, force: bool = False) -> None: + """Save the xarray dataset into a NetCDF file. + + Parameters + ---------- + ds : xarray.Dataset + Input xarray dataset. + filepath : str + Output file path. + product: str + DISDRODB product name. + force : bool, optional + Whether to overwrite existing data. + If ``True``, overwrite existing data into destination directories. + If ``False``, raise an error if there are already data into destination directories. This is the default. + """ + # Create station directory if does not exist + create_directory(os.path.dirname(filepath)) + + # Check if the file already exists + # - If force=True --> Remove it + # - If force=False --> Raise error + remove_if_exists(filepath, force=force) + + # Update attributes + ds = set_disdrodb_attrs(ds, product=product) + + # Write netcdf + ds.to_netcdf(filepath, engine="netcdf4") diff --git a/docs/source/l0_processing.rst b/docs/source/l0_processing.rst index f8604ad5..91df610a 100644 --- a/docs/source/l0_processing.rst +++ b/docs/source/l0_processing.rst @@ -55,7 +55,7 @@ Example : # L0 processing settings l0a_processing = True l0b_processing = True - l0b_concat = True + l0c_processing = True remove_l0a = False remove_l0b = False @@ -74,7 +74,7 @@ Example : # L0 processing settings l0a_processing=l0a_processing, l0b_processing=l0b_processing, - l0b_concat=l0b_concat, + l0c_processing=l0c_processing, remove_l0a=remove_l0a, remove_l0b=remove_l0b, # L0 processing options @@ -151,7 +151,7 @@ Example : # L0 processing settings l0a_processing = True l0b_processing = True - l0b_concat = False + l0c_processing = True remove_l0a = False remove_l0b = False # L0 processing options @@ -168,7 +168,7 @@ Example : # L0 processing settings l0a_processing=l0a_processing, l0b_processing=l0b_processing, - l0b_concat=l0b_concat, + l0c_processing=l0c_processing, remove_l0a=remove_l0a, remove_l0b=remove_l0b, # L0 processing options diff --git a/docs/source/metadata_csv/Sensor_Info.csv b/docs/source/metadata_csv/Sensor_Info.csv index 5a08ba5f..bf59ae17 100644 --- a/docs/source/metadata_csv/Sensor_Info.csv +++ b/docs/source/metadata_csv/Sensor_Info.csv @@ -9,7 +9,7 @@ firmware_version,Firmware version sensor_beam_length,Length of the laser beam's measurement area in mm sensor_beam_width,Width of the laser beam's measurement area in mm sensor_nominal_width,Expected width of the sensor beam under typical operating conditions -measurement_interval,Number of seconds over which measurements are taken +measurement_interval,Number of seconds over which measurements are taken. calibration_sensitivity,Sensor sensitivity calibration_certification_date,Sensor calibration date(s) calibration_certification_url,Sensor calibration certification url diff --git a/docs/source/software_structure.rst b/docs/source/software_structure.rst index 0a90a87d..c3afd058 100644 --- a/docs/source/software_structure.rst +++ b/docs/source/software_structure.rst @@ -15,7 +15,6 @@ The current software structure is described below: | ├── 📜 io.py | ├── 📜 path.py | ├── 📁 metadata -| ├── 📁 scripts | ├── 📜 disdrodb_check_metadata_archive.py | ├── 📜 checks.py | ├── 📜 info.py @@ -53,8 +52,6 @@ The current software structure is described below: | ├── 📜 disdrodb_run_l0a_station.py | ├── 📜 disdrodb_run_l0b.py | ├── 📜 disdrodb_run_l0b_station.py -| ├── 📜 disdrodb_run_l0b_concat.py -| ├── 📜 disdrodb_run_l0b_concat_station.py | ├── 📜 check_configs.py | ├── 📜 check_standards.py | ├── 📜 io.py diff --git a/docs/source/tutorials/.gitkeep b/docs/source/tutorials/.gitkeep index 139597f9..e69de29b 100644 --- a/docs/source/tutorials/.gitkeep +++ b/docs/source/tutorials/.gitkeep @@ -1,2 +0,0 @@ - - diff --git a/pyproject.toml b/pyproject.toml index 8678144a..02a6604d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,26 +70,36 @@ include = ["disdrodb*"] [project.scripts] # Initialization -disdrodb_initialize_station="disdrodb.api.scripts.disdrodb_initialize_station:disdrodb_initialize_station" +disdrodb_initialize_station="disdrodb.cli.disdrodb_initialize_station:disdrodb_initialize_station" # Metadata archive -disdrodb_check_metadata_archive="disdrodb.metadata.scripts.disdrodb_check_metadata_archive:disdrodb_check_metadata_archive" +disdrodb_check_metadata_archive="disdrodb.cli.disdrodb_check_metadata_archive:disdrodb_check_metadata_archive" # Data transfer -disdrodb_download_archive="disdrodb.data_transfer.scripts.disdrodb_download_archive:disdrodb_download_archive" -disdrodb_download_station="disdrodb.data_transfer.scripts.disdrodb_download_station:disdrodb_download_station" -disdrodb_upload_archive="disdrodb.data_transfer.scripts.disdrodb_upload_archive:disdrodb_upload_archive" -disdrodb_upload_station="disdrodb.data_transfer.scripts.disdrodb_upload_station:disdrodb_upload_station" +disdrodb_download_archive="disdrodb.cli.disdrodb_download_archive:disdrodb_download_archive" +disdrodb_download_station="disdrodb.cli.disdrodb_download_station:disdrodb_download_station" +disdrodb_upload_archive="disdrodb.cli.disdrodb_upload_archive:disdrodb_upload_archive" +disdrodb_upload_station="disdrodb.cli.disdrodb_upload_station:disdrodb_upload_station" # L0A -disdrodb_run_l0a_station="disdrodb.l0.scripts.disdrodb_run_l0a_station:disdrodb_run_l0a_station" -disdrodb_run_l0a="disdrodb.l0.scripts.disdrodb_run_l0a:disdrodb_run_l0a" +disdrodb_run_l0a_station="disdrodb.cli.disdrodb_run_l0a_station:disdrodb_run_l0a_station" +disdrodb_run_l0a="disdrodb.cli.disdrodb_run_l0a:disdrodb_run_l0a" # L0B -disdrodb_run_l0b_station="disdrodb.l0.scripts.disdrodb_run_l0b_station:disdrodb_run_l0b_station" -disdrodb_run_l0_station="disdrodb.l0.scripts.disdrodb_run_l0_station:disdrodb_run_l0_station" -# L0B concatenation -disdrodb_run_l0b_concat_station="disdrodb.l0.scripts.disdrodb_run_l0b_concat_station:disdrodb_run_l0b_concat_station" -disdrodb_run_l0b_concat="disdrodb.l0.scripts.disdrodb_run_l0b_concat:disdrodb_run_l0b_concat" +disdrodb_run_l0b_station="disdrodb.cli.disdrodb_run_l0b_station:disdrodb_run_l0b_station" +disdrodb_run_l0b="disdrodb.cli.disdrodb_run_l0b:disdrodb_run_l0b" +# L0C +disdrodb_run_l0c_station="disdrodb.cli.disdrodb_run_l0c_station:disdrodb_run_l0c_station" +disdrodb_run_l0c="disdrodb.cli.disdrodb_run_l0c:disdrodb_run_l0c" # L0 -disdrodb_run_l0b="disdrodb.l0.scripts.disdrodb_run_l0b:disdrodb_run_l0b" -disdrodb_run_l0="disdrodb.l0.scripts.disdrodb_run_l0:disdrodb_run_l0" +disdrodb_run_l0_station="disdrodb.cli.disdrodb_run_l0_station:disdrodb_run_l0_station" +disdrodb_run_l0="disdrodb.cli.disdrodb_run_l0:disdrodb_run_l0" +# L1 +disdrodb_run_l1_station="disdrodb.cli.disdrodb_run_l1_station:disdrodb_run_l1_station" +disdrodb_run_l1="disdrodb.cli.disdrodb_run_l1_station:disdrodb_run_l1_station" +# L2E +disdrodb_run_l2e_station="disdrodb.cli.disdrodb_run_l2e_station:disdrodb_run_l2e_station" +disdrodb_run_l2e="disdrodb.cli.disdrodb_run_l2e_station:disdrodb_run_l2e_station" +# L2M +disdrodb_run_l2m_station="disdrodb.cli.disdrodb_run_l2m_station:disdrodb_run_l2m_station" +disdrodb_run_l2m="disdrodb.cli.disdrodb_run_l2m_station:disdrodb_run_l2m_station" + [tool.pytest.ini_options] diff --git a/tutorials/reader_preparation.ipynb b/tutorials/reader_preparation.ipynb index b47cf41d..15764625 100644 --- a/tutorials/reader_preparation.ipynb +++ b/tutorials/reader_preparation.ipynb @@ -123,6 +123,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "from IPython.display import display\n", "\n", "from disdrodb.api.checks import check_sensor_name\n", "\n", @@ -262,7 +263,8 @@ "source": [ "**3. Initialization**\n", "\n", - "We initiate some checks, and get some variable. *Nothing must be changed here.*" + "We initiate some checks, and get some variable. *Nothing must be changed here.*\n", + "The `data_dir` is the directory path where the processed data will be stored." ] }, { @@ -273,7 +275,7 @@ "outputs": [], "source": [ "# Create directory structure\n", - "create_l0_directory_structure(\n", + "data_dir = create_l0_directory_structure(\n", " raw_dir=raw_dir,\n", " processed_dir=processed_dir,\n", " station_name=station_name,\n", @@ -819,7 +821,7 @@ "df_raw = read_raw_file(filepath, column_names=None, reader_kwargs=reader_kwargs)\n", "# Print the dataframe\n", "print(f\"Dataframe for the file {os.path.basename(filepath)} :\")\n", - "display(df_raw) # noqa F821" + "display(df_raw)" ] }, { @@ -2432,7 +2434,7 @@ " verbose=verbose,\n", " df_sanitizer_fun=df_sanitizer_fun,\n", ")\n", - "display(df) # noqa F821" + "display(df)" ] }, { @@ -2529,7 +2531,7 @@ "metadata": {}, "outputs": [], "source": [ - "# ds = set_encodings(ds, sensor_name)\n", + "# ds = set_l0b_encodings(ds, sensor_name)\n", "# ds.to_netcdf(\"/path/where/to/save/the/file.nc\")" ] },