From 0c5fc46c1eba68300d439086db5eae91b48d5e34 Mon Sep 17 00:00:00 2001 From: ghiggi Date: Wed, 1 Nov 2023 20:07:54 +0100 Subject: [PATCH] Move processing of raw netCDF in separate module --- disdrodb/l0/l0_processing.py | 5 +- .../l0/{l0b_concat.py => l0b_nc_concat.py} | 0 disdrodb/l0/l0b_nc_processing.py | 399 +++++++++++++++++ disdrodb/l0/l0b_processing.py | 415 +----------------- disdrodb/l0/readers/ARM/ARM_LD.py | 2 +- disdrodb/l0/readers/ARM/ARM_LPM.py | 2 +- disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py | 2 +- .../l0/scripts/run_disdrodb_l0b_concat.py | 2 +- .../run_disdrodb_l0b_concat_station.py | 2 +- disdrodb/tests/test_l0/test_l0b_concat.py | 2 +- docs/source/l0_processing.rst | 1 - docs/source/software_structure.rst | 3 +- 12 files changed, 430 insertions(+), 405 deletions(-) rename disdrodb/l0/{l0b_concat.py => l0b_nc_concat.py} (100%) create mode 100644 disdrodb/l0/l0b_nc_processing.py diff --git a/disdrodb/l0/l0_processing.py b/disdrodb/l0/l0_processing.py index ff98a476..7fd29155 100644 --- a/disdrodb/l0/l0_processing.py +++ b/disdrodb/l0/l0_processing.py @@ -257,7 +257,8 @@ def _generate_l0b_from_nc( parallel, ): from disdrodb.l0.io import get_L0B_fpath - from disdrodb.l0.l0b_processing import process_raw_nc, write_l0b + from disdrodb.l0.l0b_nc_processing import process_raw_nc + from disdrodb.l0.l0b_processing import write_l0b # -----------------------------------------------------------------. # Create file logger @@ -902,7 +903,7 @@ def run_disdrodb_l0_station( The default is False. """ from disdrodb.api.io import _get_disdrodb_directory - from disdrodb.l0.l0b_concat import run_disdrodb_l0b_concat_station + from disdrodb.l0.l0b_nc_concat import run_disdrodb_l0b_concat_station # ---------------------------------------------------------------------. t_i = time.time() diff --git a/disdrodb/l0/l0b_concat.py b/disdrodb/l0/l0b_nc_concat.py similarity index 100% rename from disdrodb/l0/l0b_concat.py rename to disdrodb/l0/l0b_nc_concat.py diff --git a/disdrodb/l0/l0b_nc_processing.py b/disdrodb/l0/l0b_nc_processing.py new file mode 100644 index 00000000..779c75d1 --- /dev/null +++ b/disdrodb/l0/l0b_nc_processing.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python3 +# -----------------------------------------------------------------------------. +# Copyright (c) 2021-2022 DISDRODB developers +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# -----------------------------------------------------------------------------. +"""Functions to process DISDRODB raw netCDF files into DISDRODB L0B netCDF files.""" + +import copy +import logging + +import numpy as np +import xarray as xr + +from disdrodb.l0.l0b_processing import finalize_dataset +from disdrodb.l0.standards import ( + get_bin_coords, + get_data_range_dict, + get_nan_flags_dict, + get_valid_dimension_names, + get_valid_names, + get_valid_values_dict, + get_valid_variable_names, +) +from disdrodb.utils.logger import ( + # log_warning, + # log_debug, + log_error, + log_info, +) + +logger = logging.getLogger(__name__) +####--------------------------------------------------------------------------. +#### L0B Raw netCDFs Preprocessing + + +def _check_dict_names_validity(dict_names, sensor_name): + """Check dict_names dictionary values validity.""" + valid_names = get_valid_names(sensor_name) + keys = np.array(list(dict_names.keys())) + values = np.array(list(dict_names.values())) + # Get invalid keys + invalid_keys = keys[np.isin(values, valid_names, invert=True)] + if len(invalid_keys) > 0: + # Report invalid keys and raise error + invalid_dict = {k: dict_names[k] for k in invalid_keys} + msg = f"The following dict_names values are not valid: {invalid_dict}" + log_error(logger=logger, msg=msg, verbose=False) + raise ValueError(msg) + return None + + +def _get_dict_names_variables(dict_names, sensor_name): + """Get DISDRODB variables specified in dict_names.""" + possible_variables = get_valid_variable_names(sensor_name) + dictionary_names = list(dict_names.values()) + variables = [name for name in dictionary_names if name in possible_variables] + return variables + + +def _get_dict_names_dimensions(dict_names, sensor_name): + """Get DISDRODB dimensions specified in dict_names.""" + possible_dims = get_valid_dimension_names(sensor_name) + dictionary_names = list(dict_names.values()) + dims = [name for name in dictionary_names if name in possible_dims] + return dims + + +def _get_dict_dims(dict_names, sensor_name): + dims = _get_dict_names_dimensions(dict_names, sensor_name) + dict_dims = {k: v for k, v in dict_names.items() if v in dims} + return dict_dims + + +def rename_dataset(ds, dict_names): + """Rename Dataset variables, coordinates and dimensions.""" + # Get dataset variables, coordinates and dimensions of the dataset + ds_vars = list(ds.data_vars) + ds_dims = list(ds.dims) + ds_coords = list(ds.coords) + # Possible keys + possible_keys = ds_vars + ds_coords + ds_dims + # Get keys that are dimensions but not coordinates + rename_dim_keys = [dim for dim in ds_dims if dim not in ds_coords] + # Get rename keys (coords + variables) + rename_keys = [k for k in possible_keys if k not in rename_dim_keys] + # Get rename dictionary + # - Remove keys which are missing from the dataset + rename_dict = {k: v for k, v in dict_names.items() if k in rename_keys} + # Rename dataset + ds = ds.rename(rename_dict) + # Rename dimensions + rename_dim_dict = {k: v for k, v in dict_names.items() if k in rename_dim_keys} + ds = ds.rename_dims(rename_dim_dict) + return ds + + +def subset_dataset(ds, dict_names, sensor_name): + # Get valid variable names + possible_variables = get_valid_variable_names(sensor_name) + # Get variables availables in the dict_names and dataset + dataset_variables = list(ds.data_vars) + dictionary_names = list(dict_names.values()) + # Get subset variables + subset_variables = [] + for var in dataset_variables: + if var in dictionary_names and var in possible_variables: + subset_variables.append(var) + # Subset the dataset + ds = ds[subset_variables] + return ds + + +def add_dataset_missing_variables(ds, missing_vars, sensor_name): + """Add missing Dataset variables as nan DataArrays.""" + from disdrodb.l0.standards import get_variables_dimension + + # Get dimension of each variables + var_dims_dict = get_variables_dimension(sensor_name) + # Attach a nan DataArray to the Dataset for each missing variable + for var in missing_vars: + # Get variable dimension + dims = var_dims_dict[var] + # Retrieve expected shape + expected_shape = [ds.dims[dim] for dim in dims] + # Create DataArray + arr = np.zeros(expected_shape) * np.nan + da = xr.DataArray(arr, dims=dims) + # Attach to dataset + ds[var] = da + return ds + + +def preprocess_raw_netcdf(ds, dict_names, sensor_name): + """This function preprocess raw netCDF to improve compatibility with DISDRODB standards. + + This function checks validity of the dict_names, rename and subset the data accordingly. + If some variables specified in the dict_names are missing, it adds a NaN DataArray ! + + Parameters + ---------- + ds : xr.Dataset + Raw netCDF to be converted to DISDRODB standards. + dict_names : dict + Dictionary mapping raw netCDF variables/coordinates/dimension names + to DISDRODB standards. + sensor_name : str + Sensor name. + + Returns + ------- + ds : xr.Dataset + xarray Dataset with DISDRODB-compliant variable naming conventions. + + """ + # Check variable_dict has valid values + # - Check valid DISDRODB variables + dimensions + coords + _check_dict_names_validity(dict_names=dict_names, sensor_name=sensor_name) + + # Rename dataset variables and coordinates + ds = rename_dataset(ds=ds, dict_names=dict_names) + + # Subset dataset with expected variables + ds = subset_dataset(ds=ds, dict_names=dict_names, sensor_name=sensor_name) + + # If missing variables, infill with NaN array + expected_vars = set(_get_dict_names_variables(dict_names, sensor_name)) + dataset_vars = set(ds.data_vars) + missing_vars = expected_vars.difference(dataset_vars) + if len(missing_vars) > 0: + ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name) + + # Update the coordinates for (diameter and velocity) + coords = get_bin_coords(sensor_name) + ds = ds.assign_coords(coords) + + # Return dataset + return ds + + +def replace_custom_nan_flags(ds, dict_nan_flags): + """Set values corresponding to nan_flags to np.nan. + + Parameters + ---------- + df : xr.Dataset + Input xarray dataset + dict_nan_flags : dict + Dictionary with nan flags value to set as np.nan + + Returns + ------- + xr.Dataset + Dataset without nan_flags values. + """ + # Loop over the needed variable, and replace nan_flags values with np.nan + for var, nan_flags in dict_nan_flags.items(): + # If the variable is in the dataframe + if var in ds: + # Get occurrence of nan_flags + is_a_nan_flag = ds[var].isin(nan_flags) + # Replace with np.nan + ds[var] = ds[var].where(~is_a_nan_flag) + + # Return dataset + return ds + + +def replace_nan_flags(ds, sensor_name, verbose): + """Set values corresponding to nan_flags to np.nan. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset + dict_nan_flags : dict + Dictionary with nan flags value to set as np.nan + verbose : bool + Whether to verbose the processing. + + Returns + ------- + xr.Dataset + Dataset without nan_flags values. + """ + # Get dictionary of nan flags + dict_nan_flags = get_nan_flags_dict(sensor_name) + + # Loop over the needed variable, and replace nan_flags values with np.nan + for var, nan_flags in dict_nan_flags.items(): + # If the variable is in the dataframe + if var in ds: + # Get occurrence of nan_flags + is_a_nan_flag = ds[var].isin(nan_flags) + n_nan_flags_values = np.sum(is_a_nan_flag.data) + if n_nan_flags_values > 0: + msg = f"In variable {var}, {n_nan_flags_values} values were nan_flags and were replaced to np.nan." + log_info(logger=logger, msg=msg, verbose=verbose) + # Replace with np.nan + ds[var] = ds[var].where(~is_a_nan_flag) + + # Return dataset + return ds + + +def set_nan_outside_data_range(ds, sensor_name, verbose): + """Set values outside the data range as np.nan. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset + sensor_name : str + Name of the sensor. + verbose : bool + Whether to verbose the processing. + + Returns + ------- + xr.Dataset + Dataset without values outside the expected data range. + """ + # Get dictionary of data_range + dict_data_range = get_data_range_dict(sensor_name) + # Loop over the variable with a defined data_range + for var, data_range in dict_data_range.items(): + # If the variable is in the dataframe + if var in ds: + # Get min and max value + min_val = data_range[0] + max_val = data_range[1] + # Check within data range or already np.nan + is_valid = (ds[var] >= min_val) & (ds[var] <= max_val) | np.isnan(ds[var]) + # If there are values outside the data range, set to np.nan + n_invalid = np.sum(~is_valid.data) + if n_invalid > 0: + msg = f"{n_invalid} {var} values were outside the data range and were set to np.nan." + log_info(logger=logger, msg=msg, verbose=verbose) + ds[var] = ds[var].where(is_valid) # set not valid to np.nan + # Return dataset + return ds + + +def set_nan_invalid_values(ds, sensor_name, verbose): + """Set invalid (class) values to np.nan. + + Parameters + ---------- + ds : xr.Dataset + Input xarray dataset + sensor_name : str + Name of the sensor. + verbose : bool + Whether to verbose the processing. + + Returns + ------- + xr.Dataset + Dataset without invalid values. + """ + # Get dictionary of valid values + dict_valid_values = get_valid_values_dict(sensor_name) + # Loop over the variable with a defined data_range + for var, valid_values in dict_valid_values.items(): + # If the variable is in the dataframe + if var in ds: + # Get array with occurrence of correct values (or already np.nan) + is_valid_values = ds[var].isin(valid_values) | np.isnan(ds[var]) + # If invalid values are present, replace with np.nan + n_invalid_values = np.sum(~is_valid_values.data) + if n_invalid_values > 0: + msg = f"{n_invalid_values} {var} values were invalid and were replaced to np.nan." + log_info(logger=logger, msg=msg, verbose=verbose) + ds[var] = ds[var].where(is_valid_values) # set not valid to np.nan + + # Return dataset + return ds + + +def process_raw_nc( + filepath, + dict_names, + ds_sanitizer_fun, + sensor_name, + verbose, + attrs, +): + """Read and convert a raw netCDF into a DISDRODB L0B netCDF. + + Parameters + ---------- + filepath : str + netCDF file path. + dict_names : dict + Dictionary mapping raw netCDF variables/coordinates/dimension names + to DISDRODB standards. + ds_sanitizer_fun : function + Sanitizer function to do ad-hoc processing of the xr.Dataset. + attrs: dict + Global metadata to attach as global attributes to the xr.Dataset. + sensor_name : str + Name of the sensor. + verbose : bool + Whether to verbose the processing. + + + Returns + ------- + xr.Dataset + L0B xr.Dataset + """ + # Open the netCDF + with xr.open_dataset(filepath, cache=False) as data: + ds = data.load() + + # Preprocess netcdf + ds = preprocess_raw_netcdf(ds=ds, dict_names=dict_names, sensor_name=sensor_name) + + # Add CRS and geolocation information + attrs = copy.deepcopy(attrs) + coords = {} + geolocation_vars = ["latitude", "longitude", "altitude"] + for var in geolocation_vars: + if var not in ds: + coords[var] = attrs[var] + _ = attrs.pop(var) + ds = ds.assign_coords(coords) + + # Add global attributes + ds.attrs = attrs + + # Apply dataset sanitizer function + ds = ds_sanitizer_fun(ds) + + # Replace nan flags values with np.nans + ds = replace_nan_flags(ds, sensor_name=sensor_name, verbose=verbose) + + # Set values outside the data range to np.nan + ds = set_nan_outside_data_range(ds, sensor_name=sensor_name, verbose=verbose) + + # Replace invalid values with np.nan + ds = set_nan_invalid_values(ds, sensor_name=sensor_name, verbose=verbose) + + # Finalize dataset + ds = finalize_dataset(ds, sensor_name=sensor_name) + + # Return dataset + return ds diff --git a/disdrodb/l0/l0b_processing.py b/disdrodb/l0/l0b_processing.py index 337e43ec..7bab09e1 100644 --- a/disdrodb/l0/l0b_processing.py +++ b/disdrodb/l0/l0b_processing.py @@ -18,8 +18,6 @@ # -----------------------------------------------------------------------------. """Functions to process DISDRODB L0A files into DISDRODB L0B netCDF files.""" -# -----------------------------------------------------------------------------. -import copy import logging import os @@ -45,15 +43,10 @@ get_dims_size_dict, get_l0b_encodings_dict, get_long_name_dict, - get_nan_flags_dict, get_raw_array_dims_order, get_raw_array_nvalues, get_time_encoding, get_units_dict, - get_valid_dimension_names, - get_valid_names, - get_valid_values_dict, - get_valid_variable_names, get_velocity_bin_center, get_velocity_bin_lower, get_velocity_bin_upper, @@ -451,25 +444,6 @@ def add_dataset_crs_coords(ds): #### L0B Raw DataFrame Preprocessing -def _finalize_dataset(ds, sensor_name): - # Add dataset CRS coordinate - ds = add_dataset_crs_coords(ds) - - # Ensure variables with dtype object are converted to string - ds = convert_object_variables_to_string(ds) - - # Set netCDF dimension order - ds = ds.transpose("time", "diameter_bin_center", ...) - - # Add netCDF variable and coordinate attributes - ds = set_dataset_attrs(ds, sensor_name) - - # Check L0B standards - check_l0b_standards(ds) - - return ds - - def _define_dataset_variables(df, sensor_name, verbose): # Preprocess raw_spectrum, diameter and velocity arrays if available if np.any( @@ -573,7 +547,7 @@ def create_l0b_from_l0a( log_error(logger=logger, msg=msg, verbose=False) raise ValueError(msg) - ds = _finalize_dataset(ds, sensor_name=sensor_name) + ds = finalize_dataset(ds, sensor_name=sensor_name) # ----------------------------------------------------------- return ds @@ -583,6 +557,25 @@ def create_l0b_from_l0a( #### L0B netCDF4 Writer +def finalize_dataset(ds, sensor_name): + """Finalize DISDRODB L0B Dataset.""" + # Add dataset CRS coordinate + ds = add_dataset_crs_coords(ds) + + # Set netCDF dimension order + ds = ds.transpose("time", "diameter_bin_center", ...) + + # Add netCDF variable and coordinate attributes + ds = set_dataset_attrs(ds, sensor_name) + + # Ensure variables with dtype object are converted to string + ds = convert_object_variables_to_string(ds) + + # Check L0B standards + check_l0b_standards(ds) + return ds + + def sanitize_encodings_dict(encoding_dict: dict, ds: xr.Dataset) -> dict: """Ensure chunk size to be smaller than the array shape. @@ -700,372 +693,4 @@ def write_l0b(ds: xr.Dataset, fpath: str, force=False) -> None: ds.to_netcdf(fpath, engine="netcdf4") -####--------------------------------------------------------------------------. -#### L0B Raw netCDFs Preprocessing - - -def _check_dict_names_validity(dict_names, sensor_name): - """Check dict_names dictionary values validity.""" - valid_names = get_valid_names(sensor_name) - keys = np.array(list(dict_names.keys())) - values = np.array(list(dict_names.values())) - # Get invalid keys - invalid_keys = keys[np.isin(values, valid_names, invert=True)] - if len(invalid_keys) > 0: - # Report invalid keys and raise error - invalid_dict = {k: dict_names[k] for k in invalid_keys} - msg = f"The following dict_names values are not valid: {invalid_dict}" - log_error(logger=logger, msg=msg, verbose=False) - raise ValueError(msg) - return None - - -def _get_dict_names_variables(dict_names, sensor_name): - """Get DISDRODB variables specified in dict_names.""" - possible_variables = get_valid_variable_names(sensor_name) - dictionary_names = list(dict_names.values()) - variables = [name for name in dictionary_names if name in possible_variables] - return variables - - -def _get_dict_names_dimensions(dict_names, sensor_name): - """Get DISDRODB dimensions specified in dict_names.""" - possible_dims = get_valid_dimension_names(sensor_name) - dictionary_names = list(dict_names.values()) - dims = [name for name in dictionary_names if name in possible_dims] - return dims - - -def _get_dict_dims(dict_names, sensor_name): - dims = _get_dict_names_dimensions(dict_names, sensor_name) - dict_dims = {k: v for k, v in dict_names.items() if v in dims} - return dict_dims - - -def rename_dataset(ds, dict_names): - """Rename Dataset variables, coordinates and dimensions.""" - # Get dataset variables, coordinates and dimensions of the dataset - ds_vars = list(ds.data_vars) - ds_dims = list(ds.dims) - ds_coords = list(ds.coords) - # Possible keys - possible_keys = ds_vars + ds_coords + ds_dims - # Get keys that are dimensions but not coordinates - rename_dim_keys = [dim for dim in ds_dims if dim not in ds_coords] - # Get rename keys (coords + variables) - rename_keys = [k for k in possible_keys if k not in rename_dim_keys] - # Get rename dictionary - # - Remove keys which are missing from the dataset - rename_dict = {k: v for k, v in dict_names.items() if k in rename_keys} - # Rename dataset - ds = ds.rename(rename_dict) - # Rename dimensions - rename_dim_dict = {k: v for k, v in dict_names.items() if k in rename_dim_keys} - ds = ds.rename_dims(rename_dim_dict) - return ds - - -def subset_dataset(ds, dict_names, sensor_name): - # Get valid variable names - possible_variables = get_valid_variable_names(sensor_name) - # Get variables availables in the dict_names and dataset - dataset_variables = list(ds.data_vars) - dictionary_names = list(dict_names.values()) - # Get subset variables - subset_variables = [] - for var in dataset_variables: - if var in dictionary_names and var in possible_variables: - subset_variables.append(var) - # Subset the dataset - ds = ds[subset_variables] - return ds - - -def add_dataset_missing_variables(ds, missing_vars, sensor_name): - """Add missing Dataset variables as nan DataArrays.""" - from disdrodb.l0.standards import get_variables_dimension - - # Get dimension of each variables - var_dims_dict = get_variables_dimension(sensor_name) - # Attach a nan DataArray to the Dataset for each missing variable - for var in missing_vars: - # Get variable dimension - dims = var_dims_dict[var] - # Retrieve expected shape - expected_shape = [ds.dims[dim] for dim in dims] - # Create DataArray - arr = np.zeros(expected_shape) * np.nan - da = xr.DataArray(arr, dims=dims) - # Attach to dataset - ds[var] = da - return ds - - -def preprocess_raw_netcdf(ds, dict_names, sensor_name): - """This function preprocess raw netCDF to improve compatibility with DISDRODB standards. - - This function checks validity of the dict_names, rename and subset the data accordingly. - If some variables specified in the dict_names are missing, it adds a NaN DataArray ! - - Parameters - ---------- - ds : xr.Dataset - Raw netCDF to be converted to DISDRODB standards. - dict_names : dict - Dictionary mapping raw netCDF variables/coordinates/dimension names - to DISDRODB standards. - sensor_name : str - Sensor name. - - Returns - ------- - ds : xr.Dataset - xarray Dataset with DISDRODB-compliant variable naming conventions. - - """ - # Check variable_dict has valid values - # - Check valid DISDRODB variables + dimensions + coords - _check_dict_names_validity(dict_names=dict_names, sensor_name=sensor_name) - - # Rename dataset variables and coordinates - ds = rename_dataset(ds=ds, dict_names=dict_names) - - # Subset dataset with expected variables - ds = subset_dataset(ds=ds, dict_names=dict_names, sensor_name=sensor_name) - - # If missing variables, infill with NaN array - expected_vars = set(_get_dict_names_variables(dict_names, sensor_name)) - dataset_vars = set(ds.data_vars) - missing_vars = expected_vars.difference(dataset_vars) - if len(missing_vars) > 0: - ds = add_dataset_missing_variables(ds=ds, missing_vars=missing_vars, sensor_name=sensor_name) - - # Update the coordinates for (diameter and velocity) - coords = get_bin_coords(sensor_name) - ds = ds.assign_coords(coords) - - # Return dataset - return ds - - -def process_raw_nc( - filepath, - dict_names, - ds_sanitizer_fun, - sensor_name, - verbose, - attrs, -): - """Read and convert a raw netCDF into a DISDRODB L0B netCDF. - - Parameters - ---------- - filepath : str - netCDF file path. - dict_names : dict - Dictionary mapping raw netCDF variables/coordinates/dimension names - to DISDRODB standards. - ds_sanitizer_fun : function - Sanitizer function to do ad-hoc processing of the xr.Dataset. - attrs: dict - Global metadata to attach as global attributes to the xr.Dataset. - sensor_name : str - Name of the sensor. - verbose : bool - Whether to verbose the processing. - - - Returns - ------- - xr.Dataset - L0B xr.Dataset - """ - # Open the netCDF - with xr.open_dataset(filepath, cache=False) as data: - ds = data.load() - - # Preprocess netcdf - ds = preprocess_raw_netcdf(ds=ds, dict_names=dict_names, sensor_name=sensor_name) - - # Add CRS and geolocation information - attrs = copy.deepcopy(attrs) - coords = {} - geolocation_vars = ["latitude", "longitude", "altitude"] - for var in geolocation_vars: - if var not in ds: - coords[var] = attrs[var] - _ = attrs.pop(var) - ds = ds.assign_coords(coords) - ds = add_dataset_crs_coords(ds) - - # Add global attributes - ds.attrs = attrs - - # Apply dataset sanitizer function - ds = ds_sanitizer_fun(ds) - - # - Replace nan flags values with np.nans - ds = replace_nan_flags(ds, sensor_name=sensor_name, verbose=verbose) - - # - Set values outside the data range to np.nan - ds = set_nan_outside_data_range(ds, sensor_name=sensor_name, verbose=verbose) - - # - Replace invalid values with np.nan - ds = set_nan_invalid_values(ds, sensor_name=sensor_name, verbose=verbose) - - # Ensure variables with dtype object are converted to string - ds = convert_object_variables_to_string(ds) - - # Set netCDF dimension order - ds = ds.transpose("time", "diameter_bin_center", ...) - - # Add netCDF variable and coordinate attributes - ds = set_dataset_attrs(ds, sensor_name) - - # Check L0B standards - check_l0b_standards(ds) - - # Return dataset - return ds - - -def replace_custom_nan_flags(ds, dict_nan_flags): - """Set values corresponding to nan_flags to np.nan. - - Parameters - ---------- - df : xr.Dataset - Input xarray dataset - dict_nan_flags : dict - Dictionary with nan flags value to set as np.nan - - Returns - ------- - xr.Dataset - Dataset without nan_flags values. - """ - # Loop over the needed variable, and replace nan_flags values with np.nan - for var, nan_flags in dict_nan_flags.items(): - # If the variable is in the dataframe - if var in ds: - # Get occurrence of nan_flags - is_a_nan_flag = ds[var].isin(nan_flags) - # Replace with np.nan - ds[var] = ds[var].where(~is_a_nan_flag) - - # Return dataset - return ds - - -def replace_nan_flags(ds, sensor_name, verbose): - """Set values corresponding to nan_flags to np.nan. - - Parameters - ---------- - ds : xr.Dataset - Input xarray dataset - dict_nan_flags : dict - Dictionary with nan flags value to set as np.nan - verbose : bool - Whether to verbose the processing. - - Returns - ------- - xr.Dataset - Dataset without nan_flags values. - """ - # Get dictionary of nan flags - dict_nan_flags = get_nan_flags_dict(sensor_name) - - # Loop over the needed variable, and replace nan_flags values with np.nan - for var, nan_flags in dict_nan_flags.items(): - # If the variable is in the dataframe - if var in ds: - # Get occurrence of nan_flags - is_a_nan_flag = ds[var].isin(nan_flags) - n_nan_flags_values = np.sum(is_a_nan_flag.data) - if n_nan_flags_values > 0: - msg = f"In variable {var}, {n_nan_flags_values} values were nan_flags and were replaced to np.nan." - log_info(logger=logger, msg=msg, verbose=verbose) - # Replace with np.nan - ds[var] = ds[var].where(~is_a_nan_flag) - - # Return dataset - return ds - - -def set_nan_outside_data_range(ds, sensor_name, verbose): - """Set values outside the data range as np.nan. - - Parameters - ---------- - ds : xr.Dataset - Input xarray dataset - sensor_name : str - Name of the sensor. - verbose : bool - Whether to verbose the processing. - - Returns - ------- - xr.Dataset - Dataset without values outside the expected data range. - """ - # Get dictionary of data_range - dict_data_range = get_data_range_dict(sensor_name) - # Loop over the variable with a defined data_range - for var, data_range in dict_data_range.items(): - # If the variable is in the dataframe - if var in ds: - # Get min and max value - min_val = data_range[0] - max_val = data_range[1] - # Check within data range or already np.nan - is_valid = (ds[var] >= min_val) & (ds[var] <= max_val) | np.isnan(ds[var]) - # If there are values outside the data range, set to np.nan - n_invalid = np.sum(~is_valid.data) - if n_invalid > 0: - msg = f"{n_invalid} {var} values were outside the data range and were set to np.nan." - log_info(logger=logger, msg=msg, verbose=verbose) - ds[var] = ds[var].where(is_valid) # set not valid to np.nan - # Return dataset - return ds - - -def set_nan_invalid_values(ds, sensor_name, verbose): - """Set invalid (class) values to np.nan. - - Parameters - ---------- - ds : xr.Dataset - Input xarray dataset - sensor_name : str - Name of the sensor. - verbose : bool - Whether to verbose the processing. - - Returns - ------- - xr.Dataset - Dataset without invalid values. - """ - # Get dictionary of valid values - dict_valid_values = get_valid_values_dict(sensor_name) - # Loop over the variable with a defined data_range - for var, valid_values in dict_valid_values.items(): - # If the variable is in the dataframe - if var in ds: - # Get array with occurrence of correct values (or already np.nan) - is_valid_values = ds[var].isin(valid_values) | np.isnan(ds[var]) - # If invalid values are present, replace with np.nan - n_invalid_values = np.sum(~is_valid_values.data) - if n_invalid_values > 0: - msg = f"{n_invalid_values} {var} values were invalid and were replaced to np.nan." - log_info(logger=logger, msg=msg, verbose=verbose) - ds[var] = ds[var].where(is_valid_values) # set not valid to np.nan - - # Return dataset - return ds - - ####--------------------------------------------------------------------------. diff --git a/disdrodb/l0/readers/ARM/ARM_LD.py b/disdrodb/l0/readers/ARM/ARM_LD.py index 5fa85e38..2628f2bc 100644 --- a/disdrodb/l0/readers/ARM/ARM_LD.py +++ b/disdrodb/l0/readers/ARM/ARM_LD.py @@ -86,7 +86,7 @@ def reader( # Define dataset sanitizer def ds_sanitizer_fun(ds): - from disdrodb.l0.l0b_processing import replace_custom_nan_flags + from disdrodb.l0.l0b_nc_processing import replace_custom_nan_flags # Replace nan flags with np.nan # - ARM use the -9999 flags diff --git a/disdrodb/l0/readers/ARM/ARM_LPM.py b/disdrodb/l0/readers/ARM/ARM_LPM.py index 03399ece..6d110490 100644 --- a/disdrodb/l0/readers/ARM/ARM_LPM.py +++ b/disdrodb/l0/readers/ARM/ARM_LPM.py @@ -101,7 +101,7 @@ def reader( # Define dataset sanitizer def ds_sanitizer_fun(ds): - from disdrodb.l0.l0b_processing import replace_custom_nan_flags + from disdrodb.l0.l0b_nc_processing import replace_custom_nan_flags # Replace nan flags with np.nan # - ARM use the -9999 flags diff --git a/disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py b/disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py index 67d7fc52..6588d200 100644 --- a/disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py +++ b/disdrodb/l0/readers/MEXICO/OH_IIUNAM_nc.py @@ -50,7 +50,7 @@ def reader( # Define dataset sanitizer def ds_sanitizer_fun(ds): - # from disdrodb.l0.l0b_processing import replace_custom_nan_flags + # from disdrodb.l0.l0b_nc_processing import replace_custom_nan_flags # # Replace nan flags with np.nan # # - ARM use the -9999 flags diff --git a/disdrodb/l0/scripts/run_disdrodb_l0b_concat.py b/disdrodb/l0/scripts/run_disdrodb_l0b_concat.py index 479f4a41..0cb69c3f 100644 --- a/disdrodb/l0/scripts/run_disdrodb_l0b_concat.py +++ b/disdrodb/l0/scripts/run_disdrodb_l0b_concat.py @@ -72,7 +72,7 @@ def run_disdrodb_l0b_concat( Whether to print detailed processing information into terminal. The default is False. """ - from disdrodb.l0.L0B_concat import run_disdrodb_l0b_concat + from disdrodb.l0.l0b_nc_concat import run_disdrodb_l0b_concat # Parse data_sources, campaign_names and station arguments data_sources = parse_arg_to_list(data_sources) diff --git a/disdrodb/l0/scripts/run_disdrodb_l0b_concat_station.py b/disdrodb/l0/scripts/run_disdrodb_l0b_concat_station.py index 6b2cfcd8..f5778fde 100644 --- a/disdrodb/l0/scripts/run_disdrodb_l0b_concat_station.py +++ b/disdrodb/l0/scripts/run_disdrodb_l0b_concat_station.py @@ -65,7 +65,7 @@ def run_disdrodb_l0b_concat_station( The default is False. """ from disdrodb.api.io import _get_disdrodb_directory - from disdrodb.l0.l0b_concat import _concatenate_netcdf_files + from disdrodb.l0.l0b_nc_concat import _concatenate_netcdf_files # Retrieve processed_dir processed_dir = _get_disdrodb_directory( diff --git a/disdrodb/tests/test_l0/test_l0b_concat.py b/disdrodb/tests/test_l0/test_l0b_concat.py index fa2ebdf4..284735f9 100644 --- a/disdrodb/tests/test_l0/test_l0b_concat.py +++ b/disdrodb/tests/test_l0/test_l0b_concat.py @@ -6,7 +6,7 @@ import xarray as xr import yaml -from disdrodb.l0.l0b_concat import _concatenate_netcdf_files, run_disdrodb_l0b_concat +from disdrodb.l0.l0b_nc_concat import _concatenate_netcdf_files, run_disdrodb_l0b_concat def create_dummy_netcdf_file(filename: str, data: tuple): diff --git a/docs/source/l0_processing.rst b/docs/source/l0_processing.rst index fdce5b0a..a3236507 100644 --- a/docs/source/l0_processing.rst +++ b/docs/source/l0_processing.rst @@ -143,7 +143,6 @@ For example, if only ``--campaign_names`` are specified, DISDRODB will process o verbose=True debugging_mode=True parallel=False - l0b_concat=True run_disdrodb_l0( disdrodb_dir=disdrodb_dir, diff --git a/docs/source/software_structure.rst b/docs/source/software_structure.rst index 0cc72c20..f2537da6 100644 --- a/docs/source/software_structure.rst +++ b/docs/source/software_structure.rst @@ -40,8 +40,9 @@ The current software structure is described below: | ├── 📜 l0_processing.py | ├── 📜 l0a_processing.py | ├── 📜 l0b_processing.py -| ├── 📜 l0b_concat.py | ├── 📜 l0b_processing.py +| ├── 📜 l0b_nc_processing.py +| ├── 📜 l0b_nc_concat.py | ├── 📜 l0_reader.py | ├── 📜 metadata.py | ├── 📜 standards.py