Skip to content

Commit

Permalink
Add initial new code for bucket partitioning
Browse files Browse the repository at this point in the history
  • Loading branch information
ghiggi committed May 24, 2024
1 parent 2330085 commit ce4b8fb
Show file tree
Hide file tree
Showing 14 changed files with 1,513 additions and 301 deletions.
2 changes: 1 addition & 1 deletion docs/source/03_quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ as well as other parameters such as the directory on your local machine where to
save the GPM dataset of interest.

To facilitate the creation of the configuration file, you can adapt and run the following script in Python.
The configuration file will be created in the user's home directory under the name ``.config_gpm.yaml``.
The configuration file will be created in the user's home directory under the name ``.config_gpm_api.yaml``.

.. code-block:: python
Expand Down
2 changes: 1 addition & 1 deletion gpm/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@


def _get_default_configs():
"""Retrieve the default GPM-API settings from the ``.config_gpm.yaml`` file."""
"""Retrieve the default GPM-API settings from the ``.config_gpm_api.yaml`` file."""
try:
config_dict = read_configs()
config_dict = {key: value for key, value in config_dict.items() if value is not None}
Expand Down
4 changes: 2 additions & 2 deletions gpm/accessor/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ def sel(self, indexers=None, drop=False, **indexers_kwargs):

@auto_wrap_docstring
def extent(self, padding=0, size=None):
from gpm.utils.geospatial import get_extent
from gpm.utils.geospatial import get_geographic_extent_from_xarray

return get_extent(self._obj, padding=padding, size=size)
return get_geographic_extent_from_xarray(self._obj, padding=padding, size=size)

@auto_wrap_docstring
def crop(self, extent):
Expand Down
223 changes: 223 additions & 0 deletions gpm/bucket/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,118 @@
import pandas as pd
import polars as pl

from gpm.utils.geospatial import _check_size

# in processing.py --> replace, assign_spatial_partitions, get_bin_partition
# assign_spatial_partitions
# get_bin_partition


def get_bin_partition(values, bin_size):
"""Compute the bins partitioning values.
Parameters
----------
values : float or array-like
Values.
bin_size : float
Bin size.
Returns
-------
Bin value : float or array-like
DESCRIPTION.
"""
return bin_size * np.floor(values / bin_size)


# bin_size = 10
# values = np.array([-180,-176,-175, -174, -171, 170, 166])
# get_bin_partition(values, bin_size)


def assign_spatial_partitions(
df,
xbin_name,
ybin_name,
xbin_size,
ybin_size,
x_column="lat",
y_column="lon",
):
"""Add partitioning bin columns to dataframe.
Works for both `dask.dataframe.DataFrame` and `pandas.DataFrame`.
"""
# Remove invalid coordinates
df = df[~df[x_column].isna()]
df = df[~df[y_column].isna()]

# Add spatial partitions columns to dataframe
partition_columns = {
xbin_name: get_bin_partition(df[x_column], bin_size=xbin_size),
ybin_name: get_bin_partition(df[y_column], bin_size=ybin_size),
}
return df.assign(**partition_columns)

Check notice on line 86 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

assign_spatial_partitions has 7 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.


def _get_bin_edges(vmin, vmax, size):
"""Get bin edges."""
return np.arange(vmin, vmax + 1e-10, step=size)


def _get_bin_midpoints(vmin, vmax, size):
"""Get bin midpoints."""
edges = _get_bin_edges(vmin=vmin, vmax=vmax, size=size)
return edges[:-1] + np.diff(edges) / 2


def create_spatial_bin_empty_df(
xbin_size=1,
ybin_size=1,
xlim=(-180, 180),
ylim=(-90, 90),
xbin_name="xbin",
ybin_name="ybin",
):
"""Create empty spatial bin DataFrame."""
# Get midpoints
x_midpoints = _get_bin_midpoints(vmin=xlim[0], vmax=xlim[1], size=xbin_size)
y_midpoints = _get_bin_midpoints(vmin=ylim[0], vmax=ylim[1], size=ybin_size)

# Create the MultiIndex from the combination of x and y bins
multi_index = pd.MultiIndex.from_product(
[x_midpoints, y_midpoints],
names=[xbin_name, ybin_name],
)

# Create an empty DataFrame with the MultiIndex
return pd.DataFrame(index=multi_index)

Check notice on line 120 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

create_spatial_bin_empty_df has 6 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.


def add_bin_column(df, column, bin_size, vmin, vmax, bin_name, add_midpoint=True):
# Keep rows within values
valid_rows = df[column].between(left=vmin, right=vmax, inclusive="both")
df = df.loc[valid_rows, :]

# Get bin edges and midpoints
bin_edges = _get_bin_edges(vmin=vmin, vmax=vmax, size=bin_size)
bin_midpoints = _get_bin_midpoints(vmin=vmin, vmax=vmax, size=bin_size)

# Get bin index
# - 0 is outside to the left of the bins
# - -1 is outside to the right
# --> Subtract 1
bin_idx = np.digitize(df[column], bins=bin_edges, right=False) - 1

# Add bin index/midpoint values
if add_midpoint:
df[bin_name] = bin_midpoints[bin_idx]
else:
df[bin_name] = bin_idx
return df

Check notice on line 143 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

add_bin_column has 7 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.


def get_n_decimals(number):
number_str = str(number)
Expand Down Expand Up @@ -95,6 +207,40 @@ def get_cut_lon_breaks_labels(bin_spacing):
return cut_lon_breaks, cut_lon_labels


def add_spatial_bins(
df,
x="x",
y="y",
xbin_size=1,
ybin_size=1,
xlim=(-180, 180),
ylim=(-90, 90),
xbin_name="xbin",
ybin_name="ybin",
add_bin_midpoint=True,
):
# Define x bins
df = add_bin_column(
df=df,
column=x,
bin_size=xbin_size,
vmin=xlim[0],
vmax=xlim[1],
bin_name=xbin_name,
add_midpoint=add_bin_midpoint,
)
# Define y bins
return add_bin_column(
df=df,
column=y,
bin_size=ybin_size,
vmin=ylim[0],
vmax=ylim[1],
bin_name=ybin_name,
add_midpoint=add_bin_midpoint,
)

Check notice on line 241 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

add_spatial_bins has 10 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.


def pl_add_geographic_bins(
df,
xbin_column,
Expand All @@ -113,6 +259,49 @@ def pl_add_geographic_bins(
# df.filter(pl.col(xbin_column) == "outside_right")


def add_geographic_bins(
df,
x,
y,
xbin,
ybin,
size,
extent,
add_bin_midpoint=True,
):
size = _check_size(size)
if isinstance(df, pd.DataFrame):
from gpm.bucket.analysis import add_spatial_bins

df = add_spatial_bins(
df=df,
x=x,
y=y,
xbin_name=xbin,
ybin_name=ybin,
xbin_size=size[0],
ybin_size=size[1],
xlim=extent[0:2],
ylim=extent[0:2],
add_bin_midpoint=add_bin_midpoint,
)
else:
# TODO: no extent !

Check notice on line 289 in gpm/bucket/analysis.py

View check run for this annotation

codefactor.io / CodeFactor

gpm/bucket/analysis.py#L289

unresolved comment '# TODO: no extent !' (C100)
df = pl_add_geographic_bins(
df=df,
xbin_column=xbin,
ybin_column=ybin,
bin_spacing=size,
x_column=x,
y_column=y,
)
return df

Check notice on line 298 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

add_geographic_bins has 8 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.


####----------------------------------------------------------------.
#### Conversion to xarray Dataset


def pl_df_to_xarray(df, xbin_column, ybin_column, bin_spacing):
df_stats_pd = df.to_pandas()

Expand All @@ -138,3 +327,37 @@ def pl_df_to_xarray(df, xbin_column, ybin_column, bin_spacing):
# Reshape to xarray
ds = df_stats_pd.to_xarray()
return ds.rename({xbin_column: "longitude", ybin_column: "latitude"})


def pd_df_to_xarray(df, xbin, ybin, size):
size = _check_size(size)
if set(df.index.names) != {xbin, ybin}:
df[xbin] = df[xbin].astype(float)
df[ybin] = df[ybin].astype(float)
df = df.set_index([xbin, ybin])

# Create an empty DataFrame with the MultiIndex
lon_labels = get_lon_labels(size[0])
lat_labels = get_lat_labels(size[1])
multi_index = pd.MultiIndex.from_product(
[lon_labels, lat_labels],
names=[xbin, ybin],
)
empty_df = pd.DataFrame(index=multi_index)

# Create final dataframe
df_full = empty_df.join(df, how="left")

# Reshape to xarray
ds = df_full.to_xarray()
return ds


def df_to_dataset(df, xbin, ybin, size, extent):
if isinstance(df, pl.DataFrame):
df = df.to_pandas()
if not isinstance(df, pd.DataFrame):
raise TypeError("Expecting a pandas or polars DataFrame.")
size = _check_size(size)
ds = pd_df_to_xarray(df, xbin=xbin, ybin=ybin, size=size, extent=extent)
return ds

Check notice on line 363 in gpm/bucket/analysis.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

ℹ New issue: Excess Number of Function Arguments

df_to_dataset has 5 arguments, threshold = 4. This function has too many arguments, indicating a lack of encapsulation. Avoid adding more arguments.
Loading

0 comments on commit ce4b8fb

Please sign in to comment.