Generalize add_labels, add_centroids and to_xarray methods

ghiggi · May 31, 2024 · 9b468ce · 9b468ce
1 parent 281e56b
commit 9b468ce
Show file tree

Hide file tree

Showing 6 changed files with 586 additions and 471 deletions.
diff --git a/gpm/bucket/__init__.py b/gpm/bucket/__init__.py
@@ -25,11 +25,11 @@
 
 # -----------------------------------------------------------------------------.
 """This directory defines the GPM-API geographic binning toolbox."""
-from gpm.bucket.partitioning import LonLatPartitioning  # , TilePartitioning
+from gpm.bucket.partitioning import LonLatPartitioning, TilePartitioning
 from gpm.bucket.readers import read_bucket as read
 
 __all__ = [
     "LonLatPartitioning",
-    # "TilePartitioning"
+    "TilePartitioning",
     "read",
 ]
diff --git a/gpm/bucket/dataframe.py b/gpm/bucket/dataframe.py
@@ -0,0 +1,88 @@
+# -----------------------------------------------------------------------------.
+# MIT License
+
+# Copyright (c) 2024 GPM-API developers
+#
+# This file is part of GPM-API.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# -----------------------------------------------------------------------------.
+"""This module implements manipulation wrappers for multiple DataFrame classes."""
+import dask.dataframe as dd
+import numpy as np
+import pandas as pd
+import polars as pl
+import pyarrow as pa
+
+
+def check_valid_dataframe(df):
+    """Check the dataframe class."""
+    valid_types = (pl.DataFrame, pl.LazyFrame, pa.Table, dd.DataFrame, pd.DataFrame)
+    if not isinstance(df, valid_types):
+        class_name = repr(df.__class__)
+        raise TypeError(f"Dataframe operations not yet implemented for {class_name}")
+
+
+def df_is_column_in(df, column):
+    if isinstance(df, pa.Table):
+        return column in df.column_names
+    return column in df
+
+
+def df_get_column(df, column):
+    """Get the dataframe column."""
+    if isinstance(df, pl.LazyFrame):
+        return df.select(column).collect()[column]
+    return df[column]
+
+
+def df_select_valid_rows(df, valid_rows):
+    """Select only dataframe rows with valid rows (using boolean array)."""
+    if isinstance(df, (pl.DataFrame, pl.LazyFrame, pa.Table)):
+        return df.filter(valid_rows)
+    if isinstance(df, dd.DataFrame):
+        return df.loc[np.where(valid_rows)[0]]  # BUG by providing boolean when npartitions>1
+    # else: #  if isinstance(df, pd.DataFrame):
+    return df.loc[valid_rows]
+
+
+def df_add_column(df, column, values):
+    """Add column to dataframe."""
+    if isinstance(df, (pl.DataFrame, pl.LazyFrame)):
+        return df.with_columns(pl.Series(column, values))
+    if isinstance(df, (dd.DataFrame, pd.DataFrame)):
+        df[column] = pd.Series(values)
+        return df
+    # else: # pyarrow.Table
+    return df.append_column(column, pa.array(values))
+
+
+def df_to_pandas(df):
+    """Convert dataframe to pandas."""
+    if isinstance(df, pd.DataFrame):
+        return df
+    if isinstance(df, pl.DataFrame):
+        return df.to_pandas()
+    if isinstance(df, pl.LazyFrame):
+        return df.collect().to_pandas()
+    if isinstance(df, dd.DataFrame):
+        return df.compute()
+    # else: if isinstance(df, pa.Table):
+    return df.to_pandas()
diff --git a/gpm/bucket/filters.py b/gpm/bucket/filters.py
@@ -2,8 +2,16 @@
 import polars as pl
 import pyproj
 
+from gpm.bucket.dataframe import (
+    df_add_column,
+    df_get_column,
+    df_select_valid_rows,
+)
+
 
 def get_geodesic_distance_from_point(lons, lats, lon, lat):
+    lons = np.asanyarray(lons)
+    lats = np.asanyarray(lats)
     geod = pyproj.Geod(ellps="WGS84")
     _, _, distance = geod.inv(lons, lats, np.ones(lons.shape) * lon, np.ones(lats.shape) * lat, radians=False)
     return distance
@@ -12,23 +20,15 @@ def get_geodesic_distance_from_point(lons, lats, lon, lat):
 def filter_around_point(df, lon, lat, distance):
     # https://stackoverflow.com/questions/76262681/i-need-to-create-a-column-with-the-distance-between-two-coordinates-in-polars
     # Retrieve coordinates
-    if isinstance(df, pl.LazyFrame):
-        df_coords = df.select("lon", "lat").collect()
-        lons = np.asanyarray(df_coords["lon"])
-        lats = np.asanyarray(df_coords["lat"])
-    else:
-        lons = np.asanyarray(df["lon"])
-        lats = np.asanyarray(df["lat"])
+    lons = df_get_column(df, column="lon")
+    lats = df_get_column(df, column="lat")
     # Compute geodesic distance
     distances = get_geodesic_distance_from_point(lons=lons, lats=lats, lon=lon, lat=lat)
     valid_indices = distances <= distance
-    # Filter dataframe
-    if isinstance(df, (pl.LazyFrame, pl.DataFrame)):
-        df = df.with_columns(pl.Series("distance", distances))
-        df = df.filter(valid_indices)
-    else:
-        df["distance"] = distances
-        df = df.loc[valid_indices]
+    # Add distance
+    df = df_add_column(df, column="distance", values=distances)
+    # Select only valid rows
+    df = df_select_valid_rows(df, valid_rows=valid_indices)
     return df