Rename chunks kwarg to partition_chunks into open_datatree method (#37)

ratt-ru · Oct 15, 2024 · bce6370 · bce6370
1 parent f151e81
commit bce6370
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 14 deletions.
diff --git a/README.rst b/README.rst
@@ -24,7 +24,7 @@ to be developed on well-understood MSv2 data.
   >>> import xarray_ms
   >>> from xarray.backends.api import datatree
   >>> dt = open_datatree("/data/L795830_SB001_uv.MS/",
-                         chunks={"time": 2000, "baseline": 1000})
+                         partition_chunks={"time": 2000, "baseline": 1000})
   >>> dt
   <xarray.DataTree>
   Group: /
@@ -47,7 +47,7 @@ to be developed on well-understood MSv2 data.
       │       VISIBILITY                  (time, baseline, frequency, polarization) complex64 41GB ...
       │       WEIGHT                      (time, baseline, frequency, polarization) float32 20GB ...
       │   Attributes:
-      │       version:              0.0.1
+      │       version:              4.0.0
       │       creation_date:        2024-09-18T10:49:55.133908+00:00
       │       data_description_id:  0
       └── Group: /DATA_DESC_ID=0,FIELD_ID=0,OBSERVATION_ID=0/ANTENNA

diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -5,6 +5,8 @@ Changelog
 
 X.Y.Z (DD-MM-YYYY)
 ------------------
+* Move ``chunks`` kwarg functionality in MSv2PartitionEntryPoint.open_datatree
+  to ``partition_chunks`` (:pr:`35`)
 * Set MSv4 version to 4.0.0 (:pr:`34`)
 * Fix changelog highlighting in install instructions (:pr:`33`)
 * Add basic read tests (:pr:`32`)

diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
@@ -102,18 +102,19 @@ Per-partition chunking
 ++++++++++++++++++++++
 
 Different chunking may be desired, especially when applied to
-different channelisation and polarisation configurations
-
+different channelisation and polarisation configurations.
+In these cases, the ``partition_chunks`` argument can be used
+to specify different chunking setups for each partition.
 
 .. ipython:: python
 
   dt = open_datatree(ms, partition_columns=[
     "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"],
-    chunks={
+    partition_chunks={
       (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4},
       (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}})
 
-See the ``chunks`` argument of
+See the ``partition_chunks`` argument of
 :meth:`xarray_ms.backend.msv2.entrypoint.MSv2PartitionEntryPoint.open_datatree`
 for more information.
 
@@ -138,7 +139,7 @@ this to a zarr_ store.
 
   dt = open_datatree(ms, partition_columns=[
     "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"],
-    chunks={
+    partition_chunks={
       (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4},
       (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}})
 

diff --git a/tests/test_backend.py b/tests/test_backend.py
@@ -156,7 +156,7 @@ def test_open_datatree(simmed_ms):
 
   # Works with default dask scheduler
   with ExitStack() as stack:
-    dt = open_datatree(simmed_ms, chunks=chunks)
+    dt = open_datatree(simmed_ms, partition_chunks=chunks)
     for ds in dt.values():
       del ds.attrs["creation_date"]
     xt.assert_identical(dt, mem_dt)
@@ -165,7 +165,7 @@ def test_open_datatree(simmed_ms):
   with ExitStack() as stack:
     cluster = stack.enter_context(LocalCluster(processes=True, n_workers=4))
     stack.enter_context(Client(cluster))
-    dt = open_datatree(simmed_ms, chunks=chunks)
+    dt = open_datatree(simmed_ms, partition_chunks=chunks)
     for ds in dt.values():
       del ds.attrs["creation_date"]
     xt.assert_identical(dt, mem_dt)
@@ -186,7 +186,7 @@ def test_open_datatree_chunking(simmed_ms):
   and partition-specific chunking"""
   dt = open_datatree(
     simmed_ms,
-    chunks={"time": 3, "frequency": 2},
+    partition_chunks={"time": 3, "frequency": 2},
   )
 
   for child in dt.children:
@@ -210,7 +210,10 @@ def test_open_datatree_chunking(simmed_ms):
 
   dt = open_datatree(
     simmed_ms,
-    chunks={"D=0": {"time": 2, "baseline": 2}, "D=1": {"time": 3, "frequency": 2}},
+    partition_chunks={
+      "D=0": {"time": 2, "baseline": 2},
+      "D=1": {"time": 3, "frequency": 2},
+    },
   )
 
   for child in dt.children:
@@ -231,3 +234,13 @@ def test_open_datatree_chunking(simmed_ms):
         "polarization": (2,),
         "uvw_label": (3,),
       }
+
+  with pytest.warns(UserWarning, match="`partition_chunks` overriding `chunks`"):
+    dt = open_datatree(
+      simmed_ms,
+      chunks={},
+      partition_chunks={
+        "D=0": {"time": 2, "baseline": 2},
+        "D=1": {"time": 3, "frequency": 2},
+      },
+    )
diff --git a/xarray_ms/backend/msv2/entrypoint.py b/xarray_ms/backend/msv2/entrypoint.py
@@ -298,7 +298,7 @@ def open_datatree(
     self,
     filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
     *,
-    chunks: Dict[str, Any] | None = None,
+    partition_chunks: Dict[str, Any] | None = None,
     drop_variables: str | Iterable[str] | None = None,
     partition_columns: List[str] | None = None,
     auto_corrs: bool = True,
@@ -311,7 +311,7 @@ def open_datatree(
 
     Args:
       filename_or_obj: The path to the MSv2 CASA Measurement Set file.
-      chunks: Chunk sizes along each dimension,
+      partition_chunks: Chunk sizes along each dimension,
         e.g. :code:`{{"time": 10, "frequency": 16}}`.
         Individual partitions can be chunked differently by
         partially (or fully) specifying a partition key: e.g.
@@ -331,6 +331,11 @@ def open_datatree(
             "D=0,F=1": {{"time": 20, "frequency": 32}},
           }}
 
+        .. note:: This argument overrides the reserved ``chunks`` argument
+          used by xarray to control chunking in Datasets and DataTrees.
+          It should be used instead of ``chunks`` when different
+          chunking is desired for different partitions.
+
       drop_variables: Variables to drop from the dataset.
       partition_columns: The columns to use for partitioning the Measurement set.
         Defaults to :code:`{DEFAULT_PARTITION_COLUMNS}`.
@@ -355,7 +360,14 @@ def open_datatree(
 
     structure = structure_factory()
     datasets = {}
-    pchunks = promote_chunks(structure, chunks)
+
+    if not partition_chunks:
+      partition_chunks = kwargs.pop("chunks", None)
+    elif "chunks" in kwargs:
+      kwargs.pop("chunks", None)
+      warnings.warn("`partition_chunks` overriding `chunks`")
+
+    pchunks = promote_chunks(structure, partition_chunks)
 
     for partition_key in structure:
       ds = xarray.open_dataset(