From bce6370603d6e22b1373a6c57a7b1c8e1c0831de Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Tue, 15 Oct 2024 16:24:05 +0200 Subject: [PATCH] Rename chunks kwarg to partition_chunks into open_datatree method (#37) --- README.rst | 4 ++-- doc/source/changelog.rst | 2 ++ doc/source/tutorial.rst | 11 ++++++----- tests/test_backend.py | 21 +++++++++++++++++---- xarray_ms/backend/msv2/entrypoint.py | 18 +++++++++++++++--- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/README.rst b/README.rst index ddc8419..e5bd8b0 100644 --- a/README.rst +++ b/README.rst @@ -24,7 +24,7 @@ to be developed on well-understood MSv2 data. >>> import xarray_ms >>> from xarray.backends.api import datatree >>> dt = open_datatree("/data/L795830_SB001_uv.MS/", - chunks={"time": 2000, "baseline": 1000}) + partition_chunks={"time": 2000, "baseline": 1000}) >>> dt Group: / @@ -47,7 +47,7 @@ to be developed on well-understood MSv2 data. │ VISIBILITY (time, baseline, frequency, polarization) complex64 41GB ... │ WEIGHT (time, baseline, frequency, polarization) float32 20GB ... │ Attributes: - │ version: 0.0.1 + │ version: 4.0.0 │ creation_date: 2024-09-18T10:49:55.133908+00:00 │ data_description_id: 0 └── Group: /DATA_DESC_ID=0,FIELD_ID=0,OBSERVATION_ID=0/ANTENNA diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index a038290..5180e76 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -5,6 +5,8 @@ Changelog X.Y.Z (DD-MM-YYYY) ------------------ +* Move ``chunks`` kwarg functionality in MSv2PartitionEntryPoint.open_datatree + to ``partition_chunks`` (:pr:`35`) * Set MSv4 version to 4.0.0 (:pr:`34`) * Fix changelog highlighting in install instructions (:pr:`33`) * Add basic read tests (:pr:`32`) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index b134ff5..aabed49 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -102,18 +102,19 @@ Per-partition chunking ++++++++++++++++++++++ Different chunking may be desired, especially when applied to -different channelisation and polarisation configurations - +different channelisation and polarisation configurations. +In these cases, the ``partition_chunks`` argument can be used +to specify different chunking setups for each partition. .. ipython:: python dt = open_datatree(ms, partition_columns=[ "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"], - chunks={ + partition_chunks={ (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4}, (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}}) -See the ``chunks`` argument of +See the ``partition_chunks`` argument of :meth:`xarray_ms.backend.msv2.entrypoint.MSv2PartitionEntryPoint.open_datatree` for more information. @@ -138,7 +139,7 @@ this to a zarr_ store. dt = open_datatree(ms, partition_columns=[ "DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"], - chunks={ + partition_chunks={ (("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4}, (("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}}) diff --git a/tests/test_backend.py b/tests/test_backend.py index f472f75..a08d1d4 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -156,7 +156,7 @@ def test_open_datatree(simmed_ms): # Works with default dask scheduler with ExitStack() as stack: - dt = open_datatree(simmed_ms, chunks=chunks) + dt = open_datatree(simmed_ms, partition_chunks=chunks) for ds in dt.values(): del ds.attrs["creation_date"] xt.assert_identical(dt, mem_dt) @@ -165,7 +165,7 @@ def test_open_datatree(simmed_ms): with ExitStack() as stack: cluster = stack.enter_context(LocalCluster(processes=True, n_workers=4)) stack.enter_context(Client(cluster)) - dt = open_datatree(simmed_ms, chunks=chunks) + dt = open_datatree(simmed_ms, partition_chunks=chunks) for ds in dt.values(): del ds.attrs["creation_date"] xt.assert_identical(dt, mem_dt) @@ -186,7 +186,7 @@ def test_open_datatree_chunking(simmed_ms): and partition-specific chunking""" dt = open_datatree( simmed_ms, - chunks={"time": 3, "frequency": 2}, + partition_chunks={"time": 3, "frequency": 2}, ) for child in dt.children: @@ -210,7 +210,10 @@ def test_open_datatree_chunking(simmed_ms): dt = open_datatree( simmed_ms, - chunks={"D=0": {"time": 2, "baseline": 2}, "D=1": {"time": 3, "frequency": 2}}, + partition_chunks={ + "D=0": {"time": 2, "baseline": 2}, + "D=1": {"time": 3, "frequency": 2}, + }, ) for child in dt.children: @@ -231,3 +234,13 @@ def test_open_datatree_chunking(simmed_ms): "polarization": (2,), "uvw_label": (3,), } + + with pytest.warns(UserWarning, match="`partition_chunks` overriding `chunks`"): + dt = open_datatree( + simmed_ms, + chunks={}, + partition_chunks={ + "D=0": {"time": 2, "baseline": 2}, + "D=1": {"time": 3, "frequency": 2}, + }, + ) diff --git a/xarray_ms/backend/msv2/entrypoint.py b/xarray_ms/backend/msv2/entrypoint.py index 35f8ee9..becb5e5 100644 --- a/xarray_ms/backend/msv2/entrypoint.py +++ b/xarray_ms/backend/msv2/entrypoint.py @@ -298,7 +298,7 @@ def open_datatree( self, filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore, *, - chunks: Dict[str, Any] | None = None, + partition_chunks: Dict[str, Any] | None = None, drop_variables: str | Iterable[str] | None = None, partition_columns: List[str] | None = None, auto_corrs: bool = True, @@ -311,7 +311,7 @@ def open_datatree( Args: filename_or_obj: The path to the MSv2 CASA Measurement Set file. - chunks: Chunk sizes along each dimension, + partition_chunks: Chunk sizes along each dimension, e.g. :code:`{{"time": 10, "frequency": 16}}`. Individual partitions can be chunked differently by partially (or fully) specifying a partition key: e.g. @@ -331,6 +331,11 @@ def open_datatree( "D=0,F=1": {{"time": 20, "frequency": 32}}, }} + .. note:: This argument overrides the reserved ``chunks`` argument + used by xarray to control chunking in Datasets and DataTrees. + It should be used instead of ``chunks`` when different + chunking is desired for different partitions. + drop_variables: Variables to drop from the dataset. partition_columns: The columns to use for partitioning the Measurement set. Defaults to :code:`{DEFAULT_PARTITION_COLUMNS}`. @@ -355,7 +360,14 @@ def open_datatree( structure = structure_factory() datasets = {} - pchunks = promote_chunks(structure, chunks) + + if not partition_chunks: + partition_chunks = kwargs.pop("chunks", None) + elif "chunks" in kwargs: + kwargs.pop("chunks", None) + warnings.warn("`partition_chunks` overriding `chunks`") + + pchunks = promote_chunks(structure, partition_chunks) for partition_key in structure: ds = xarray.open_dataset(