Skip to content

Commit

Permalink
Rename chunks kwarg to partition_chunks into open_datatree method (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
sjperkins authored Oct 15, 2024
1 parent f151e81 commit bce6370
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 14 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ to be developed on well-understood MSv2 data.
>>> import xarray_ms
>>> from xarray.backends.api import datatree
>>> dt = open_datatree("/data/L795830_SB001_uv.MS/",
chunks={"time": 2000, "baseline": 1000})
partition_chunks={"time": 2000, "baseline": 1000})
>>> dt
<xarray.DataTree>
Group: /
Expand All @@ -47,7 +47,7 @@ to be developed on well-understood MSv2 data.
│ VISIBILITY (time, baseline, frequency, polarization) complex64 41GB ...
│ WEIGHT (time, baseline, frequency, polarization) float32 20GB ...
│ Attributes:
│ version: 0.0.1
│ version: 4.0.0
│ creation_date: 2024-09-18T10:49:55.133908+00:00
│ data_description_id: 0
└── Group: /DATA_DESC_ID=0,FIELD_ID=0,OBSERVATION_ID=0/ANTENNA
Expand Down
2 changes: 2 additions & 0 deletions doc/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ Changelog

X.Y.Z (DD-MM-YYYY)
------------------
* Move ``chunks`` kwarg functionality in MSv2PartitionEntryPoint.open_datatree
to ``partition_chunks`` (:pr:`35`)
* Set MSv4 version to 4.0.0 (:pr:`34`)
* Fix changelog highlighting in install instructions (:pr:`33`)
* Add basic read tests (:pr:`32`)
Expand Down
11 changes: 6 additions & 5 deletions doc/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,19 @@ Per-partition chunking
++++++++++++++++++++++

Different chunking may be desired, especially when applied to
different channelisation and polarisation configurations

different channelisation and polarisation configurations.
In these cases, the ``partition_chunks`` argument can be used
to specify different chunking setups for each partition.

.. ipython:: python
dt = open_datatree(ms, partition_columns=[
"DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"],
chunks={
partition_chunks={
(("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4},
(("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}})
See the ``chunks`` argument of
See the ``partition_chunks`` argument of
:meth:`xarray_ms.backend.msv2.entrypoint.MSv2PartitionEntryPoint.open_datatree`
for more information.

Expand All @@ -138,7 +139,7 @@ this to a zarr_ store.
dt = open_datatree(ms, partition_columns=[
"DATA_DESC_ID", "FIELD_ID", "OBSERVATION_ID"],
chunks={
partition_chunks={
(("DATA_DESC_ID", 0),): {"time": 2, "frequency": 4},
(("DATA_DESC_ID", 1),): {"time": 3, "frequency": 2}})
Expand Down
21 changes: 17 additions & 4 deletions tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def test_open_datatree(simmed_ms):

# Works with default dask scheduler
with ExitStack() as stack:
dt = open_datatree(simmed_ms, chunks=chunks)
dt = open_datatree(simmed_ms, partition_chunks=chunks)
for ds in dt.values():
del ds.attrs["creation_date"]
xt.assert_identical(dt, mem_dt)
Expand All @@ -165,7 +165,7 @@ def test_open_datatree(simmed_ms):
with ExitStack() as stack:
cluster = stack.enter_context(LocalCluster(processes=True, n_workers=4))
stack.enter_context(Client(cluster))
dt = open_datatree(simmed_ms, chunks=chunks)
dt = open_datatree(simmed_ms, partition_chunks=chunks)
for ds in dt.values():
del ds.attrs["creation_date"]
xt.assert_identical(dt, mem_dt)
Expand All @@ -186,7 +186,7 @@ def test_open_datatree_chunking(simmed_ms):
and partition-specific chunking"""
dt = open_datatree(
simmed_ms,
chunks={"time": 3, "frequency": 2},
partition_chunks={"time": 3, "frequency": 2},
)

for child in dt.children:
Expand All @@ -210,7 +210,10 @@ def test_open_datatree_chunking(simmed_ms):

dt = open_datatree(
simmed_ms,
chunks={"D=0": {"time": 2, "baseline": 2}, "D=1": {"time": 3, "frequency": 2}},
partition_chunks={
"D=0": {"time": 2, "baseline": 2},
"D=1": {"time": 3, "frequency": 2},
},
)

for child in dt.children:
Expand All @@ -231,3 +234,13 @@ def test_open_datatree_chunking(simmed_ms):
"polarization": (2,),
"uvw_label": (3,),
}

with pytest.warns(UserWarning, match="`partition_chunks` overriding `chunks`"):
dt = open_datatree(
simmed_ms,
chunks={},
partition_chunks={
"D=0": {"time": 2, "baseline": 2},
"D=1": {"time": 3, "frequency": 2},
},
)
18 changes: 15 additions & 3 deletions xarray_ms/backend/msv2/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def open_datatree(
self,
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
*,
chunks: Dict[str, Any] | None = None,
partition_chunks: Dict[str, Any] | None = None,
drop_variables: str | Iterable[str] | None = None,
partition_columns: List[str] | None = None,
auto_corrs: bool = True,
Expand All @@ -311,7 +311,7 @@ def open_datatree(
Args:
filename_or_obj: The path to the MSv2 CASA Measurement Set file.
chunks: Chunk sizes along each dimension,
partition_chunks: Chunk sizes along each dimension,
e.g. :code:`{{"time": 10, "frequency": 16}}`.
Individual partitions can be chunked differently by
partially (or fully) specifying a partition key: e.g.
Expand All @@ -331,6 +331,11 @@ def open_datatree(
"D=0,F=1": {{"time": 20, "frequency": 32}},
}}
.. note:: This argument overrides the reserved ``chunks`` argument
used by xarray to control chunking in Datasets and DataTrees.
It should be used instead of ``chunks`` when different
chunking is desired for different partitions.
drop_variables: Variables to drop from the dataset.
partition_columns: The columns to use for partitioning the Measurement set.
Defaults to :code:`{DEFAULT_PARTITION_COLUMNS}`.
Expand All @@ -355,7 +360,14 @@ def open_datatree(

structure = structure_factory()
datasets = {}
pchunks = promote_chunks(structure, chunks)

if not partition_chunks:
partition_chunks = kwargs.pop("chunks", None)
elif "chunks" in kwargs:
kwargs.pop("chunks", None)
warnings.warn("`partition_chunks` overriding `chunks`")

pchunks = promote_chunks(structure, partition_chunks)

for partition_key in structure:
ds = xarray.open_dataset(
Expand Down

0 comments on commit bce6370

Please sign in to comment.