diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a763c66..1d576a6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # HDMF-ZARR Changelog +## 0.7.0 (Upcoming) + +### Enhancements + +* Changed default object_codec_class for ZarrIO to numcodecs.JSON. The issue with the old default (numcodecs.Pickle) was that it was not readable outside of Python. Exposed the object_codec_class as a parameter to the NWBZarrIO constructor. Resort to Pickle for complex cases such as structured arrays or compound datasets with refs. + ## 0.6.0 (February 21, 2024) ### Enhancements diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 6d95246a..7f33d146 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -249,7 +249,7 @@ Zarr file. The individual object references are defined in the :py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function. -By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to +By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.JSON`` codec to encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets. Users may set the codec used to encode objects in Zarr datasets via the ``object_codec_class`` parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index 6ae27a25..0fed1601 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -94,7 +94,7 @@ def can_read(path): 'default': None}, {'name': 'object_codec_class', 'type': None, 'doc': 'Set the numcodec object codec class to be used to encode objects.' - 'Use numcodecs.pickles.Pickle by default.', + 'Use numcodecs.JSON by default.', 'default': None}, {'name': 'storage_options', 'type': dict, 'doc': 'Zarr storage options to read remote folders', @@ -120,8 +120,8 @@ def __init__(self, **kwargs): self.__built = dict() self._written_builders = WriteStatusTracker() # track which builders were written (or read) by this IO object self.__dci_queue = None # Will be initialized on call to io.write - # Codec class to be used. Alternates, e.g., =numcodecs.JSON - self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class + # Codec class to be used. Alternates, e.g., =numcodecs.pickles.Pickle + self.__codec_cls = numcodecs.JSON if object_codec_class is None else object_codec_class source_path = self.__path if isinstance(self.__path, SUPPORTED_ZARR_STORES): source_path = self.__path.path @@ -1050,13 +1050,18 @@ def write_dataset(self, **kwargs): # noqa: C901 new_dtype.append((field['name'], self.__resolve_dtype_helper__(field['dtype']))) dtype = np.dtype(new_dtype) + object_codec = self.__codec_cls() + if not isinstance(object_codec, numcodecs.Pickle): + warnings.warn(f'Resorting to Pickle codec for dataset {name} of {parent.name}') + object_codec = numcodecs.Pickle() + # cast and store compound dataset arr = np.array(new_items, dtype=dtype) dset = parent.require_dataset( name, shape=(len(arr),), dtype=dtype, - object_codec=self.__codec_cls(), + object_codec=object_codec, **options['io_settings'] ) dset.attrs['zarr_dtype'] = type_str @@ -1268,6 +1273,23 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901 else: data_shape = get_data_shape(data) + # Let's check to see if we have a structured array somewhere in the data + # If we do, then we are going to resort to pickling the data and + # printing a warning. + has_structured_array = False + if dtype == object: + for c in np.ndindex(data_shape): + o = data + for i in c: + o = o[i] + if isinstance(o, np.void) and o.dtype.names is not None: + has_structured_array = True + if has_structured_array: + object_codec = io_settings.get('object_codec') + if not isinstance(object_codec, numcodecs.Pickle): + warnings.warn(f'Resorting to Pickle codec for {name} of {parent.name}.') + io_settings['object_codec'] = numcodecs.Pickle() + # Create the dataset dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings) dset.attrs['zarr_dtype'] = type_str diff --git a/src/hdmf_zarr/nwb.py b/src/hdmf_zarr/nwb.py index ed3d831c..9983c9b2 100644 --- a/src/hdmf_zarr/nwb.py +++ b/src/hdmf_zarr/nwb.py @@ -27,9 +27,10 @@ class NWBZarrIO(ZarrIO): 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', 'default': None}) def __init__(self, **kwargs): - path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = \ + path, mode, manager, extensions, load_namespaces, synchronizer, storage_options, object_codec_class = \ popargs('path', 'mode', 'manager', 'extensions', - 'load_namespaces', 'synchronizer', 'storage_options', kwargs) + 'load_namespaces', 'synchronizer', 'storage_options', + 'object_codec_class', kwargs) if load_namespaces: if manager is not None: warn("loading namespaces from file - ignoring 'manager'") @@ -53,7 +54,8 @@ def __init__(self, **kwargs): manager=manager, mode=mode, synchronizer=synchronizer, - storage_options=storage_options) + storage_options=storage_options, + object_codec_class=object_codec_class) @docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'}, {'name': 'nwbfile', 'type': 'NWBFile', diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index d142499d..98b149bb 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -17,7 +17,7 @@ # Try to import numcodecs and disable compression tests if it is not available try: - from numcodecs import Blosc, Delta, JSON + from numcodecs import Blosc, Delta, JSON, Pickle DISABLE_ZARR_COMPRESSION_TESTS = False except ImportError: DISABLE_ZARR_COMPRESSION_TESTS = True @@ -491,12 +491,12 @@ def setUp(self): # ZarrDataIO general ############################################# def test_set_object_codec(self): - # Test that the default codec is the Pickle store + # Test that the default codec is JSON tempIO = ZarrIO(self.store, mode='w') - self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle') - del tempIO # also calls tempIO.close() - tempIO = ZarrIO(self.store, mode='w', object_codec_class=JSON) self.assertEqual(tempIO.object_codec_class.__qualname__, 'JSON') + del tempIO # also calls tempIO.close() + tempIO = ZarrIO(self.store, mode='w', object_codec_class=Pickle) + self.assertEqual(tempIO.object_codec_class.__qualname__, 'Pickle') tempIO.close() def test_synchronizer_constructor_arg_bool(self):