diff --git a/docs/README.md b/docs/README.md index 35b7f16..bf222fe 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,6 @@ To build docs: - - create an environment using the ``requirements.txt`` file in this directory - - run ``make`` in this directory - - find the docs is "./build/html", probably starting with file "index.html" + - `cd docs` + - create an environment using the `requirements.txt` file in this directory, e.g., `pip install -r requirements.txt` + - run `make html` + - open `build/html/index.html` diff --git a/docs/source/conf.py b/docs/source/conf.py index 1d5ad92..23d479c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,8 @@ project = 'kerchunk' copyright = '2021, Martin Durant' author = 'Martin Durant' -version = kerchunk.__version__ +# No easy way to get the latest version based on how the github pages are built/deployed, so leave it blank or else it will be 9999 +version = '' # -- General configuration --------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 4572bd4..15a2589 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -68,6 +68,7 @@ so that blocks from one or more files can be arranged into aggregate datasets ac beyond nonzarr reference + reference_aggregation contributing advanced diff --git a/docs/source/reference.rst b/docs/source/reference.rst index ffe7c32..98e4cfb 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -10,6 +10,7 @@ File format backends kerchunk.fits.process_file kerchunk.tiff.tiff_to_zarr kerchunk.netCDF3.NetCDF3ToZarr + kerchunk.hdf4.HDF4ToZarr .. autoclass:: kerchunk.hdf.SingleHdf5ToZarr :members: @@ -24,6 +25,9 @@ File format backends .. autoclass:: kerchunk.netCDF3.NetCDF3ToZarr :members: __init__, translate +.. autoclass:: kerchunk.hdf4.HDF4ToZarr + :members: __init__, translate + Codecs ------ @@ -50,6 +54,9 @@ Codecs .. autoclass:: kerchunk.codecs.RecordArrayMember :members: __init__ +.. autoclass:: kerchunk.codecs.ZlibCodec + :members: __init__ + Combining --------- diff --git a/docs/source/reference_aggregation.rst b/docs/source/reference_aggregation.rst new file mode 100644 index 0000000..ee8e28f --- /dev/null +++ b/docs/source/reference_aggregation.rst @@ -0,0 +1,224 @@ +Aggregation special cases +============================= + +As we have already seen in this `page `_, +that the main purpose of ``kerchunk`` it to generate references, to view whole archive +of files like GRIB2, NetCDF etc, allowing us for direct access to the data. In +this part of the documentation, we will see some other efficient ways of +combining references. + +GRIB Aggregations +----------------- + +This reference aggregation method of GRIB files, developed by `Camus Energy `_, +and it functions if accompanying ``.idx`` files are present. It involves creating a reference index +for every GRIB message across the files that we want to aggregate. + +**But this procedure has some certain restrictions:** + + - GRIB files must paired with their ``.idx`` files + - The ``.idx`` file must be of *text* type. + - Only specialised for time-series data, where GRIB files + have *identical* structure. + - Each horizon(forecast time) must be indexed separately. + + +Utilizing this method can significantly reduce the time required to combine +references, cutting it down to a fraction of the previous duration. The original +idea was showcased in this `talk `_. +It follows a three step approach. + +**Three step approach:** + + 1. Extract and persist metadata directly from a few arbitrary grib + files for a given product such as HRRR SUBH, GEFS, GFS etc. + 2. Use the metadata mapping to build an index table of every grib + message from the ``.idx`` files + 3. Combine the index data with the metadata to build any FMRC + slice (Horizon, RunTime, ValidTime, BestAvailable) + + +*How is it faster* + +The ``.idx`` file otherwise known as an *index* file contains the key +metadata of the messages in the GRIB files. These metadata include `index`, `offset`, `datetime`, +`variable` and `forecast time` for their respective messages. This metadata +will be used to index every GRIB message. By following this approach, we only have to +``scan_grib`` a single GRIB file, not the whole archive. + +Building the index of a time horizon, first requires a single one-to-one mapping of GRIB/Zarr +metadata to the attributes in the idx file. Only constraint is the mapping needs to be +made from a single GRIB file, belonging to the *same time horizon*. The indexing process +primarily involves the `pandas `_ library. To confirm this, +see this `notebook `_. +After indexing a single time horizon, you can combine this index with indexes of +other time horizon and store it. + +.. note:: + The index in ``.idx`` file indexes the GRIB messages where as the ``k_index`` + (kerchunk index), index the variables + in those messages. + +The table mentioned below is a *k_index* made from a single GRIB file. + +.. list-table:: k_index for a single GRIB file + :header-rows: 1 + :widths: 5 10 15 10 20 15 10 20 20 30 10 10 10 + + * - + - varname + - typeOfLevel + - stepType + - name + - step + - level + - time + - valid_time + - uri + - offset + - length + - inline_value + * - 0 + - gh + - isobaricInhPa + - instant + - Geopotential height + - 0 days 06:00:00 + - 0.0 + - 2017-01-01 06:00:00 + - 2017-01-01 12:00:00 + - s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z... + - 0 + - 47493 + - None + * - 1 + - t + - isobaricInhPa + - instant + - Temperature + - 0 days 06:00:00 + - 0.0 + - 2017-01-01 06:00:00 + - 2017-01-01 12:00:00 + - s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z... + - 47493 + - 19438 + - None + * - 2 + - r + - isobaricInhPa + - instant + - Relative humidity + - 0 days 06:00:00 + - 0.0 + - 2017-01-01 06:00:00 + - 2017-01-01 12:00:00 + - s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z... + - 66931 + - 10835 + - None + * - 3 + - u + - isobaricInhPa + - instant + - U component of wind + - 0 days 06:00:00 + - 0.0 + - 2017-01-01 06:00:00 + - 2017-01-01 12:00:00 + - s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z... + - 77766 + - 22625 + - None + * - 4 + - v + - isobaricInhPa + - instant + - V component of wind + - 0 days 06:00:00 + - 0.0 + - 2017-01-01 06:00:00 + - 2017-01-01 12:00:00 + - s3://noaa-gefs-pds/gefs.20170101/06/gec00.t06z... + - 100391 + - 20488 + - None + + +*What now* + +After creating the k_index as per the desired duration, we will use the ``DataTree`` model +from the `xarray-datatree `_ to view a +part(desired variables) or the whole of the aggregation, using the k_index. Below is a +tree model made from an aggregation of GRIB files produced from **GEFS** model hosted +in AWS S3 bucket. + +.. code-block:: bash + + DataTree('None', parent=None) + ├── DataTree('prmsl') + │ │ Dimensions: () + │ │ Data variables: + │ │ *empty* + │ │ Attributes: + │ │ name: Pressure reduced to MSL + │ └── DataTree('instant') + │ │ Dimensions: () + │ │ Data variables: + │ │ *empty* + │ │ Attributes: + │ │ stepType: instant + │ └── DataTree('meanSea') + │ Dimensions: (latitude: 181, longitude: 360, time: 1, step: 1, + │ model_horizons: 1, valid_times: 237) + │ Coordinates: + │ * latitude (latitude) float64 1kB 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0 + │ * longitude (longitude) float64 3kB 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 + │ meanSea float64 8B ... + │ number (time, step) int64 8B ... + │ step (model_horizons, valid_times) timedelta64[ns] 2kB ... + │ time (model_horizons, valid_times) datetime64[ns] 2kB ... + │ valid_time (model_horizons, valid_times) datetime64[ns] 2kB ... + │ Dimensions without coordinates: model_horizons, valid_times + │ Data variables: + │ prmsl (model_horizons, valid_times, latitude, longitude) float64 124MB ... + │ Attributes: + │ typeOfLevel: meanSea + └── DataTree('ulwrf') + │ Dimensions: () + │ Data variables: + │ *empty* + │ Attributes: + │ name: Upward long-wave radiation flux + └── DataTree('avg') + │ Dimensions: () + │ Data variables: + │ *empty* + │ Attributes: + │ stepType: avg + └── DataTree('nominalTop') + Dimensions: (latitude: 181, longitude: 360, time: 1, step: 1, + model_horizons: 1, valid_times: 237) + Coordinates: + * latitude (latitude) float64 1kB 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0 + * longitude (longitude) float64 3kB 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 + nominalTop float64 8B ... + number (time, step) int64 8B ... + step (model_horizons, valid_times) timedelta64[ns] 2kB ... + time (model_horizons, valid_times) datetime64[ns] 2kB ... + valid_time (model_horizons, valid_times) datetime64[ns] 2kB ... + Dimensions without coordinates: model_horizons, valid_times + Data variables: + ulwrf (model_horizons, valid_times, latitude, longitude) float64 124MB ... + Attributes: + typeOfLevel: nominalTop + + +.. tip:: + For a full tutorial on this workflow, refer this `kerchunk cookbook `_ + in `Project Pythia `_. + +.. raw:: html + + diff --git a/kerchunk/codecs.py b/kerchunk/codecs.py index 3d206e7..852076e 100644 --- a/kerchunk/codecs.py +++ b/kerchunk/codecs.py @@ -5,6 +5,7 @@ from numcodecs.abc import Codec import numpy as np import threading +import zlib class FillStringsCodec(Codec): @@ -238,3 +239,19 @@ def decode(self, buf, out=None): def encode(self, buf): raise NotImplementedError + + +class ZlibCodec(Codec): + codec_id = "zlib" + + def __init__(self): + ... + + def decode(self, data, out=None): + if out: + out[:] = zlib.decompress(data) + return out + return zlib.decompress(data) + + def encode(self, buf): + return zlib.compress(buf) diff --git a/kerchunk/hdf4.py b/kerchunk/hdf4.py new file mode 100644 index 0000000..449e4e0 --- /dev/null +++ b/kerchunk/hdf4.py @@ -0,0 +1,509 @@ +import fsspec +import numpy as np +import ujson + + +decoders = {} + + +def reg(name): + def f(func): + decoders[name] = func + return func + + return f + + +class HDF4ToZarr: + """Experimental: interface to HDF4 archival files""" + + def __init__( + self, + path, + storage_options=None, + inline_threshold=100, + out=None, + ): + self.path = path + self.st = storage_options + self.thresh = inline_threshold + self.out = out or {} + + def read_int(self, n): + return int.from_bytes(self.f.read(n), "big") + + def read_ddh(self): + return {"ndd": self.read_int(2), "next": self.read_int(4)} + + def read_dd(self): + loc = self.f.tell() + i = int.from_bytes(self.f.read(2), "big") + if i & 0x4000: + extended = True + i = i - 0x4000 + else: + extended = False + tag = tags.get(i, i) + no_data = tag not in {"NULL"} + ref = (tag, int.from_bytes(self.f.read(2), "big")) + info = { + "offset": int.from_bytes(self.f.read(4), "big") * no_data, + "length": int.from_bytes(self.f.read(4), "big") * no_data, + "extended": extended, + "loc": loc, + } + return ref, info + + def decode(self, tag, info): + self.f.seek(info["offset"]) + ident = lambda _, __: info + return decoders.get(tag, ident)(self, info) + + def translate(self, filename=None, storage_options=None): + """Scan and return references + + Parameters + ---------- + filename: if given, write to this as JSON + storage_options: to interpret filename + + Returns + ------- + references + """ + import zarr + from kerchunk.codecs import ZlibCodec + + fo = fsspec.open(self.path, **(self.st or {})) + self.f = fo.open() + + # magic header + assert self.f.read(4) == b"\x0e\x03\x13\x01" + + # all the data descriptors in a linked list + self.tags = {} + while True: + ddh = self.read_ddh() + + for _ in range(ddh["ndd"]): + ident, info = self.read_dd() + self.tags[ident] = info + if ddh["next"] == 0: + # "finished" sentry + break + # or continue + self.f.seek(ddh["next"]) + + # basic decode + for tag, ref in self.tags: + self._dec(tag, ref) + + # global attributes + attrs = {} + for (tag, ref), info in self.tags.items(): + if tag == "VH" and info["names"][0].upper() == "VALUES": + # dtype = dtypes[info["types"][0]] + inf2 = self.tags[("VS", ref)] + self.f.seek(inf2["offset"]) + data = self.f.read(inf2["length"]) + # NASA conventions + if info["name"].startswith(("CoreMetadata.", "ArchiveMetadata.")): + obj = None + for line in data.decode().split("\n"): + if "OBJECT" in line: + obj = line.split()[-1] + if "VALUE" in line: + attrs[obj] = line.split()[-1].lstrip('"').rstrip('"') + + # there should be only one root, and it's probably the last VG + # so maybe this loop isn't needed + roots = set() + children = set() + child = {} + for (tag, ref), info in self.tags.items(): + if tag == "VG": + here = child.setdefault((tag, ref), set()) + for t, r in zip(info["tag"], info["refs"]): + if t == "VG": + children.add((t, r)) + roots.discard((t, r)) + here.add((t, r)) + if tag not in children: + roots.add((tag, ref)) + + # hierarchical output + output = self._descend_vg(*list(roots)[0]) + prot = fo.fs.protocol + prot = prot[0] if isinstance(prot, tuple) else prot + fs = fsspec.filesystem( + "reference", + fo=self.out, + remote_protocol=prot, + remote_options=self.st, + ) + g = zarr.open_group("reference://", storage_options=dict(fs=fs)) + refs = {} + for k, v in output.items(): + if isinstance(v, dict): + compression = ZlibCodec() if "refs" in v else None + arr = g.create_dataset( + name=k, + shape=v["dims"], + dtype=v["dtype"], + chunks=v.get("chunks", v["dims"]), + compressor=compression, + overwrite=True, + ) + arr.attrs.update( + dict( + _ARRAY_DIMENSIONS=[f"{k}_x", f"{k}_y"][: len(v["dims"])] + if "refs" in v + else ["0"], + **{ + i: j + for i, j in v.items() + if i not in {"chunk", "dims", "dtype", "refs"} + }, + ) + ) + for r in v.get("refs", []): + refs[f"{k}/{r[0]}"] = [self.path, r[1], r[2]] + else: + attrs[k] = v + fs.references.update(refs) + g.attrs.update(attrs) + + if filename is None: + return fs.references + with fsspec.open(filename, **(storage_options or {})) as f: + ujson.dumps(dict(fs.references), f) + + def _descend_vg(self, tag, ref): + info = self.tags[(tag, ref)] + out = {} + for t, r in zip(info["tag"], info["refs"]): + inf2 = self.tags[(t, r)] + if t == "VG": + tmp = self._descend_vg(t, r) + if list(tmp)[0] == inf2["name"]: + tmp = tmp[inf2["name"]] + out[inf2["name"]] = tmp + elif t == "VH": + if len(inf2["names"]) == 1 and inf2["names"][0].lower() == "values": + dtype = dtypes[inf2["types"][0]] + name = inf2["name"] + inf2 = self.tags[("VS", r)] + self.f.seek(inf2["offset"]) + data = self.f.read(inf2["length"]) + if dtype == "str": + out[name] = data.decode().lstrip('"').rstrip('"') # decode() ? + else: + out[name] = np.frombuffer(data, dtype)[0] + elif t == "NT": + out["dtype"] = inf2["typ"] + elif t == "SD": + out["refs"] = inf2["data"][:-1] + out["chunks"] = [_["chunk_length"] for _ in inf2["data"][-1]] + elif t == "SDD": + out["dims"] = inf2["dims"] + else: + # NDGs contain same info as NT, SD and SDD + pass + return out + + def _dec(self, tag, ref): + info = self.tags[(tag, ref)] + if not set(info) - {"length", "offset", "extended", "loc"}: + self.f.seek(info["offset"]) + if info["extended"]: + info["data"] = self._dec_extended() + else: + info.update(self.decode(tag, info)) + return info + + def _dec_extended(self): + ext_type = spec[self.read_int(2)] + if ext_type == "CHUNKED": + return self._dec_chunked() + elif ext_type == "LINKED": + return self._dec_linked_header() + elif ext_type == "COMP": + return self._dec_comp() + + def _dec_linked_header(self): + # get the bytes of a linked set - these will always be inlined + self.read_int(4) # length + self.read_int(4) # blk_len + self.read_int(4) # num_blk + next_ref = self.read_int(2) + out = [] + while next_ref: + next_ref, data = self._dec_linked_block(self.tags[("LINKED", next_ref)]) + out.extend([d for d in data if d]) + bits = [] + for ref in out: + info = self.tags[("LINKED", ref)] + self.f.seek(info["offset"]) + bits.append(self.f.read(info["length"])) + return b"".join(bits) + + def _dec_linked_block(self, block): + self.f.seek(block["offset"]) + next_ref = self.read_int(2) + refs = [self.read_int(2) for _ in range((block["length"] // 2) - 1)] + return next_ref, refs + + def _dec_chunked(self): + # we want to turn the chunks table into references + # tag_head_len = self.read_int(4) + # version = self.f.read(1)[0] + # flag = self.read_int(4) + # elem_tot_len = self.read_int(4) + # chunk_size = self.read_int(4) + # nt_size = self.read_int(4) + self.f.seek(21, 1) + chk_tbl_tag = tags[self.read_int(2)] # should be VH + chk_tbl_ref = self.read_int(2) + self.read_int(2) # sp_tab = tags[self.read_int(2)] + self.read_int(2) # sp_ref + ndims = self.read_int(4) + + dims = [ # we don't use these, could skip + { + "flag": self.read_int(4), + "dim_length": self.read_int(4), + "chunk_length": self.read_int(4), + } + for _ in range(ndims) + ] + self.f.read( # fill_value + self.read_int(4) + ) # to be interpreted as a number later; but chunk table probs has no fill + # self.f.seek(12*ndims + 4, 1) # if skipping + + header = self._dec(chk_tbl_tag, chk_tbl_ref) + data = self._dec("VS", chk_tbl_ref)["data"] # corresponding table + + # header gives the field pattern for the rows of data, one per chunk + # maybe faster to use struct and iter than numpy, since we iterate anyway + dt = [(f"ind{i}", ">u4") for i in range(ndims)] + [ + ("tag", ">u2"), + ("ref", ">u2"), + ] + rows = np.frombuffer(data, dtype=dt, count=header["nvert"]) + # rows["tag"] should always be 61 -> CHUNK + refs = [] + for *ind, tag, ref in rows: + # maybe ind needs reversing since everything is FORTRAN + chunk_tag = self.tags[("CHUNK", ref)] + if chunk_tag["extended"]: + self.f.seek(chunk_tag["offset"]) + # these are always COMP? + ctype, offset, length = self._dec_extended() + refs.append([".".join(str(_) for _ in ind), offset, length, ctype]) + else: + refs.append( + [ + ".".join(str(_) for _ in ind), + chunk_tag["offset"], + chunk_tag["length"], + ] + ) + refs.append(dims) + return refs + + def _dec_comp(self): + # version = self.read_int(2) # always 0 + # len_uncomp = self.read_int(4) + self.f.seek(6, 1) + + data_ref = self.read_int(2) + # model = self.read_int(2) # always 0 + ctype = "DEFLATE" # comp[self.read_int(2)] + tag = self.tags[("COMPRESSED", data_ref)] + return ctype, tag["offset"], tag["length"] + + +@reg("NDG") +def _dec_ndg(self, info): + # links together these things as a Data Group + return { + "tags": [ + (tags[self.read_int(2)], self.read_int(2)) + for _ in range(0, info["length"], 4) + ] + } + + +@reg("SDD") +def _dec_sdd(self, info): + rank = self.read_int(2) + dims = [self.read_int(4) for _ in range(rank)] + data_tag = (tags[self.read_int(2)], self.read_int(2)) + scale_tags = [(tags[self.read_int(2)], self.read_int(2)) for _ in range(rank)] + return _pl(locals()) + + +@reg("VERSION") +def _dec_version(self, info): + return { + "major": self.read_int(4), + "minor": self.read_int(4), + "release": self.read_int(4), + "string:": _null_str(self.f.read(info["length"] - 10).decode()), + } + + +@reg("VH") +def _dec_vh(self, info): + # virtual group ("table") header + interface = self.read_int(2) + nvert = self.read_int(4) + ivsize = self.read_int(2) + nfields = self.read_int(2) + types = [self.read_int(2) for _ in range(nfields)] + isize = [self.read_int(2) for _ in range(nfields)] + offsets = [self.read_int(2) for _ in range(nfields)] + order = [self.read_int(2) for _ in range(nfields)] + names = [self.f.read(self.read_int(2)).decode() for _ in range(nfields)] + namelen = self.read_int(2) + name = self.f.read(namelen).decode() + classlen = self.read_int(2) + cls = self.f.read(classlen).decode() + ref = (self.read_int(2), self.read_int(2)) + return _pl(locals()) + + +@reg("VG") +def _dec_vg(self, info): + nelt = self.read_int(2) + tag = [tags[self.read_int(2)] for _ in range(nelt)] + refs = [self.read_int(2) for _ in range(nelt)] + name = self.f.read(self.read_int(2)).decode() + cls = self.f.read(self.read_int(2)).decode() + return _pl(locals()) + + +@reg("NT") +def _dec_nt(self, info): + version, typ, width, cls = list(self.f.read(4)) + typ = dtypes[typ] + return _pl(locals()) + + +def _null_str(s): + return s.split("\00", 1)[0] + + +def _pl(l): + return {k: v for k, v in l.items() if k not in {"info", "f", "self"}} + + +# hdf/src/htags.h +tags = { + 1: "NULL", + 20: "LINKED", + 30: "VERSION", + 40: "COMPRESSED", + 50: "VLINKED", + 51: "VLINKED_DATA", + 60: "CHUNKED", + 61: "CHUNK", + 100: "FID", + 101: "FD", + 102: "TID", + 103: "TD", + 104: "DIL", + 105: "DIA", + 106: "NT", + 107: "MT", + 108: "FREE", + 200: "ID8", + 201: "IP8", + 202: "RI8", + 203: "CI8", + 204: "II8", + 300: "ID", + 301: "LUT", + 302: "RI", + 303: "CI", + 304: "NRI", + 306: "RIG", + 307: "LD", + 308: "MD", + 309: "MA", + 310: "CCN", + 311: "CFM", + 312: "AR", + 400: "DRAW", + 401: "RUN", + 500: "XYP", + 501: "MTO", + 602: "T14", + 603: "T105", + 700: "SDG", + 701: "SDD", + 702: "SD", + 703: "SDS", + 704: "SDL", + 705: "SDU", + 706: "SDF", + 707: "SDM", + 708: "SDC", + 709: "SDT", + 710: "SDLNK", + 720: "NDG", + 721: "RESERVED", + # "Objects of tag 721 are never actually written to the file. The tag is + # needed to make things easier mixing DFSD and SD style objects in the same file" + 731: "CAL", + 732: "FV", + 799: "BREQ", + 781: "SDRAG", + 780: "EREQ", + 1965: "VG", + 1962: "VH", + 1963: "VS", + 11: "RLE", + 12: "IMCOMP", + 13: "JPEG", + 14: "GREYJPEG", + 15: "JPEG5", + 16: "GREYJPEG5", +} +spec = { + 1: "LINKED", + 2: "EXT", + 3: "COMP", + 4: "VLINKED", + 5: "CHUNKED", + 6: "BUFFERED", + 7: "COMPRAS", +} + +# hdf4/hdf/src/hntdefs.h +dtypes = { + 5: "f4", + 6: "f8", + 20: "i1", + 21: "u1", + 4: "str", # special case, size given in header + 22: ">i2", + 23: ">u2", + 24: ">i4", + 25: ">u4", + 26: ">i8", + 27: ">u8", +} + +# hdf4/hdf/src/hcomp.h +comp = { + 0: "NONE", + 1: "RLE", + 2: "NBIT", + 3: "SKPHUFF", + 4: "DEFLATE", # called deflate, but code says "gzip" and doc says "GNU zip"; actually zlib? + # see codecs.ZlibCodec + 5: "SZIP", + 7: "JPEG", +} diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index a20be2a..69fd22b 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -279,18 +279,18 @@ def test_compound_string_encode(): assert (z.vlen_str["strs"][1:] == "").all() -def test_compact(): - pytest.importorskip("ipfsspec") - h = kerchunk.hdf.SingleHdf5ToZarr( - "ipfs://QmVZc4TzRP7zydgKzDX7CH2JpYw2LJKkWBm6jhCfigeon6" - ) - out = h.translate() - - m = fsspec.get_mapper("reference://", fo=out) - g = zarr.open(m) - assert np.allclose(g.ancillary_data.atlas_sdp_gps_epoch[:], 1.19880002e09) - - +# def test_compact(): +# pytest.importorskip("ipfsspec") +# h = kerchunk.hdf.SingleHdf5ToZarr( +# "ipfs://QmVZc4TzRP7zydgKzDX7CH2JpYw2LJKkWBm6jhCfigeon6" +# ) +# out = h.translate() +# +# m = fsspec.get_mapper("reference://", fo=out) +# g = zarr.open(m) +# assert np.allclose(g.ancillary_data.atlas_sdp_gps_epoch[:], 1.19880002e09) +# +# def test_compress(): import glob diff --git a/pyproject.toml b/pyproject.toml index c11e340..5eb7c0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ fill_hdf_strings = "kerchunk.codecs:FillStringsCodec" FITSAscii = "kerchunk.codecs:AsciiTableCodec" FITSVarBintable = "kerchunk.codecs:VarArrCodec" record_member = "kerchunk.codecs:RecordArrayMember" +zlib = "kerchunk.codecs:ZlibCodec" [project.entry-points."xarray.backends"] kerchunk = "kerchunk.xarray_backend:KerchunkBackend"