Skip to content

Commit

Permalink
mostly working
Browse files Browse the repository at this point in the history
  • Loading branch information
martindurant committed Aug 27, 2024
1 parent df61060 commit 93093e9
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 29 deletions.
17 changes: 17 additions & 0 deletions kerchunk/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from numcodecs.abc import Codec
import numpy as np
import threading
import zlib


class FillStringsCodec(Codec):
Expand Down Expand Up @@ -238,3 +239,19 @@ def decode(self, buf, out=None):

def encode(self, buf):
raise NotImplementedError


class ZlibCodec(numcodecs.abc.Codec):
codec_id = "zlib"

def __init__(self):
...

def decode(self, data, out=None):
if out:
out[:] = zlib.decompress(data)
return out
return zlib.decompress(data)

def encode(self, buf):
return zlib.compress(buf)
77 changes: 48 additions & 29 deletions kerchunk/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,15 +743,9 @@ def decode(self, tag, info):

def translate(self):
import zarr
from kerchunk.codecs import ZlibCodec

self.f = fsspec.open(self.path, **(self.st or {})).open()
fs = fsspec.filesystem(
"reference",
fo=self.out,
remote_protocol=self.remote_protocol,
remote_options=self.remote_options,
)
g = zarr.open_group("reference://", storage_options=dict(fs=fs))

# magic header
assert self.f.read(4) == b"\x0e\x03\x13\x01"
Expand Down Expand Up @@ -790,7 +784,6 @@ def translate(self):
obj = line.split()[-1]
if "VALUE" in line:
attrs[obj] = line.split()[-1].lstrip('"').rstrip('"')
g.attrs.update(attrs)

# there should be only one root, and it's probably the last VG
# so maybe this loop isn't needed
Expand All @@ -807,9 +800,46 @@ def translate(self):
here.add((t, r))
if tag not in children:
roots.add((tag, ref))
for t, r in roots:
self.tags[(t, r)] = self._descend_vg(t, r)
return self.tags, roots

# hierarchical output
output = self._descend_vg(*list(roots)[0])
fs = fsspec.filesystem(
"reference",
fo=self.out,
remote_protocol=self.remote_protocol,
remote_options=self.remote_options,
)
g = zarr.open_group("reference://", storage_options=dict(fs=fs))
for k, v in output.items():
if isinstance(v, dict):
compression = ZlibCodec() if "refs" in v else None
arr = g.create_dataset(
name=k,
shape=v["dims"],
dtype=v["dtype"],
chunks=v.get("chunks", v["dims"]),
compressor=compression,
overwrite=True,
)
arr.attrs.update(
dict(
_ARRAY_DIMENSIONS=[f"{k}_x", f"{k}_y"][: len(v["dims"])]
if "refs" in v
else ["0"],
**{
i: j
for i, j in v.items()
if i not in {"chunk", "dims", "dtype", "refs"}
},
)
)
for r in v.get("refs", []):
self.out[f"{k}/{r[0]}"] = [self.path, r[1], r[2]]
else:
attrs[k] = v
g.attrs.update(attrs)

return fs.references

def _descend_vg(self, tag, ref):
info = self.tags[(tag, ref)]
Expand All @@ -824,15 +854,14 @@ def _descend_vg(self, tag, ref):
elif t == "VH":
if len(inf2["names"]) == 1 and inf2["names"][0].lower() == "values":
dtype = dtypes[inf2["types"][0]]
name = inf2["name"]
inf2 = self.tags[("VS", r)]
self.f.seek(inf2["offset"])
data = self.f.read(inf2["length"])
if dtype == "str":
out[info["name"]] = (
data.decode().lstrip('"').rstrip('"')
) # decode() ?
out[name] = data.decode().lstrip('"').rstrip('"') # decode() ?
else:
out[info["name"]] = np.frombuffer(data, dtype)[0]
out[name] = np.frombuffer(data, dtype)[0]
elif t == "NT":
out["dtype"] = inf2["typ"]
elif t == "SD":
Expand Down Expand Up @@ -902,7 +931,7 @@ def _dec_chunked(self):
sp_ref = self.read_int(2)
ndims = self.read_int(4)

dims = [ # we don't use these, could
dims = [ # we don't use these, could skip
{
"flag": self.read_int(4),
"dim_length": self.read_int(4),
Expand All @@ -913,7 +942,7 @@ def _dec_chunked(self):
fill_value = self.f.read(
self.read_int(4)
) # to be interpreted as a number later; but chunk table probs has no fill
# self.f.seek(12*ndims + 4, 1)
# self.f.seek(12*ndims + 4, 1) # if skipping

header = self._dec(chk_tbl_tag, chk_tbl_ref)
data = self._dec("VS", chk_tbl_ref)["data"] # corresponding table
Expand Down Expand Up @@ -1139,18 +1168,8 @@ def _pl(l):
1: "RLE",
2: "NBIT",
3: "SKPHUFF",
4: "DEFLATE", # called deflate, but code says "gzip" and doc says "GNU zip"
4: "DEFLATE", # called deflate, but code says "gzip" and doc says "GNU zip"; actually zlib?
# see codecs.ZlibCodec
5: "SZIP",
7: "JPEG",
}


class FLATECodec: # (numcodecs.abc.Codec)
def __init__(self):
...

def decode(self, data):
import zlib

obj = zlib.decompressobj(-15)
return obj.decompress(data)

0 comments on commit 93093e9

Please sign in to comment.