Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add some Zarr-based datatypes #19040

Merged
merged 23 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
52a816c
Add new Zarr datatypes
wm75 Mar 1, 2024
c383d69
Fix set_meta logic
wm75 Mar 1, 2024
1b51e1a
Rename generic classes for consistency
davelopez Oct 10, 2024
de2606e
Refactor Zarr datatypes
davelopez Oct 18, 2024
ea711f3
Drop zarr image datatypes for now
davelopez Oct 18, 2024
66cf066
Refactor ZarrDirectory class to improve handling of Zarr store root f…
davelopez Oct 18, 2024
4bdb09c
Add ZarrDirectory display_data method to support previewing metadata …
davelopez Oct 21, 2024
642c30d
Add remote S3 zarr datatype
davelopez Oct 21, 2024
02c6594
Refactor CompressedZarrZipArchive to support zarr format version meta…
davelopez Oct 22, 2024
dfbdbbe
Add some comments to clarify the use of store_root metadata
davelopez Oct 22, 2024
abfb220
Allow to convert/extract from zip directly to zarr
davelopez Oct 22, 2024
826d37a
Add compression metadata to CompressedZarrZipArchive
davelopez Oct 22, 2024
cc9b631
Fix linting
davelopez Oct 22, 2024
5f4197a
Refactor metadata file detection in CompressedZarrZipArchive and Zarr…
davelopez Oct 22, 2024
6855055
Add Compressed OME-Zarr Zip datatype
davelopez Oct 23, 2024
f12014c
Add OMEZarr directory datatype
davelopez Oct 23, 2024
3cd5cdb
Refactor ZarrRemoteS3Bucket to ZarrRemoteUri as base
davelopez Oct 23, 2024
6c3ff40
Add OME-Zarr remote URI datatype
davelopez Oct 23, 2024
3deb373
Remove ZarrRemoteS3Bucket and refactor ZarrRemoteUri
davelopez Oct 23, 2024
89e058d
Add properly formatted remote_uri to ZarrRemoteUri metadata
davelopez Oct 24, 2024
ee334f2
Refactor CompressedZarrZipArchive to handle zarr store in subfolders
davelopez Oct 25, 2024
3b7c640
Drop URI datatype in favor of deferred
davelopez Oct 31, 2024
68d1f31
Add missing auto_compressed_types to tar datatype
davelopez Nov 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,18 @@
<converter file="archive_to_directory.xml" target_datatype="directory" />
</datatype>
<datatype extension="ncbi_genome_dataset.zip" type="galaxy.datatypes.binary:CompressedZipArchive" subclass="true" display_in_upload="true"/>
<datatype extension="zarr.zip" type="galaxy.datatypes.binary:CompressedZarrZipArchive" display_in_upload="true">
<converter file="archive_to_directory.xml" target_datatype="zarr" />
</datatype>
<datatype extension="ome_zarr.zip" type="galaxy.datatypes.binary:CompressedOMEZarrZipArchive" display_in_upload="true">
<converter file="archive_to_directory.xml" target_datatype="ome_zarr" />
</datatype>
<datatype extension="tar" auto_compressed_types="gz,bz2" type="galaxy.datatypes.binary:CompressedArchive" subclass="true" display_in_upload="true">
<converter file="archive_to_directory.xml" target_datatype="directory"/>
</datatype>
<datatype extension="directory" type="galaxy.datatypes.data:Directory"/>
<datatype extension="zarr" type="galaxy.datatypes.data:ZarrDirectory" />
<datatype extension="ome_zarr" type="galaxy.datatypes.images:OMEZarr" />
<datatype extension="yaml" type="galaxy.datatypes.text:Yaml" display_in_upload="true" />
<!-- Proteomics Datatypes -->
<datatype extension="mrm" type="galaxy.datatypes.tabular:Tabular" display_in_upload="true" subclass="true"/>
Expand Down Expand Up @@ -1108,6 +1116,8 @@
<sniffer type="galaxy.datatypes.qiime2:QIIME2Metadata"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Artifact"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Visualization"/>
<sniffer type="galaxy.datatypes.binary:CompressedOMEZarrZipArchive"/>
<sniffer type="galaxy.datatypes.binary:CompressedZarrZipArchive"/>
<sniffer type="galaxy.datatypes.binary:CompressedZipArchive"/>
<sniffer type="galaxy.datatypes.binary:Pretext"/>
<sniffer type="galaxy.datatypes.annotation:Augustus"/>
Expand Down
97 changes: 97 additions & 0 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,103 @@ def sniff(self, filename: str) -> bool:
return False


class CompressedZarrZipArchive(CompressedZipArchive):
"""A zarr store compressed in a zip file.

The zarr store must be in the root of the zip file.
"""

file_ext = "zarr.zip"

MetadataElement(
name="zarr_format",
default=None,
desc="Zarr format version",
readonly=True,
optional=False,
visible=False,
)

MetadataElement(
name="compression",
default=None,
desc="Compression type used in the Zip zarr store",
readonly=True,
optional=False,
visible=False,
)

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.blurb = f"{nice_size(dataset.get_size())}"
dataset.blurb += f"\nFormat v{dataset.metadata.zarr_format}"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
with zipfile.ZipFile(dataset.get_file_name()) as zf:
dataset.metadata.compression = zf.compression
meta_file = self._find_zarr_metadata_file(zf)
if meta_file:
with zf.open(meta_file) as f:
meta = json.load(f)
format_version = meta.get("zarr_format")
if not format_version:
log.debug("Could not determine Zarr format version")
return
dataset.metadata.zarr_format = format_version

def sniff(self, filename: str) -> bool:
# Check if the zip file contains a zarr store.
# In theory, the zarr store must be in the root of the zip file.
# See: https://github.com/zarr-developers/zarr-python/issues/756#issuecomment-852134901
# But in practice, many examples online have the zarr store in a subfolder in the zip file,
# so we will check for that as well.
meta_file = None
with zipfile.ZipFile(filename) as zf:
meta_file = self._find_zarr_metadata_file(zf)
return meta_file is not None

def _find_zarr_metadata_file(self, zip_file: zipfile.ZipFile) -> Optional[str]:
"""Returns the path to the metadata file in the Zarr store if found."""
# Depending on the Zarr version, the metadata file can be in different locations
# In v1 the metadata is in a file named "meta" https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html
# In v2 it can be in .zarray or .zgroup https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html
# In v3 the metadata is in a file named "zarr.json" https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html
possible_meta_files = ["meta", ".zarray", ".zgroup", "zarr.json"]
for file in zip_file.namelist():
if any(file.endswith(meta_file) for meta_file in possible_meta_files):
return file
return None


class CompressedOMEZarrZipArchive(CompressedZarrZipArchive):
file_ext = "ome_zarr.zip"

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = "OME-Zarr directory"
dataset.blurb = f"{nice_size(dataset.get_size())}"
dataset.blurb += f"\nZarr Format v{dataset.metadata.zarr_format}"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"

def sniff(self, filename: str) -> bool:
meta_file = None
with zipfile.ZipFile(filename) as zf:
meta_file = self._find_ome_zarr_metadata_file(zf)
return meta_file is not None

def _find_ome_zarr_metadata_file(self, zip_file: zipfile.ZipFile) -> Optional[str]:
expected_meta_file_name = "OME/METADATA.ome.xml"
for file in zip_file.namelist():
if file.endswith(expected_meta_file_name):
return file
return None


class GenericAsn1Binary(Binary):
"""Class for generic ASN.1 binary format"""

Expand Down
2 changes: 2 additions & 0 deletions lib/galaxy/datatypes/converters/archive_to_directory.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
<param format="tar,zip" name="input1" type="data"/>
<param name="__target_datatype__" type="select" label="Target data type">
<option value="directory">directory</option>
<option value="zarr">zarr</option>
<option value="ome_zarr">ome_zarr</option>
</param>
</inputs>
<outputs provided_metadata_file="metadata_json">
Expand Down
126 changes: 126 additions & 0 deletions lib/galaxy/datatypes/data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import mimetypes
import os
Expand Down Expand Up @@ -466,6 +467,7 @@ def _serve_file_download(self, headers, data, trans, to_ext, file_size, **kwd):
composite_extensions.append("html") # for archiving composite datatypes
composite_extensions.append("data_manager_json") # for downloading bundles if bundled.
composite_extensions.append("directory") # for downloading directories.
composite_extensions.append("zarr") # for downloading zarr directories.

if data.extension in composite_extensions:
return self._archive_composite_dataset(trans, data, headers, do_action=kwd.get("do_action", "zip"))
Expand Down Expand Up @@ -1226,6 +1228,130 @@ def _archive_main_file(
return error, msg, messagetype


class ZarrDirectory(Directory):
"""Class representing a zarr-format structure with general-purpose numeric content."""

edam_format = "format_3915"
file_ext = "zarr"

# This wouldn't be needed if the CompressedFile.extract function didn't
# create an extra folder under the dataset's extra_files_path.
# Maybe this can be avoided somehow?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we merge with this or do we need to investigate this more?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would try to get rid of this ? Otherwise I'd like to see a tool actually use this metadata element.

Copy link
Contributor Author

@davelopez davelopez Nov 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me explain the issue in more detail to see if we can eliminate the need for store_root metadata. This store_root isn't true metadata—it's more of a workaround that indicates the folder containing the actual root of the Zarr directory.

Both tools and visualizations require a path to this root directory to access the correct contents.

When we upload a zip file containing a Zarr directory, it’s common for the zip to include the parent folder of the Zarr store. Many Zarr zips I’ve encountered are structured this way. Ideally, the Zarr store would be zipped without this extra parent folder, but even if it isn’t, when we extract it using the converter, it creates a new folder (like dataset_{uuid}) within extra_files_path, resulting in an additional layer.

Currently, to access the Zarr directory correctly, any tool needs to reference it as follows:

input_zarr = zarr.open('$zarrinput.extra_files_path/$zarrinput.metadata.store_root', mode='r')

This approach, however, is not fully reliable—what if the Zarr store is nested deeper within subdirectories? A better solution might be to use a dedicated converter (rather than archive_to_directory.xml) that finds and extracts the root store directly to extra_files_path, without any parent folders, would this be better?

Another drawback is that tool developers must remember to reference the $zarrinput.extra_files_path, and even add /$zarrinput.metadata.store_root to reach the actual Zarr store.

Any ideas on how to make this process more elegant and eliminate the store_root?

MetadataElement(
name="store_root",
default=None,
desc="Name of the root folder where the zarr store is located",
readonly=True,
optional=False,
visible=False,
)

MetadataElement(
name="zarr_format",
default=None,
desc="Zarr format version",
readonly=True,
optional=False,
visible=False,
)

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.blurb = f"Format v{dataset.metadata.zarr_format}"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
store_root_folder = self._find_store_root_folder_name(dataset)
if store_root_folder is None:
log.debug("Directory structure does not look like Zarr format")
return
dataset.metadata.store_root = store_root_folder

root_directory = os.path.join(dataset.extra_files_path, store_root_folder)
format_version = self._get_format_version(root_directory)
if not format_version:
log.debug("Could not determine Zarr format version")
return
dataset.metadata.zarr_format = format_version

def sniff(self, filename: str) -> bool:
# TO DO: Can we access extra files path from here? Otherwise it cannot be auto-detected.
return False

def display_data(
self,
trans,
dataset: DatasetHasHidProtocol,
preview: bool = False,
filename: Optional[str] = None,
to_ext: Optional[str] = None,
**kwd,
):
if preview:
store_root_path = os.path.join(dataset.extra_files_path, dataset.metadata.store_root)
metadata_file_path = self._find_zarr_metadata_file(store_root_path)
if metadata_file_path:
headers = kwd.get("headers", {})
headers["content-type"] = "application/json"
return self._yield_user_file_content(trans, dataset, metadata_file_path, headers), headers

return super().display_data(trans, dataset, preview, filename, to_ext, **kwd)

def _find_store_root_folder_name(self, dataset: DatasetProtocol) -> Optional[str]:
"""Returns the name of the root folder where the Zarr store is located.

The Zarr store can be directly in the extra files folder or in a subfolder.
"""
extra_files_path = dataset.extra_files_path
if self._find_zarr_metadata_file(extra_files_path):
return "" # The store is in the root of the extra files folder
items_in_path = os.listdir(extra_files_path)
sub_folder_name = items_in_path[0]
zarr_store_path = os.path.join(extra_files_path, sub_folder_name)
if (
len(items_in_path) == 1
and os.path.isdir(zarr_store_path)
and self._find_zarr_metadata_file(zarr_store_path)
):
return sub_folder_name # The store is in a subfolder of the extra files folder
return None # The directory structure does not look like Zarr format

def _load_zarr_metadata_file(self, store_root_path: str) -> Optional[Dict[str, Any]]:
"""Returns the path to the metadata file in the Zarr store."""
meta_file = self._find_zarr_metadata_file(store_root_path)
if meta_file:
with open(meta_file) as f:
return json.load(f)
return None

def _find_zarr_metadata_file(self, store_root_path: str) -> Optional[str]:
"""Returns the path to the metadata file in the Zarr store."""
meta_file = None
files_in_store = os.listdir(store_root_path)

# Depending on the Zarr version, the metadata file can be in different locations
# In v1 the metadata is in a file named "meta" https://zarr-specs.readthedocs.io/en/latest/v1/v1.0.html
# In v2 it can be in .zarray or .zgroup https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html
# In v3 the metadata is in a file named "zarr.json" https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html
for meta_filename in ["meta", ".zarray", ".zgroup", "zarr.json"]:
if meta_filename in files_in_store:
meta_file = os.path.join(store_root_path, meta_filename)
break

if meta_file and os.path.isfile(meta_file):
return meta_file
return None

def _get_format_version(self, store_root_path: str) -> Optional[str]:
"""Returns the Zarr format version from the metadata file in the Zarr store."""
metadata_file = self._load_zarr_metadata_file(store_root_path)
if metadata_file:
return metadata_file.get("zarr_format")
return None


class GenericAsn1(Text):
"""Class for generic ASN.1 text format"""

Expand Down
18 changes: 18 additions & 0 deletions lib/galaxy/datatypes/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,24 @@ def sniff(self, filename: str) -> bool:
return False


class OMEZarr(data.ZarrDirectory):
"""OME-Zarr is a format for storing multi-dimensional image data in Zarr format.

It is technically a Zarr directory with custom metadata but stores image information
so it is an Image datatype.
"""

file_ext = "ome_zarr"

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
dataset.peek = "OME-Zarr directory"
dataset.blurb = f"Zarr Format v{dataset.metadata.zarr_format}"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"


class Hamamatsu(Image):
file_ext = "vms"

Expand Down
Loading