From 2cdf0a1b0d79da8053c828b3d2753718ed767d96 Mon Sep 17 00:00:00 2001 From: konstntokas Date: Tue, 10 Dec 2024 13:24:32 +0100 Subject: [PATCH 1/4] preload method added --- .gitignore | 2 + environment.yml | 3 + examples/zenodo_data_store.ipynb | 2 +- examples/zenodo_data_store_preload.ipynb | 4426 ++++++++++++++++++++++ pyproject.toml | 3 + xcube_zenodo/_utils.py | 27 +- xcube_zenodo/constants.py | 5 + xcube_zenodo/preload.py | 299 ++ xcube_zenodo/store.py | 121 +- 9 files changed, 4861 insertions(+), 27 deletions(-) create mode 100644 examples/zenodo_data_store_preload.ipynb create mode 100644 xcube_zenodo/preload.py diff --git a/.gitignore b/.gitignore index 82f9275..2f41357 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +examples/preload_cache/ \ No newline at end of file diff --git a/environment.yml b/environment.yml index 62f8418..d170b8e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,10 @@ channels: dependencies: # Required - python>=3.10 + - IPython + - numpy - requests + - tabulate - xarray - xcube >= 1.7.0 # for testing diff --git a/examples/zenodo_data_store.ipynb b/examples/zenodo_data_store.ipynb index 22b1a82..43af87c 100644 --- a/examples/zenodo_data_store.ipynb +++ b/examples/zenodo_data_store.ipynb @@ -116,7 +116,7 @@ ], "source": [ "%%time\n", - "access_token = \"ZsZVfyPCmLYRZQtfSYWruNwXYBykonv0pXZYnrQYNNL0gGMJipYsx0CYvOSB\"\n", + "access_token = \"fill in you Zenodo access token here\"\n", "store = new_data_store(\"zenodo\", access_token=access_token)" ] }, diff --git a/examples/zenodo_data_store_preload.ipynb b/examples/zenodo_data_store_preload.ipynb new file mode 100644 index 0000000..1ea6038 --- /dev/null +++ b/examples/zenodo_data_store_preload.ipynb @@ -0,0 +1,4426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This notebook gives an introduction to the xcube's \"zenodo\" data store and its preload_data method\n", + "\n", + "This notebook shows an example how to preload compressed files published on the [https://zenodo.org](https://zenodo.org) webpage. The compressed files will be downloaded, unpacked and the the individual files will be made available as a Zarr file, which can be subsequently used by the data store as usual. \n", + "\n", + "### Setup\n", + "In order to run this notebook you need to get an access token for the Zenodo API following the [documentation](https://zenodo.org/login/?next=%2Faccount%2Fsettings%2Fapplications%2Ftokens%2Fnew%2F). Furthermore, make sure that [`xcube_zenodo`](https://github.com/xcube-dev/xcube-zenodo) is installed. You may install [`xcube_zenodo`](https://github.com/xcube-dev/xcube-zenodo) directly from the git repository by cloning the repository, directing into `xcube-zenodo`, and following the steps below:\n", + "\n", + "```bash\n", + "conda env create -f environment.yml\n", + "conda activate xcube-zenodo\n", + "pip install .\n", + "```\n", + "\n", + "Note that [`xcube_zenodo`](https://github.com/xcube-dev/xcube-zenodo) is a plugin of [`xcube`](https://xcube.readthedocs.io/en/latest/), where `xcube` is included in the `environment.yml`. \n", + "\n", + "Now, we first import everything we need:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.91 s, sys: 193 ms, total: 3.1 s\n", + "Wall time: 1.24 s\n" + ] + } + ], + "source": [ + "%%time\n", + "from xcube.core.store import new_data_store\n", + "from xcube.core.store import get_data_store_params_schema\n", + "import logging\n", + "\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', # Log format\n", + " datefmt='%Y-%m-%d %H:%M:%S', # Timestamp format\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we get the store parameters needed to initialize a zenodo [data store](https://xcube.readthedocs.io/en/latest/dataaccess.html#data-store-framework). " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 33.9 ms, sys: 7.05 ms, total: 40.9 ms\n", + "Wall time: 40.2 ms\n" + ] + }, + { + "data": { + "application/json": { + "additionalProperties": false, + "properties": { + "access_token": { + "title": "Zenodo access token.", + "type": "string" + }, + "preload_cache_folder": { + "description": "Datasets which are accessed using prelaod_data will be stored in this folder in a prepared way.", + "title": "Preload cache folder.", + "type": "string" + } + }, + "required": [ + "access_token" + ], + "type": "object" + }, + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "store_params = get_data_store_params_schema(\"zenodo\")\n", + "store_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We initiate a zenodo [data store](https://xcube.readthedocs.io/en/latest/dataaccess.html#data-store-framework) with the access_token. Note that the `xcube-zenodo` plugin is recognized after installation by setting the first argument to `\"zenodo\"` in the `new_data_store` function. We can add a relative path to a folder where preloaded data will be stored. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.04 ms, sys: 4 μs, total: 5.05 ms\n", + "Wall time: 4.98 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "access_token = \"fill in you Zenodo access token here\"\n", + "store = new_data_store(\"zenodo\", access_token=access_token, preload_cache_folder=\"preload_cache\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compressed files can be preloaded using the `preload_data` method. Also this method uses `preload_params`, which can be viewed in the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 30 μs, sys: 2 μs, total: 32 μs\n", + "Wall time: 34.1 μs\n" + ] + }, + { + "data": { + "application/json": { + "additionalProperties": false, + "properties": { + "merge": { + "default": false, + "description": "If True, xarray.merge is applied to the files stored in a compressed format. If False, each dataset is stored individually. The data ID will be extended by the filename.", + "title": "Merge multiple dataset of compressed data IDs.", + "type": "boolean" + }, + "monitor_preload": { + "default": true, + "description": "If True, the progress of preload will be visualized", + "title": "Monitor preload_data method.", + "type": "boolean" + } + }, + "type": "object" + }, + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "preload_params = store.get_preload_data_params()\n", + "preload_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " This approach enables the downloading of compressed files that cannot be lazily loaded, allowing them to be stored and readily available for the duration of the project. The `preload_data` method is non-blocking and returns a handler which can be used to cancel the preload by typing `handler.cancel()` indto the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
Data ID Status Progress Message
13333034/belgium.zipPreloadednan%
13333034/denmark.zipPreloadednan%
" + ], + "text/plain": [ + "'\\n\\n\\n\\n\\n\\n\\n\\n
Data ID Status Progress Message
13333034/belgium.zipPreloadednan%
13333034/denmark.zipPreloadednan%
'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "handler = store.preload_data(\"13333034/belgium.zip\", \"13333034/denmark.zip\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data IDs can be view by using the following line. Note that in this example the zipped file contain multiple files. If this is the case and `merge=False` (which is the default), each file is written to a Zarr file. The data ID is extended by the individual file names where the file extension is adjusted. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['13333034/belgium/number_disturbances_belgium.zarr',\n", + " '13333034/belgium/annual_disturbances_1985_2023_belgium.zarr',\n", + " '13333034/belgium/latest_disturbance_belgium.zarr',\n", + " '13333034/belgium/disturbance_agent_1985_2023_belgium.zarr',\n", + " '13333034/belgium/forest_mask_belgium.zarr',\n", + " '13333034/belgium/disturbance_agent_aggregated_belgium.zarr',\n", + " '13333034/belgium/greatest_disturbance_belgium.zarr',\n", + " '13333034/belgium/disturbance_probability_1985_2023_belgium.zarr',\n", + " '13333034/belgium/disturbance_severity_1985_2023_belgium.zarr',\n", + " '13333034/denmark/disturbance_severity_1985_2023_denmark.zarr',\n", + " '13333034/denmark/latest_disturbance_denmark.zarr',\n", + " '13333034/denmark/disturbance_probability_1985_2023_denmark.zarr',\n", + " '13333034/denmark/disturbance_agent_aggregated_denmark.zarr',\n", + " '13333034/denmark/number_disturbances_denmark.zarr',\n", + " '13333034/denmark/annual_disturbances_1985_2023_denmark.zarr',\n", + " '13333034/denmark/greatest_disturbance_denmark.zarr',\n", + " '13333034/denmark/forest_mask_denmark.zarr',\n", + " '13333034/denmark/disturbance_agent_1985_2023_denmark.zarr']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.cache_store.list_data_ids()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we want to open one of the datasets. We first view the availbale parameters to open the data. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2.09 ms, sys: 153 μs, total: 2.25 ms\n", + "Wall time: 1.82 ms\n" + ] + }, + { + "data": { + "application/json": { + "additionalProperties": false, + "properties": { + "cache_size": { + "minimum": 0, + "type": "integer" + }, + "chunks": { + "additionalProperties": true, + "description": "Optional chunk sizes along each dimension. Chunk size values may be None, \"auto\" or an integer value.", + "examples": [ + { + "lat": "auto", + "lon": 90, + "time": null + }, + { + "time": 1, + "x": 512, + "y": 512 + } + ], + "properties": {}, + "type": "object" + }, + "consolidated": { + "default": false, + "description": "Whether to open the store using Zarr's consolidated metadata capability. Only works for stores that have already been consolidated.", + "type": "boolean" + }, + "data_type": { + "enum": [ + "dataset", + "mldataset", + "geodataframe" + ], + "title": "Optional data type", + "type": "string" + }, + "decode_cf": { + "default": true, + "description": "Whether to decode these variables, assuming they were saved according to CF conventions.", + "type": "boolean" + }, + "decode_coords": { + "default": true, + "description": "If True, decode the \"coordinates\" attribute to identify coordinates in the resulting dataset.", + "type": "boolean" + }, + "decode_times": { + "default": true, + "description": "If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers.", + "type": "boolean" + }, + "drop_variables": { + "items": { + "minLength": 1, + "type": "string" + }, + "type": "array" + }, + "group": { + "description": "Group path. (a.k.a. path in zarr terminology.).", + "minLength": 1, + "type": "string" + }, + "log_access": { + "default": false, + "type": "boolean" + }, + "mask_and_scale": { + "default": true, + "description": "If True, replace array values equal to attribute \"_FillValue\" with NaN. Use \"scale_factor\" and \"add_offset\" attributes to compute actual values.", + "type": "boolean" + } + }, + "type": "object" + }, + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "open_params = store.get_open_data_params_schema(data_id=\"13333034/belgium/disturbance_probability_1985_2023_belgium.zarr\")\n", + "open_params" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 24.7 ms, sys: 5.98 ms, total: 30.6 ms\n", + "Wall time: 30 ms\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 27GB\n",
+       "Dimensions:      (y: 9084, x: 9390)\n",
+       "Coordinates:\n",
+       "    spatial_ref  int64 8B ...\n",
+       "  * x            (x) float64 75kB 3.795e+06 3.795e+06 ... 4.076e+06 4.076e+06\n",
+       "  * y            (y) float64 73kB 3.189e+06 3.189e+06 ... 2.917e+06 2.917e+06\n",
+       "Data variables: (12/39)\n",
+       "    band_1       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_10      (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_11      (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_12      (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_13      (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_14      (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    ...           ...\n",
+       "    band_4       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_5       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_6       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_7       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_8       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "    band_9       (y, x) float64 682MB dask.array<chunksize=(512, 512), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    source:   file:///home/konstantin/bc_kon/01_coding/01_github/xcube-zenodo...
" + ], + "text/plain": [ + " Size: 27GB\n", + "Dimensions: (y: 9084, x: 9390)\n", + "Coordinates:\n", + " spatial_ref int64 8B ...\n", + " * x (x) float64 75kB 3.795e+06 3.795e+06 ... 4.076e+06 4.076e+06\n", + " * y (y) float64 73kB 3.189e+06 3.189e+06 ... 2.917e+06 2.917e+06\n", + "Data variables: (12/39)\n", + " band_1 (y, x) float64 682MB dask.array\n", + " band_10 (y, x) float64 682MB dask.array\n", + " band_11 (y, x) float64 682MB dask.array\n", + " band_12 (y, x) float64 682MB dask.array\n", + " band_13 (y, x) float64 682MB dask.array\n", + " band_14 (y, x) float64 682MB dask.array\n", + " ... ...\n", + " band_4 (y, x) float64 682MB dask.array\n", + " band_5 (y, x) float64 682MB dask.array\n", + " band_6 (y, x) float64 682MB dask.array\n", + " band_7 (y, x) float64 682MB dask.array\n", + " band_8 (y, x) float64 682MB dask.array\n", + " band_9 (y, x) float64 682MB dask.array\n", + "Attributes:\n", + " source: file:///home/konstantin/bc_kon/01_coding/01_github/xcube-zenodo..." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "ds = store.open_data(\"13333034/belgium/disturbance_probability_1985_2023_belgium.zarr\")\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot parts of the opened data as an example below." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 543 ms, sys: 65.8 ms, total: 609 ms\n", + "Wall time: 551 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%time\n", + "ds.band_1[::10, ::10].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 69756f4..79b0486 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,10 @@ readme = {file = "README.md", content-type = "text/markdown"} license = {text = "MIT"} requires-python = ">=3.10" dependencies = [ + "IPython", + "numpy", "requests", + "tabulate", "xarray", "xcube" ] diff --git a/xcube_zenodo/_utils.py b/xcube_zenodo/_utils.py index bb2141a..a630d43 100644 --- a/xcube_zenodo/_utils.py +++ b/xcube_zenodo/_utils.py @@ -19,13 +19,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from typing import Any, Container - -from xcube.core.store import DATASET_TYPE -from xcube.core.store import MULTI_LEVEL_DATASET_TYPE -from xcube.core.store import DataTypeLike - +from typing import Any, Container, Union +from .constants import COMPRESSED_FORMATS from .constants import MAP_FILE_EXTENSION_FORMAT @@ -70,18 +66,27 @@ def get_attrs_from_record( return attrs -def estimate_file_format(data_id: str) -> str: - ext = data_id.split(".")[-1] - format_id = MAP_FILE_EXTENSION_FORMAT.get(ext.lower()) - return format_id +def estimate_file_format(data_id: str) -> Union[str, None]: + for key, val in MAP_FILE_EXTENSION_FORMAT.items(): + if data_id.endswith(key.lower()): + return val + return None def is_supported_file_format(data_id: str) -> bool: return estimate_file_format(data_id) is not None -def translate_data_id2uri(data_id: str) -> str: +def is_supported_compressed_file_format(data_id: str) -> bool: + return estimate_file_format(data_id) in COMPRESSED_FORMATS + + +def translate_data_id2fs_path(data_id: str) -> str: components = data_id.split("/") record_id = components[0] file_key = "/".join(components[1:]) return f"records/{record_id}/files/{file_key}" + + +def translate_data_id2uri(data_id: str) -> str: + return f"https://zenodo.org/{translate_data_id2fs_path(data_id)}" diff --git a/xcube_zenodo/constants.py b/xcube_zenodo/constants.py index eea475c..4aaeeb3 100644 --- a/xcube_zenodo/constants.py +++ b/xcube_zenodo/constants.py @@ -21,6 +21,7 @@ DATA_STORE_ID = "zenodo" API_RECORDS_ENDPOINT = "https://zenodo.org/api/records" +PRELOAD_CACHE_FOLDER = "preload_cache/" MAP_FILE_EXTENSION_FORMAT = { "zarr": "zarr", @@ -31,4 +32,8 @@ "geotiff": "geotiff", "shp": "shapefile", "geojson": "geojson", + "zip": "zip", + "tar": "tar", + "tar.gz": "tar.gz", } +COMPRESSED_FORMATS = list(MAP_FILE_EXTENSION_FORMAT.keys())[-3:] diff --git a/xcube_zenodo/preload.py b/xcube_zenodo/preload.py new file mode 100644 index 0000000..ff1387f --- /dev/null +++ b/xcube_zenodo/preload.py @@ -0,0 +1,299 @@ +# The MIT License (MIT) +# Copyright (c) 2024 by the xcube development team and contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import logging +import os +import shutil +import threading +import time +from typing import Callable, Union +import tarfile +import zipfile + +import IPython +import IPython.display +import numpy as np +import tabulate +import requests +import xarray as xr +from xcube.core.store import DataStoreError +from xcube.core.store import MutableDataStore + +from ._utils import estimate_file_format +from ._utils import translate_data_id2uri + +LOG = logging.getLogger(__name__) + + +class Event: + + def __init__(self, data_id: str, total_size: Union[int, float]): + self.data_id = data_id + self.status = "Not started" + self.progress = 0.0 + self.message = "Preloading not started jet." + self.total_size = total_size + self._callback = None + + def subscribe(self, callback: Callable[[], None]): + self._callback = callback + + def notify(self): + if self._callback is not None: + self._callback() + + def update(self, status: str, progress: float, message: str): + self.status = status + self.progress = progress + self.message = message + self.notify() + + +class PreloadHandle: + + def __init__(self, cache_store: MutableDataStore, *data_ids: str, **preload_params): + self._is_cancelled = False + self._is_closed = False + self._cache_store = cache_store + self._data_ids = data_ids + self._preload_params = preload_params + self._cache_root = preload_params.pop("cache_root") + self._download_folder_name = "downloads" + self._download_folder = os.path.join( + self._cache_root, self._download_folder_name + ) + self._events = [Event(data_id, np.nan) for data_id in data_ids] + self.lock = threading.Lock() + if preload_params.get("monitor_preload"): + for event in self._events: + event.subscribe(self._monitor_preload) + self._thread_download = None + self._thread_decompress = None + self._thread_prepare = None + + @property + def is_cancelled(self) -> bool: + return self._is_cancelled + + def cancel(self): + self._is_cancelled = True + self._thread_prepare.join() + self._thread_decompress.join() + self._thread_download.join() + for event in self._events: + event.update("Canceled", np.nan, "Preload has been canceled by user.") + self.close() + + @property + def is_closed(self) -> bool: + return self._is_closed + + def close(self): + self._is_closed = True + for event in self._events: + if event.status != "Preloaded": + if self._cache_store.has_data(event.data_id): + self._cache_store.delete_data(event.data_id) + else: + record, filename = event.data_id.split("/") + format_id = estimate_file_format(event.data_id) + dirname = filename.replace(f".{format_id}", "") + data_id_mod = f"{record}/{dirname}" + list_data_ids = self._cache_store.list_data_ids() + list_data_ids_mod = [ + data_id for data_id in list_data_ids if data_id_mod in data_id + ] + for data_id in list_data_ids_mod: + self._cache_store.delete_data(data_id) + if os.path.isdir(self._download_folder): + shutil.rmtree(self._download_folder) + + def _monitor_preload(self): + rows = [ + [ + event.data_id, + event.status, + f"{event.progress * 100:.2f}%", + event.message, + ] + for event in self._events + ] + if is_jupyter(): + table = tabulate.tabulate( + rows, + headers=["Data ID", "Status", "Progress", "Message"], + tablefmt="html", + ) + IPython.display.clear_output(wait=True) + IPython.display.display(table) + else: + table = tabulate.tabulate( + rows, + headers=["Dataset", "Status", "Progress", "Message"], + ) + os.system("clear" if os.name == "posix" else "cls") + print(table) + + def preload_data(self, *data_ids: str, **preload_params): + self._download_data(*data_ids) + self._decompress_data(*data_ids) + self._prepare_data(*data_ids, **preload_params) + + def _download_data(self, *data_ids: str): + # get first total size of all datasets and initialize it as Events + for i, data_id in enumerate(data_ids): + uri = translate_data_id2uri(data_id) + with requests.get(uri, stream=True) as response: + if not response.ok: + raise DataStoreError(response.raise_for_status()) + self._events[i].total_size = int( + response.headers.get("content-length", 0) + ) + + # start downloading + chunk_size = 1024 * 1024 + + def download(): + for i, data_id in enumerate(data_ids): + download_size = 0 + self._events[i].update("Download started", 0, "") + uri = translate_data_id2uri(data_id) + with requests.get(uri, stream=True) as response: + if not response.ok: + raise DataStoreError(response.raise_for_status()) + record, filename = data_id.split("/") + record_folder = os.path.join(self._download_folder, record) + if not os.path.exists(record_folder): + os.makedirs(record_folder) + download_path = os.path.join(record_folder, filename) + with open(download_path, "wb") as file: + for chunk in response.iter_content(chunk_size=chunk_size): + file.write(chunk) + download_size += len(chunk) + self._events[i].update( + "Download started", + download_size / self._events[i].total_size, + "", + ) + if self._is_cancelled: + break + self._events[i].update("Downloaded", 1.0, "") + + self._thread_download = threading.Thread( + target=download, daemon=True, name="download_data" + ) + self._thread_download.start() + + def _decompress_data(self, *data_ids): + def decompress(): + for i, data_id in enumerate(data_ids): + while not self._events[i].status == "Downloaded": + time.sleep(1) + self._events[i].update("Decompression started", np.nan, "") + record, filename = data_id.split("/") + file_path = os.path.join(self._download_folder, record, filename) + if zipfile.is_zipfile(file_path): + with zipfile.ZipFile(file_path, "r") as zip_ref: + dirname = filename.replace(".zip", "") + extract_dir = os.path.join( + self._download_folder, record, dirname + ) + zip_ref.extractall(extract_dir) + elif file_path.endswith(".tar"): + with tarfile.open(file_path, "r") as tar_ref: + dirname = filename.replace(".tar", "") + extract_dir = os.path.join( + self._download_folder, record, dirname + ) + tar_ref.extractall(path=extract_dir) + elif file_path.endswith(".tar.gz"): + with tarfile.open(file_path, "r:gz") as tar_ref: + dirname = filename.replace(".tar.gz", "") + extract_dir = os.path.join( + self._download_folder, record, dirname + ) + tar_ref.extractall(path=extract_dir) + self._events[i].update("Decompressed", np.nan, "") + if self._is_cancelled: + break + + self._thread_decompress = threading.Thread( + target=decompress, daemon=True, name="decompress_data" + ) + self._thread_decompress.start() + + def _prepare_data(self, *data_ids, **preload_params): + def prepare(): + for i, data_id in enumerate(data_ids): + while not self._events[i].status == "Decompressed": + time.sleep(1) + self._events[i].update("File processing started", np.nan, "") + record, filename = data_id.split("/") + format_id = estimate_file_format(data_id) + dirname = filename.replace(f".{format_id}", "") + extract_dir = os.path.join(self._download_folder, record, dirname) + dss = [] + sub_fnames = os.listdir(extract_dir) + for sub_fname in sub_fnames: + sub_data_id = ( + f"{self._download_folder_name}/{record}/{dirname}/{sub_fname}" + ) + if not self._cache_store.has_data(sub_data_id): + LOG.debug( + f"File with data ID {sub_data_id} cannot be opened, " + f"and thus will not be considered." + ) + dss.append(self._cache_store.open_data(sub_data_id)) + if len(dss) == 1: + self._cache_store.write_data( + dss[0], data_id, writer_id="dataset:zarr:file" + ) + elif preload_params.get("merge"): + ds = xr.merge(dss) + self._cache_store.write_data( + ds, data_id, writer_id="dataset:zarr:file" + ) + else: + for ds, sub_fname in zip(dss, sub_fnames): + data_id = ( + f"{record}/{dirname}/" + f"{".".join(sub_fname.split(".")[:-1])}.zarr" + ) + self._cache_store.write_data( + ds, data_id, writer_id="dataset:zarr:file" + ) + LOG.info( + f"Merge is set to False. The sub-dataset is " + f"written to {data_id}" + ) + self._events[i].update("Preloaded", np.nan, "") + if self._is_cancelled: + break + self.close() + + self._thread_prepare = threading.Thread( + target=prepare, daemon=True, name="prepare_data" + ) + self._thread_prepare.start() + + +def is_jupyter(): + return "ZMQInteractiveShell" in IPython.get_ipython().__class__.__name__ diff --git a/xcube_zenodo/store.py b/xcube_zenodo/store.py index 3af893e..ea504f9 100644 --- a/xcube_zenodo/store.py +++ b/xcube_zenodo/store.py @@ -20,44 +20,63 @@ # SOFTWARE. import logging +import os from typing import Tuple, Iterator, Container, Any, Union import requests import xarray as xr from xcube.util.jsonschema import ( + JsonBooleanSchema, JsonObjectSchema, JsonStringSchema, ) from xcube.core.store import ( DataDescriptor, DataStore, + DataStoreError, DataTypeLike, new_data_store, ) from .constants import API_RECORDS_ENDPOINT +from .constants import COMPRESSED_FORMATS +from .constants import PRELOAD_CACHE_FOLDER +from .preload import PreloadHandle +from ._utils import estimate_file_format from ._utils import get_attrs_from_record from ._utils import is_supported_file_format -from ._utils import translate_data_id2uri +from ._utils import is_supported_compressed_file_format +from ._utils import translate_data_id2fs_path -_LOG = logging.getLogger("xcube") +LOG = logging.getLogger(__name__) class ZenodoDataStore(DataStore): """Implementation of the Zenodo data store defined in the ``xcube_zenodo`` plugin.""" - def __init__(self, access_token: str): + def __init__(self, access_token: str, preload_cache_folder: str = None): self._requests_params = {"access_token": access_token} self._https_data_store = new_data_store("https", root="zenodo.org") + self._cache_root = os.path.join( + os.getcwd(), preload_cache_folder or PRELOAD_CACHE_FOLDER + ) + self.cache_store = new_data_store("file", root=self._cache_root, max_depth=3) @classmethod def get_data_store_params_schema(cls) -> JsonObjectSchema: params = dict( access_token=JsonStringSchema( title="Zenodo access token.", - ) + ), + preload_cache_folder=JsonStringSchema( + title="Preload cache folder.", + description=( + "Datasets which are accessed using prelaod_data will be stored " + "in this folder in a prepared way." + ), + ), ) return JsonObjectSchema( properties=dict(**params), @@ -71,7 +90,7 @@ def get_data_types(cls) -> Tuple[str, ...]: return store.get_data_types() def get_data_types_for_data(self, data_id: str) -> Tuple[str, ...]: - uri = translate_data_id2uri(data_id) + uri = translate_data_id2fs_path(data_id) return self._https_data_store.get_data_types_for_data(uri) def get_data_ids( @@ -104,13 +123,13 @@ def get_data_ids( page += 1 def has_data(self, data_id: str, data_type: str = None) -> bool: - uri = translate_data_id2uri(data_id) + uri = translate_data_id2fs_path(data_id) return self._https_data_store.has_data(data_id=uri, data_type=data_type) def describe_data( self, data_id: str, data_type: DataTypeLike = None ) -> DataDescriptor: - uri = translate_data_id2uri(data_id) + uri = translate_data_id2fs_path(data_id) descriptor = self._https_data_store.describe_data( data_id=uri, data_type=data_type ) @@ -121,7 +140,7 @@ def get_data_opener_ids( self, data_id: str = None, data_type: DataTypeLike = None ) -> Tuple[str, ...]: if data_id is not None: - uri = translate_data_id2uri(data_id) + uri = translate_data_id2fs_path(data_id) else: uri = data_id return self._https_data_store.get_data_opener_ids( @@ -132,7 +151,7 @@ def get_open_data_params_schema( self, data_id: str = None, opener_id: str = None ) -> JsonObjectSchema: if data_id is not None: - uri = translate_data_id2uri(data_id) + uri = translate_data_id2fs_path(data_id) else: uri = data_id return self._https_data_store.get_open_data_params_schema( @@ -142,14 +161,86 @@ def get_open_data_params_schema( def open_data( self, data_id: str, opener_id: str = None, **open_params ) -> xr.Dataset: - uri = translate_data_id2uri(data_id) - return self._https_data_store.open_data( - data_id=uri, opener_id=opener_id, **open_params + format_id = estimate_file_format(data_id) + if format_id in COMPRESSED_FORMATS: + if not self.cache_store.has_data(data_id): + raise DataStoreError( + f"The dataset {data_id} is stored in a compressed format. " + f"Please use store.preload_data({data_id}) first." + ) + return self.cache_store.open_data( + data_id=data_id, opener_id="dataset:zarr:file", **open_params + ) + elif self.cache_store.has_data(data_id): + return self.cache_store.open_data(data_id=data_id, **open_params) + else: + uri = translate_data_id2fs_path(data_id) + return self._https_data_store.open_data( + data_id=uri, opener_id=opener_id, **open_params + ) + + def preload_data(self, *data_ids: str, **preload_params) -> PreloadHandle: + schema = self.get_preload_data_params() + schema.validate_instance(preload_params) + preload_params["merge"] = preload_params.get("merge", False) + preload_params["monitor_preload"] = preload_params.get("monitor_preload", True) + data_ids_sel = [] + for data_id in data_ids: + format_id = estimate_file_format(data_id) + data_id_mod = data_id.replace(f".{format_id}", "/") + list_data_ids = self.cache_store.list_data_ids() + list_data_ids_mod = [i for i in list_data_ids if data_id_mod in i] + if list_data_ids_mod: + LOG.info( + f"{data_id} is already pre-loaded. The datasets can be " + f"opened with the following data IDs: " + f"\n{'\n'.join(str(item) for item in list_data_ids_mod)}" + ) + elif self.cache_store.has_data(data_id): + LOG.info(f"{data_id} is already pre-loaded.") + else: + if is_supported_compressed_file_format(data_id): + data_ids_sel.append(data_id) + else: + LOG.warning( + f"{data_id} cannot be preloaded. Only 'zip', 'tar', and " + "'tar.gz' compressed files are supported. The preload " + "request is discarded." + ) + preload_handle = PreloadHandle( + self.cache_store, + *data_ids_sel, + cache_root=self._cache_root, + **preload_params, + ) + if data_ids_sel: + preload_handle.preload_data(*data_ids_sel, **preload_params) + return preload_handle + + def get_preload_data_params(self) -> JsonObjectSchema: + params = dict( + merge=JsonBooleanSchema( + title="Merge multiple dataset of compressed data IDs.", + description=( + "If True, xarray.merge is applied to the files stored in a " + "compressed format. If False, each dataset is stored individually. " + "The data ID will be extended by the filename." + ), + default=False, + ), + monitor_preload=JsonBooleanSchema( + title="Monitor preload_data method.", + description="If True, the progress of preload will be visualized", + default=True, + ), + ) + return JsonObjectSchema( + properties=dict(**params), + required=[], + additional_properties=False, ) - def search_data( - self, data_type: DataTypeLike = None, **search_params - ) -> Iterator[DataDescriptor]: + def search_data(self, data_type: DataTypeLike = None, **search_params): schema = self.get_search_params_schema() schema.validate_instance(search_params) raise NotImplementedError("search_data() operation is not supported.") From a7409f6c49f2479a44c98896cc93c1a7b404d202 Mon Sep 17 00:00:00 2001 From: konstntokas Date: Wed, 11 Dec 2024 16:32:17 +0100 Subject: [PATCH 2/4] yogesh pr addressed --- examples/zenodo_data_store_preload.ipynb | 28 ++++++++++++------------ xcube_zenodo/_utils.py | 6 ++--- xcube_zenodo/preload.py | 8 +++---- xcube_zenodo/store.py | 6 ++--- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/zenodo_data_store_preload.ipynb b/examples/zenodo_data_store_preload.ipynb index 1ea6038..2bf61b1 100644 --- a/examples/zenodo_data_store_preload.ipynb +++ b/examples/zenodo_data_store_preload.ipynb @@ -31,8 +31,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 2.91 s, sys: 193 ms, total: 3.1 s\n", - "Wall time: 1.24 s\n" + "CPU times: user 2.5 s, sys: 422 ms, total: 2.92 s\n", + "Wall time: 2.65 s\n" ] } ], @@ -65,8 +65,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 33.9 ms, sys: 7.05 ms, total: 40.9 ms\n", - "Wall time: 40.2 ms\n" + "CPU times: user 40.8 ms, sys: 10.6 ms, total: 51.4 ms\n", + "Wall time: 61.2 ms\n" ] }, { @@ -90,7 +90,7 @@ "type": "object" }, "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -113,15 +113,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.04 ms, sys: 4 μs, total: 5.05 ms\n", - "Wall time: 4.98 ms\n" + "CPU times: user 8.1 ms, sys: 43 μs, total: 8.14 ms\n", + "Wall time: 7.66 ms\n" ] } ], @@ -140,15 +140,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 30 μs, sys: 2 μs, total: 32 μs\n", - "Wall time: 34.1 μs\n" + "CPU times: user 68 μs, sys: 11 μs, total: 79 μs\n", + "Wall time: 83.7 μs\n" ] }, { @@ -172,10 +172,10 @@ "type": "object" }, "text/plain": [ - "" + "" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -195,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { diff --git a/xcube_zenodo/_utils.py b/xcube_zenodo/_utils.py index a630d43..6468a4f 100644 --- a/xcube_zenodo/_utils.py +++ b/xcube_zenodo/_utils.py @@ -66,7 +66,7 @@ def get_attrs_from_record( return attrs -def estimate_file_format(data_id: str) -> Union[str, None]: +def identify_file_format(data_id: str) -> Union[str, None]: for key, val in MAP_FILE_EXTENSION_FORMAT.items(): if data_id.endswith(key.lower()): return val @@ -74,11 +74,11 @@ def estimate_file_format(data_id: str) -> Union[str, None]: def is_supported_file_format(data_id: str) -> bool: - return estimate_file_format(data_id) is not None + return identify_file_format(data_id) is not None def is_supported_compressed_file_format(data_id: str) -> bool: - return estimate_file_format(data_id) in COMPRESSED_FORMATS + return identify_file_format(data_id) in COMPRESSED_FORMATS def translate_data_id2fs_path(data_id: str) -> str: diff --git a/xcube_zenodo/preload.py b/xcube_zenodo/preload.py index ff1387f..656b36e 100644 --- a/xcube_zenodo/preload.py +++ b/xcube_zenodo/preload.py @@ -37,7 +37,7 @@ from xcube.core.store import DataStoreError from xcube.core.store import MutableDataStore -from ._utils import estimate_file_format +from ._utils import identify_file_format from ._utils import translate_data_id2uri LOG = logging.getLogger(__name__) @@ -49,7 +49,7 @@ def __init__(self, data_id: str, total_size: Union[int, float]): self.data_id = data_id self.status = "Not started" self.progress = 0.0 - self.message = "Preloading not started jet." + self.message = "Preloading not started yet." self.total_size = total_size self._callback = None @@ -114,7 +114,7 @@ def close(self): self._cache_store.delete_data(event.data_id) else: record, filename = event.data_id.split("/") - format_id = estimate_file_format(event.data_id) + format_id = identify_file_format(event.data_id) dirname = filename.replace(f".{format_id}", "") data_id_mod = f"{record}/{dirname}" list_data_ids = self._cache_store.list_data_ids() @@ -247,7 +247,7 @@ def prepare(): time.sleep(1) self._events[i].update("File processing started", np.nan, "") record, filename = data_id.split("/") - format_id = estimate_file_format(data_id) + format_id = identify_file_format(data_id) dirname = filename.replace(f".{format_id}", "") extract_dir = os.path.join(self._download_folder, record, dirname) dss = [] diff --git a/xcube_zenodo/store.py b/xcube_zenodo/store.py index ea504f9..76e046b 100644 --- a/xcube_zenodo/store.py +++ b/xcube_zenodo/store.py @@ -42,7 +42,7 @@ from .constants import COMPRESSED_FORMATS from .constants import PRELOAD_CACHE_FOLDER from .preload import PreloadHandle -from ._utils import estimate_file_format +from ._utils import identify_file_format from ._utils import get_attrs_from_record from ._utils import is_supported_file_format from ._utils import is_supported_compressed_file_format @@ -161,7 +161,7 @@ def get_open_data_params_schema( def open_data( self, data_id: str, opener_id: str = None, **open_params ) -> xr.Dataset: - format_id = estimate_file_format(data_id) + format_id = identify_file_format(data_id) if format_id in COMPRESSED_FORMATS: if not self.cache_store.has_data(data_id): raise DataStoreError( @@ -186,7 +186,7 @@ def preload_data(self, *data_ids: str, **preload_params) -> PreloadHandle: preload_params["monitor_preload"] = preload_params.get("monitor_preload", True) data_ids_sel = [] for data_id in data_ids: - format_id = estimate_file_format(data_id) + format_id = identify_file_format(data_id) data_id_mod = data_id.replace(f".{format_id}", "/") list_data_ids = self.cache_store.list_data_ids() list_data_ids_mod = [i for i in list_data_ids if data_id_mod in i] From 03973f93cd1121bc78d7266a062a86448c725fe2 Mon Sep 17 00:00:00 2001 From: konstntokas Date: Fri, 13 Dec 2024 17:37:58 +0100 Subject: [PATCH 3/4] reveiw addressed --- .gitignore | 2 +- environment.yml | 2 +- examples/zenodo_data_store.ipynb | 188 +++++++++++------------ examples/zenodo_data_store_preload.ipynb | 156 +++++++++---------- pyproject.toml | 2 +- xcube_zenodo/_utils.py | 4 +- xcube_zenodo/constants.py | 6 +- xcube_zenodo/preload.py | 73 +++++---- xcube_zenodo/store.py | 27 ++-- 9 files changed, 231 insertions(+), 229 deletions(-) diff --git a/.gitignore b/.gitignore index 2f41357..435dc68 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,4 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -examples/preload_cache/ \ No newline at end of file +examples/zenodo_cache/ \ No newline at end of file diff --git a/environment.yml b/environment.yml index d170b8e..f0202df 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: # Required - python>=3.10 - - IPython + - fsspec - numpy - requests - tabulate diff --git a/examples/zenodo_data_store.ipynb b/examples/zenodo_data_store.ipynb index 43af87c..76d0a1c 100644 --- a/examples/zenodo_data_store.ipynb +++ b/examples/zenodo_data_store.ipynb @@ -24,15 +24,15 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.04 s, sys: 234 ms, total: 3.27 s\n", - "Wall time: 1.41 s\n" + "CPU times: user 7 μs, sys: 1 μs, total: 8 μs\n", + "Wall time: 10 μs\n" ] } ], @@ -52,15 +52,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 31.6 ms, sys: 13.8 ms, total: 45.5 ms\n", - "Wall time: 44.6 ms\n" + "CPU times: user 33.4 ms, sys: 8.93 ms, total: 42.3 ms\n", + "Wall time: 41.6 ms\n" ] }, { @@ -71,6 +71,11 @@ "access_token": { "title": "Zenodo access token.", "type": "string" + }, + "preload_cache_folder": { + "description": "Datasets which are accessed using prelaod_data will be stored in this folder in a prepared way.", + "title": "Preload cache folder.", + "type": "string" } }, "required": [ @@ -79,10 +84,10 @@ "type": "object" }, "text/plain": [ - "" + "" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -102,21 +107,21 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 3.44 ms, sys: 1.04 ms, total: 4.48 ms\n", - "Wall time: 4.43 ms\n" + "CPU times: user 5.17 ms, sys: 15 μs, total: 5.18 ms\n", + "Wall time: 5.1 ms\n" ] } ], "source": [ "%%time\n", - "access_token = \"fill in you Zenodo access token here\"\n", + "access_token = \"XVYlYi840itHkdbVXN2PtQk9cBeIY2WQ9OlVQeIssaP9YgMDAAyjlnIc6H6l\"\n", "store = new_data_store(\"zenodo\", access_token=access_token)" ] }, @@ -129,33 +134,33 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 365 ms, sys: 30.8 ms, total: 396 ms\n", - "Wall time: 2min 18s\n" + "CPU times: user 43.9 ms, sys: 5.57 ms, total: 49.4 ms\n", + "Wall time: 25.5 s\n" ] }, { "data": { "text/plain": [ - "['14054126/AVIRIS-NG-methane-emissions_v2024-11-04.nc',\n", - " '14052963/CESM.b.T31_g37.HighPrec.SO.pop.DIC.nc',\n", - " '14052963/CESM.b.T31_g37.Equilibration.500-1000.pop.nc',\n", - " '14052963/CESM.b.T31_g37.NoPrec.SH.cam.nc',\n", - " '14052963/CESM.b.T31_g37.HighPrec.SH.cam.nc',\n", - " '14052963/CESM.b.T31_g37.Equilibration.0-500.pop.nc',\n", - " '14052963/CESM.b.T31_g37.NoPrec.SO.pop.nc',\n", - " '14052963/CESM.b.T31_g37.NoPrec.SO.pop.DIC.nc',\n", - " '14052963/CESM.b.T31_g37.HighPrec.SO.pop.nc',\n", - " '14051280/Mitoflash_CM-8bit.tif']" + "['14446470/pretrained.zip',\n", + " '14446770/mastodon-sc/mastodon-deep-lineage-mastodon-deep-lineage-0.4.3.zip',\n", + " '14446677/MeTech/haptic-oriring-public-v1.1.zip',\n", + " '14446628/magicrjk/treeofrobots-ToR-V1.0.zip',\n", + " '14446620/Ahmet-Agaoglu/The-Corridor-Method-0.1.0.zip',\n", + " '14446612/FAIRmat-NFDI/pynxtools-xps-v0.4.9.zip',\n", + " '14446605/PowerGridModel/power-grid-model-v1.10.22.zip',\n", + " '14446598/FerdinandKlingenberg/tree-cover-density-comparison-V1.0.0.zip',\n", + " '14446582/KatieWillis/DriveSelectionBalance-DriveSelectionBalance.zip',\n", + " '14441477/simulation_data.zip']" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -182,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -271,10 +276,10 @@ ] }, "text/plain": [ - "" + "" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -292,15 +297,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 192 μs, sys: 19 μs, total: 211 μs\n", - "Wall time: 214 μs\n" + "CPU times: user 217 μs, sys: 20 μs, total: 237 μs\n", + "Wall time: 239 μs\n" ] }, { @@ -348,10 +353,10 @@ "type": "object" }, "text/plain": [ - "" + "" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -364,15 +369,15 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 203 ms, sys: 3.83 ms, total: 207 ms\n", - "Wall time: 206 ms\n" + "CPU times: user 21.2 ms, sys: 4.01 ms, total: 25.2 ms\n", + "Wall time: 24.1 ms\n" ] }, { @@ -750,9 +755,9 @@ "Data variables:\n", " band_1 (y, x) uint8 25GB dask.array<chunksize=(1024, 1024), meta=np.ndarray>\n", "Attributes:\n", - " source: https://zenodo.org/records/8154445/files/planet_canopy_cover_30..." + " dtype='float64', name='y', length=149363))
  • source :
    https://zenodo.org/records/8154445/files/planet_canopy_cover_30m_v0.1.tif
  • " ], "text/plain": [ " Size: 25GB\n", @@ -876,7 +881,7 @@ " source: https://zenodo.org/records/8154445/files/planet_canopy_cover_30..." ] }, - "execution_count": 14, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -899,24 +904,24 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 321 ms, sys: 68.4 ms, total: 389 ms\n", - "Wall time: 3.06 s\n" + "CPU times: user 661 ms, sys: 122 ms, total: 783 ms\n", + "Wall time: 4 s\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, @@ -945,15 +950,15 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.15 ms, sys: 0 ns, total: 5.15 ms\n", - "Wall time: 4.9 ms\n" + "CPU times: user 5.73 ms, sys: 1.12 ms, total: 6.85 ms\n", + "Wall time: 161 ms\n" ] }, { @@ -962,7 +967,7 @@ "1" ] }, - "execution_count": 16, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -979,7 +984,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1357,9 +1362,9 @@ "Data variables:\n", " band_1 (y, x) uint8 25GB dask.array<chunksize=(1024, 1024), meta=np.ndarray>\n", "Attributes:\n", - " source: https://zenodo.org/records/8154445/files/planet_canopy_cover_30..." + " dtype='float64', name='y', length=149363))
  • source :
    https://zenodo.org/records/8154445/files/planet_canopy_cover_30m_v0.1.tif
  • " ], "text/plain": [ " Size: 25GB\n", @@ -1483,7 +1488,7 @@ " source: https://zenodo.org/records/8154445/files/planet_canopy_cover_30..." ] }, - "execution_count": 17, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1509,15 +1514,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 83.2 ms, sys: 15.3 ms, total: 98.4 ms\n", - "Wall time: 9.03 s\n" + "CPU times: user 65.9 ms, sys: 21.8 ms, total: 87.7 ms\n", + "Wall time: 8.05 s\n" ] }, { @@ -1910,7 +1915,7 @@ " Modelrun: ERA5weather\n", " Modelconfig: newfriction\n", " TimeStamp: 11-Jun-2024 13:01:44\n", - " Notes: Statistics based on ERA5 hindcast run from 1980 to 2022. \\n...