diff --git a/.github/workflows/deploy_recipe.yaml b/.github/workflows/deploy_recipe.yaml index 02c92b8..d3ec80f 100644 --- a/.github/workflows/deploy_recipe.yaml +++ b/.github/workflows/deploy_recipe.yaml @@ -42,6 +42,7 @@ jobs: # AT that point, screw it, not worth it. run: | jobname="${{ env.JOB_NAME }}" + echo "$JOB_NAME" while true; do count=$(gcloud dataflow jobs list --status=active --filter="name:${jobname}" --format="value(id)" | wc -l) echo "Active Dataflow jobs: $count" diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8817f77..ffd7b02 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,10 +4,12 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 1 +c.Bake.prune = True c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" -c.DataflowBakery.use_dataflow_prime = True -c.DataflowBakery.max_workers = 50 +c.DataflowBakery.use_dataflow_prime = False +c.DataflowBakery.machine_type = "e2-highmem-8" # 1 year had max 50GB of ram on single worker. This is 64GB +c.DataflowBakery.disk_size_gb = 400 +c.DataflowBakery.max_num_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( "leap-community-bakery@leap-pangeo.iam.gserviceaccount.com" diff --git a/feedstock/catalog.yaml b/feedstock/catalog.yaml index 8a7d348..4370f86 100644 --- a/feedstock/catalog.yaml +++ b/feedstock/catalog.yaml @@ -1,16 +1,9 @@ # All the information important to cataloging. -"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/proto_feedstock/blob/main/feedstock/meta.yaml" +"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/eNATL_feedstock/blob/main/feedstock/meta.yaml" tags: - - my-custom-tag - zarr + - ocean stores: - - id: "small" - name: "The cool small Proto Dataset" - url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/small.zarr" - "ncviewjs:rechunking": - - path: "gs://some-bucket/small.zarr" - use_case: "multiscales" - - - id: "large" - name: "The even cooler large Proto Dataset" # no pyramids - url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/large.zarr" + - id: "enatl60-blbt02" + name: "Needs a name" + url: "gs://leap-persistent/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 19f7434..487f0fd 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,24 +1,68 @@ import xarray as xr +import pandas as pd import apache_beam as beam -from pangeo_forge_recipes.patterns import pattern_from_file_sequence +import pooch +from pangeo_forge_recipes.patterns import ConcatDim, FilePattern from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, ConsolidateDimensionCoordinates, - OpenURLWithFSSpec, OpenWithXarray, StoreToZarr, ) +from leap_data_management_utils.data_management_transforms import ( + Copy, + get_catalog_store_urls, +) + +catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") + + +dates = pd.date_range("2009-07-01", "2010-06-30", freq="D") + +records = { + 1: "10261988", + 2: "10260907", + 3: "10260980", + 4: "10261078", + 5: "10261126", + 6: "10261192", + 7: "10261274", + 8: "10261349", + 9: "10261461", + 10: "10261540", + 11: "10262356", + 12: "10261643", +} + + +def make_full_path(time): + record = str(records[time.month]) + date = ( + "y" + + str(time.year) + + "m" + + str("{:02d}".format(time.month)) + + "d" + + str("{:02d}".format(time.day)) + ) + return ( + f"https://zenodo.org/records/{record}/files/eNATL60-BLBT02_{date}.1d_TSW_60m.nc" + ) + + +time_concat_dim = ConcatDim("time", dates) +pattern = FilePattern(make_full_path, time_concat_dim) +# pattern = pattern.prune(60) + -# Common Parameters -days = range(1, 32) -dataset_url = "https://zenodo.org/records/10513552/files" +class OpenWithPooch(beam.PTransform): + @staticmethod + def _open_pooch(url: str) -> str: + return pooch.retrieve(url=url, known_hash=None) -## Monthly version -input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc" for d in days -] -pattern = pattern_from_file_sequence(input_urls, concat_dim="time") + def expand(self, pcoll: beam.PCollection) -> beam.PCollection: + return pcoll | "open" >> beam.MapTuple(lambda k, v: (k, self._open_pooch(v))) class Preprocess(beam.PTransform): @@ -26,11 +70,11 @@ class Preprocess(beam.PTransform): @staticmethod def _set_coords(ds: xr.Dataset) -> xr.Dataset: - t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) - ds = ds.assign_coords(time=t_new) - ds = ds.drop(["time_counter"]) - ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"]) - + ds = ds.rename({"time_counter": "time"}) + ds = ds.set_coords(("nav_lat", "nav_lon")) + ds.attrs["deptht"] = ds.deptht.values[0] + ds = ds.drop("deptht") + ds = ds[["vosaline", "votemper", "vovecrtz"]] return ds def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @@ -39,20 +83,21 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) -eNATL60_BLBT02 = ( +eNATL60BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec() - | OpenWithXarray( - xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, - load=True, - copy_to_local=True, - ) + # | OpenURLWithFSSpec(max_concurrency=1) + | OpenWithPooch() + | OpenWithXarray() + # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, + # load=True, + # copy_to_local=True,) | Preprocess() | StoreToZarr( - store_name="eNATL60_BLBT02.zarr", + store_name="eNATL60-BLBT02.zarr", combine_dims=pattern.combine_dim_keys, - target_chunks={"x": 2000, "y": 2000, "time": 2}, + target_chunks={"time": 30, "y": 900, "x": 900}, ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() + | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) diff --git a/feedstock/meta.yaml b/feedstock/meta.yaml index fdc6c71..7d65db9 100644 --- a/feedstock/meta.yaml +++ b/feedstock/meta.yaml @@ -2,8 +2,8 @@ title: "LEAP Data Library" description: > eNATL60-TSW-60m is an extraction of a very high resolution oceanic simulation of the North Atlantic performed at MEOM, IGE (FRANCE) recipes: - - id: eNATL60_BLBT02 - object: "eNATL60:eNATL60_BLBT02" + - id: enatl60-blbt02 + object: "eNATL60:eNATL60BLBT02" provenance: providers: - name: "Zenodo" @@ -23,4 +23,4 @@ maintainers: github: jbusecke - name: "Charles Stern" orcid: 0000-0002-4078-0852 - github: cisaacstern \ No newline at end of file + github: cisaacstern diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index 2250fc2..81609bf 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,5 +1,6 @@ -pangeo-forge-recipes==0.10.4 +pangeo-forge-recipes==0.10.7 gcsfs -apache-beam[gcp] +apache-beam[gcp] >= 2.58.0 leap-data-management-utils==0.0.12 -xarray=2024.05.0 \ No newline at end of file +xarray==2024.05.0 +pooch