leap-stc · jbusecke · Aug 23, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/.github/workflows/deploy_recipe.yaml b/.github/workflows/deploy_recipe.yaml
@@ -25,7 +25,7 @@ jobs:
       - name: "Install dependencies"
         run: |
           python -m pip install --upgrade pip
-          pip install pangeo-forge-runner
+          pip install git+https://github.com/leap-stc/pangeo-forge-runner
       - name: "Deploy recipes"
         run: |
           pangeo-forge-runner bake \
@@ -42,6 +42,7 @@ jobs:
         # AT that point, screw it, not worth it.
         run: |
           jobname="${{ env.JOB_NAME }}"
+          echo "$JOB_NAME"
           while true; do
             count=$(gcloud dataflow jobs list --status=active --filter="name:${jobname}" --format="value(id)" | wc -l)
             echo "Active Dataflow jobs: $count"

diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py
@@ -4,10 +4,13 @@
 repo_path = os.environ['GITHUB_REPOSITORY']
 FEEDSTOCK_NAME = repo_path.split('/')[-1]
 
-c.Bake.prune = 1
+c.Bake.prune = False
 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery"
-c.DataflowBakery.use_dataflow_prime = True
-c.DataflowBakery.max_workers = 50
+c.DataflowBakery.use_dataflow_prime = False
+c.DataflowBakery.machine_type = "e2-highmem-16"
+c.DataflowBakery.disk_size_gb = 400
+c.DataflowBakery.use_shuffle = False
+c.DataflowBakery.max_num_workers = 1
 c.DataflowBakery.use_public_ips = True
 c.DataflowBakery.service_account_email = (
     "[email protected]"

diff --git a/feedstock/catalog.yaml b/feedstock/catalog.yaml
@@ -1,16 +1,9 @@
 # All the information important to cataloging.
-"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/proto_feedstock/blob/main/feedstock/meta.yaml"
+"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/eNATL_feedstock/blob/main/feedstock/meta.yaml"
 tags:
-  - my-custom-tag
   - zarr
+  - ocean
 stores:
-  - id: "small"
-    name: "The cool small Proto Dataset"
-    url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/small.zarr"
-    "ncviewjs:rechunking":
-      - path: "gs://some-bucket/small.zarr"
-        use_case: "multiscales"
-
-  - id: "large"
-    name: "The even cooler large Proto Dataset" # no pyramids
-    url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/large.zarr"
+  - id: "enatl60-blbt02"
+    name: "Needs a name"
+    url: "gs://leap-persistent/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr"
diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py
@@ -1,36 +1,80 @@
 import xarray as xr
+import pandas as pd
 import apache_beam as beam
-from pangeo_forge_recipes.patterns import pattern_from_file_sequence
+import pooch
+from pangeo_forge_recipes.patterns import ConcatDim, FilePattern
 from pangeo_forge_recipes.transforms import (
     ConsolidateMetadata,
     ConsolidateDimensionCoordinates,
-    OpenURLWithFSSpec,
     OpenWithXarray,
     StoreToZarr,
 )
 
+from leap_data_management_utils.data_management_transforms import (
+    Copy,
+    get_catalog_store_urls,
+)
+
+catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml")
+
+
+dates = pd.date_range("2009-07-01", "2010-06-30", freq="D")
+
+records = {
+    1: "10261988",
+    2: "10260907",
+    3: "10260980",
+    4: "10261078",
+    5: "10261126",
+    6: "10261192",
+    7: "10261274",
+    8: "10261349",
+    9: "10261461",
+    10: "10261540",
+    11: "10262356",
+    12: "10261643",
+}
+
+
+def make_full_path(time):
+    record = str(records[time.month])
+    date = (
+        "y"
+        + str(time.year)
+        + "m"
+        + str("{:02d}".format(time.month))
+        + "d"
+        + str("{:02d}".format(time.day))
+    )
+    return (
+        f"https://zenodo.org/records/{record}/files/eNATL60-BLBT02_{date}.1d_TSW_60m.nc"
+    )
+
+
+time_concat_dim = ConcatDim("time", dates)
+pattern = FilePattern(make_full_path, time_concat_dim)
+# pattern = pattern.prune(60)
+
 
-# Common Parameters
-days = range(1, 32)
-dataset_url = "https://zenodo.org/records/10513552/files"
+class OpenWithPooch(beam.PTransform):
+    @staticmethod
+    def _open_pooch(url: str) -> str:
+        return pooch.retrieve(url=url, known_hash=None)
 
-## Monthly version
-input_urls = [
-    f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc" for d in days
-]
-pattern = pattern_from_file_sequence(input_urls, concat_dim="time")
+    def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
+        return pcoll | "open" >> beam.MapTuple(lambda k, v: (k, self._open_pooch(v)))
 
 
 class Preprocess(beam.PTransform):
     """Custom transform to fix invalid time dimension"""
 
     @staticmethod
     def _set_coords(ds: xr.Dataset) -> xr.Dataset:
-        t_new = xr.DataArray(ds.time_counter.data, dims=["time"])
-        ds = ds.assign_coords(time=t_new)
-        ds = ds.drop(["time_counter"])
-        ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"])
-
+        ds = ds.rename({"time_counter": "time"})
+        ds = ds.set_coords(("nav_lat", "nav_lon"))
-        ds = ds.set_coords(("nav_lat", "nav_lon"))
+        ds = ds.set_coords(("nav_lat", "nav_lon", "t_mask"))
-        ds = ds.set_coords(("nav_lat", "nav_lon"))
+        ds = ds.set_coords(("nav_lat", "nav_lon", "t_mask"))
+        ds.attrs["deptht"] = ds.deptht.values[0]
+        ds = ds.drop("deptht")
+        ds = ds[["vosaline", "votemper", "vovecrtz"]]
         return ds
 
     def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
@@ -39,20 +83,21 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection:
         )
 
 
-eNATL60_BLBT02 = (
+eNATL60BLBT02 = (
     beam.Create(pattern.items())
-    | OpenURLWithFSSpec()
-    | OpenWithXarray(
-        xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"},
-        load=True,
-        copy_to_local=True,
-    )
+    # | OpenURLWithFSSpec(max_concurrency=1)
+    | OpenWithPooch()
+    | OpenWithXarray()
+    # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"},
+    # load=True,
+    # copy_to_local=True,)
     | Preprocess()
     | StoreToZarr(
-        store_name="eNATL60_BLBT02.zarr",
+        store_name="eNATL60-BLBT02.zarr",
         combine_dims=pattern.combine_dim_keys,
-        target_chunks={"x": 2000, "y": 2000, "time": 2},
+        target_chunks={"time": 30, "y": 900, "x": 900},
     )
     | ConsolidateDimensionCoordinates()
     | ConsolidateMetadata()
+    | Copy(target=catalog_store_urls["enatl60-blbt02"])
 )
diff --git a/feedstock/meta.yaml b/feedstock/meta.yaml
@@ -2,8 +2,8 @@ title: "LEAP Data Library"
 description: >
   eNATL60-TSW-60m is an extraction of a very high resolution oceanic simulation of the North Atlantic performed at MEOM, IGE (FRANCE)
 recipes:
-  - id: eNATL60_BLBT02
-    object: "eNATL60:eNATL60_BLBT02"
+  - id: enatl60-blbt02
+    object: "eNATL60:eNATL60BLBT02"
 provenance:
   providers:
     - name: "Zenodo"
@@ -23,4 +23,4 @@ maintainers:
     github: jbusecke
   - name: "Charles Stern"
     orcid: 0000-0002-4078-0852
-    github: cisaacstern
+    github: cisaacstern
diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt
@@ -1,5 +1,6 @@
-pangeo-forge-recipes==0.10.4
+pangeo-forge-recipes==0.10.7
 gcsfs
-apache-beam[gcp]
+apache-beam[gcp] >= 2.58.0
 leap-data-management-utils==0.0.12
-xarray=2024.05.0
+xarray==2024.05.0
+pooch