From 71140d0bc1b1de117e7607786ecad446e2d23ee0 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:17:41 +0100 Subject: [PATCH 01/65] Fix recipe_id naming --- feedstock/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feedstock/meta.yaml b/feedstock/meta.yaml index fdc6c71..4d0928d 100644 --- a/feedstock/meta.yaml +++ b/feedstock/meta.yaml @@ -2,7 +2,7 @@ title: "LEAP Data Library" description: > eNATL60-TSW-60m is an extraction of a very high resolution oceanic simulation of the North Atlantic performed at MEOM, IGE (FRANCE) recipes: - - id: eNATL60_BLBT02 + - id: eNATL60-BLBT02 object: "eNATL60:eNATL60_BLBT02" provenance: providers: @@ -23,4 +23,4 @@ maintainers: github: jbusecke - name: "Charles Stern" orcid: 0000-0002-4078-0852 - github: cisaacstern \ No newline at end of file + github: cisaacstern From 1ed8bba54e1ae0b15ad7420f0f45abe5ff883965 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:28:13 +0100 Subject: [PATCH 02/65] Update catalog.yaml --- feedstock/catalog.yaml | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/feedstock/catalog.yaml b/feedstock/catalog.yaml index 8a7d348..79f91f4 100644 --- a/feedstock/catalog.yaml +++ b/feedstock/catalog.yaml @@ -1,16 +1,9 @@ # All the information important to cataloging. -"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/proto_feedstock/blob/main/feedstock/meta.yaml" +"ncviewjs:meta_yaml_url": "https://github.com/leap-stc/eNATL_feedstock/blob/main/feedstock/meta.yaml" tags: - - my-custom-tag - zarr + - ocean stores: - - id: "small" - name: "The cool small Proto Dataset" - url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/small.zarr" - "ncviewjs:rechunking": - - path: "gs://some-bucket/small.zarr" - use_case: "multiscales" - - - id: "large" - name: "The even cooler large Proto Dataset" # no pyramids - url: "gs://leap-scratch/data-library/feedstocks/proto_feedstock/large.zarr" + - id: "eNATL60-BLBT02" + name: "Needs a name" + url: "gs://leap-scratch/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr" From 6c4f75d19b11d3831955ad721ad2d698da9411e1 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:43:41 +0100 Subject: [PATCH 03/65] Update catalog.yaml --- feedstock/catalog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/catalog.yaml b/feedstock/catalog.yaml index 79f91f4..74a6520 100644 --- a/feedstock/catalog.yaml +++ b/feedstock/catalog.yaml @@ -4,6 +4,6 @@ tags: - zarr - ocean stores: - - id: "eNATL60-BLBT02" + - id: "enatl60-blbt02" name: "Needs a name" url: "gs://leap-scratch/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr" From cc9938526c77e3bd2b3b090cc3f71af65d2618e2 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:43:56 +0100 Subject: [PATCH 04/65] Update meta.yaml --- feedstock/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/meta.yaml b/feedstock/meta.yaml index 4d0928d..ec449bb 100644 --- a/feedstock/meta.yaml +++ b/feedstock/meta.yaml @@ -2,7 +2,7 @@ title: "LEAP Data Library" description: > eNATL60-TSW-60m is an extraction of a very high resolution oceanic simulation of the North Atlantic performed at MEOM, IGE (FRANCE) recipes: - - id: eNATL60-BLBT02 + - id: enatl60-blbt02 object: "eNATL60:eNATL60_BLBT02" provenance: providers: From d39ce9024630ad21766fa1ae49ad211c29e59216 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:46:51 +0100 Subject: [PATCH 05/65] fix dependencies --- feedstock/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index 2250fc2..e843367 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -2,4 +2,4 @@ pangeo-forge-recipes==0.10.4 gcsfs apache-beam[gcp] leap-data-management-utils==0.0.12 -xarray=2024.05.0 \ No newline at end of file +xarray==2024.05.0 From 66b4088a1296e3832ffd2f6e9a176d00dd33da4d Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 11:52:31 +0100 Subject: [PATCH 06/65] bump pgf-recipes --- feedstock/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index e843367..aac4838 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,4 +1,4 @@ -pangeo-forge-recipes==0.10.4 +pangeo-forge-recipes==0.10.7 gcsfs apache-beam[gcp] leap-data-management-utils==0.0.12 From 6816b1ad2c968bafa5a16974aa4c368ecd0ac29f Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 14:31:32 +0100 Subject: [PATCH 07/65] Update config_dataflow.py --- configs/config_dataflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8817f77..4fca086 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -6,7 +6,8 @@ c.Bake.prune = 1 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" -c.DataflowBakery.use_dataflow_prime = True +c.DataflowBakery.use_dataflow_prime = False +c.DataflowBakery.machine_type = "n2-highmem-8" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 92102ab51e2fcad5737cd8f7548719c0acfa86c9 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:03:40 +0100 Subject: [PATCH 08/65] move final output to persistent bucket --- feedstock/catalog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/catalog.yaml b/feedstock/catalog.yaml index 74a6520..4370f86 100644 --- a/feedstock/catalog.yaml +++ b/feedstock/catalog.yaml @@ -6,4 +6,4 @@ tags: stores: - id: "enatl60-blbt02" name: "Needs a name" - url: "gs://leap-scratch/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr" + url: "gs://leap-persistent/data-library/feedstocks/eNATL_feedstock/eNATL60-BLBT02.zarr" From 363fb00b710d2caeb461ac1e74312a907e49de48 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:06:29 +0100 Subject: [PATCH 09/65] Re add logic for copy stage --- feedstock/eNATL60.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 19f7434..fa86ead 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,5 +1,11 @@ import xarray as xr import apache_beam as beam +from leap_data_management_utils.data_management_transforms import ( + Copy, + InjectAttrs, + get_catalog_store_urls, +) + from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, @@ -9,6 +15,19 @@ StoreToZarr, ) +# parse the catalog store locations (this is where the data is copied to after successful write (and maybe testing) +catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") + +# if not run in a github workflow, assume local testing and deactivate the copy stage by setting all urls to False (see https://github.com/leap-stc/leap-data-management-utils/blob/b5762a17cbfc9b5036e1cd78d62c4e6a50c9691a/leap_data_management_utils/data_management_transforms.py#L121-L145) +if os.getenv("GITHUB_ACTIONS") == "true": + print("Running inside GitHub Actions.") +else: + print("Running locally. Deactivating final copy stage.") + catalog_store_urls = {k: False for k in catalog_store_urls.keys()} + +print("Final output locations") +print(f"{catalog_store_urls=}") + # Common Parameters days = range(1, 32) @@ -55,4 +74,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() + | Copy( + target=catalog_store_urls["enatl60-blbt02"] + ) ) From 3c183f19fd1f7ee92be0d2a985a74113d5170b45 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:12:43 +0100 Subject: [PATCH 10/65] Update eNATL60.py --- feedstock/eNATL60.py | 1 + 1 file changed, 1 insertion(+) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index fa86ead..533c3b1 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,3 +1,4 @@ +import os import xarray as xr import apache_beam as beam from leap_data_management_utils.data_management_transforms import ( From e9ae02d3d0635e49d6d86d16f6d2bb61a535070b Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:29:23 +0100 Subject: [PATCH 11/65] remove prune and reduce machine size --- configs/config_dataflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 4fca086..8ffe0fa 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,10 +4,10 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 1 +c.Bake.prune = 0 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "n2-highmem-8" +c.DataflowBakery.machine_type = "n2-highmem-4" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From b19e7a3d7fe1a97053edc18864c730aeab80ab39 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 12 Jul 2024 14:29:41 +0000 Subject: [PATCH 12/65] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/eNATL60.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 533c3b1..310bf77 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -3,7 +3,6 @@ import apache_beam as beam from leap_data_management_utils.data_management_transforms import ( Copy, - InjectAttrs, get_catalog_store_urls, ) @@ -75,7 +74,5 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() - | Copy( - target=catalog_store_urls["enatl60-blbt02"] - ) + | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From 6152eb638cbcb152061d77832d6fdc11dfe373df Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:39:27 +0100 Subject: [PATCH 13/65] Update eNATL60.py --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 310bf77..2436381 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -60,7 +60,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60_BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec() + | OpenURLWithFSSpec(max_concurrency=4) | OpenWithXarray( xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, load=True, From 57d68ed3fcb6189dd7f9d5cc863a25d384231926 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 15:49:05 +0100 Subject: [PATCH 14/65] Update eNATL60.py --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 2436381..a983150 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -60,7 +60,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60_BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec(max_concurrency=4) + | OpenURLWithFSSpec(max_concurrency=2) | OpenWithXarray( xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, load=True, From 9e27af0435f5a426dda78c697ba0aca821748d3c Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Fri, 12 Jul 2024 16:07:16 +0100 Subject: [PATCH 15/65] Update eNATL60.py --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index a983150..601cca1 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -60,7 +60,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60_BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec(max_concurrency=2) + | OpenURLWithFSSpec(max_concurrency=1) | OpenWithXarray( xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, load=True, From 4f89453773ed3c9be256a22ad40a957ef4633fec Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Mon, 22 Jul 2024 13:01:53 +0200 Subject: [PATCH 16/65] Update config_dataflow.py --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8ffe0fa..bd34767 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = 0 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "n2-highmem-4" +c.DataflowBakery.machine_type = "n2-highmem-8" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 42587662efcb4259610f91bf342325c0c0360599 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 23 Jul 2024 12:10:17 +0200 Subject: [PATCH 17/65] Update eNATL60.py --- feedstock/eNATL60.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 601cca1..79b57aa 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,4 +1,5 @@ import os +import logging import xarray as xr import apache_beam as beam from leap_data_management_utils.data_management_transforms import ( @@ -15,6 +16,8 @@ StoreToZarr, ) +logger = logging.getLogger(__name__) + # parse the catalog store locations (this is where the data is copied to after successful write (and maybe testing) catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") @@ -45,11 +48,12 @@ class Preprocess(beam.PTransform): @staticmethod def _set_coords(ds: xr.Dataset) -> xr.Dataset: + logger.info(f"Before preprocessing {ds=}") t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) ds = ds.assign_coords(time=t_new) ds = ds.drop(["time_counter"]) ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"]) - + logger.info(f"After preprocessing {ds=}") return ds def expand(self, pcoll: beam.PCollection) -> beam.PCollection: From de123d0928cf2602ee6aa44da334bb69d4b9e0c4 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 23 Jul 2024 12:12:20 +0200 Subject: [PATCH 18/65] Update eNATL60.py --- feedstock/eNATL60.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 79b57aa..5726794 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -33,7 +33,9 @@ # Common Parameters -days = range(1, 32) +# days = range(1, 32) +# reduce for faster testing +days = range(1,3) dataset_url = "https://zenodo.org/records/10513552/files" ## Monthly version From 558b7c0636bf6a71c674e5b8fefb906e9a4d4546 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:12:25 +0000 Subject: [PATCH 19/65] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 5726794..ccdfc34 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -35,7 +35,7 @@ # Common Parameters # days = range(1, 32) # reduce for faster testing -days = range(1,3) +days = range(1, 3) dataset_url = "https://zenodo.org/records/10513552/files" ## Monthly version From ec2da10d84c73ec184a3a3a83599dbcb0ede84e0 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 23 Jul 2024 12:29:09 +0200 Subject: [PATCH 20/65] Update eNATL60.py --- feedstock/eNATL60.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index ccdfc34..f2559c8 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -78,7 +78,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: combine_dims=pattern.combine_dim_keys, target_chunks={"x": 2000, "y": 2000, "time": 2}, ) - | ConsolidateDimensionCoordinates() - | ConsolidateMetadata() - | Copy(target=catalog_store_urls["enatl60-blbt02"]) + # | ConsolidateDimensionCoordinates() + # | ConsolidateMetadata() + # | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From 49728c3c496962f8ebe9e9e848dff5c9251a85a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:29:26 +0000 Subject: [PATCH 21/65] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/eNATL60.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index f2559c8..97dccb8 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -3,14 +3,11 @@ import xarray as xr import apache_beam as beam from leap_data_management_utils.data_management_transforms import ( - Copy, get_catalog_store_urls, ) from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( - ConsolidateMetadata, - ConsolidateDimensionCoordinates, OpenURLWithFSSpec, OpenWithXarray, StoreToZarr, From 4f6407559dbe6e317017e50515e9e582e864ef48 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 23 Jul 2024 15:17:38 +0200 Subject: [PATCH 22/65] Update eNATL60.py --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 97dccb8..2faf528 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -69,7 +69,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: load=True, copy_to_local=True, ) - | Preprocess() + # | Preprocess() | StoreToZarr( store_name="eNATL60_BLBT02.zarr", combine_dims=pattern.combine_dim_keys, From dc648fd4151b1c8780aab511c48b69e1a4fe92bb Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 8 Aug 2024 16:11:03 -0600 Subject: [PATCH 23/65] converts cftime.DatetimeGregorian datetime64[ns] --- feedstock/eNATL60.py | 41 ++++++++++------------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 2faf528..7a0655d 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,38 +1,17 @@ -import os -import logging import xarray as xr import apache_beam as beam -from leap_data_management_utils.data_management_transforms import ( - get_catalog_store_urls, -) - from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( + ConsolidateMetadata, + ConsolidateDimensionCoordinates, OpenURLWithFSSpec, OpenWithXarray, StoreToZarr, ) -logger = logging.getLogger(__name__) - -# parse the catalog store locations (this is where the data is copied to after successful write (and maybe testing) -catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") - -# if not run in a github workflow, assume local testing and deactivate the copy stage by setting all urls to False (see https://github.com/leap-stc/leap-data-management-utils/blob/b5762a17cbfc9b5036e1cd78d62c4e6a50c9691a/leap_data_management_utils/data_management_transforms.py#L121-L145) -if os.getenv("GITHUB_ACTIONS") == "true": - print("Running inside GitHub Actions.") -else: - print("Running locally. Deactivating final copy stage.") - catalog_store_urls = {k: False for k in catalog_store_urls.keys()} - -print("Final output locations") -print(f"{catalog_store_urls=}") - # Common Parameters -# days = range(1, 32) -# reduce for faster testing -days = range(1, 3) +days = range(1, 32) dataset_url = "https://zenodo.org/records/10513552/files" ## Monthly version @@ -47,12 +26,13 @@ class Preprocess(beam.PTransform): @staticmethod def _set_coords(ds: xr.Dataset) -> xr.Dataset: - logger.info(f"Before preprocessing {ds=}") t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) ds = ds.assign_coords(time=t_new) ds = ds.drop(["time_counter"]) ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"]) - logger.info(f"After preprocessing {ds=}") + # convert cftime.DatetimeGregorian to datetime64[ns] + ds["time"] = ds.indexes["time"].to_datetimeindex() + return ds def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @@ -63,19 +43,18 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60_BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec(max_concurrency=1) + | OpenURLWithFSSpec() | OpenWithXarray( xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, load=True, copy_to_local=True, ) - # | Preprocess() + | Preprocess() | StoreToZarr( store_name="eNATL60_BLBT02.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"x": 2000, "y": 2000, "time": 2}, ) - # | ConsolidateDimensionCoordinates() - # | ConsolidateMetadata() - # | Copy(target=catalog_store_urls["enatl60-blbt02"]) + | ConsolidateDimensionCoordinates() + | ConsolidateMetadata() ) From d3c84647b9fcc6a82549c1e01aec44d9d452f321 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 8 Aug 2024 16:33:07 -0600 Subject: [PATCH 24/65] pruned + job name --- configs/config_dataflow.py | 4 ++-- feedstock/eNATL60.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index bd34767..ddf13a8 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,7 +4,7 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 0 +c.Bake.prune = 1 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "n2-highmem-8" @@ -17,5 +17,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 7a0655d..8e31b51 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -51,7 +51,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) | Preprocess() | StoreToZarr( - store_name="eNATL60_BLBT02.zarr", + store_name="eNATL60_BLBT02_timefix.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"x": 2000, "y": 2000, "time": 2}, ) From 09ae8708c805c683e007f432e97ec3beba5f142d Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 8 Aug 2024 16:49:31 -0600 Subject: [PATCH 25/65] adds copy stage & echo --- .github/workflows/deploy_recipe.yaml | 1 + configs/config_dataflow.py | 4 ++-- feedstock/eNATL60.py | 9 ++++++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy_recipe.yaml b/.github/workflows/deploy_recipe.yaml index 02c92b8..d3ec80f 100644 --- a/.github/workflows/deploy_recipe.yaml +++ b/.github/workflows/deploy_recipe.yaml @@ -42,6 +42,7 @@ jobs: # AT that point, screw it, not worth it. run: | jobname="${{ env.JOB_NAME }}" + echo "$JOB_NAME" while true; do count=$(gcloud dataflow jobs list --status=active --filter="name:${jobname}" --format="value(id)" | wc -l) echo "Active Dataflow jobs: $count" diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index ddf13a8..bd34767 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,7 +4,7 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 1 +c.Bake.prune = 0 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "n2-highmem-8" @@ -17,5 +17,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 8e31b51..194f6fd 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -9,6 +9,12 @@ StoreToZarr, ) +from leap_data_management_utils.data_management_transforms import ( + Copy, + get_catalog_store_urls, +) + +catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") # Common Parameters days = range(1, 32) @@ -51,10 +57,11 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) | Preprocess() | StoreToZarr( - store_name="eNATL60_BLBT02_timefix.zarr", + store_name="eNATL60_BLBT02.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"x": 2000, "y": 2000, "time": 2}, ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() + | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From f60775c9d4d497f0546037a132de9fddf277aaa5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 8 Aug 2024 17:05:00 -0600 Subject: [PATCH 26/65] noprune --- configs/config_dataflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index bd34767..225a477 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -19,3 +19,5 @@ c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" + + From da43ea388a26327ba32945423d4cabec66ea33fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 23:05:07 +0000 Subject: [PATCH 27/65] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- configs/config_dataflow.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 225a477..bd34767 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -19,5 +19,3 @@ c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" - - From d9cf73008d7ffca9b1c611daf976342dfa9194ce Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 9 Aug 2024 10:38:28 -0600 Subject: [PATCH 28/65] hardcode dataflow name + prune --- configs/config_dataflow.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 225a477..609fe8b 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,7 +4,7 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 0 +c.Bake.prune = 1 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "n2-highmem-8" @@ -17,7 +17,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/enatl-dataflow-prune" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" - - From 6c592b7a11c6d097b82f77b4f19721310942023b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 9 Aug 2024 10:53:05 -0600 Subject: [PATCH 29/65] 1 month + prod copy --- configs/config_dataflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 609fe8b..bd34767 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,7 +4,7 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 1 +c.Bake.prune = 0 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "n2-highmem-8" @@ -17,5 +17,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/enatl-dataflow-prune" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" From 15b4ad9f17c93cf13ed3653774e949adb1f85e22 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 9 Aug 2024 11:11:16 -0600 Subject: [PATCH 30/65] hardcode naming --- configs/config_dataflow.py | 2 +- feedstock/eNATL60.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index bd34767..6a0384f 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -17,5 +17,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/eNATL_feedstock" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 194f6fd..89525b7 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -57,7 +57,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) | Preprocess() | StoreToZarr( - store_name="eNATL60_BLBT02.zarr", + store_name="eNATL60-BLBT02.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"x": 2000, "y": 2000, "time": 2}, ) From f9d03e80a316f93c971227a27c871f97a14ab900 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Fri, 9 Aug 2024 11:11:53 -0600 Subject: [PATCH 31/65] naming nit --- feedstock/eNATL60.py | 2 +- feedstock/meta.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 89525b7..a174d23 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -47,7 +47,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: ) -eNATL60_BLBT02 = ( +eNATL60BLBT02 = ( beam.Create(pattern.items()) | OpenURLWithFSSpec() | OpenWithXarray( diff --git a/feedstock/meta.yaml b/feedstock/meta.yaml index ec449bb..7d65db9 100644 --- a/feedstock/meta.yaml +++ b/feedstock/meta.yaml @@ -3,7 +3,7 @@ description: > eNATL60-TSW-60m is an extraction of a very high resolution oceanic simulation of the North Atlantic performed at MEOM, IGE (FRANCE) recipes: - id: enatl60-blbt02 - object: "eNATL60:eNATL60_BLBT02" + object: "eNATL60:eNATL60BLBT02" provenance: providers: - name: "Zenodo" From 72861b2711fb5b94a4d29fbf5cc33fe456843751 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 13:16:45 -0600 Subject: [PATCH 32/65] max_concurrency=1 & prune off --- configs/config_dataflow.py | 5 ++--- feedstock/eNATL60.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 6a0384f..49e230e 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -6,8 +6,7 @@ c.Bake.prune = 0 c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" -c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "n2-highmem-8" +c.DataflowBakery.use_dataflow_prime = True c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( @@ -17,5 +16,5 @@ c.DataflowBakery.temp_gcs_location = f"gs://leap-scratch/data-library/feedstocks/temp/{FEEDSTOCK_NAME}" c.TargetStorage.fsspec_class = "gcsfs.GCSFileSystem" c.InputCacheStorage.fsspec_class = "gcsfs.GCSFileSystem" -c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/eNATL_feedstock" +c.TargetStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/output/{FEEDSTOCK_NAME}/{{job_name}}" c.InputCacheStorage.root_path = f"gs://leap-scratch/data-library/feedstocks/cache" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index a174d23..97b0e14 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -49,7 +49,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec() + | OpenURLWithFSSpec(max_concurrency=1) | OpenWithXarray( xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, load=True, From 023d13a53a74cec1252c151701e58c6eded2241e Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 13:21:37 -0600 Subject: [PATCH 33/65] prune bool - does this work --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 49e230e..fa4acd2 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,7 +4,7 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = 0 +c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = True c.DataflowBakery.max_workers = 50 From 47ecf4ca6035ab43606452998d6a41864fc792b0 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 15:31:52 -0600 Subject: [PATCH 34/65] disable prime, add e2-highmem-4 --- configs/config_dataflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index fa4acd2..224d395 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -6,7 +6,8 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" -c.DataflowBakery.use_dataflow_prime = True +c.DataflowBakery.use_dataflow_prime = False +c.DataflowBakery.machine_type = "e2-highmem-4" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 8ef929f934b125b75df85f2b267c906d2fac97bf Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 16:01:29 -0600 Subject: [PATCH 35/65] bump machine size --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 224d395..42796fd 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-4" +c.DataflowBakery.machine_type = "e2-highmem-16" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 4fb61ff91e2f96c47736d4e139dbf44c64412669 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:34:28 -0600 Subject: [PATCH 36/65] new approach --- feedstock/eNATL60.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 97b0e14..f397ff5 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -22,7 +22,7 @@ ## Monthly version input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc" for d in days + f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" for d in days ] pattern = pattern_from_file_sequence(input_urls, concat_dim="time") @@ -36,8 +36,8 @@ def _set_coords(ds: xr.Dataset) -> xr.Dataset: ds = ds.assign_coords(time=t_new) ds = ds.drop(["time_counter"]) ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"]) - # convert cftime.DatetimeGregorian to datetime64[ns] - ds["time"] = ds.indexes["time"].to_datetimeindex() + # # convert cftime.DatetimeGregorian to datetime64[ns] + # ds["time"] = ds.indexes["time"].to_datetimeindex() return ds @@ -49,11 +49,13 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60BLBT02 = ( beam.Create(pattern.items()) - | OpenURLWithFSSpec(max_concurrency=1) + # | OpenURLWithFSSpec(max_concurrency=1) | OpenWithXarray( - xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, - load=True, - copy_to_local=True, + xarray_open_kwargs={"engine": 'netcdf4'} + + # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, + # load=True, + # copy_to_local=True, ) | Preprocess() | StoreToZarr( @@ -65,3 +67,4 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: | ConsolidateMetadata() | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) + From aff575896f2de5e26acb3a27832490257f8f8c71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 13 Aug 2024 23:34:37 +0000 Subject: [PATCH 37/65] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/eNATL60.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index f397ff5..f9a8c87 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -4,7 +4,6 @@ from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, ConsolidateDimensionCoordinates, - OpenURLWithFSSpec, OpenWithXarray, StoreToZarr, ) @@ -22,7 +21,8 @@ ## Monthly version input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" for d in days + f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" + for d in days ] pattern = pattern_from_file_sequence(input_urls, concat_dim="time") @@ -51,8 +51,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: beam.Create(pattern.items()) # | OpenURLWithFSSpec(max_concurrency=1) | OpenWithXarray( - xarray_open_kwargs={"engine": 'netcdf4'} - + xarray_open_kwargs={"engine": "netcdf4"} # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, # load=True, # copy_to_local=True, @@ -67,4 +66,3 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: | ConsolidateMetadata() | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) - From 84caede8c772483cd5fef9f0fd618f0fe3cc5866 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:35:05 -0600 Subject: [PATCH 38/65] lint --- feedstock/eNATL60.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index f397ff5..f9a8c87 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -4,7 +4,6 @@ from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, ConsolidateDimensionCoordinates, - OpenURLWithFSSpec, OpenWithXarray, StoreToZarr, ) @@ -22,7 +21,8 @@ ## Monthly version input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" for d in days + f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" + for d in days ] pattern = pattern_from_file_sequence(input_urls, concat_dim="time") @@ -51,8 +51,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: beam.Create(pattern.items()) # | OpenURLWithFSSpec(max_concurrency=1) | OpenWithXarray( - xarray_open_kwargs={"engine": 'netcdf4'} - + xarray_open_kwargs={"engine": "netcdf4"} # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, # load=True, # copy_to_local=True, @@ -67,4 +66,3 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: | ConsolidateMetadata() | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) - From 39bfabec64334c2e71f6124d4d0050ba30e73eb9 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:49:04 -0600 Subject: [PATCH 39/65] pooch --- feedstock/eNATL60.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index f9a8c87..e875f93 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,5 +1,6 @@ import xarray as xr import apache_beam as beam +import pooch from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, @@ -21,12 +22,21 @@ ## Monthly version input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc#mode=bytes" - for d in days + f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc" for d in days ] pattern = pattern_from_file_sequence(input_urls, concat_dim="time") +class OpenWithPooch(beam.PTransform): + @staticmethod + def _open_pooch(url: str) -> str: + # import pdb; pdb.set_trace() + return pooch.retrieve(url=url, known_hash=None) + + def expand(self, pcoll: beam.PCollection) -> beam.PCollection: + return pcoll | "open" >> beam.MapTuple(lambda k, v: (k, self._open_pooch(v))) + + class Preprocess(beam.PTransform): """Custom transform to fix invalid time dimension""" @@ -50,12 +60,11 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: eNATL60BLBT02 = ( beam.Create(pattern.items()) # | OpenURLWithFSSpec(max_concurrency=1) - | OpenWithXarray( - xarray_open_kwargs={"engine": "netcdf4"} - # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, - # load=True, - # copy_to_local=True, - ) + | OpenWithPooch() + | OpenWithXarray() + # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, + # load=True, + # copy_to_local=True,) | Preprocess() | StoreToZarr( store_name="eNATL60-BLBT02.zarr", From 8ce62e12a87d25f86e530b7883988ace76604925 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:49:30 -0600 Subject: [PATCH 40/65] pooch 5 timesteps --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index e875f93..20319f1 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -17,7 +17,7 @@ catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") # Common Parameters -days = range(1, 32) +days = range(1, 5) dataset_url = "https://zenodo.org/records/10513552/files" ## Monthly version From 4cb16c93e62f743fe2bc23b199d06db056de5c2c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:50:05 -0600 Subject: [PATCH 41/65] update reqs --- feedstock/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index aac4838..81609bf 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,5 +1,6 @@ pangeo-forge-recipes==0.10.7 gcsfs -apache-beam[gcp] +apache-beam[gcp] >= 2.58.0 leap-data-management-utils==0.0.12 xarray==2024.05.0 +pooch From 4b18288f510692865509aa634c58640639aab967 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Tue, 13 Aug 2024 17:58:02 -0600 Subject: [PATCH 42/65] 5 to 32 days --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 20319f1..e875f93 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -17,7 +17,7 @@ catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") # Common Parameters -days = range(1, 5) +days = range(1, 32) dataset_url = "https://zenodo.org/records/10513552/files" ## Monthly version From ee1dbaafe934672adfdecd382c9ffcd6edf3b8a1 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 09:47:13 -0600 Subject: [PATCH 43/65] smaller worker --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 42796fd..224d395 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.machine_type = "e2-highmem-4" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 7159713b3c8b80d4c0535a81d5e7b0467f5a18c7 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 09:59:40 -0600 Subject: [PATCH 44/65] trying out ARM machine --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 224d395..97122a2 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-4" +c.DataflowBakery.machine_type = "t2a-standard-8" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From b7d28e311e636d70899e84b4d02030e44b3d6dbc Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 10:24:11 -0600 Subject: [PATCH 45/65] small standard worker --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 97122a2..224d395 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "t2a-standard-8" +c.DataflowBakery.machine_type = "e2-highmem-4" c.DataflowBakery.max_workers = 50 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 23a37a9f6a77a35a8c87b5b5994dd361539f7ec6 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 10:39:58 -0600 Subject: [PATCH 46/65] single worker + highmem --- configs/config_dataflow.py | 4 ++-- feedstock/eNATL60.py | 47 +++++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 224d395..f2ad751 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,8 +7,8 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-4" -c.DataflowBakery.max_workers = 50 +c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( "leap-community-bakery@leap-pangeo.iam.gserviceaccount.com" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index e875f93..7a7cc0f 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -1,7 +1,8 @@ import xarray as xr +import pandas as pd import apache_beam as beam import pooch -from pangeo_forge_recipes.patterns import pattern_from_file_sequence +from pangeo_forge_recipes.patterns import ConcatDim, FilePattern from pangeo_forge_recipes.transforms import ( ConsolidateMetadata, ConsolidateDimensionCoordinates, @@ -15,22 +16,46 @@ ) catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") +dates = pd.date_range("2009-07-01", "2010-06-30", freq="D") -# Common Parameters -days = range(1, 32) -dataset_url = "https://zenodo.org/records/10513552/files" +records = { + 1: "10261988", + 2: "10260907", + 3: "10260980", + 4: "10261078", + 5: "10261126", + 6: "10261192", + 7: "10261274", + 8: "10261349", + 9: "10261461", + 10: "10261540", + 11: "10262356", + 12: "10261643", +} -## Monthly version -input_urls = [ - f"{dataset_url}/eNATL60-BLBT02_y2009m07d{d:02d}.1d_TSWm_60m.nc" for d in days -] -pattern = pattern_from_file_sequence(input_urls, concat_dim="time") + +def make_full_path(time): + record = str(records[time.month]) + date = ( + "y" + + str(time.year) + + "m" + + str("{:02d}".format(time.month)) + + "d" + + str("{:02d}".format(time.day)) + ) + return ( + f"https://zenodo.org/records/{record}/files/eNATL60-BLBT02_{date}.1d_TSW_60m.nc" + ) + + +time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) +pattern = FilePattern(make_full_path, time_concat_dim) class OpenWithPooch(beam.PTransform): @staticmethod def _open_pooch(url: str) -> str: - # import pdb; pdb.set_trace() return pooch.retrieve(url=url, known_hash=None) def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @@ -69,7 +94,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: | StoreToZarr( store_name="eNATL60-BLBT02.zarr", combine_dims=pattern.combine_dim_keys, - target_chunks={"x": 2000, "y": 2000, "time": 2}, + target_chunks={"time": 30, "y": 900, "x": 900}, ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() From 4ddfd926c9dd582b64178508babc155de8fb90a5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 10:56:08 -0600 Subject: [PATCH 47/65] task --- feedstock/eNATL60.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 7a7cc0f..e095340 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -70,7 +70,8 @@ def _set_coords(ds: xr.Dataset) -> xr.Dataset: t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) ds = ds.assign_coords(time=t_new) ds = ds.drop(["time_counter"]) - ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat", "tmask"]) + ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat"]) + # # convert cftime.DatetimeGregorian to datetime64[ns] # ds["time"] = ds.indexes["time"].to_datetimeindex() From c529398b898ce6eb149644e096304b3a081e6e5f Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 11:10:05 -0600 Subject: [PATCH 48/65] disk size --- configs/config_dataflow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index f2ad751..30bfca8 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -8,6 +8,7 @@ c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.disk_size_gb = 100 c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 5c41c755ff23a0212ef9ea425291789b8141daba Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 11:11:59 -0600 Subject: [PATCH 49/65] 300GB storage --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 30bfca8..8e7bdfb 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -8,7 +8,7 @@ c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "e2-highmem-16" -c.DataflowBakery.disk_size_gb = 100 +c.DataflowBakery.disk_size_gb = 300 c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From 95211e4cef1df27c2620a0bb0568f5e7ac8ee2f3 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 11:33:18 -0600 Subject: [PATCH 50/65] 120 days --- configs/config_dataflow.py | 2 +- feedstock/eNATL60.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8e7bdfb..d22c041 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -8,7 +8,7 @@ c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False c.DataflowBakery.machine_type = "e2-highmem-16" -c.DataflowBakery.disk_size_gb = 300 +c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index e095340..08a356c 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -51,6 +51,7 @@ def make_full_path(time): time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) pattern = FilePattern(make_full_path, time_concat_dim) +pattern = pattern.prune(120) class OpenWithPooch(beam.PTransform): From 39b7d1f185d813a28154c9d57206fc16fe7580b4 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 11:43:18 -0600 Subject: [PATCH 51/65] weird z3 machine --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index d22c041..563ffef 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.machine_type = "z3-highmem-88" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True From 642af624c756bef42ee8c3d57b9e32fa3d0c42b4 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 11:47:55 -0600 Subject: [PATCH 52/65] region for z3 --- configs/config_dataflow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 563ffef..b82fcd9 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,6 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False +c.DataflowBakery.region = 'us-central1-a' c.DataflowBakery.machine_type = "z3-highmem-88" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.max_workers = 1 From 1f91f7ad36cca88b87e7aa4a5f239f7cd570e5b3 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 12:02:44 -0600 Subject: [PATCH 53/65] use_shuffle = False + fork of runner with new opts --- configs/config_dataflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index b82fcd9..cc90441 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,9 +7,9 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.region = 'us-central1-a' -c.DataflowBakery.machine_type = "z3-highmem-88" +c.DataflowBakery.machine_type = "e2-highmem-16" c.DataflowBakery.disk_size_gb = 400 +c.DataflowBakery.use_shuffle = False c.DataflowBakery.max_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( From c2c8b80939343ffab745645a9b9558a61f720ec5 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 12:10:34 -0600 Subject: [PATCH 54/65] nit --- .github/workflows/deploy_recipe.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy_recipe.yaml b/.github/workflows/deploy_recipe.yaml index d3ec80f..4500ab1 100644 --- a/.github/workflows/deploy_recipe.yaml +++ b/.github/workflows/deploy_recipe.yaml @@ -25,7 +25,7 @@ jobs: - name: "Install dependencies" run: | python -m pip install --upgrade pip - pip install pangeo-forge-runner + pip install git+https://github.com/leap-stc/pangeo-forge-runner - name: "Deploy recipes" run: | pangeo-forge-runner bake \ From 26baa53fe26d4fddf7857fc96b2bdb3f27bf4f28 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 13:02:46 -0600 Subject: [PATCH 55/65] max_num_workers=1 --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index cc90441..8f2f236 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -10,7 +10,7 @@ c.DataflowBakery.machine_type = "e2-highmem-16" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.use_shuffle = False -c.DataflowBakery.max_workers = 1 +c.DataflowBakery.max_num_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( "leap-community-bakery@leap-pangeo.iam.gserviceaccount.com" From dcf3066a31516b0a8f1bb332111db753ad43bc0c Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 13:40:18 -0600 Subject: [PATCH 56/65] tmask --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 08a356c..7c1c93c 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -70,7 +70,7 @@ class Preprocess(beam.PTransform): def _set_coords(ds: xr.Dataset) -> xr.Dataset: t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) ds = ds.assign_coords(time=t_new) - ds = ds.drop(["time_counter"]) + ds = ds.drop(["time_counter", "tmask"]) ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat"]) # # convert cftime.DatetimeGregorian to datetime64[ns] From aa09a95c1ba0a13663feaa72e5d71ce5e7f53d54 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 13:46:43 -0600 Subject: [PATCH 57/65] shuffle off retry + 34 days --- feedstock/eNATL60.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 7c1c93c..ba06dec 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -51,7 +51,7 @@ def make_full_path(time): time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) pattern = FilePattern(make_full_path, time_concat_dim) -pattern = pattern.prune(120) +pattern = pattern.prune(33) class OpenWithPooch(beam.PTransform): From ff79c1150234c74d6a7f548b167257f18773f7f7 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 15:18:58 -0600 Subject: [PATCH 58/65] nit --- feedstock/eNATL60.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index ba06dec..8480927 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -51,7 +51,6 @@ def make_full_path(time): time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) pattern = FilePattern(make_full_path, time_concat_dim) -pattern = pattern.prune(33) class OpenWithPooch(beam.PTransform): @@ -70,7 +69,10 @@ class Preprocess(beam.PTransform): def _set_coords(ds: xr.Dataset) -> xr.Dataset: t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) ds = ds.assign_coords(time=t_new) - ds = ds.drop(["time_counter", "tmask"]) + to_drop = ["time_counter"] + if "tmask" in ds: + to_drop += ["tmask"] + ds = ds.drop_dims(to_drop) ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat"]) # # convert cftime.DatetimeGregorian to datetime64[ns] From b9d9ea74c3a1a7ec79912f55fdd374d3eb6c0d2b Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 15:39:11 -0600 Subject: [PATCH 59/65] pruned version, no copy w/ shuffle and ssd --- feedstock/eNATL60.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 8480927..1946c2f 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -11,7 +11,6 @@ ) from leap_data_management_utils.data_management_transforms import ( - Copy, get_catalog_store_urls, ) @@ -51,6 +50,7 @@ def make_full_path(time): time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) pattern = FilePattern(make_full_path, time_concat_dim) +pattern = pattern.prune(35) class OpenWithPooch(beam.PTransform): @@ -96,11 +96,11 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: # copy_to_local=True,) | Preprocess() | StoreToZarr( - store_name="eNATL60-BLBT02.zarr", + store_name="eNATL60-BLBT02_test.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"time": 30, "y": 900, "x": 900}, ) | ConsolidateDimensionCoordinates() | ConsolidateMetadata() - | Copy(target=catalog_store_urls["enatl60-blbt02"]) + # | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From 99af8c678782ce52ddbf7d7e40a1bf7feada4b08 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 16:44:04 -0600 Subject: [PATCH 60/65] eNATL full recipe + shuffle no copy --- feedstock/eNATL60.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 1946c2f..9b6a639 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -50,7 +50,6 @@ def make_full_path(time): time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) pattern = FilePattern(make_full_path, time_concat_dim) -pattern = pattern.prune(35) class OpenWithPooch(beam.PTransform): @@ -96,7 +95,7 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: # copy_to_local=True,) | Preprocess() | StoreToZarr( - store_name="eNATL60-BLBT02_test.zarr", + store_name="eNATL60-BLBT02-shuffle.zarr", combine_dims=pattern.combine_dim_keys, target_chunks={"time": 30, "y": 900, "x": 900}, ) From 53c6683ce960f9987b89675bf733148bf61539da Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 14 Aug 2024 17:13:33 -0600 Subject: [PATCH 61/65] pooch concurrent test --- configs/config_dataflow.py | 4 ++-- feedstock/eNATL60.py | 30 ++++++++++++------------------ 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8f2f236..8625ec2 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,10 +7,10 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.machine_type = "e2-highmem-4" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.use_shuffle = False -c.DataflowBakery.max_num_workers = 1 +# c.DataflowBakery.max_num_workers = 120 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( "leap-community-bakery@leap-pangeo.iam.gserviceaccount.com" diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 9b6a639..22c9f02 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -3,12 +3,6 @@ import apache_beam as beam import pooch from pangeo_forge_recipes.patterns import ConcatDim, FilePattern -from pangeo_forge_recipes.transforms import ( - ConsolidateMetadata, - ConsolidateDimensionCoordinates, - OpenWithXarray, - StoreToZarr, -) from leap_data_management_utils.data_management_transforms import ( get_catalog_store_urls, @@ -89,17 +83,17 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: beam.Create(pattern.items()) # | OpenURLWithFSSpec(max_concurrency=1) | OpenWithPooch() - | OpenWithXarray() - # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, - # load=True, - # copy_to_local=True,) - | Preprocess() - | StoreToZarr( - store_name="eNATL60-BLBT02-shuffle.zarr", - combine_dims=pattern.combine_dim_keys, - target_chunks={"time": 30, "y": 900, "x": 900}, - ) - | ConsolidateDimensionCoordinates() - | ConsolidateMetadata() + # | OpenWithXarray() + # # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, + # # load=True, + # # copy_to_local=True,) + # | Preprocess() + # | StoreToZarr( + # store_name="eNATL60-BLBT02-shuffle.zarr", + # combine_dims=pattern.combine_dim_keys, + # target_chunks={"time": 30, "y": 900, "x": 900}, + # ) + # | ConsolidateDimensionCoordinates() + # | ConsolidateMetadata() # | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From 231d1de292a6e232058a6760ff6b84a276aa9bcc Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 15 Aug 2024 10:53:08 -0600 Subject: [PATCH 62/65] coord update --- feedstock/eNATL60.py | 54 ++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/feedstock/eNATL60.py b/feedstock/eNATL60.py index 22c9f02..487f0fd 100644 --- a/feedstock/eNATL60.py +++ b/feedstock/eNATL60.py @@ -3,12 +3,21 @@ import apache_beam as beam import pooch from pangeo_forge_recipes.patterns import ConcatDim, FilePattern +from pangeo_forge_recipes.transforms import ( + ConsolidateMetadata, + ConsolidateDimensionCoordinates, + OpenWithXarray, + StoreToZarr, +) from leap_data_management_utils.data_management_transforms import ( + Copy, get_catalog_store_urls, ) catalog_store_urls = get_catalog_store_urls("feedstock/catalog.yaml") + + dates = pd.date_range("2009-07-01", "2010-06-30", freq="D") records = { @@ -42,8 +51,9 @@ def make_full_path(time): ) -time_concat_dim = ConcatDim("time", dates, nitems_per_file=1) +time_concat_dim = ConcatDim("time", dates) pattern = FilePattern(make_full_path, time_concat_dim) +# pattern = pattern.prune(60) class OpenWithPooch(beam.PTransform): @@ -60,17 +70,11 @@ class Preprocess(beam.PTransform): @staticmethod def _set_coords(ds: xr.Dataset) -> xr.Dataset: - t_new = xr.DataArray(ds.time_counter.data, dims=["time"]) - ds = ds.assign_coords(time=t_new) - to_drop = ["time_counter"] - if "tmask" in ds: - to_drop += ["tmask"] - ds = ds.drop_dims(to_drop) - ds = ds.set_coords(["deptht", "depthw", "nav_lon", "nav_lat"]) - - # # convert cftime.DatetimeGregorian to datetime64[ns] - # ds["time"] = ds.indexes["time"].to_datetimeindex() - + ds = ds.rename({"time_counter": "time"}) + ds = ds.set_coords(("nav_lat", "nav_lon")) + ds.attrs["deptht"] = ds.deptht.values[0] + ds = ds.drop("deptht") + ds = ds[["vosaline", "votemper", "vovecrtz"]] return ds def expand(self, pcoll: beam.PCollection) -> beam.PCollection: @@ -83,17 +87,17 @@ def expand(self, pcoll: beam.PCollection) -> beam.PCollection: beam.Create(pattern.items()) # | OpenURLWithFSSpec(max_concurrency=1) | OpenWithPooch() - # | OpenWithXarray() - # # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, - # # load=True, - # # copy_to_local=True,) - # | Preprocess() - # | StoreToZarr( - # store_name="eNATL60-BLBT02-shuffle.zarr", - # combine_dims=pattern.combine_dim_keys, - # target_chunks={"time": 30, "y": 900, "x": 900}, - # ) - # | ConsolidateDimensionCoordinates() - # | ConsolidateMetadata() - # | Copy(target=catalog_store_urls["enatl60-blbt02"]) + | OpenWithXarray() + # xarray_open_kwargs={"use_cftime": True, "engine": "netcdf4"}, + # load=True, + # copy_to_local=True,) + | Preprocess() + | StoreToZarr( + store_name="eNATL60-BLBT02.zarr", + combine_dims=pattern.combine_dim_keys, + target_chunks={"time": 30, "y": 900, "x": 900}, + ) + | ConsolidateDimensionCoordinates() + | ConsolidateMetadata() + | Copy(target=catalog_store_urls["enatl60-blbt02"]) ) From 8f3b2641df8789b2cb263b67773fd33b5c877245 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 15 Aug 2024 11:37:23 -0600 Subject: [PATCH 63/65] 1 worker --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8625ec2..ff9b8d8 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -10,7 +10,7 @@ c.DataflowBakery.machine_type = "e2-highmem-4" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.use_shuffle = False -# c.DataflowBakery.max_num_workers = 120 +c.DataflowBakery.max_num_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = ( "leap-community-bakery@leap-pangeo.iam.gserviceaccount.com" From 5d32583a02c5d4c3a9fbcae8be1c4331153d7284 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 15 Aug 2024 12:10:40 -0600 Subject: [PATCH 64/65] config update --- configs/config_dataflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index ff9b8d8..8f2f236 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -7,7 +7,7 @@ c.Bake.prune = False c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-4" +c.DataflowBakery.machine_type = "e2-highmem-16" c.DataflowBakery.disk_size_gb = 400 c.DataflowBakery.use_shuffle = False c.DataflowBakery.max_num_workers = 1 From 1c335074e8347ec5619a44baa00d1f32caddf377 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Thu, 22 Aug 2024 13:42:51 -0600 Subject: [PATCH 65/65] incorporate review + smaller worker + prune --- .github/workflows/deploy_recipe.yaml | 2 +- configs/config_dataflow.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy_recipe.yaml b/.github/workflows/deploy_recipe.yaml index 4500ab1..d3ec80f 100644 --- a/.github/workflows/deploy_recipe.yaml +++ b/.github/workflows/deploy_recipe.yaml @@ -25,7 +25,7 @@ jobs: - name: "Install dependencies" run: | python -m pip install --upgrade pip - pip install git+https://github.com/leap-stc/pangeo-forge-runner + pip install pangeo-forge-runner - name: "Deploy recipes" run: | pangeo-forge-runner bake \ diff --git a/configs/config_dataflow.py b/configs/config_dataflow.py index 8f2f236..ffd7b02 100644 --- a/configs/config_dataflow.py +++ b/configs/config_dataflow.py @@ -4,12 +4,11 @@ repo_path = os.environ['GITHUB_REPOSITORY'] FEEDSTOCK_NAME = repo_path.split('/')[-1] -c.Bake.prune = False +c.Bake.prune = True c.Bake.bakery_class = "pangeo_forge_runner.bakery.dataflow.DataflowBakery" c.DataflowBakery.use_dataflow_prime = False -c.DataflowBakery.machine_type = "e2-highmem-16" +c.DataflowBakery.machine_type = "e2-highmem-8" # 1 year had max 50GB of ram on single worker. This is 64GB c.DataflowBakery.disk_size_gb = 400 -c.DataflowBakery.use_shuffle = False c.DataflowBakery.max_num_workers = 1 c.DataflowBakery.use_public_ips = True c.DataflowBakery.service_account_email = (