From d527845b19816445b4b2cbdb074e594a3b6d2a37 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 13:21:37 -0500 Subject: [PATCH 01/15] Initial metadata implementation --- tests/test_cf_router.py | 48 ++++++++++--- xpublish_edr/plugin.py | 154 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 188 insertions(+), 14 deletions(-) diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index f1b5dc7..e8b0dd6 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -62,6 +62,32 @@ def test_cf_area_formats(cf_client): assert "csv" in data, "csv is not a valid format" +def test_cf_metadata_query(cf_client): + response = cf_client.get("/datasets/air/edr/") + assert response.status_code == 200, "Response did not return successfully" + data = response.json() + + assert data["id"] == "air", "The id should be air" + assert data["title"] == "4x daily NMC reanalysis (1948)", "The title is incorrect" + assert ( + data["description"] + == "Data is from NMC initialized reanalysis\n(4x/day). These are the 0.9950 sigma level values." + ), "The description is incorrect" + assert data["crs"] == ["EPSG:4326"], "The crs is incorrect" + assert set(data["output_formats"]) == { + "cf_covjson", + "nc", + "netcdf4", + "nc4", + "netcdf", + "csv", + "geojson", + }, "The output formats are incorrect" + assert ( + "position" in data["data_queries"] and "area" in data["data_queries"] + ), "The data queries are incorrect" + + def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): x = 204 y = 44 @@ -122,14 +148,20 @@ def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): axes = data["domain"]["axes"] - npt.assert_array_almost_equal( - axes["x"]["values"], - [[x]], - ), "Did not select nearby x coordinate" - npt.assert_array_almost_equal( - axes["y"]["values"], - [[y]], - ), "Did not select a nearby y coordinate" + ( + npt.assert_array_almost_equal( + axes["x"]["values"], + [[x]], + ), + "Did not select nearby x coordinate", + ) + ( + npt.assert_array_almost_equal( + axes["y"]["values"], + [[y]], + ), + "Did not select a nearby y coordinate", + ) temp_range = data["ranges"]["temp"] assert temp_range["type"] == "NdArray", "Response range should be a NdArray" diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index fc8fd71..ff82299 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -5,19 +5,20 @@ import importlib from typing import List +import cf_xarray.units # noqa import xarray as xr from fastapi import APIRouter, Depends, HTTPException, Request from xpublish import Dependencies, Plugin, hookimpl from xpublish_edr.formats.to_covjson import to_cf_covjson from xpublish_edr.geometry.area import select_by_area -from xpublish_edr.geometry.common import project_dataset +from xpublish_edr.geometry.common import dataset_crs, project_dataset from xpublish_edr.geometry.position import select_by_position from xpublish_edr.logger import logger from xpublish_edr.query import EDRQuery, edr_query -def position_formats(): +def output_formats(): """ Return response format functions from registered `xpublish_edr_position_formats` entry_points @@ -31,6 +32,29 @@ def position_formats(): return formats +def variable_description(variable: xr.DataArray): + """ + Return CF version of EDR Parameter metadata for a given xarray variable + """ + name = variable.attrs.get("name", None) + standard_name = variable.attrs.get("standard_name", name if name else "") + label = standard_name if not name else name + long_name = variable.attrs.get("long_name", "") + units = variable.attrs.get("units", "") + return { + "type": "Parameter", + "description": long_name, + "unit": { + "label": units, + }, + "observedProperty": { + "label": label, + "standard_name": standard_name, + "long_name": long_name, + }, + } + + class CfEdrPlugin(Plugin): """ OGC EDR compatible endpoints for Xpublish datasets @@ -57,7 +81,7 @@ def get_position_formats(): """ Returns the various supported formats for position queries """ - formats = {key: value.__doc__ for key, value in position_formats().items()} + formats = {key: value.__doc__ for key, value in output_formats().items()} return formats @@ -69,7 +93,7 @@ def get_area_formats(): """ Returns the various supported formats for area queries """ - formats = {key: value.__doc__ for key, value in position_formats().items()} + formats = {key: value.__doc__ for key, value in output_formats().items()} return formats @@ -80,6 +104,124 @@ def dataset_router(self, deps: Dependencies): """Register dataset level router for EDR endpoints""" router = APIRouter(prefix=self.app_router_prefix, tags=self.dataset_router_tags) + @router.get("/", summary="Collection metadata") + def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): + """ + Returns the collection metadata for the dataset + + There is no nested hierarchy in our router right now, so instead we return the metadata + for the current dataset as the a single collection. See the spec for more information: + https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b + """ + id = dataset.attrs.get("_xpublish_id", "unknown") + title = dataset.attrs.get("title", "unknown") + description = dataset.attrs.get("description", "no description") + + crs = dataset_crs(dataset) + + available_output_formats = list(output_formats().keys()) + + ds_cf = dataset.cf + min_lon = ds_cf["X"].min().item() + max_lon = ds_cf["X"].max().item() + min_lat = ds_cf["Y"].min().item() + max_lat = ds_cf["Y"].max().item() + + spatial_extent = { + "bbox": [ + [ + min_lon, + min_lat, + max_lon, + max_lat, + ], + ], + } + + time_min = ds_cf["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%SZ").values + time_max = ds_cf["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%SZ").values + + temporal_extent = { + "interval": [ + str(time_min), + str(time_max), + ], + "values": [ + f"{time_min}/{time_max}", + ], + } + + parameters = { + k: variable_description(v) + for k, v in dataset.variables.items() + if "axis" not in v.attrs + } + + return { + "id": id, + "title": title, + "description": description, + "keywords": [], + "links": [], + "extent": { + "spatial": spatial_extent, + "temporal": temporal_extent, + }, + "data_queries": { + "position": { + "href": "/edr/position", + "hreflang": "en", + "rel": "data", + "templated": True, + "variables": { + "title": "Position query", + "description": "Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "query_type": "position", + "coords": { + "type": "string", + "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "required": True, + }, + "output_format": available_output_formats, + "default_output_format": "cf_covjson", + "crs_details": [ + { + "crs": crs.to_string(), + "wkt": crs.to_wkt(), + }, + ], + }, + }, + "area": { + "href": "/edr/area", + "hreflang": "en", + "rel": "data", + "templated": True, + "variables": { + "title": "Area query", + "description": "Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa + "query_type": "position", + "coords": { + "type": "string", + "description": "WKT `POLYGON(lon lat, ...)` coordinates", + "required": True, + }, + "output_format": available_output_formats, + "default_output_format": "cf_covjson", + "crs_details": [ + { + "crs": crs.coordinate_system.name, + "wkt": crs.to_wkt(), + }, + ], + }, + }, + }, + "crs": [crs.to_string()], + "output_formats": available_output_formats, + "parameter_names": parameters, + } + @router.get("/position", summary="Position query") def get_position( request: Request, @@ -126,7 +268,7 @@ def get_position( if query.format: try: - format_fn = position_formats()[query.format] + format_fn = output_formats()[query.format] except KeyError: raise HTTPException( 404, @@ -182,7 +324,7 @@ def get_area( if query.format: try: - format_fn = position_formats()[query.format] + format_fn = output_formats()[query.format] except KeyError: raise HTTPException( 404, From 6c15c2794e6119385234d107ebb7b9c26257f594 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 13:45:53 -0500 Subject: [PATCH 02/15] Initial extents with dask arrays --- xpublish_edr/plugin.py | 65 ++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index ff82299..74b48f2 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -5,7 +5,6 @@ import importlib from typing import List -import cf_xarray.units # noqa import xarray as xr from fastapi import APIRouter, Depends, HTTPException, Request from xpublish import Dependencies, Plugin, hookimpl @@ -121,13 +120,21 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): available_output_formats = list(output_formats().keys()) - ds_cf = dataset.cf - min_lon = ds_cf["X"].min().item() - max_lon = ds_cf["X"].max().item() - min_lat = ds_cf["Y"].min().item() - max_lat = ds_cf["Y"].max().item() + extents = {} - spatial_extent = { + ds_cf = dataset.cf + if "longitude" and "latitude" in ds_cf: + min_lon = float(ds_cf["longitude"].min().values) + max_lon = float(ds_cf["longitude"].max().values) + min_lat = float(ds_cf["latitude"].min().values) + max_lat = float(ds_cf["latitude"].max().values) + else: + min_lon = float(ds_cf["X"].min().values) + max_lon = float(ds_cf["X"].max().values) + min_lat = float(ds_cf["Y"].min().values) + max_lat = float(ds_cf["Y"].max().values) + + extents["spatial"] = { "bbox": [ [ min_lon, @@ -138,18 +145,35 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): ], } - time_min = ds_cf["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%SZ").values - time_max = ds_cf["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%SZ").values + if "T" in ds_cf: + time_min = ds_cf["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%S").values + time_max = ds_cf["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%S").values - temporal_extent = { - "interval": [ - str(time_min), - str(time_max), - ], - "values": [ - f"{time_min}/{time_max}", - ], - } + extents["temporal"] = { + "interval": [ + str(time_min), + str(time_max), + ], + "values": [ + f"{time_min}/{time_max}", + ], + } + + if "Z" in ds_cf: + min_z = ds_cf["Z"].min() + max_z = ds_cf["Z"].max() + units = ds_cf["Z"].attrs.get("units", "unknown") + + extents["vertical"] = { + "interval": [ + min_z, + max_z, + ], + "values": [ + f"{min_z}/{max_z}", + ], + "units": units, + } parameters = { k: variable_description(v) @@ -163,10 +187,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): "description": description, "keywords": [], "links": [], - "extent": { - "spatial": spatial_extent, - "temporal": temporal_extent, - }, + "extent": extents, "data_queries": { "position": { "href": "/edr/position", From e3dc4a17f802a8f1562140249e8589a3d8ee77aa Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 15:33:21 -0500 Subject: [PATCH 03/15] More accurate to the spec version of the metadata --- tests/test_cf_router.py | 21 ++++++++ xpublish_edr/geometry/common.py | 3 ++ xpublish_edr/plugin.py | 95 +++++++++++++++++++++------------ 3 files changed, 84 insertions(+), 35 deletions(-) diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index e8b0dd6..67d2178 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -87,6 +87,27 @@ def test_cf_metadata_query(cf_client): "position" in data["data_queries"] and "area" in data["data_queries"] ), "The data queries are incorrect" + assert ( + "temporal" and "spatial" in data["extent"] + ), "Temporal and spatial extents should be present in extent" + assert ( + "vertical" not in data["extent"] + ), "Vertical extent should not be present in extent" + + assert data["extent"]["temporal"]["interval"] == [ + "2013-01-01T00:00:00", + "2013-01-01T18:00:00", + ], "Temporal interval is incorrect" + assert ( + data["extent"]["temporal"]["values"][0] + == "2013-01-01T00:00:00/2013-01-01T18:00:00" + ), "Temporal values are incorrect" + + assert data["extent"]["spatial"]["bbox"] == [ + [200.0, 15.0, 322.5, 75.0], + ], "Spatial bbox is incorrect" + assert data["extent"]["spatial"]["crs"] == "EPSG:4326", "Spatial CRS is incorrect" + def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): x = 204 diff --git a/xpublish_edr/geometry/common.py b/xpublish_edr/geometry/common.py index bd2e00c..6c944e6 100644 --- a/xpublish_edr/geometry/common.py +++ b/xpublish_edr/geometry/common.py @@ -16,6 +16,9 @@ transformer_from_crs = lru_cache(pyproj.Transformer.from_crs) +DEFAULT_CRS = pyproj.CRS.from_epsg(4326) + + def coord_is_regular(da: xr.DataArray) -> bool: """ Check if the DataArray has a regular grid diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index 74b48f2..8563593 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -11,7 +11,7 @@ from xpublish_edr.formats.to_covjson import to_cf_covjson from xpublish_edr.geometry.area import select_by_area -from xpublish_edr.geometry.common import dataset_crs, project_dataset +from xpublish_edr.geometry.common import DEFAULT_CRS, dataset_crs, project_dataset from xpublish_edr.geometry.position import select_by_position from xpublish_edr.logger import logger from xpublish_edr.query import EDRQuery, edr_query @@ -120,29 +120,45 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): available_output_formats = list(output_formats().keys()) - extents = {} - ds_cf = dataset.cf - if "longitude" and "latitude" in ds_cf: - min_lon = float(ds_cf["longitude"].min().values) - max_lon = float(ds_cf["longitude"].max().values) - min_lat = float(ds_cf["latitude"].min().values) - max_lat = float(ds_cf["latitude"].max().values) + axes = ds_cf.axes + + # We will use the dataset's CRS as the default CRS for the extents, + # but override when it makes sense. + extent_crs = crs + + if len(axes["X"]) > 1: + if "latitude" and "longitude" in ds_cf: + min_lon = float(ds_cf["longitude"].min().values) + max_lon = float(ds_cf["longitude"].max().values) + min_lat = float(ds_cf["latitude"].min().values) + max_lat = float(ds_cf["latitude"].max().values) + + # When we are explicitly using latitude and longitude, we should use WGS84 + extent_crs = DEFAULT_CRS + else: + raise HTTPException( + status_code=404, + detail="Dataset does not have EDR compliant metadata: Multiple X axes found", + ) else: min_lon = float(ds_cf["X"].min().values) max_lon = float(ds_cf["X"].max().values) min_lat = float(ds_cf["Y"].min().values) max_lat = float(ds_cf["Y"].max().values) - extents["spatial"] = { - "bbox": [ - [ - min_lon, - min_lat, - max_lon, - max_lat, + extents: dict = { + "spatial": { + "bbox": [ + [ + min_lon, + min_lat, + max_lon, + max_lat, + ], ], - ], + "crs": extent_crs.to_string(), + }, } if "T" in ds_cf: @@ -157,21 +173,25 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): "values": [ f"{time_min}/{time_max}", ], + "trs": 'TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa } if "Z" in ds_cf: - min_z = ds_cf["Z"].min() - max_z = ds_cf["Z"].max() units = ds_cf["Z"].attrs.get("units", "unknown") + positive = ds_cf["Z"].attrs.get("positive", "up") + elevations = ds_cf["Z"].values + min_z = elevations.min() + max_z = elevations.max() + elevation_values = ",".join([str(e) for e in elevations]) extents["vertical"] = { "interval": [ min_z, max_z, ], - "values": [ - f"{min_z}/{max_z}", - ], + "values": elevation_values, + "vrs": f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa + "positive": positive, "units": units, } @@ -181,16 +201,31 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): if "axis" not in v.attrs } + crs_details = [ + { + "crs": crs.to_string(), + "wkt": crs.to_wkt(), + }, + ] + + # 4326 is always available + if crs != DEFAULT_CRS: + crs_details.append( + { + "crs": DEFAULT_CRS.to_string(), + "wkt": DEFAULT_CRS.to_wkt(), + }, + ) + return { "id": id, "title": title, "description": description, - "keywords": [], "links": [], "extent": extents, "data_queries": { "position": { - "href": "/edr/position", + "href": "/edr/position?coords={coords}", "hreflang": "en", "rel": "data", "templated": True, @@ -205,12 +240,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): }, "output_format": available_output_formats, "default_output_format": "cf_covjson", - "crs_details": [ - { - "crs": crs.to_string(), - "wkt": crs.to_wkt(), - }, - ], + "crs_details": crs_details, }, }, "area": { @@ -229,12 +259,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): }, "output_format": available_output_formats, "default_output_format": "cf_covjson", - "crs_details": [ - { - "crs": crs.coordinate_system.name, - "wkt": crs.to_wkt(), - }, - ], + "crs_details": crs_details, }, }, }, From 8f5542245e599b88fbba407d27a7f944e68d1bee Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 16:22:29 -0500 Subject: [PATCH 04/15] Update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d5d09fa..4854854 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,9 @@ This package attempts to follow [the spec](https://docs.ogc.org/is/19-086r6/19-0 ### [collections](https://docs.ogc.org/is/19-086r6/19-086r6.html#_e55ba0f5-8f24-4f1b-a7e3-45775e39ef2e) and Resource Paths Support -`xpublish-edr` does not currently support the `/collections/{collectionId}/query` path template described in the spec. Instead the path resource appears as `/{dataset_id}/query`. This is because of the path structure of xpublish. +`xpublish-edr` does not currently support the `/collections/{collectionId}/query` path template described in the spec. Instead the path resource appears as `/{dataset_id}/edr/{query}`. This is because of the path structure of xpublish. In the future, if `xpublish` supports [`DataTree`](https://docs.xarray.dev/en/stable/generated/xarray.DataTree.html) it could provide a path to supporting the spec compliant `collections` resource path. -In the future, when `xpublish` supports [`DataTree`](https://docs.xarray.dev/en/stable/generated/xarray.DataTree.html) it will provide a path to supporting the spec compliant `collections` resource path. + However, despite the collections resource not existing, this implementation supports [collection metadata](https://docs.ogc.org/is/19-086r6/19-086r6.html#_5d07dde9-231a-4652-a1f3-dd036c337bdc) at the dataset level through the `/{dataset_id}/edr/` resource. ### Supported Queries From f6f671c61c9c7f5966b7e6a5071479815dbe5db6 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 16:26:01 -0500 Subject: [PATCH 05/15] More tests --- tests/test_cf_router.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index 67d2178..2a38c29 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -109,6 +109,16 @@ def test_cf_metadata_query(cf_client): assert data["extent"]["spatial"]["crs"] == "EPSG:4326", "Spatial CRS is incorrect" +def test_cf_metadata_query_temp_smoke_test(cf_client): + response = cf_client.get("/datasets/temp/edr/") + assert response.status_code == 200, "Response did not return successfully" + data = response.json() + + assert data["id"] == "temp", "The id should be temp" + for key in ("title", "description", "crs", "extent", "output_formats", "data_queries"): + assert key in data, f"Key {key} is not a top level key in the metadata response" + + def test_cf_position_query(cf_client, cf_air_dataset, cf_temp_dataset): x = 204 y = 44 From 9354fda75bb6cd4244822c193073e2ebb25daaf2 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 20 Nov 2024 16:28:24 -0500 Subject: [PATCH 06/15] Suggestions, formatting --- tests/test_cf_router.py | 9 ++++++++- xpublish_edr/plugin.py | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index 2a38c29..5d729fb 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -115,7 +115,14 @@ def test_cf_metadata_query_temp_smoke_test(cf_client): data = response.json() assert data["id"] == "temp", "The id should be temp" - for key in ("title", "description", "crs", "extent", "output_formats", "data_queries"): + for key in ( + "title", + "description", + "crs", + "extent", + "output_formats", + "data_queries", + ): assert key in data, f"Key {key} is not a top level key in the metadata response" diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index 8563593..9024414 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -128,7 +128,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): extent_crs = crs if len(axes["X"]) > 1: - if "latitude" and "longitude" in ds_cf: + if "latitude" in ds_cf and "longitude" in ds_cf: min_lon = float(ds_cf["longitude"].min().values) max_lon = float(ds_cf["longitude"].max().values) min_lat = float(ds_cf["latitude"].min().values) @@ -173,6 +173,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): "values": [ f"{time_min}/{time_max}", ], + # TODO: parse `ds.cf["time"].dt.calendar` "trs": 'TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa } From da3643f0dbe42c2de7d0bab1a343579c4ebce662 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 09:38:58 -0500 Subject: [PATCH 07/15] Use projected dataset for extents --- xpublish_edr/geometry/common.py | 21 +++++++++++++-- xpublish_edr/plugin.py | 47 +++++++++------------------------ 2 files changed, 32 insertions(+), 36 deletions(-) diff --git a/xpublish_edr/geometry/common.py b/xpublish_edr/geometry/common.py index 6c944e6..7eac8c2 100644 --- a/xpublish_edr/geometry/common.py +++ b/xpublish_edr/geometry/common.py @@ -33,6 +33,20 @@ def is_regular_xy_coords(ds: xr.Dataset) -> bool: return coord_is_regular(ds.cf["X"]) and coord_is_regular(ds.cf["Y"]) +def spatial_bounds(ds: xr.Dataset) -> tuple[float, float, float, float]: + """ + Get the spatial bounds of the dataset, naively, in whatever CRS it is in + """ + x = ds.cf["X"] + min_x = float(x.min().values) + max_x = float(x.max().values) + + y = ds.cf["Y"] + min_y = float(y.min().values) + max_y = float(y.max().values) + return min_x, min_y, max_x, max_y + + def dataset_crs(ds: xr.Dataset) -> pyproj.CRS: grid_mapping_names = ds.cf.grid_mapping_names if len(grid_mapping_names) == 0: @@ -64,12 +78,15 @@ def project_geometry(ds: xr.Dataset, geometry_crs: str, geometry: Geometry) -> G return transform(transformer.transform, geometry) -def project_dataset(ds: xr.Dataset, query_crs: str) -> xr.Dataset: +def project_dataset(ds: xr.Dataset, query_crs: str | pyproj.CRS) -> xr.Dataset: """ Project the dataset to the given CRS """ data_crs = dataset_crs(ds) - target_crs = pyproj.CRS.from_string(query_crs) + if isinstance(query_crs, pyproj.CRS): + target_crs = query_crs + else: + target_crs = pyproj.CRS.from_string(query_crs) if data_crs == target_crs: return ds diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index 9024414..2275c5c 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -11,7 +11,12 @@ from xpublish_edr.formats.to_covjson import to_cf_covjson from xpublish_edr.geometry.area import select_by_area -from xpublish_edr.geometry.common import DEFAULT_CRS, dataset_crs, project_dataset +from xpublish_edr.geometry.common import ( + DEFAULT_CRS, + dataset_crs, + project_dataset, + spatial_bounds, +) from xpublish_edr.geometry.position import select_by_position from xpublish_edr.logger import logger from xpublish_edr.query import EDRQuery, edr_query @@ -121,42 +126,16 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): available_output_formats = list(output_formats().keys()) ds_cf = dataset.cf - axes = ds_cf.axes - - # We will use the dataset's CRS as the default CRS for the extents, - # but override when it makes sense. - extent_crs = crs - - if len(axes["X"]) > 1: - if "latitude" in ds_cf and "longitude" in ds_cf: - min_lon = float(ds_cf["longitude"].min().values) - max_lon = float(ds_cf["longitude"].max().values) - min_lat = float(ds_cf["latitude"].min().values) - max_lat = float(ds_cf["latitude"].max().values) - - # When we are explicitly using latitude and longitude, we should use WGS84 - extent_crs = DEFAULT_CRS - else: - raise HTTPException( - status_code=404, - detail="Dataset does not have EDR compliant metadata: Multiple X axes found", - ) - else: - min_lon = float(ds_cf["X"].min().values) - max_lon = float(ds_cf["X"].max().values) - min_lat = float(ds_cf["Y"].min().values) - max_lat = float(ds_cf["Y"].max().values) + + # We will use the dataset's CRS as the default CRS, but use 4326 for the extents + # since it is always available + extent_crs = DEFAULT_CRS + projected_ds = project_dataset(dataset, DEFAULT_CRS) + bounds = spatial_bounds(projected_ds) extents: dict = { "spatial": { - "bbox": [ - [ - min_lon, - min_lat, - max_lon, - max_lat, - ], - ], + "bbox": [bounds], "crs": extent_crs.to_string(), }, } From 810222dba2cb518b894e8a98c52e4c99c4551ebc Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 09:59:29 -0500 Subject: [PATCH 08/15] Better coordinate handling in projections --- xpublish_edr/geometry/common.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/xpublish_edr/geometry/common.py b/xpublish_edr/geometry/common.py index 7eac8c2..ee5844a 100644 --- a/xpublish_edr/geometry/common.py +++ b/xpublish_edr/geometry/common.py @@ -96,15 +96,23 @@ def project_dataset(ds: xr.Dataset, query_crs: str | pyproj.CRS) -> xr.Dataset: always_xy=True, ) - # TODO: Handle rotated pole - cf_coords = target_crs.coordinate_system.to_cf() - - # Get the new X and Y coordinates - target_y_coord = next(coord for coord in cf_coords if coord["axis"] == "Y") - target_x_coord = next(coord for coord in cf_coords if coord["axis"] == "X") - - X = ds.cf["X"] - Y = ds.cf["Y"] + # Unpack the coordinates + try: + X = ds.cf["X"] + Y = ds.cf["Y"] + except KeyError: + # If the dataset has multiple X axes, we can try to find the right one + source_cf_coords = data_crs.coordinate_system.to_cf() + + source_x_coord = next( + coord["standard_name"] for coord in source_cf_coords if coord["axis"] == "X" + ) + source_y_coord = next( + coord["standard_name"] for coord in source_cf_coords if coord["axis"] == "Y" + ) + + X = ds.cf[source_x_coord] + Y = ds.cf[source_y_coord] # Transform the coordinates # If the data is vectorized, we just transform the points in full @@ -124,6 +132,13 @@ def project_dataset(ds: xr.Dataset, query_crs: str | pyproj.CRS) -> xr.Dataset: c for c in ds.coords if x_dim in ds[c].dims or y_dim in ds[c].dims ] + # TODO: Handle rotated pole + target_cf_coords = target_crs.coordinate_system.to_cf() + + # Get the new X and Y coordinates + target_x_coord = next(coord for coord in target_cf_coords if coord["axis"] == "X") + target_y_coord = next(coord for coord in target_cf_coords if coord["axis"] == "Y") + target_x_coord_name = target_x_coord["standard_name"] target_y_coord_name = target_y_coord["standard_name"] From b8970f7ec19367ded21d43d31176d3b70313e191 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 10:05:06 -0500 Subject: [PATCH 09/15] Adjust typing --- xpublish_edr/geometry/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xpublish_edr/geometry/common.py b/xpublish_edr/geometry/common.py index ee5844a..a96a446 100644 --- a/xpublish_edr/geometry/common.py +++ b/xpublish_edr/geometry/common.py @@ -4,6 +4,7 @@ import itertools from functools import lru_cache +from typing import Union import pyproj import xarray as xr @@ -78,7 +79,7 @@ def project_geometry(ds: xr.Dataset, geometry_crs: str, geometry: Geometry) -> G return transform(transformer.transform, geometry) -def project_dataset(ds: xr.Dataset, query_crs: str | pyproj.CRS) -> xr.Dataset: +def project_dataset(ds: xr.Dataset, query_crs: Union[str, pyproj.CRS]) -> xr.Dataset: """ Project the dataset to the given CRS """ From 3a625ea00e653d688d13b2d4d42ea0c7136d5d18 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 10:30:54 -0500 Subject: [PATCH 10/15] Better parameter filtering --- tests/test_cf_router.py | 4 ++++ xpublish_edr/plugin.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_cf_router.py b/tests/test_cf_router.py index 5d729fb..76cf29e 100644 --- a/tests/test_cf_router.py +++ b/tests/test_cf_router.py @@ -108,6 +108,10 @@ def test_cf_metadata_query(cf_client): ], "Spatial bbox is incorrect" assert data["extent"]["spatial"]["crs"] == "EPSG:4326", "Spatial CRS is incorrect" + assert "air" in data["parameter_names"], "Air parameter should be present" + assert "lat" not in data["parameter_names"], "lat should not be present" + assert "lon" not in data["parameter_names"], "lon should not be present" + def test_cf_metadata_query_temp_smoke_test(cf_client): response = cf_client.get("/datasets/temp/edr/") diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index 2275c5c..a83123e 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -177,7 +177,7 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): parameters = { k: variable_description(v) - for k, v in dataset.variables.items() + for k, v in projected_ds.variables.items() if "axis" not in v.attrs } From 51a27449ea03c9e0ca4b343fe1f72d3a8a1e18dd Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 14:49:09 -0500 Subject: [PATCH 11/15] Extract metadata logic placeholder --- xpublish_edr/plugin.py | 161 +---------------------------------------- 1 file changed, 3 insertions(+), 158 deletions(-) diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index a83123e..3eabf9f 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -11,14 +11,10 @@ from xpublish_edr.formats.to_covjson import to_cf_covjson from xpublish_edr.geometry.area import select_by_area -from xpublish_edr.geometry.common import ( - DEFAULT_CRS, - dataset_crs, - project_dataset, - spatial_bounds, -) +from xpublish_edr.geometry.common import project_dataset from xpublish_edr.geometry.position import select_by_position from xpublish_edr.logger import logger +from xpublish_edr.metadata import collection_metadata from xpublish_edr.query import EDRQuery, edr_query @@ -36,29 +32,6 @@ def output_formats(): return formats -def variable_description(variable: xr.DataArray): - """ - Return CF version of EDR Parameter metadata for a given xarray variable - """ - name = variable.attrs.get("name", None) - standard_name = variable.attrs.get("standard_name", name if name else "") - label = standard_name if not name else name - long_name = variable.attrs.get("long_name", "") - units = variable.attrs.get("units", "") - return { - "type": "Parameter", - "description": long_name, - "unit": { - "label": units, - }, - "observedProperty": { - "label": label, - "standard_name": standard_name, - "long_name": long_name, - }, - } - - class CfEdrPlugin(Plugin): """ OGC EDR compatible endpoints for Xpublish datasets @@ -117,136 +90,8 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): for the current dataset as the a single collection. See the spec for more information: https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b """ - id = dataset.attrs.get("_xpublish_id", "unknown") - title = dataset.attrs.get("title", "unknown") - description = dataset.attrs.get("description", "no description") - - crs = dataset_crs(dataset) - available_output_formats = list(output_formats().keys()) - - ds_cf = dataset.cf - - # We will use the dataset's CRS as the default CRS, but use 4326 for the extents - # since it is always available - extent_crs = DEFAULT_CRS - projected_ds = project_dataset(dataset, DEFAULT_CRS) - bounds = spatial_bounds(projected_ds) - - extents: dict = { - "spatial": { - "bbox": [bounds], - "crs": extent_crs.to_string(), - }, - } - - if "T" in ds_cf: - time_min = ds_cf["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%S").values - time_max = ds_cf["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%S").values - - extents["temporal"] = { - "interval": [ - str(time_min), - str(time_max), - ], - "values": [ - f"{time_min}/{time_max}", - ], - # TODO: parse `ds.cf["time"].dt.calendar` - "trs": 'TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa - } - - if "Z" in ds_cf: - units = ds_cf["Z"].attrs.get("units", "unknown") - positive = ds_cf["Z"].attrs.get("positive", "up") - elevations = ds_cf["Z"].values - min_z = elevations.min() - max_z = elevations.max() - elevation_values = ",".join([str(e) for e in elevations]) - - extents["vertical"] = { - "interval": [ - min_z, - max_z, - ], - "values": elevation_values, - "vrs": f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa - "positive": positive, - "units": units, - } - - parameters = { - k: variable_description(v) - for k, v in projected_ds.variables.items() - if "axis" not in v.attrs - } - - crs_details = [ - { - "crs": crs.to_string(), - "wkt": crs.to_wkt(), - }, - ] - - # 4326 is always available - if crs != DEFAULT_CRS: - crs_details.append( - { - "crs": DEFAULT_CRS.to_string(), - "wkt": DEFAULT_CRS.to_wkt(), - }, - ) - - return { - "id": id, - "title": title, - "description": description, - "links": [], - "extent": extents, - "data_queries": { - "position": { - "href": "/edr/position?coords={coords}", - "hreflang": "en", - "rel": "data", - "templated": True, - "variables": { - "title": "Position query", - "description": "Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa - "query_type": "position", - "coords": { - "type": "string", - "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa - "required": True, - }, - "output_format": available_output_formats, - "default_output_format": "cf_covjson", - "crs_details": crs_details, - }, - }, - "area": { - "href": "/edr/area", - "hreflang": "en", - "rel": "data", - "templated": True, - "variables": { - "title": "Area query", - "description": "Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa - "query_type": "position", - "coords": { - "type": "string", - "description": "WKT `POLYGON(lon lat, ...)` coordinates", - "required": True, - }, - "output_format": available_output_formats, - "default_output_format": "cf_covjson", - "crs_details": crs_details, - }, - }, - }, - "crs": [crs.to_string()], - "output_formats": available_output_formats, - "parameter_names": parameters, - } + return collection_metadata(dataset, available_output_formats) @router.get("/position", summary="Position query") def get_position( From b917693debf8f6ad18d0f93e8677d4251f3713b3 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 14:54:33 -0500 Subject: [PATCH 12/15] forgot metadata file --- xpublish_edr/metadata.py | 218 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 xpublish_edr/metadata.py diff --git a/xpublish_edr/metadata.py b/xpublish_edr/metadata.py new file mode 100644 index 0000000..1b7a2b4 --- /dev/null +++ b/xpublish_edr/metadata.py @@ -0,0 +1,218 @@ +import pyproj +import xarray as xr + +from xpublish_edr.geometry.common import ( + DEFAULT_CRS, + dataset_crs, + project_dataset, + spatial_bounds, +) + + +def crs_description(crs: pyproj.CRS) -> dict: + """ + Return CF version of EDR CRS metadata + """ + return { + "crs": crs.to_string(), + "wkt": crs.to_wkt(), + } + + +def variable_description(da: xr.DataArray) -> dict: + """ + Return CF version of EDR Parameter metadata for a given xarray variable + """ + name = da.attrs.get("name", None) + standard_name = da.attrs.get("standard_name", name if name else "") + label = standard_name if not name else name + long_name = da.attrs.get("long_name", "") + units = da.attrs.get("units", "") + return { + "type": "Parameter", + "description": long_name, + "unit": { + "label": units, + }, + "observedProperty": { + "label": label, + "standard_name": standard_name, + "long_name": long_name, + }, + } + + +def extract_parameters(ds: xr.Dataset) -> dict: + """ + Extract the parameters from the dataset into collection metadata specific format + """ + return { + k: variable_description(v) + for k, v in ds.variables.items() + if "axis" not in v.attrs + } + + +def spatial_extent_description(ds: xr.Dataset, crs: pyproj.CRS) -> dict: + """ + Extract the spatial extent from the dataset into collection metadata specific format + """ + # We will use the dataset's CRS as the default CRS, but use 4326 for the extents + # since it is always available + bounds = spatial_bounds(ds) + + return { + "bbox": [bounds], + "crs": crs.to_string(), + } + + +def temporal_extent_description(ds: xr.Dataset) -> dict: + """ + Extract the temporal extent from the dataset into collection metadata specific format + """ + time_min = ds["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%S").values + time_max = ds["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%S").values + return { + "interval": [ + str(time_min), + str(time_max), + ], + "values": [ + f"{time_min}/{time_max}", + ], + # TODO: parse `ds.cf["time"].dt.calendar` + "trs": 'TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa + } + + +def vertical_extent_description(ds: xr.Dataset) -> dict: + """ + Extract the vertical extent from the dataset into collection metadata specific format + """ + elevations = ds.cf["Z"].values + units = ds.cf["Z"].attrs.get("units", "unknown") + positive = ds.cf["Z"].attrs.get("positive", "up") + min_z = elevations.min() + max_z = elevations.max() + elevation_values = ",".join([str(e) for e in elevations]) + + return { + "interval": [ + min_z, + max_z, + ], + "values": elevation_values, + "vrs": f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa + "positive": positive, + "units": units, + } + + +def position_query_description( + output_formats: list[str], crs_details: list[dict] +) -> dict: + """ + Return CF version of EDR Position Query metadata + """ + return { + "href": "/edr/position", + "hreflang": "en", + "rel": "data", + "templated": True, + "variables": { + "title": "Position query", + "description": "Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "query_type": "position", + "coords": { + "type": "string", + "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "required": True, + }, + "output_format": output_formats, + "default_output_format": "cf_covjson", + "crs_details": crs_details, + }, + } + + +def area_query_description(output_formats: list[str], crs_details: list[dict]) -> dict: + """ + Return CF version of EDR Area Query metadata + """ + return { + "href": "/edr/area?coords={coords}", + "hreflang": "en", + "rel": "data", + "templated": True, + "variables": { + "title": "Area query", + "description": "Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa + "query_type": "position", + "coords": { + "type": "string", + "description": "WKT `POLYGON(lon lat, ...)` coordinates", + "required": True, + }, + "output_format": output_formats, + "default_output_format": "cf_covjson", + "crs_details": crs_details, + }, + } + + +def collection_metadata(ds: xr.Dataset, output_formats: list[str]) -> dict: + """ + Returns the collection metadata for the dataset + There is no nested hierarchy in our router right now, so instead we return the metadata + for the current dataset as the a single collection. See the spec for more information: + https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b + """ + id = ds.attrs.get("_xpublish_id", "unknown") + title = ds.attrs.get("title", "unknown") + description = ds.attrs.get("description", "no description") + + crs = dataset_crs(ds) + + ds_cf = ds.cf + + # We will use the dataset's CRS as the default CRS, but use 4326 for the extents + # since it is always available + projected_ds = project_dataset(ds, DEFAULT_CRS) + + extents: dict = { + "spatial": spatial_extent_description(projected_ds, DEFAULT_CRS), + } + + if "T" in ds_cf: + extents["temporal"] = temporal_extent_description(ds_cf) + + if "Z" in ds_cf: + extents["vertical"] = vertical_extent_description(ds_cf) + + parameters = extract_parameters(ds) + + crs_details = [ + crs_description(crs), + ] + + # 4326 is always available + if crs != DEFAULT_CRS: + crs_details.append( + crs_description(DEFAULT_CRS), + ) + + return { + "id": id, + "title": title, + "description": description, + "links": [], + "extent": extents, + "data_queries": { + "position": position_query_description(output_formats, crs_details), + "area": area_query_description(output_formats, crs_details), + }, + "crs": [crs.to_string()], + "output_formats": output_formats, + "parameter_names": parameters, + } From 522c3729b383eb88e270c13c7f89bbe6b693dc16 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 Nov 2024 19:55:36 +0000 Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xpublish_edr/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xpublish_edr/metadata.py b/xpublish_edr/metadata.py index 1b7a2b4..e7fcdaf 100644 --- a/xpublish_edr/metadata.py +++ b/xpublish_edr/metadata.py @@ -110,7 +110,7 @@ def vertical_extent_description(ds: xr.Dataset) -> dict: def position_query_description( - output_formats: list[str], crs_details: list[dict] + output_formats: list[str], crs_details: list[dict], ) -> dict: """ Return CF version of EDR Position Query metadata From 4f5cbb9ec5142b0e37d8b43780ba3ef6a9a8bcda Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 15:19:52 -0500 Subject: [PATCH 14/15] Add a whole bunch of pydantic --- xpublish_edr/metadata.py | 199 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 198 insertions(+), 1 deletion(-) diff --git a/xpublish_edr/metadata.py b/xpublish_edr/metadata.py index e7fcdaf..ca998f0 100644 --- a/xpublish_edr/metadata.py +++ b/xpublish_edr/metadata.py @@ -1,5 +1,8 @@ +from typing import Optional + import pyproj import xarray as xr +from pydantic import BaseModel, Field from xpublish_edr.geometry.common import ( DEFAULT_CRS, @@ -9,6 +12,199 @@ ) +class CRSDetails(BaseModel): + """OGC EDR CRS metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_7124ec17-6401-4eb7-ba1d-8ec329b7e677 + """ + + crs: str + wkt: str + + +class VariablesMetadata(BaseModel): + """OGC EDR Variables metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_1b54f97a-e1dc-4920-b8b4-e4981554138d + """ + + title: Optional[str] + description: Optional[str] + query_type: Optional[str] + coords: Optional[dict] + output_formats: Optional[list[str]] + default_output_format: Optional[str] + crs_details: Optional[list[CRSDetails]] + + +class Link(BaseModel): + """OGC EDR Link metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_ea77762b-89c0-4704-b845-748efc66e597 + """ + + href: str + rel: str + type_: Optional[str] = Field(None, serialization_alias="type") + hreflang: Optional[str] + title: Optional[str] + length: Optional[int] + templated: Optional[bool] + variables: Optional[VariablesMetadata] + + +class SpatialExtent(BaseModel): + """OGC EDR Spatial Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_0afff399-4d8e-4a9b-961b-cab841d23cc1 + """ + + bbox: list[list[float]] + crs: str + + +class TemporalExtent(BaseModel): + """OGC EDR Temporal Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_8f4c9f38-bc6a-4b98-8fd9-772e42d60ab2 + """ + + interval: list[str] + values: list[str] + trs: str + + +class VerticalExtent(BaseModel): + """OGC EDR Vertical Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_52bf970b-315a-4a09-8b92-51757b584a62 + """ + + interval: list[float] + values: list[float] + vrs: str + + +class Extent(BaseModel): + """OGC EDR Extent metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_2a2d533f-6efe-48df-8056-2eca9deb848f + """ + + spatial: SpatialExtent + temporal: Optional[TemporalExtent] + vertical: Optional[VerticalExtent] + + +class EDRQueryMetadata(BaseModel): + """OGC EDR Query metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_9a6620ce-6093-4b1b-8f68-2e2c04a13746 + """ + + link: Link + + +class DataQueries(BaseModel): + """OGC EDR Data Queries metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_df2c080b-949c-40c3-ad14-d20228270c2d + """ + + position: Optional[EDRQueryMetadata] + radius: Optional[EDRQueryMetadata] + area: Optional[EDRQueryMetadata] + cube: Optional[EDRQueryMetadata] + trajectory: Optional[EDRQueryMetadata] + corridor: Optional[EDRQueryMetadata] + item: Optional[EDRQueryMetadata] + location: Optional[EDRQueryMetadata] + + +class SymbolMetadata(BaseModel): + """OGC EDR Symbol metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_3e50c10c-85bd-46d9-8e09-1c5fffffb055 + """ + + title: Optional[str] + description: Optional[str] + value: Optional[str] + type: Optional[str] + + +class UnitMetadata(BaseModel): + """OGC EDR Unit metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_5378d779-6a38-4607-9051-6f12c3d3107b + """ + + label: str + symbol: SymbolMetadata + + +class MeasurementType(BaseModel): + """OGC EDR Measurement Type metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_c81181d6-fd09-454e-9c00-a3bb3b21d592 + """ + + method: str + duration: str + + +class ObservedProperty(BaseModel): + """OGC EDR Observed Property metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_7e053ab4-5cde-4a5c-a8be-acc6495f9eb5 + """ + + id: Optional[str] + label: str + description: Optional[str] + + +class Parameter(BaseModel): + """OGC EDR Parameter metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_da400aef-f6ee-4d08-b36c-2f535d581d53 + """ + + id: Optional[str] + type_: str = Field(..., serialization_alias="type") + label: Optional[str] + description: Optional[str] + data_type: Optional[str] = Field(None, serialization_alias="data-type") + unit: Optional[UnitMetadata] + observed_property: ObservedProperty = Field( + ..., + serialization_alias="observedProperty", + ) + extent: Optional[Extent] + measurement_type: Optional[MeasurementType] = Field( + None, + serialization_alias="measurementType", + ) + + +class Collection(BaseModel): + """OGC EDR Collection metadata + + https://docs.ogc.org/is/19-086r6/19-086r6.html#_b6db449c-4ca7-4117-9bf4-241984cef569 + """ + + links: list[Link] + id: str + title: str + description: str + keywords: list[str] + extent: dict + data_queries: dict + crs: list[str] + output_formats: list[str] + parameter_names: dict + + def crs_description(crs: pyproj.CRS) -> dict: """ Return CF version of EDR CRS metadata @@ -110,7 +306,8 @@ def vertical_extent_description(ds: xr.Dataset) -> dict: def position_query_description( - output_formats: list[str], crs_details: list[dict], + output_formats: list[str], + crs_details: list[dict], ) -> dict: """ Return CF version of EDR Position Query metadata From 9c8c8087a8da0b2b4557acd7292e609267ed5147 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 21 Nov 2024 16:16:05 -0500 Subject: [PATCH 15/15] Full pydantic typesafe collection metadata --- xpublish_edr/metadata.py | 387 ++++++++++++++++++++------------------- xpublish_edr/plugin.py | 4 +- 2 files changed, 199 insertions(+), 192 deletions(-) diff --git a/xpublish_edr/metadata.py b/xpublish_edr/metadata.py index ca998f0..fccb4a7 100644 --- a/xpublish_edr/metadata.py +++ b/xpublish_edr/metadata.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Literal, Optional import pyproj import xarray as xr @@ -28,13 +28,13 @@ class VariablesMetadata(BaseModel): https://docs.ogc.org/is/19-086r6/19-086r6.html#_1b54f97a-e1dc-4920-b8b4-e4981554138d """ - title: Optional[str] - description: Optional[str] - query_type: Optional[str] - coords: Optional[dict] - output_formats: Optional[list[str]] - default_output_format: Optional[str] - crs_details: Optional[list[CRSDetails]] + title: Optional[str] = None + description: Optional[str] = None + query_type: Optional[str] = None + coords: Optional[dict] = None + output_formats: Optional[list[str]] = None + default_output_format: Optional[str] = None + crs_details: Optional[list[CRSDetails]] = None class Link(BaseModel): @@ -46,11 +46,11 @@ class Link(BaseModel): href: str rel: str type_: Optional[str] = Field(None, serialization_alias="type") - hreflang: Optional[str] - title: Optional[str] - length: Optional[int] - templated: Optional[bool] - variables: Optional[VariablesMetadata] + hreflang: Optional[str] = None + title: Optional[str] = None + length: Optional[int] = None + templated: Optional[bool] = None + variables: Optional[VariablesMetadata] = None class SpatialExtent(BaseModel): @@ -92,8 +92,8 @@ class Extent(BaseModel): """ spatial: SpatialExtent - temporal: Optional[TemporalExtent] - vertical: Optional[VerticalExtent] + temporal: Optional[TemporalExtent] = None + vertical: Optional[VerticalExtent] = None class EDRQueryMetadata(BaseModel): @@ -111,14 +111,14 @@ class DataQueries(BaseModel): https://docs.ogc.org/is/19-086r6/19-086r6.html#_df2c080b-949c-40c3-ad14-d20228270c2d """ - position: Optional[EDRQueryMetadata] - radius: Optional[EDRQueryMetadata] - area: Optional[EDRQueryMetadata] - cube: Optional[EDRQueryMetadata] - trajectory: Optional[EDRQueryMetadata] - corridor: Optional[EDRQueryMetadata] - item: Optional[EDRQueryMetadata] - location: Optional[EDRQueryMetadata] + position: Optional[EDRQueryMetadata] = None + radius: Optional[EDRQueryMetadata] = None + area: Optional[EDRQueryMetadata] = None + cube: Optional[EDRQueryMetadata] = None + trajectory: Optional[EDRQueryMetadata] = None + corridor: Optional[EDRQueryMetadata] = None + item: Optional[EDRQueryMetadata] = None + location: Optional[EDRQueryMetadata] = None class SymbolMetadata(BaseModel): @@ -127,10 +127,10 @@ class SymbolMetadata(BaseModel): https://docs.ogc.org/is/19-086r6/19-086r6.html#_3e50c10c-85bd-46d9-8e09-1c5fffffb055 """ - title: Optional[str] - description: Optional[str] - value: Optional[str] - type: Optional[str] + title: Optional[str] = None + description: Optional[str] = None + value: Optional[str] = None + type_: Optional[str] = Field(None, serialization_alias="type") class UnitMetadata(BaseModel): @@ -159,9 +159,9 @@ class ObservedProperty(BaseModel): https://docs.ogc.org/is/19-086r6/19-086r6.html#_7e053ab4-5cde-4a5c-a8be-acc6495f9eb5 """ - id: Optional[str] + id: Optional[str] = None label: str - description: Optional[str] + description: Optional[str] = None class Parameter(BaseModel): @@ -170,17 +170,17 @@ class Parameter(BaseModel): https://docs.ogc.org/is/19-086r6/19-086r6.html#_da400aef-f6ee-4d08-b36c-2f535d581d53 """ - id: Optional[str] - type_: str = Field(..., serialization_alias="type") - label: Optional[str] - description: Optional[str] + id: Optional[str] = None + type_: Literal["Parameter"] = Field("Parameter", serialization_alias="type") + label: Optional[str] = None + description: Optional[str] = None data_type: Optional[str] = Field(None, serialization_alias="data-type") - unit: Optional[UnitMetadata] + unit: Optional[UnitMetadata] = None observed_property: ObservedProperty = Field( ..., serialization_alias="observedProperty", ) - extent: Optional[Extent] + extent: Optional[Extent] = None measurement_type: Optional[MeasurementType] = Field( None, serialization_alias="measurementType", @@ -198,167 +198,181 @@ class Collection(BaseModel): title: str description: str keywords: list[str] - extent: dict - data_queries: dict + extent: Extent + data_queries: DataQueries crs: list[str] output_formats: list[str] - parameter_names: dict + parameter_names: dict[str, Parameter] -def crs_description(crs: pyproj.CRS) -> dict: +def crs_details(crs: pyproj.CRS) -> CRSDetails: """ Return CF version of EDR CRS metadata """ - return { - "crs": crs.to_string(), - "wkt": crs.to_wkt(), - } + return CRSDetails(crs=crs.to_string(), wkt=crs.to_wkt()) + + +def unit(unit: str) -> UnitMetadata: + """ + Return CF version of EDR Unit metadata + """ + return UnitMetadata( + label=unit, + symbol=SymbolMetadata( + value=unit, + type="unit", + ), + ) -def variable_description(da: xr.DataArray) -> dict: +def parameter(da: xr.DataArray) -> Parameter: """ Return CF version of EDR Parameter metadata for a given xarray variable """ name = da.attrs.get("name", None) standard_name = da.attrs.get("standard_name", name if name else "") - label = standard_name if not name else name - long_name = da.attrs.get("long_name", "") - units = da.attrs.get("units", "") - return { - "type": "Parameter", - "description": long_name, - "unit": { - "label": units, - }, - "observedProperty": { - "label": label, - "standard_name": standard_name, - "long_name": long_name, - }, - } - - -def extract_parameters(ds: xr.Dataset) -> dict: - """ - Extract the parameters from the dataset into collection metadata specific format - """ - return { - k: variable_description(v) - for k, v in ds.variables.items() - if "axis" not in v.attrs - } + observed_property = ObservedProperty( + label=standard_name, + description=da.attrs.get("long_name", ""), + ) + return Parameter( + label=standard_name, + type_="Parameter", + description=da.attrs.get("long_name", ""), + data_type=da.dtype.name, + unit=unit(da.attrs.get("units", "")), + observed_property=observed_property, + ) -def spatial_extent_description(ds: xr.Dataset, crs: pyproj.CRS) -> dict: - """ - Extract the spatial extent from the dataset into collection metadata specific format - """ - # We will use the dataset's CRS as the default CRS, but use 4326 for the extents - # since it is always available +def spatial_extent(ds: xr.Dataset, crs: pyproj.CRS) -> SpatialExtent: + """Extract the spatial extent from the dataset into collection metadata specific format""" bounds = spatial_bounds(ds) - return { - "bbox": [bounds], - "crs": crs.to_string(), - } + return SpatialExtent( + bbox=[bounds], + crs=crs.to_string(), + ) -def temporal_extent_description(ds: xr.Dataset) -> dict: - """ - Extract the temporal extent from the dataset into collection metadata specific format - """ - time_min = ds["T"].min().dt.strftime("%Y-%m-%dT%H:%M:%S").values - time_max = ds["T"].max().dt.strftime("%Y-%m-%dT%H:%M:%S").values - return { - "interval": [ - str(time_min), - str(time_max), - ], - "values": [ - f"{time_min}/{time_max}", - ], - # TODO: parse `ds.cf["time"].dt.calendar` - "trs": 'TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa - } - - -def vertical_extent_description(ds: xr.Dataset) -> dict: - """ - Extract the vertical extent from the dataset into collection metadata specific format - """ - elevations = ds.cf["Z"].values - units = ds.cf["Z"].attrs.get("units", "unknown") - positive = ds.cf["Z"].attrs.get("positive", "up") +def temporal_extent(ds: xr.Dataset) -> Optional[TemporalExtent]: + """Extract the temporal extent from the dataset into collection metadata specific format""" + if "T" not in ds.cf: + return None + + t = ds.cf["T"] + time_min = t.min().dt.strftime("%Y-%m-%dT%H:%M:%S").values + time_max = t.max().dt.strftime("%Y-%m-%dT%H:%M:%S").values + return TemporalExtent( + interval=[str(time_min), str(time_max)], + values=[f"{time_min}/{time_max}"], + trs='TIMECRS["DateTime",TDATUM["Gregorian Calendar"],CS[TemporalDateTime,1],AXIS["Time (T)",unspecified]]', # noqa + ) + + +def vertical_extent(ds: xr.Dataset) -> Optional[VerticalExtent]: + """Extract the vertical extent from the dataset into collection metadata specific format""" + if "Z" not in ds.cf: + return None + + z = ds.cf["Z"] + elevations = z.values + units = z.attrs.get("units", "unknown") + positive = z.attrs.get("positive", "up") min_z = elevations.min() max_z = elevations.max() elevation_values = ",".join([str(e) for e in elevations]) - return { - "interval": [ - min_z, - max_z, - ], - "values": elevation_values, - "vrs": f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa - "positive": positive, - "units": units, - } + return VerticalExtent( + interval=[min_z, max_z], + values=elevation_values, + vrs=f"VERTCRS[VERT_CS['unknown'],AXIS['Z',{positive}],UNIT[{units},1]]", # noqa + ) + + +def extent(ds: xr.Dataset, crs: pyproj.CRS) -> Extent: + """ + Extract the extent from the dataset into collection metadata specific format + """ + spatial = spatial_extent(ds, crs) + temporal = temporal_extent(ds) + vertical = vertical_extent(ds) + + return Extent( + spatial=spatial, + temporal=temporal, + vertical=vertical, + ) + + +def extract_parameters(ds: xr.Dataset) -> dict[str, Parameter]: + """ + Extract the parameters from the dataset into collection metadata specific format + """ + return {k: parameter(v) for k, v in ds.variables.items() if "axis" not in v.attrs} def position_query_description( output_formats: list[str], - crs_details: list[dict], -) -> dict: + crs_details: list[CRSDetails], +) -> EDRQueryMetadata: """ Return CF version of EDR Position Query metadata """ - return { - "href": "/edr/position", - "hreflang": "en", - "rel": "data", - "templated": True, - "variables": { - "title": "Position query", - "description": "Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa - "query_type": "position", - "coords": { - "type": "string", - "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa - "required": True, - }, - "output_format": output_formats, - "default_output_format": "cf_covjson", - "crs_details": crs_details, - }, - } - - -def area_query_description(output_formats: list[str], crs_details: list[dict]) -> dict: + return EDRQueryMetadata( + link=Link( + href="/edr/position?coords={coords}", + hreflang="en", + rel="data", + templated=True, + variables=VariablesMetadata( + title="Position query", + description="Returns position data based on WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + query_type="position", + coords={ + "type": "string", + "description": "WKT `POINT(lon lat)` or `MULTIPOINT(lon lat, ...)` coordinates", # noqa + "required": True, + }, + output_formats=output_formats, + default_output_format="cf_covjson", + crs_details=crs_details, + ), + ), + ) + + +def area_query_description( + output_formats: list[str], + crs_details: list[CRSDetails], +) -> EDRQueryMetadata: """ Return CF version of EDR Area Query metadata """ - return { - "href": "/edr/area?coords={coords}", - "hreflang": "en", - "rel": "data", - "templated": True, - "variables": { - "title": "Area query", - "description": "Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa - "query_type": "position", - "coords": { - "type": "string", - "description": "WKT `POLYGON(lon lat, ...)` coordinates", - "required": True, - }, - "output_format": output_formats, - "default_output_format": "cf_covjson", - "crs_details": crs_details, - }, - } - - -def collection_metadata(ds: xr.Dataset, output_formats: list[str]) -> dict: + return EDRQueryMetadata( + link=Link( + href="/edr/area?coords={coords}", + hreflang="en", + rel="data", + templated=True, + variables=VariablesMetadata( + title="Area query", + description="Returns data in a polygon based on WKT `POLYGON(lon lat, ...)` coordinates", # noqa + query_type="position", + coords={ + "type": "string", + "description": "WKT `POLYGON(lon lat, ...)` coordinates", + "required": True, + }, + output_formats=output_formats, + default_output_format="cf_covjson", + crs_details=crs_details, + ), + ), + ) + + +def collection_metadata(ds: xr.Dataset, output_formats: list[str]) -> Collection: """ Returns the collection metadata for the dataset There is no nested hierarchy in our router right now, so instead we return the metadata @@ -371,45 +385,36 @@ def collection_metadata(ds: xr.Dataset, output_formats: list[str]) -> dict: crs = dataset_crs(ds) - ds_cf = ds.cf - # We will use the dataset's CRS as the default CRS, but use 4326 for the extents # since it is always available projected_ds = project_dataset(ds, DEFAULT_CRS) - extents: dict = { - "spatial": spatial_extent_description(projected_ds, DEFAULT_CRS), - } - - if "T" in ds_cf: - extents["temporal"] = temporal_extent_description(ds_cf) - - if "Z" in ds_cf: - extents["vertical"] = vertical_extent_description(ds_cf) + extents = extent(projected_ds, crs) parameters = extract_parameters(ds) - crs_details = [ - crs_description(crs), + supported_crs = [ + crs_details(crs), ] # 4326 is always available if crs != DEFAULT_CRS: - crs_details.append( - crs_description(DEFAULT_CRS), + supported_crs.append( + crs_details(DEFAULT_CRS), ) - return { - "id": id, - "title": title, - "description": description, - "links": [], - "extent": extents, - "data_queries": { - "position": position_query_description(output_formats, crs_details), - "area": area_query_description(output_formats, crs_details), - }, - "crs": [crs.to_string()], - "output_formats": output_formats, - "parameter_names": parameters, - } + return Collection( + links=[], + id=id, + title=title, + description=description, + keywords=[], + extent=extents, + data_queries=DataQueries( + position=position_query_description(output_formats, supported_crs), + area=area_query_description(output_formats, supported_crs), + ), + crs=[crs.to_string()], + output_formats=output_formats, + parameter_names=parameters, + ) diff --git a/xpublish_edr/plugin.py b/xpublish_edr/plugin.py index 3eabf9f..ba20d79 100644 --- a/xpublish_edr/plugin.py +++ b/xpublish_edr/plugin.py @@ -91,7 +91,9 @@ def get_collection_metadata(dataset: xr.Dataset = Depends(deps.dataset)): https://docs.ogc.org/is/19-086r6/19-086r6.html#_162817c2-ccd7-43c9-b1ea-ad3aea1b4d6b """ available_output_formats = list(output_formats().keys()) - return collection_metadata(dataset, available_output_formats) + return collection_metadata(dataset, available_output_formats).dict( + exclude_none=True, + ) @router.get("/position", summary="Position query") def get_position(