diff --git a/datasets/io-land-cover/dataset.yaml b/datasets/io-land-cover/dataset.yaml
index b7f59ca3..1bdde809 100644
--- a/datasets/io-land-cover/dataset.yaml
+++ b/datasets/io-land-cover/dataset.yaml
@@ -2,7 +2,7 @@ id: io_lulc
image: ${{ args.registry }}/pctasks-task-base:latest
args:
-- registry
+ - registry
code:
src: ${{ local.path(./io_lulc.py) }}
diff --git a/datasets/ms-buildings/Dockerfile b/datasets/ms-buildings/Dockerfile
new file mode 100644
index 00000000..1cb83935
--- /dev/null
+++ b/datasets/ms-buildings/Dockerfile
@@ -0,0 +1,74 @@
+FROM ubuntu:20.04
+
+# Setup timezone info
+ENV TZ=UTC
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+RUN apt-get update && apt-get install -y software-properties-common
+
+RUN add-apt-repository ppa:ubuntugis/ppa && \
+ apt-get update && \
+ apt-get install -y build-essential python3-dev python3-pip \
+ jq unzip ca-certificates wget curl git && \
+ apt-get autoremove && apt-get autoclean && apt-get clean
+
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+
+# See https://github.com/mapbox/rasterio/issues/1289
+ENV CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
+
+# Install Python 3.11
+RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" \
+ && bash "Mambaforge-$(uname)-$(uname -m).sh" -b -p /opt/conda \
+ && rm -rf "Mambaforge-$(uname)-$(uname -m).sh"
+
+ENV PATH /opt/conda/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH
+
+RUN mamba install -y -c conda-forge python=3.11 gdal pip setuptools cython numpy
+
+RUN python -m pip install --upgrade pip
+
+# Install common packages
+COPY requirements-task-base.txt /tmp/requirements.txt
+RUN python -m pip install --no-build-isolation -r /tmp/requirements.txt
+
+#
+# Copy and install packages
+#
+
+COPY pctasks/core /opt/src/pctasks/core
+RUN cd /opt/src/pctasks/core && \
+ pip install .
+
+COPY pctasks/cli /opt/src/pctasks/cli
+RUN cd /opt/src/pctasks/cli && \
+ pip install .
+
+COPY pctasks/task /opt/src/pctasks/task
+RUN cd /opt/src/pctasks/task && \
+ pip install .
+
+COPY pctasks/client /opt/src/pctasks/client
+RUN cd /opt/src/pctasks/client && \
+ pip install .
+
+COPY pctasks/ingest /opt/src/pctasks/ingest
+RUN cd /opt/src/pctasks/ingest && \
+ pip install .
+
+COPY pctasks/dataset /opt/src/pctasks/dataset
+RUN cd /opt/src/pctasks/dataset && \
+ pip install .
+
+COPY ./datasets/ms-buildings/requirements.txt /opt/src/datasets/ms-buildings/requirements.txt
+RUN python3 -m pip install -r /opt/src/datasets/ms-buildings/requirements.txt
+
+# Setup Python Path to allow import of test modules
+ENV PYTHONPATH=/opt/src:$PYTHONPATH
+
+WORKDIR /opt/src
diff --git a/datasets/sentinel-5p/Dockerfile b/datasets/sentinel-5p/Dockerfile
new file mode 100644
index 00000000..828da64b
--- /dev/null
+++ b/datasets/sentinel-5p/Dockerfile
@@ -0,0 +1,74 @@
+FROM ubuntu:20.04
+
+# Setup timezone info
+ENV TZ=UTC
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+RUN apt-get update && apt-get install -y software-properties-common
+
+RUN add-apt-repository ppa:ubuntugis/ppa && \
+ apt-get update && \
+ apt-get install -y build-essential python3-dev python3-pip \
+ jq unzip ca-certificates wget curl git && \
+ apt-get autoremove && apt-get autoclean && apt-get clean
+
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
+
+# See https://github.com/mapbox/rasterio/issues/1289
+ENV CURL_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
+
+# Install Python 3.8
+RUN curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" \
+ && bash "Mambaforge-$(uname)-$(uname -m).sh" -b -p /opt/conda \
+ && rm -rf "Mambaforge-$(uname)-$(uname -m).sh"
+
+ENV PATH /opt/conda/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/conda/lib/:$LD_LIBRARY_PATH
+
+RUN mamba install -y -c conda-forge python=3.8 gdal=3.3.3 pip setuptools cython numpy==1.21.5
+
+RUN python -m pip install --upgrade pip
+
+# Install common packages
+COPY requirements-task-base.txt /tmp/requirements.txt
+RUN python -m pip install --no-build-isolation -r /tmp/requirements.txt
+
+#
+# Copy and install packages
+#
+
+COPY pctasks/core /opt/src/pctasks/core
+RUN cd /opt/src/pctasks/core && \
+ pip install .
+
+COPY pctasks/cli /opt/src/pctasks/cli
+RUN cd /opt/src/pctasks/cli && \
+ pip install .
+
+COPY pctasks/task /opt/src/pctasks/task
+RUN cd /opt/src/pctasks/task && \
+ pip install .
+
+COPY pctasks/client /opt/src/pctasks/client
+RUN cd /opt/src/pctasks/client && \
+ pip install .
+
+COPY pctasks/ingest /opt/src/pctasks/ingest
+RUN cd /opt/src/pctasks/ingest && \
+ pip install .
+
+COPY pctasks/dataset /opt/src/pctasks/dataset
+RUN cd /opt/src/pctasks/dataset && \
+ pip install .
+
+COPY ./datasets/sentinel-5p/requirements.txt /opt/src/datasets/sentinel-5p/requirements.txt
+RUN python3 -m pip install -r /opt/src/datasets/sentinel-5p/requirements.txt
+
+# Setup Python Path to allow import of test modules
+ENV PYTHONPATH=/opt/src:$PYTHONPATH
+
+WORKDIR /opt/src
diff --git a/datasets/sentinel-5p/README.md b/datasets/sentinel-5p/README.md
index 1bfda52a..1f2b162a 100644
--- a/datasets/sentinel-5p/README.md
+++ b/datasets/sentinel-5p/README.md
@@ -1 +1,11 @@
-# Work in progress - in a broken state
\ No newline at end of file
+# planetary-computer-tasks dataset: sentinel-5p
+
+Sentinel 5 Precursor
+
+## Building the Docker image
+
+To build and push a custom docker image to our container registry:
+
+```shell
+az acr build -r {the registry} --subscription {the subscription} -t pctasks-sentinel-5p:latest -f datasets/sentinel-5p/Dockerfile .
+```
diff --git a/datasets/sentinel-5p/collection/description.md b/datasets/sentinel-5p/collection/description.md
new file mode 100644
index 00000000..fa391880
--- /dev/null
+++ b/datasets/sentinel-5p/collection/description.md
@@ -0,0 +1,19 @@
+The Copernicus [Sentinel-5 Precursor](https://sentinels.copernicus.eu/web/sentinel/missions/sentinel-5p) mission provides high spatio-temporal resolution measurements of the Earth's atmosphere. The mission consists of one satellite carrying the [TROPOspheric Monitoring Instrument](http://www.tropomi.eu/) (TROPOMI). The satellite flies in loose formation with NASA's [Suomi NPP](https://www.nasa.gov/mission_pages/NPP/main/index.html) spacecraft, allowing utilization of co-located cloud mask data provided by the [Visible Infrared Imaging Radiometer Suite](https://www.nesdis.noaa.gov/current-satellite-missions/currently-flying/joint-polar-satellite-system/visible-infrared-imaging) (VIIRS) instrument onboard Suomi NPP during processing of the TROPOMI methane product.
+
+The Sentinel-5 Precursor mission aims to reduce the global atmospheric data gap between the retired [ENVISAT](https://earth.esa.int/eogateway/missions/envisat) and [AURA](https://www.nasa.gov/mission_pages/aura/main/index.html) missions and the future [Sentinel-5](https://sentinels.copernicus.eu/web/sentinel/missions/sentinel-5) mission. Sentinel-5 Precursor [Level 2 data](http://www.tropomi.eu/data-products/level-2-products) provide total columns of ozone, sulfur dioxide, nitrogen dioxide, carbon monoxide and formaldehyde, tropospheric columns of ozone, vertical profiles of ozone and cloud & aerosol information. These measurements are used for improving air quality forecasts and monitoring the concentrations of atmospheric constituents.
+
+This STAC Collection provides Sentinel-5 Precursor Level 2 data, in NetCDF format, since April 2018 for the following products:
+
+* [`L2__AER_AI`](http://www.tropomi.eu/data-products/uv-aerosol-index): Ultraviolet aerosol index
+* [`L2__AER_LH`](http://www.tropomi.eu/data-products/aerosol-layer-height): Aerosol layer height
+* [`L2__CH4___`](http://www.tropomi.eu/data-products/methane): Methane (CH4) total column
+* [`L2__CLOUD_`](http://www.tropomi.eu/data-products/cloud): Cloud fraction, albedo, and top pressure
+* [`L2__CO____`](http://www.tropomi.eu/data-products/carbon-monoxide): Carbon monoxide (CO) total column
+* [`L2__HCHO__`](http://www.tropomi.eu/data-products/formaldehyde): Formaldehyde (HCHO) total column
+* [`L2__NO2___`](http://www.tropomi.eu/data-products/nitrogen-dioxide): Nitrogen dioxide (NO2) total column
+* [`L2__O3____`](http://www.tropomi.eu/data-products/total-ozone-column): Ozone (O3) total column
+* [`L2__O3_TCL`](http://www.tropomi.eu/data-products/tropospheric-ozone-column): Ozone (O3) tropospheric column
+* [`L2__SO2___`](http://www.tropomi.eu/data-products/sulphur-dioxide): Sulfur dioxide (SO2) total column
+* [`L2__NP_BD3`](http://www.tropomi.eu/data-products/auxiliary): Cloud from the Suomi NPP mission, band 3
+* [`L2__NP_BD6`](http://www.tropomi.eu/data-products/auxiliary): Cloud from the Suomi NPP mission, band 6
+* [`L2__NP_BD7`](http://www.tropomi.eu/data-products/auxiliary): Cloud from the Suomi NPP mission, band 7
diff --git a/datasets/sentinel-5p/collection/template.json b/datasets/sentinel-5p/collection/template.json
new file mode 100755
index 00000000..db13cd24
--- /dev/null
+++ b/datasets/sentinel-5p/collection/template.json
@@ -0,0 +1,247 @@
+{
+ "stac_version": "1.0.0",
+ "type": "Collection",
+ "id": "sentinel-5p-l2-netcdf",
+ "title": "Sentinel-5P Level-2",
+ "description": "{{ collection.description }}",
+ "license": "proprietary",
+ "links": [
+ {
+ "rel": "license",
+ "href": "https://sentinel.esa.int/documents/247904/690755/Sentinel_Data_Legal_Notice",
+ "type": "application/pdf",
+ "title": "Sentinel Data License"
+ },
+ {
+ "rel": "about",
+ "href": "https://sentinel.esa.int/web/sentinel/missions/sentinel-5p",
+ "type": "text/html",
+ "title": "Sentinel-5 Precursor Mission"
+ }
+ ],
+ "stac_extensions": [
+ "https://stac-extensions.github.io/sat/v1.0.0/schema.json",
+ "https://stac-extensions.github.io/table/v1.2.0/schema.json",
+ "https://stac-extensions.github.io/item-assets/v1.0.0/schema.json"
+ ],
+ "keywords": [
+ "ESA",
+ "Copernicus",
+ "Sentinel",
+ "Air Quality",
+ "Climate Change",
+ "Forecasting"
+ ],
+ "msft:short_description": "Sentinel-5P Level 2 atmospheric monitoring products in NetCDF format",
+ "msft:storage_account": "sentinel5euwest",
+ "msft:container": "sentinel-5p",
+ "msft:region": "westeurope",
+ "providers": [
+ {
+ "name": "ESA",
+ "roles": [
+ "producer",
+ "processor",
+ "licensor"
+ ],
+ "url": "https://earth.esa.int/web/guest/home"
+ },
+ {
+ "name": "Microsoft",
+ "roles": [
+ "host"
+ ],
+ "url": "https://planetarycomputer.microsoft.com"
+ }
+ ],
+ "assets": {
+ "thumbnail": {
+ "title": "Sentinel-5P Level-2 NetCDF Thumbnail",
+ "href": "https://ai4edatasetspublicassets.blob.core.windows.net/assets/pc_thumbnails/sentinel-5p-l2-netcdf-thumb.png",
+ "media_type": "image/png"
+ },
+ "geoparquet-items": {
+ "href": "abfs://items/sentinel-5p-l2-netcdf.parquet",
+ "title": "GeoParquet STAC Items",
+ "description": "Snapshot of the collection's STAC items exported to GeoParquet format.",
+ "type": "application/x-parquet",
+ "roles": [
+ "stac-items"
+ ],
+ "table:storage_options": {
+ "account_name": "pcstacitems"
+ },
+ "msft:partition_info": {
+ "is_partitioned": true,
+ "partition_frequency": "MS"
+ }
+ }
+ },
+ "summaries": {
+ "constellation": [
+ "Sentinel-5P"
+ ],
+ "platform": [
+ "Sentinel 5 Precursor"
+ ],
+ "instruments": [
+ "TROPOMI"
+ ],
+ "sat:platform_international_designator": [
+ "2017-064A"
+ ],
+ "s5p:collection_identifier": [
+ "01",
+ "02",
+ "03"
+ ],
+ "s5p:processing_mode": [
+ "NRTI",
+ "OFFL",
+ "RPRO"
+ ],
+ "s5p:product_type": [
+ "L2__AER_AI",
+ "L2__AER_LH",
+ "L2__CH4___",
+ "L2__CLOUD_",
+ "L2__CO____",
+ "L2__HCHO__",
+ "L2__NO2___",
+ "L2__NP_BD3",
+ "L2__NP_BD6",
+ "L2__NP_BD7",
+ "L2__O3_TCL",
+ "L2__O3____",
+ "L2__SO2___"
+ ],
+ "s5p:product_name": [
+ "aer-ai",
+ "aer-lh",
+ "ch4",
+ "cloud",
+ "co",
+ "hcho",
+ "no2",
+ "np-bd3",
+ "np-bd6",
+ "np-bd7",
+ "o3-tcl",
+ "o3",
+ "so2"
+ ]
+ },
+ "extent": {
+ "spatial": {
+ "bbox": [
+ [
+ -180,
+ -90,
+ 180,
+ 90
+ ]
+ ]
+ },
+ "temporal": {
+ "interval": [
+ [
+ "2018-04-30T00:18:50Z",
+ null
+ ]
+ ]
+ }
+ },
+ "item_assets": {
+ "aer-ai": {
+ "title": "Ultraviolet Aerosol Index",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "aer-lh": {
+ "title": "Aerosol Layer Height",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "ch4": {
+ "title": "Methane Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "cloud": {
+ "title": "Cloud Fraction, Albedo, and Top Pressure",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "co": {
+ "title": "Carbon Monoxide Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "hcho": {
+ "title": "Formaldehyde Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "no2": {
+ "title": "Nitrogen Dioxide Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "o3": {
+ "title": "Ozone Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "o3-tcl": {
+ "title": "Ozone Tropospheric Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "so2": {
+ "title": "Sulphur Dioxide Total Column",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "np-bd3": {
+ "title": "VIIRS/NPP Band 3 Cloud Mask",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "np-bd6": {
+ "title": "VIIRS/NPP Band 6 Cloud Mask",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ },
+ "np-bd7": {
+ "title": "VIIRS/NPP Band 7 Cloud Mask",
+ "type": "application/x-netcdf",
+ "roles": [
+ "data"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/datasets/sentinel-5p/dataset.yaml b/datasets/sentinel-5p/dataset.yaml
index 8e9e9cd6..a68c7bb6 100644
--- a/datasets/sentinel-5p/dataset.yaml
+++ b/datasets/sentinel-5p/dataset.yaml
@@ -1,12 +1,42 @@
-id: sentinel-1-grd
+id: sentinel_5p
+image: ${{ args.registry }}/pctasks-sentinel-5p:latest
+
+args:
+ - registry
+
+code:
+ src: ${{ local.path(./sentinel_5p.py) }}
+
+environment:
+ AZURE_TENANT_ID: ${{ secrets.task-tenant-id }}
+ AZURE_CLIENT_ID: ${{ secrets.task-client-id }}
+ AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }}
+
collections:
- id: sentinel-1-grd
- containers:
- assets: sentinel1euwest/s1-grd
- stac: sentinel1euwest/s1-grd-stac
- etl-data: sentinel1euwest/s1-grd-etl-info
-triggers:
- storage:
- account: sentinel1euwest
- container: s1-grd-stac
- filter: "*.json"
+ - id: sentinel-5p-l2-netcdf
+ template: ${{ local.path(./collection) }}
+ class: sentinel_5p:Sentinel5pNetCDFCollection
+ asset_storage:
+ # The blob storage pattern is
+ #
+ # | sentinel-5p-st2ac/
+ # | TROPOMI/
+ # | L2_AER_AI/
+ # | 2018/
+ # | 06/
+ # | 28/
+ # | ...
+ # | L2_AER_LH
+ # | ...
+ # |
+ # We want to split by product (L2_AER_AI)
+ - uri: blob://sentinel5euwest/sentinel-5p-stac/TROPOMI/
+ token: ${{ pc.get_token(sentinel5euwest, sentinel-5p-stac) }}
+ chunks:
+ options:
+ extensions: [.json]
+ chunk_length: 5000
+ splits:
+ - depth: 1
+ chunk_storage:
+ uri: blob://sentinel5euwest/sentinel-5p-etl-data/pctasks-chunks/
diff --git a/datasets/sentinel-5p/requirements.txt b/datasets/sentinel-5p/requirements.txt
new file mode 100644
index 00000000..f546863c
--- /dev/null
+++ b/datasets/sentinel-5p/requirements.txt
@@ -0,0 +1 @@
+antimeridian==0.2.2
\ No newline at end of file
diff --git a/datasets/sentinel-5p/s5/__init__.py b/datasets/sentinel-5p/s5/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/datasets/sentinel-5p/sentinel-5p-l2.json b/datasets/sentinel-5p/sentinel-5p-l2.json
deleted file mode 100755
index 98dd3c13..00000000
--- a/datasets/sentinel-5p/sentinel-5p-l2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
- "type": "Collection",
- "id": "sentinel-5p-l2",
- "stac_version": "1.0.0",
- "description": "The Copernicus [Sentinel-5 Precursor mission](https://sentinels.copernicus.eu/web/sentinel/missions/sentinel-5p) is dedicated to monitoring our atmosphere and consists of one satellite carrying the TROPOspheric Monitoring Instrument (TROPOMI). Sentinel-5 Precursor mission aims to fill in the global atmospheric data gap between the retired ENVISAT and AURA missions and the future Sentinel-5 mission. The main objective of TROPOMI is to provide daily global observations of key atmospheric constituents related to air quality, ozone layer and climate change monitoring and forecasting. Level 2 data provide total columns of ozone, sulfur dioxide, nitrogen dioxide, carbon monoxide and formaldehyde, tropospheric columns of ozone, vertical profiles of ozone and cloud & aerosol information. Level 2 data are available since April 2018. These measurements are used for improving air quality forecasts as well as for monitoring the concentrations of atmospheric constituents.",
- "links": [
- {
- "rel": "license",
- "href": "https://sentinel.esa.int/documents/247904/690755/Sentinel_Data_Legal_Notice"
- },
- {
- "rel": "root",
- "href": "./sentinel-5p-l2.json",
- "type": "application/json"
- },
- {
- "rel": "self",
- "href": "https://sentinel5euwest.blob.core.windows.net/sentinel-5p-stac/sentinel-5p-l2.json",
- "type": "application/json"
- }
- ],
- "stac_extensions": [
- "https://stac-extensions.github.io/sat/v1.0.0/schema.json"
- ],
- "extent": {
- "spatial": {
- "bbox": [
- [
- -180,
- -85,
- 180,
- 85
- ]
- ]
- },
- "temporal": {
- "interval": [
- [
- "2018-04-30T00:18:50Z",
- null
- ]
- ]
- }
- },
- "license": "proprietary",
- "keywords": [
- "eu",
- "esa",
- "copernicus",
- "sentinel",
- "systematic",
- "aerosols",
- "air quality",
- "climate change",
- "ozone",
- "forecasting",
- "N02"
- ],
- "providers": [
- {
- "name": "ESA",
- "roles": [
- "producer",
- "processor",
- "licensor"
- ],
- "url": "https://earth.esa.int/web/guest/home"
- },
- {
- "name": "Microsoft",
- "roles": [
- "host",
- "processor"
- ],
- "url": "https://planetarycomputer.microsoft.com"
- }
- ],
- "msft:container": "sentinel-5p",
- "msft:region": "westeurope",
- "msft:storage_account": "sentinel5euwest",
- "msft:short_description": "The Copernicus [Sentinel-5 Precursor mission](https://sentinels.copernicus.eu/web/sentinel/missions/sentinel-5p) is dedicated to monitoring our atmosphere and consists of one satellite carrying the TROPOspheric Monitoring Instrument (TROPOMI). Level 2 data provide total columns of ozone, sulfur dioxide, nitrogen dioxide, carbon monoxide and formaldehyde, tropospheric columns of ozone, vertical profiles of ozone and cloud & aerosol information.",
- "summaries": {
- "constellation": [
- "Sentinel-5P"
- ],
- "platform" : [
- "Sentinel 5 Precursor"
- ],
- "instruments" : [
- "TROPOMI"
- ],
- "sat:platform_international_designator": [
- "2017-064A"
- ],
- "sat:absolute_orbit" : {
- "minimum": 1,
- "maximum": 2147483647
- },
- "s5p:processing_mode": [
- "NRTI",
- "OFFL",
- "RPRO"
- ],
- "s5p:product_type": [
- "L2__AER_AI",
- "L2__AER_LH",
- "L2__CH4___",
- "L2__CLOUD_",
- "L2__CO____",
- "L2__HCHO__",
- "L2__NO2___",
- "L2__NP_BD3",
- "L2__NP_BD6",
- "L2__NP_BD7",
- "L2__O3____",
- "L2__O3_TCL",
- "L2__SO2___"
- ],
- "s5p:spatial_resolution": [
- "5.5x3.5 km2",
- "5.5x7 km2",
- "7x3.5 km2",
- "7x7 km2"
- ],
- "s5p:shape": {
- "dimensions": "scanline x ground_pixel",
- "scanline": "The dimension that indicates the flight direction",
- "ground_pixel": "The dimension perpendicular to the flight direction"
- }
- },
- "title": "Sentinel-5P Level-2"
-}
diff --git a/datasets/sentinel-5p/sentinel_5p.py b/datasets/sentinel-5p/sentinel_5p.py
new file mode 100644
index 00000000..5355f489
--- /dev/null
+++ b/datasets/sentinel-5p/sentinel_5p.py
@@ -0,0 +1,198 @@
+import logging
+import re
+from pathlib import Path
+from typing import Any, List, Union
+
+import antimeridian
+import pystac
+import shapely.geometry
+from shapely.geometry import Polygon
+
+from pctasks.core.models.task import WaitTaskResult
+from pctasks.core.storage import StorageFactory
+from pctasks.dataset.collection import Collection
+
+FILENAME_EXPR = re.compile(
+ r"S5P_(?P[A-Z]{4})_L(?P[0-9]{1})_(?P.{7})_"
+ r"(?P[0-9,A-Z]{15})_(?P[0-9,A-Z]{15})_"
+ r"(?P[0-9]{5})_(?P[0-9]{2})_(?P[0-9]{6})_"
+ r"(?P[0-9,A-Z]{15})"
+)
+
+ABOUT_LINKS = {
+ "L2__AER_AI": "http://www.tropomi.eu/data-products/uv-aerosol-index",
+ "L2__AER_LH": "http://www.tropomi.eu/data-products/aerosol-layer-height",
+ "L2__CH4___": "http://www.tropomi.eu/data-products/methane",
+ "L2__CLOUD_": "http://www.tropomi.eu/data-products/cloud",
+ "L2__CO____": "http://www.tropomi.eu/data-products/carbon-monoxide",
+ "L2__HCHO__": "http://www.tropomi.eu/data-products/formaldehyde",
+ "L2__NO2___": "http://www.tropomi.eu/data-products/nitrogen-dioxide",
+ "L2__O3____": "http://www.tropomi.eu/data-products/total-ozone-column",
+ "L2__O3_TCL": "http://www.tropomi.eu/data-products/tropospheric-ozone-column",
+ "L2__SO2___": "http://www.tropomi.eu/data-products/sulphur-dioxide",
+ "L2__NP_BD3": "https://sentinel.esa.int/web/sentinel/technical-guides/sentinel-5p/products-algorithms", # noqa
+ "L2__NP_BD6": "https://sentinel.esa.int/web/sentinel/technical-guides/sentinel-5p/products-algorithms", # noqa
+ "L2__NP_BD7": "https://sentinel.esa.int/web/sentinel/technical-guides/sentinel-5p/products-algorithms", # noqa
+}
+
+ASSET_TITLES = {
+ "L2__AER_AI": "Ultraviolet Aerosol Index",
+ "L2__AER_LH": "Aerosol Layer Height",
+ "L2__CH4___": "Methane Total Column",
+ "L2__CLOUD_": "Cloud Fraction, Albedo, and Top Pressure",
+ "L2__CO____": "Carbon Monoxide Total Column",
+ "L2__HCHO__": "Formaldehyde Total Column",
+ "L2__NO2___": "Nitrogen Dioxide Total Column",
+ "L2__O3____": "Ozone Total Column",
+ "L2__O3_TCL": "Ozone Tropospheric Column",
+ "L2__SO2___": "Sulphur Dioxide Total Column",
+ "L2__NP_BD3": "VIIRS/NPP Band 3 Cloud Mask",
+ "L2__NP_BD6": "VIIRS/NPP Band 6 Cloud Mask",
+ "L2__NP_BD7": "VIIRS/NPP Band 7 Cloud Mask",
+}
+
+O3_TCL_GEOMETRY = shapely.geometry.mapping(
+ Polygon([(-180, -19.75), (180, -19.75), (180, 19.75), (-180, 19.75)])
+)
+O3_TCL_BBOX = [-180, -19.75, 180, 19.75]
+
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter("[%(levelname)s]:%(asctime)s: %(message)s"))
+handler.setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def recursive_round(coordinates: List[Any], precision: int) -> List[Any]:
+ """Rounds a list of numbers. The list can contain additional nested lists
+ or tuples of numbers.
+
+ Any tuples encountered will be converted to lists.
+
+ Args:
+ coordinates (List[Any]): A list of numbers, possibly containing nested
+ lists or tuples of numbers.
+ precision (int): Number of decimal places to use for rounding.
+
+ Returns:
+ List[Any]: The list of numbers rounded to the given precision.
+ """
+ rounded: List[Any] = []
+ for value in coordinates:
+ if isinstance(value, (int, float)):
+ rounded.append(round(value, precision))
+ else:
+ rounded.append(recursive_round(list(value), precision))
+ return rounded
+
+
+class Sentinel5pNetCDFCollection(Collection): # type: ignore
+ @classmethod
+ def create_item(
+ cls, asset_uri: str, storage_factory: StorageFactory
+ ) -> Union[List[pystac.Item], WaitTaskResult]:
+
+ storage, json_path = storage_factory.get_storage_for_file(asset_uri)
+ item_dict = storage.read_json(json_path)
+
+ netcdf_filename = item_dict["assets"]["data"]["href"]
+ match = FILENAME_EXPR.match(Path(netcdf_filename).stem)
+ if not match:
+ raise ValueError(f"Could not parse filename {Path(json_path).stem}")
+
+ prefix = match.groupdict()["product"].strip("_").lower()
+ if prefix.startswith("np"):
+ prefix = prefix.replace("_", "")
+
+ collection_identifier = match.groupdict()["collection"]
+ product_type = item_dict["properties"]["s5p:product_type"]
+ product_name = product_type.lstrip("L2_").rstrip("_").lower().replace("_", "-")
+
+ # ---- PROPERTIES ----
+ properties = item_dict.pop("properties")
+
+ # providers should be supplied in the collection, not the item
+ properties.pop("providers", None)
+
+ # combine the product custom properties to a single object
+ product_custom_fields = {}
+ keys = [k for k in properties.keys() if str(k).startswith(prefix)]
+ if keys:
+ for key in keys:
+ product_custom_fields[
+ str(key).replace(f"{prefix}:", "")
+ ] = properties.pop(key)
+ properties[f"s5p:{prefix}"] = product_custom_fields
+
+ # add sentinel-5p collection_identifier
+ properties["s5p:collection_identifier"] = collection_identifier
+
+ # convert spatial resolution to meters, store in list to match sentinel-3
+ # order is [height, width], aka [along track, across track]
+ resolution = properties["s5p:spatial_resolution"]
+ resolution = resolution.strip().strip("km2").strip("km").strip()
+ parts = resolution.split("x")
+ assert len(parts) == 2
+ properties["s5p:spatial_resolution"] = [int(float(p)) * 1000 for p in parts]
+
+ # correct bad datetimes
+ for k, v in properties.items():
+ if k.endswith("datetime") and v.endswith("ZZ"):
+ properties[k] = v[:-2] + "Z"
+ if f"s5p:{prefix}" in properties:
+ for k, v in properties[f"s5p:{prefix}"].items():
+ if k.endswith("datetime") and v.endswith("ZZ"):
+ properties[f"s5p:{prefix}"][k] = v[:-2] + "Z"
+
+ # add a user-friendly product name
+ properties["s5p:product_name"] = product_name
+
+ item_dict["properties"] = properties
+
+ # ---- ASSETS ----
+ asset = item_dict["assets"].pop("data")
+
+ # the supplied asset description is too brief to be a description and
+ # too inconsistent to use as a title; use a custom title instead
+ asset.pop("description")
+ asset["title"] = ASSET_TITLES[product_type]
+
+ item_dict["assets"][product_name] = asset
+
+ # ---- LINKS ----
+ links = item_dict.pop("links")
+
+ # license is the same for all items; include on the collection, not the item
+ for link in links:
+ if link["rel"] == "license":
+ links.remove(link)
+
+ # add a unique link to the product landing page
+ links.append(
+ {
+ "rel": "about",
+ "href": ABOUT_LINKS.get(product_type),
+ "type": "text/html",
+ }
+ )
+
+ item_dict["links"] = links
+
+ # ---- GEOMETRY ----
+ # fix antimeridian, except for o3_tcl, where we do some hardcode hacks instead
+ item = pystac.Item.from_dict(item_dict)
+ if product_name == "o3-tcl":
+ item.geometry = O3_TCL_GEOMETRY
+ item.bbox = O3_TCL_BBOX
+ else:
+ polygon = shapely.geometry.shape(item.geometry)
+ geometry = antimeridian.fix_polygon(polygon)
+ item.bbox = list(geometry.bounds)
+ item.geometry = shapely.geometry.mapping(geometry)
+ item.bbox = recursive_round(item.bbox, precision=6)
+ item.geometry["coordinates"] = recursive_round( # type: ignore
+ list(item.geometry["coordinates"]), precision=6 # type: ignore
+ )
+
+ return [item]
diff --git a/datasets/sentinel-5p/summarize-wf.yaml b/datasets/sentinel-5p/summarize-wf.yaml
deleted file mode 100644
index 5d28333d..00000000
--- a/datasets/sentinel-5p/summarize-wf.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: Summarize Sentinel 5P L2 Items
-
-dataset:
- owner: microsoft
- name: sentinel-5p-l2
-
-args:
-- registry
-
-jobs:
- create-splits:
- tasks:
- - id: list-prefixes
- image: ${{ args.registry }}/pctasks-task-base:latest
- task: pctasks.task.common.list_prefixes:task
- args:
- src_uri: blob://sentinel5euwest/sentinel-5p-stac
- depth: 4
- environment:
- AZURE_TENANT_ID: ${{ secrets.task-tenant-id }}
- AZURE_CLIENT_ID: ${{ secrets.task-client-id }}
- AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }}
- schema_version: 1.0.0
- summarize-map:
- foreach:
- items: ${{ jobs.create-splits.tasks.list-prefixes.output.uris }}
- tasks:
- - id: list-files
- image: ${{ args.registry }}/pctasks-task-base:latest
- task: pctasks.task.common.list_files:task
- args:
- src_uri: ${{ item }}
- extensions:
- - .json
- environment:
- AZURE_TENANT_ID: ${{ secrets.task-tenant-id }}
- AZURE_CLIENT_ID: ${{ secrets.task-client-id }}
- AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }}
- schema_version: 1.0.0
- - id: summarize-map
- image: ${{ args.registry }}/pctasks-task-base:latest
- task: pctasks.task.common.summarize:map_task
- args:
- uris: ${{ tasks.list-files.output.uris }}
- include_keys:
- - assets
- - properties
- environment:
- AZURE_TENANT_ID: ${{ secrets.task-tenant-id }}
- AZURE_CLIENT_ID: ${{ secrets.task-client-id }}
- AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }}
- schema_version: 1.0.0
- summarize-reduce:
- tasks:
- - id: summarize-reduce
- image: ${{ args.registry }}/pctasks-task-base:latest
- task: pctasks.task.common.summarize:reduce_task
- args:
- summaries: ${{ jobs.summarize-map.tasks.summarize-map.output.summary }}
- - id: write-output
- image: ${{ args.registry }}/pctasks-task-base:latest
- task: pctasks.task.common.write:task
- args:
- uri: blob://sentinel5euwest/sentinel-5p-etl-data/summaries/2022-09-08.json
- content: ${{ tasks.summarize-reduce.output.summary }}
- environment:
- AZURE_TENANT_ID: ${{ secrets.task-tenant-id }}
- AZURE_CLIENT_ID: ${{ secrets.task-client-id }}
- AZURE_CLIENT_SECRET: ${{ secrets.task-client-secret }}
- schema_version: 1.0.0
-