Merge pull request #140 from developmentseed/feat/add-cmip6-demo

Feat/add cmip6 demo
developmentseed · Sep 29, 2023 · 92c7e30 · 92c7e30
2 parents 1ad2481 + 8d57410
commit 92c7e30
Show file tree

Hide file tree

Showing 4 changed files with 364 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -106,3 +106,6 @@ cdk.out/
 node_modules/
 
 .pgdata
+
+# Demo files to ignore
+demo/cmip6/CMIP6_daily_*stac_items.ndjson
diff --git a/demo/cmip6/CMIP6_daily_GISS-E2-1-G_tas_collection.json b/demo/cmip6/CMIP6_daily_GISS-E2-1-G_tas_collection.json
@@ -0,0 +1,29 @@
+{
+    "id": "CMIP6_daily_GISS-E2-1-G_tas",
+    "stac_version": "1.0.0",
+    "license": "not-provided",
+    "title": "CMIP6 Daily Near-Surface Air Temperature from GISS-E2-1-G Model",
+    "type": "Collection",
+    "description": "CMIP6 Daily Near-Surface Air Temperature from GISS-E2-1-G Model",
+    "links":[],
+    "extent": {
+        "spatial": {
+            "bbox": [
+                [
+                    -180,
+                    -90,
+                    180,
+                    90
+                ]
+            ]
+        },
+        "temporal": {
+            "interval": [
+                [
+                    "1950-01-01T00:00:00.000Z",
+                    "1951-12-31T00:00:00.000Z"
+                ]
+            ]
+        }
+    }
+}
diff --git a/demo/cmip6/generate_cmip6_items.ipynb b/demo/cmip6/generate_cmip6_items.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9de99346-8c2a-48bb-b79b-e28a4db099d8",
+   "metadata": {},
+   "source": [
+    "# Generate CMIP STAC Items and Load them into a pgSTAC database\n",
+    "\n",
+    "This notebook walks through generating STAC items from [NEX GDDP CMIP6 COGs on AWS](https://aws.amazon.com/marketplace/pp/prodview-k6adk576fiwmm#resources).\n",
+    "\n",
+    "As-is it uses daily data from the `GISS-E2-1-G` model, the `tas` variable and loads data from 1950 and 1951. The bucket has other data available. It includes monthly aggregates, other models, other variables and more years. The scripts below can easily be modified to STAC-ify other data in the nex-gddp-cmip6-cog bucket."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6f788363",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import boto3\n",
+    "import fsspec\n",
+    "import json\n",
+    "from pystac import Catalog, Collection, Item, Asset, MediaType\n",
+    "from datetime import datetime\n",
+    "import rio_stac\n",
+    "from pprint import pprint\n",
+    "import concurrent.futures\n",
+    "import threading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8e4cfbb8",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Specify the CMIP model and variable to use\n",
+    "model = \"GISS-E2-1-G\"\n",
+    "variable = \"tas\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "762e1e50-46e6-4dab-8462-38d31060e202",
+   "metadata": {},
+   "source": [
+    "## Discover the COG files on S3 using fsspec and `.glob`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cdaccd78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "anon = True\n",
+    "s3_path = f\"s3://nex-gddp-cmip6-cog/daily/{model}/historical/r1i1p1f2/{variable}/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a7caab29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fs_read = fsspec.filesystem(\"s3\", anon=anon)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4936f757",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "23725 discovered from s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/\n"
+     ]
+    }
+   ],
+   "source": [
+    "file_paths = fs_read.glob(f\"{s3_path}*\")\n",
+    "print(f\"{len(file_paths)} discovered from {s3_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "78f991f3-8cde-44bd-a955-b3e5b9694320",
+   "metadata": {},
+   "source": [
+    "## Subset the data so we don't process all historical data\n",
+    "\n",
+    "But you can if you want!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "999b0670",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Here we prepend the prefix 's3://', which points to AWS.\n",
+    "subset_files = sorted([\"s3://\" + f for f in file_paths if \"_1950_\" in f or \"_1951_\" in f])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3bae56f1-1ea6-4755-84b3-149666e84d3d",
+   "metadata": {},
+   "source": [
+    "## Double check we discovered some files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6af269dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Subseted data to files for 1950 and 1951. 730 files to process.\n"
+     ]
+    }
+   ],
+   "source": [
+    "if len(subset_files) == 0:\n",
+    "    raise Exception(f\"No files to process. Do COGs for the {model} model exist?\")\n",
+    "else:\n",
+    "    print(f\"Subseted data to files for 1950 and 1951. {len(subset_files)} files to process.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea59aceb-b80a-4166-a684-74de4230ac4a",
+   "metadata": {},
+   "source": [
+    "## Setup the collection and items\n",
+    "\n",
+    "The collection is statically defined in a json file, but can be modified as desired. Then, iterate throug all the files in S3 and create STAC Item JSON using `rio_stac`. Write all the JSON to an `ndjson` file for inserting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "57dc1b5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_prefix = f\"CMIP6_daily_{model}_{variable}\"\n",
+    "stac_items_file = f\"{file_prefix}_stac_items.ndjson\"\n",
+    "collection_json = json.loads(open(f'{file_prefix}_collection.json').read())\n",
+    "collection = Collection.from_dict(collection_json)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "45771e88",
+   "metadata": {
+    "lines_to_next_cell": 1
+   },
+   "outputs": [],
+   "source": [
+    "# clear the ndjson items file\n",
+    "with open(stac_items_file, 'w') as file:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ececf9d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def process_item(s3_file, file, lock):\n",
+    "    print(f\"Processing {s3_file}\")\n",
+    "    filename = s3_file.split('/')[-1]\n",
+    "    year, month, day = filename.split('_')[-3:]\n",
+    "    day = day.replace('.tif', '')\n",
+    "    datetime_ = datetime.strptime(f'{year}{month}{day}', '%Y%m%d')    \n",
+    "    # Create a new Item\n",
+    "    item = rio_stac.create_stac_item(\n",
+    "            id=filename,\n",
+    "            source=s3_file,\n",
+    "            collection=collection.id,\n",
+    "            input_datetime=datetime_,\n",
+    "            with_proj=True,\n",
+    "            with_raster=True,\n",
+    "            asset_name=\"data\",\n",
+    "            asset_roles=[\"data\"],\n",
+    "            asset_media_type=\"image/tiff; application=geotiff; profile=cloud-optimized\"\n",
+    "        )\n",
+    "    tiling_asset = Asset(\n",
+    "        href=s3_file,\n",
+    "        roles=['virtual', 'tiling'],\n",
+    "        title='tiling',\n",
+    "        description='Virtual asset for tiling',\n",
+    "        extra_fields={\n",
+    "            'compose:rescale': [210, 330],\n",
+    "            'compose:colormap_name': 'hot'\n",
+    "        }\n",
+    "    )\n",
+    "    item.assets['tiling'] = tiling_asset\n",
+    "    with lock:\n",
+    "        file.write(json.dumps(item.to_dict()) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f38a5953-a195-4106-b172-26ba2bce9533",
+   "metadata": {},
+   "source": [
+    "NOTE: This can take awhile if processing all 730 file which is why it is subset to only 2 files below, for demonstration purposes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "645d3ccb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_01.tif\n",
+      "Processing s3://nex-gddp-cmip6-cog/daily/GISS-E2-1-G/historical/r1i1p1f2/tas/tas_day_GISS-E2-1-G_historical_r1i1p1f2_gn_1950_01_02.tif\n"
+     ]
+    }
+   ],
+   "source": [
+    "lock = threading.Lock()\n",
+    "file = open(stac_items_file, 'a')\n",
+    "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
+    "    futures = [executor.submit(process_item, obj, file, lock) for obj in subset_files[0:2]]\n",
+    "    [future.result() for future in concurrent.futures.as_completed(futures)]\n",
+    "file.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e062949-16c9-4a79-b2ee-1579f244d74f",
+   "metadata": {},
+   "source": [
+    "# Final step - seed the database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6965e650-f89a-4c7d-9f41-11774a905b81",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "postgresql://postgres:password@localhost:5432/postgres\n",
+      "Inserting collection from CMIP6_daily_GISS-E2-1-G_tas_collection.json\n",
+      "Inserting items from CMIP6_daily_GISS-E2-1-G_tas_stac_items.ndjson\n"
+     ]
+    }
+   ],
+   "source": [
+    "!./seed-db.sh {model} {variable}"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "cell_metadata_filter": "-all",
+   "formats": "ipynb,py",
+   "main_language": "python"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demo/cmip6/seed-db.sh b/demo/cmip6/seed-db.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+model="$1"
+variable="$2"
+
+collection_json_file="CMIP6_daily_${model}_${variable}_collection.json"
+items_json_file="CMIP6_daily_${model}_${variable}_stac_items.ndjson"
+
+if [ -z "$DATABASE_URL" ]; then
+    username=postgres
+    password=password
+    host=localhost
+    dbname=postgres
+    port=5432
+    DATABASE_URL="postgresql://$username:$password@$host:$port/$dbname"
+fi
+
+echo $DATABASE_URL
+echo "Inserting collection from $collection_json_file"
+pypgstac load collections $collection_json_file --dsn $DATABASE_URL --method insert_ignore
+echo "Inserting items from $items_json_file"
+pypgstac load items $items_json_file --dsn $DATABASE_URL --method insert_ignore