From cfcc69bf3183a11638566cb1cb543104ac8f4584 Mon Sep 17 00:00:00 2001 From: amsnyder <asnyder@usgs.gov> Date: Tue, 17 Jan 2023 21:37:24 -0600 Subject: [PATCH] add initial catalog and workflows --- .gitignore | 1 + catalog/catalog.json | 24 + catalog/conus404-daily/collection.json | 1307 +++++++++++++++++ .../add_zarr_asset_to_collection_zarr.ipynb | 42 + workflows/create_catalog.ipynb | 86 ++ workflows/create_collection_opendap.ipynb | 42 + ..._exploratory_workflow_conus404-daily.ipynb | 421 ++++++ 7 files changed, 1923 insertions(+) create mode 100644 .gitignore create mode 100644 catalog/catalog.json create mode 100644 catalog/conus404-daily/collection.json create mode 100644 workflows/add_zarr_asset_to_collection_zarr.ipynb create mode 100644 workflows/create_catalog.ipynb create mode 100644 workflows/create_collection_opendap.ipynb create mode 100644 workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..763513e9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints diff --git a/catalog/catalog.json b/catalog/catalog.json new file mode 100644 index 00000000..ede6559c --- /dev/null +++ b/catalog/catalog.json @@ -0,0 +1,24 @@ +{ + "type": "Catalog", + "id": "nhgf-stac-catalog", + "stac_version": "1.0.0", + "description": "Sample NHGF STAC catalog.", + "links": [ + { + "rel": "root", + "href": "./catalog.json", + "type": "application/json" + }, + { + "rel": "child", + "href": "./conus404-daily/collection.json", + "type": "application/json" + }, + { + "rel": "child", + "href": "./conus404-daily/collection.json", + "type": "application/json" + } + ], + "stac_extensions": [] +} \ No newline at end of file diff --git a/catalog/conus404-daily/collection.json b/catalog/conus404-daily/collection.json new file mode 100644 index 00000000..bb201fbe --- /dev/null +++ b/catalog/conus404-daily/collection.json @@ -0,0 +1,1307 @@ +{ + "type": "Collection", + "id": "conus404-daily", + "stac_version": "1.0.0", + "description": "CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage", + "links": [ + { + "rel": "root", + "href": "../catalog.json", + "type": "application/json" + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json" + } + ], + "stac_extensions": [ + "https://stac-extensions.github.io/datacube/v2.0.0/schema.json" + ], + "cube:dimensions": { + "time": { + "type": "temporal", + "extent": [ + "1979-10-01T00:00:00Z", + "2021-09-25T00:00:00Z" + ] + }, + "lon": { + "type": "spatial", + "axis": "x", + "extent": [ + "-138.73135", + "-57.068634" + ] + }, + "lat": { + "type": "spatial", + "axis": "y", + "extent": [ + "17.647308", + "57.34342" + ] + }, + "bottom_top_stag": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "bottom_top": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "soil_layers_stag": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "x_stag": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "y_stag": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "snow_layers_stag": { + "type": "spatial", + "axis": "z", + "description": "" + }, + "snso_layers_stag": { + "type": "spatial", + "axis": "z", + "description": "" + } + }, + "cube:variables": { + "ACDEWC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACDRIPR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACDRIPS": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACECAN": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACEDIR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACETLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACETRAN": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACEVAC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACEVB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACEVC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACEVG": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACFROC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACFRZC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACGHB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACGHFLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACGHV": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACINTR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACINTS": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACLHFLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACLWDNB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACLWUPB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACMELTC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACPONDING": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACQLAT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACQRF": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACRAINLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACRAINSNOW": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACRUNSB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACRUNSF": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSHFLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSNBOT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSNFRO": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSNOM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSNOWLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSNSUB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSUBC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSWDNB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSWDNLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSWDNT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSWUPB": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACSWUPLSM": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACTHROR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACTHROS": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ACTR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ALBEDO": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "BF": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "BH": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "C1F": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "C1H": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "C2F": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "C2H": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "C3F": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "C3H": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "C4F": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "C4H": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "CANWAT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "CF1": { + "dimensions": [], + "type": "data" + }, + "CF2": { + "dimensions": [], + "type": "data" + }, + "CF3": { + "dimensions": [], + "type": "data" + }, + "CFN": { + "dimensions": [], + "type": "data" + }, + "CFN1": { + "dimensions": [], + "type": "data" + }, + "CLAT": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "COSALPHA": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "DN": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "DNW": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "DZS": { + "dimensions": [ + "soil_layers_stag" + ], + "type": "data" + }, + "E": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "E2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ES2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "F": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "FNM": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "FNP": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "GRAUPEL_ACC_NC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "HGT": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "ISLTYP": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "IVGTYP": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "LAKEMASK": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "LANDMASK": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "LU_INDEX": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "MAPFAC_M": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "MAPFAC_MX": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "MAPFAC_MY": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "MAPFAC_U": { + "dimensions": [ + "y", + "x_stag" + ], + "type": "data" + }, + "MAPFAC_UX": { + "dimensions": [ + "y", + "x_stag" + ], + "type": "data" + }, + "MAPFAC_UY": { + "dimensions": [ + "y", + "x_stag" + ], + "type": "data" + }, + "MAPFAC_V": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "MAPFAC_VX": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "MAPFAC_VY": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "MAX_MSTFX": { + "dimensions": [], + "type": "data" + }, + "MAX_MSTFY": { + "dimensions": [], + "type": "data" + }, + "MF_VX_INV": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "MLCAPE": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "MUB": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "P00": { + "dimensions": [], + "type": "data" + }, + "PB": { + "dimensions": [ + "bottom_top", + "y", + "x" + ], + "type": "data" + }, + "PBLH": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "PHB": { + "dimensions": [ + "bottom_top_stag", + "y", + "x" + ], + "type": "data" + }, + "PREC_ACC_NC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "PSFC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "PWAT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "P_STRAT": { + "dimensions": [], + "type": "data" + }, + "P_TOP": { + "dimensions": [], + "type": "data" + }, + "Q2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "QRFS": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "QSLAT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "QSPRINGS": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "QVAPOR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "RDN": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "RDNW": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "RDX": { + "dimensions": [], + "type": "data" + }, + "RDY": { + "dimensions": [], + "type": "data" + }, + "RECH": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "RH2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SH2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SH2O": { + "dimensions": [ + "time", + "soil_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "SHDMAX": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "SHDMIN": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "SINALPHA": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "SMCWTD": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SMOIS": { + "dimensions": [ + "time", + "soil_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "SNICE": { + "dimensions": [ + "time", + "snow_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "SNLIQ": { + "dimensions": [ + "time", + "snow_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "SNOALB": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "SNOW": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SNOWC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SNOWENERGY": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SNOWH": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SNOW_ACC_NC": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SOILENERGY": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "SR": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "T00": { + "dimensions": [], + "type": "data" + }, + "T2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "TD2": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "TISO": { + "dimensions": [], + "type": "data" + }, + "TK": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "TLP": { + "dimensions": [], + "type": "data" + }, + "TLP_STRAT": { + "dimensions": [], + "type": "data" + }, + "TRAD": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "TSK": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "TSLB": { + "dimensions": [ + "time", + "soil_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "TSNO": { + "dimensions": [ + "time", + "snow_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "TV": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "U": { + "dimensions": [ + "time", + "y", + "x_stag" + ], + "type": "data" + }, + "U10": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "V": { + "dimensions": [ + "time", + "y_stag", + "x" + ], + "type": "data" + }, + "V10": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "VAR": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "VAR_SSO": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "XLAND": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "Z": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "ZETATOP": { + "dimensions": [], + "type": "data" + }, + "ZNU": { + "dimensions": [ + "bottom_top" + ], + "type": "data" + }, + "ZNW": { + "dimensions": [ + "bottom_top_stag" + ], + "type": "data" + }, + "ZS": { + "dimensions": [ + "soil_layers_stag" + ], + "type": "data" + }, + "ZSNSO": { + "dimensions": [ + "time", + "snso_layers_stag", + "y", + "x" + ], + "type": "data" + }, + "ZWT": { + "dimensions": [ + "time", + "y", + "x" + ], + "type": "data" + }, + "crs": { + "dimensions": [], + "type": "data" + }, + "lat": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "lat_u": { + "dimensions": [ + "y", + "x_stag" + ], + "type": "data" + }, + "lat_v": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "lon": { + "dimensions": [ + "y", + "x" + ], + "type": "data" + }, + "lon_u": { + "dimensions": [ + "y", + "x_stag" + ], + "type": "data" + }, + "lon_v": { + "dimensions": [ + "y_stag", + "x" + ], + "type": "data" + }, + "time": { + "dimensions": [ + "time" + ], + "type": "data" + }, + "x": { + "dimensions": [ + "x" + ], + "type": "data" + }, + "y": { + "dimensions": [ + "y" + ], + "type": "data" + } + }, + "extent": { + "spatial": { + "bbox": [ + [ + "-138.73135", + "17.647308", + "-57.068634", + "57.34342" + ] + ] + }, + "temporal": { + "interval": [ + [ + "1979-10-01T00:00:00Z", + "2021-09-25T00:00:00Z" + ] + ] + } + }, + "license": "CC0-1.0", + "assets": { + "zarr-s3": { + "href": "s3://nhgf-development/conus404/conus404_daily_202210.zarr/", + "type": "application/vnd+zarr", + "description": "S3 access to collection zarr group", + "xarray:open_kwargs": { + "chunks": {}, + "engine": "zarr", + "consolidated": true + }, + "xarray:storage_options": { + "requester_pays": true + }, + "roles": [ + "data", + "zarr", + "s3" + ] + } + } +} \ No newline at end of file diff --git a/workflows/add_zarr_asset_to_collection_zarr.ipynb b/workflows/add_zarr_asset_to_collection_zarr.ipynb new file mode 100644 index 00000000..6289aa97 --- /dev/null +++ b/workflows/add_zarr_asset_to_collection_zarr.ipynb @@ -0,0 +1,42 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ca2c527c-0edc-4872-95dd-ecf31fe3396c", + "metadata": {}, + "source": [ + "# nhgf-development S3 Zarr -> Collection or Asset Exploratory Workflow\n", + "This is a placeholder for a workflow that will turn zarr endpoints from the nhgf-development s3 bucket into zarr collections (or assets added to existing OPeNDAP collections) for the NHGF STAC Catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb75cca-045b-4439-999c-81ad565e8205", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflows/create_catalog.ipynb b/workflows/create_catalog.ipynb new file mode 100644 index 00000000..bc6480e5 --- /dev/null +++ b/workflows/create_catalog.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e56d0e15-f009-402b-9349-0fa73d6b23ef", + "metadata": {}, + "source": [ + "# STAC Catalog Workflow\n", + "This workflow was used to create the NHGF [STAC Catalog](https://github.com/radiantearth/stac-spec/blob/master/catalog-spec/catalog-spec.md)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dab9d3d8-c9fb-43f7-8395-d9c719a26377", + "metadata": {}, + "outputs": [], + "source": [ + "import pystac\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a3f5e4df-5f19-44c2-abab-fe80e7f9445a", + "metadata": {}, + "outputs": [], + "source": [ + "# define folder location where you want to store STAC json files\n", + "catalog_path = os.path.join('..', 'catalog')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "51e81a83-11ed-4e68-a18b-937b9cd2862f", + "metadata": {}, + "outputs": [], + "source": [ + "catalog = pystac.Catalog(id='nhgf-stac-catalog', description='Sample NHGF STAC catalog.')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5c7af496-0688-45b5-a04c-02cb5d065a1e", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.normalize_hrefs(catalog_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "091849f9-7bec-481a-8d36-3ce4112b63e1", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "users-users-pangeo", + "language": "python", + "name": "conda-env-users-users-pangeo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflows/create_collection_opendap.ipynb b/workflows/create_collection_opendap.ipynb new file mode 100644 index 00000000..14d54082 --- /dev/null +++ b/workflows/create_collection_opendap.ipynb @@ -0,0 +1,42 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "397887c8-618c-460c-9f32-768ae44885e9", + "metadata": {}, + "source": [ + "# OPeNDAP Endpoint -> Collection Exploratory Workflow\n", + "This is a placeholder for a workflow that will turn OPeNDAP data endpoints into zarr collections for the NHGF STAC Catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d03f3a8-9bd0-49c6-8feb-1cf937cf199f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb new file mode 100644 index 00000000..6d904087 --- /dev/null +++ b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb @@ -0,0 +1,421 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c10e07b-1e60-4926-af1d-fa75dc78e5d4", + "metadata": { + "tags": [] + }, + "source": [ + "# CONUS404 Daily Zarr -> Collection Exploratory Workflow\n", + "This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.\n", + "\n", + "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n", + "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n", + "2. I am assuming all coordinates are from the WGS84 datum if not specified.\n", + "\n", + "Note that some work needs to be done for this particular dataset's dimension metadata. I need to collect more information the z dataset dimensions. Also, I currently use lon/lat as dimensions, but x/y are also available (and are actually dataset dimensions). I am still deciding if I will use x/y or lon/lat. I will update this workflow when those details are finalized." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "201e0945-de55-45ff-b095-c2af009a4e62", + "metadata": {}, + "outputs": [], + "source": [ + "import pystac\n", + "from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension\n", + "import xarray as xr\n", + "import os\n", + "import fsspec\n", + "import cf_xarray\n", + "import hvplot.xarray\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "20b00e88-5a13-46b3-9787-d9ac2d4e7bd6", + "metadata": {}, + "source": [ + "## Open up NHGF STAC Catalog" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adf6c59d-58cd-48b1-a5fd-3bb205a3ef56", + "metadata": {}, + "outputs": [], + "source": [ + "# define folder location where your STAC catalog json file is\n", + "catalog_path = os.path.join('..', 'catalog')\n", + "# open catalog\n", + "catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "996e60ba-13e4-453a-8534-e62ce747f0fa", + "metadata": {}, + "source": [ + "## Collection Metadata Input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "482d204d-b5b6-40e5-ac42-55b459be1097", + "metadata": {}, + "outputs": [], + "source": [ + "# url to zarr store that you want to create a collection for\n", + "zarr_url = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'\n", + "\n", + "# define keyword arguments needed for opening the dataset with xarray\n", + "# ref: https://github.com/stac-extensions/xarray-assets\n", + "xarray_opendataset_kwargs = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n", + " \"xarray:storage_options\":{\"requester_pays\":True}}\n", + "# description for zarr url asset attached to collection (zarr_url)\n", + "asset_description = \"S3 access to collection zarr group\"\n", + "# roles to tag zarr url asset with\n", + "asset_roles = [\"data\",\"zarr\",\"s3\"]\n", + "\n", + "# name for STAC collection\n", + "collection_id = 'conus404-daily'\n", + "# description of STAC collection\n", + "collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'\n", + "# license for dataset\n", + "collection_license = 'CC0-1.0'" + ] + }, + { + "cell_type": "markdown", + "id": "b213b74f-ad17-4774-93b6-3b62be616b45", + "metadata": {}, + "source": [ + "## Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "708f2cf5-79ab-49af-8067-de31d0d13ee6", + "metadata": {}, + "outputs": [], + "source": [ + "# open and view zarr dataset\n", + "fs2 = fsspec.filesystem('s3', requester_pays=True)\n", + "ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr', \n", + " backend_kwargs={'consolidated':True}, chunks={})\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f49f6d7e-af4f-471f-9a32-5db7d3b2674b", + "metadata": {}, + "outputs": [], + "source": [ + "# plot a map of a single variable\n", + "var_to_plot = 'SNOW'\n", + "da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()\n", + "da.hvplot.quadmesh(x='lon', y='lat', rasterize=True, \n", + " geo=True, tiles='OSM', alpha=0.7, cmap='turbo')" + ] + }, + { + "cell_type": "markdown", + "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c", + "metadata": {}, + "source": [ + "## Create Collection Extent" + ] + }, + { + "cell_type": "markdown", + "id": "69f0d837-68a5-4fed-9a14-5d75cfbb0da4", + "metadata": {}, + "source": [ + "### Spatial Extent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d46805e0-8e94-4ebe-aa01-d9a2d7051459", + "metadata": {}, + "outputs": [], + "source": [ + "# pull out lat/lon bbox for data\n", + "# coordiantes must be from WGS 84 datum\n", + "# left, bottom, right, top\n", + "coord_bounds = [str(ds.lon.data.min()), str(ds.lat.data.min()), str(ds.lon.data.max()), str(ds.lat.data.max())]\n", + "print(coord_bounds)\n", + "# create a spatial extent object \n", + "spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])" + ] + }, + { + "cell_type": "markdown", + "id": "a04c8fca-1d33-43ac-9e2b-62d7be2887f7", + "metadata": {}, + "source": [ + "### Temporal Extent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41a84995-867c-4152-8c57-85e3758bbb77", + "metadata": {}, + "outputs": [], + "source": [ + "# pull out first and last timestamps\n", + "temporal_extent_lower = pd.Timestamp(ds.time.data.min())\n", + "temporal_extent_upper = pd.Timestamp(ds.time.data.max())\n", + "print(f'min: {temporal_extent_lower} \\nmax: {temporal_extent_upper}')\n", + "# create a temporal extent object\n", + "temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b1e37c4-5348-46ad-abc9-e005b5d6c02b", + "metadata": {}, + "outputs": [], + "source": [ + "collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)" + ] + }, + { + "cell_type": "markdown", + "id": "cfb71202-03df-45b5-ac2f-0dc2ee1ab780", + "metadata": {}, + "source": [ + "## Create pystac collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e96811b-95ae-406a-9728-55fc429d4e1f", + "metadata": {}, + "outputs": [], + "source": [ + "collection = pystac.Collection(id=collection_id,\n", + " description=collection_description,\n", + " extent=collection_extent,\n", + " license=collection_license)" + ] + }, + { + "cell_type": "markdown", + "id": "a21c76e8-cd57-4eb5-a33f-7c668a3b3205", + "metadata": {}, + "source": [ + "## Add zarr url asset to collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "094832af-d22b-4359-b0f6-cf687acce5cc", + "metadata": {}, + "outputs": [], + "source": [ + "# create an asset for the zarr url attached to the collection\n", + "asset = pystac.Asset(href=zarr_url,\n", + " description=asset_description,\n", + " media_type=\"application/vnd+zarr\",\n", + " roles=asset_roles,\n", + " extra_fields = xarray_opendataset_kwargs)\n", + "collection.add_asset(\"zarr-s3\", asset)" + ] + }, + { + "cell_type": "markdown", + "id": "f67cd5c9-db33-45c2-bc21-480cd67354f4", + "metadata": {}, + "source": [ + "## Add datacube extension to collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc00946d-2880-491d-9b3b-3aeeb4414d6c", + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate extention on collection\n", + "dc = DatacubeExtension.ext(collection, add_if_missing=True)" + ] + }, + { + "cell_type": "markdown", + "id": "8bdd77a2-7587-485e-afb7-42af3a822241", + "metadata": {}, + "source": [ + "### Add cube dimensions (required field for extension)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "120a4914-3302-44a5-a282-0308ac84f040", + "metadata": {}, + "outputs": [], + "source": [ + "# list out dataset coordinates\n", + "dims = list(ds.dims)\n", + "print(dims)" + ] + }, + { + "cell_type": "markdown", + "id": "00a5e041-081d-428d-ac2e-75d16de205e6", + "metadata": {}, + "source": [ + "user input needed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a443497-67a9-4dce-a8e9-b08d31a88223", + "metadata": {}, + "outputs": [], + "source": [ + "# create a dictionary of datacube dimensions you would like to assign to this dataset\n", + "# dimension name should come from the coordinates printed above\n", + "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", + "# note that the extent of each dimension should be pulled from the dataset\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')]}),\n", + " 'lon': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [coord_bounds[0], coord_bounds[2]]}),\n", + " 'lat': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [coord_bounds[1], coord_bounds[3]]}),\n", + " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", + " }\n", + "# unresolved questions:\n", + "# do we want other fields? https://github.com/stac-extensions/datacube\n", + "# are these all z spatial dimension?\n", + "# can i have multiple z dimensions on one collection?\n", + "# could i also have multiple x,y dimensions\n", + "# add descriptions\n", + "\n", + "# var dims x, y - I used lat, lon here because I thought it would be more intuitive \n", + "# but not really dims\n", + "# if we use x, y need to use reference_system - how to harvest from crs?\n", + "# options: numerical EPSG code, WKT2 (ISO 19162) string or PROJJSON object" + ] + }, + { + "cell_type": "markdown", + "id": "0f277883-a3fd-425f-966a-ca2140d0ef2f", + "metadata": {}, + "source": [ + "### Add cube variables (optional field for extension)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9272931-fc0b-4f2a-9546-283033e9cde8", + "metadata": {}, + "outputs": [], + "source": [ + "# pull list of vars from dataset\n", + "vars = list(ds.variables)\n", + "# create dictionary of dataset variables and associated dimensions\n", + "vars_dict={}\n", + "for var in vars:\n", + " vars_dict[var] = pystac.extensions.datacube.Variable({'dimensions':list(ds[var].dims), 'type': 'data'})" + ] + }, + { + "cell_type": "markdown", + "id": "11ad5352-884c-4472-8864-4570a96f66e5", + "metadata": {}, + "source": [ + "### Finalize extension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10141fd4-91d6-491d-878b-02653720891d", + "metadata": {}, + "outputs": [], + "source": [ + "# add dimesions and variables to collection extension\n", + "dc.apply(dimensions=dims_dict, variables=vars_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "615ca168-75fb-4135-9941-0ef5fe4fd1cb", + "metadata": {}, + "source": [ + "## Add STAC Collection to Catalog and Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b75791b-6b2d-40be-b7c6-330a60888fb5", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.add_child(collection)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65ab4190-dd2d-45bf-96bd-b6d32e5cf8fc", + "metadata": {}, + "outputs": [], + "source": [ + "catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f94dbc67-7fff-4cd1-98f8-bc270fc3b110", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "users-users-pangeo", + "language": "python", + "name": "conda-env-users-users-pangeo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab