add aiem workflows

c5dabb2c · Snyder, Amelia Marie · 2c311a70 · c5dabb2c · c5dabb2c · c5dabb2c
Commit c5dabb2c authored 10 months ago by Snyder, Amelia Marie
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G0.6SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G0.6SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6c10e07b-1e60-4926-af1d-fa75dc78e5d4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# 5m_rcp45_G0.6SW_S0.15m_wMeta Zarr -> Collection Workflow\n",
+    "This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr asset for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.\n",
+    "\n",
+    "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n",
+    "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n",
+    "2. I am assuming all coordinates are from the WGS84 datum if not specified."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "201e0945-de55-45ff-b095-c2af009a4e62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pystac\n",
+    "from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension\n",
+    "import xarray as xr\n",
+    "import cf_xarray\n",
+    "import os\n",
+    "import fsspec\n",
+    "import cf_xarray\n",
+    "import hvplot.xarray\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pyproj\n",
+    "from pyproj import Transformer\n",
+    "import cartopy.crs as ccrs\n",
+    "import cfunits\n",
+    "import json\n",
+    "import sys\n",
+    "sys.path.insert(1, '../..')\n",
+    "import stac_helpers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a71f9d19-8fb3-4f47-b4c4-447bb80d8dd5",
+   "metadata": {},
+   "source": [
+    "## Collection ID"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15ee060d-3127-4024-a1ad-6aa0648667e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# name for STAC collection - should match name of zarr dataset\n",
+    "collection_id = '5m_rcp45_G0.6SW_S0.15m_wMeta'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "116b5837-8e85-4ae7-964a-803533ded714",
+   "metadata": {},
+   "source": [
+    "## Asset Metadata Input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd6fa323-132a-4794-8c80-576933f547a0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# url to zarr store that you want to create a collection for\n",
+    "zarr_url = f's3://mdmf/gdp/{collection_id}.zarr/'\n",
+    "\n",
+    "# define keyword arguments needed for opening the dataset with xarray\n",
+    "# ref: https://github.com/stac-extensions/xarray-assets\n",
+    "xarray_opendataset_kwargs = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n",
+    "                          \"xarray:storage_options\": {\"anon\": True, \"client_kwargs\": {\"endpoint_url\":\"https://usgs.osn.mghpcc.org/\"}}}\n",
+    "# title - The displayed title for clients and users.\n",
+    "asset_title = 'Requester pays access to zarr via S3 API'\n",
+    "# description for zarr url asset attached to collection (zarr_url)\n",
+    "asset_description = \"Free, public access to zarr data store via the S3 API. This data is stored on an Open Storage Network Pod.\"\n",
+    "# roles to tag zarr url asset with\n",
+    "asset_roles = [\"data\",\"zarr\",\"s3\",\"osn\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1441cd4-e94c-4902-af46-8f1af470eb6b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# url to zarr store that you want to create a collection for\n",
+    "zarr_url2 = f's3://nhgf-development/workspace/DataConversion/{collection_id}.zarr/'\n",
+    "\n",
+    "# define keyword arguments needed for opening the dataset with xarray\n",
+    "# ref: https://github.com/stac-extensions/xarray-assets\n",
+    "xarray_opendataset_kwargs2 = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n",
+    "                          \"xarray:storage_options\":{\"requester_pays\":True}}\n",
+    "# title - The displayed title for clients and users.\n",
+    "asset_title2 = 'Free access to zarr via S3 API'\n",
+    "# description for zarr url asset attached to collection (zarr_url)\n",
+    "asset_description2 = \"Requester pays, public access to zarr data store via the S3 API. This data is stored in an AWS S3 bucket.\"\n",
+    "# roles to tag zarr url asset with\n",
+    "asset_roles2 = [\"data\",\"zarr\",\"s3\",\"aws\",\"requester-pays\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b213b74f-ad17-4774-93b6-3b62be616b45",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Data Exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "708f2cf5-79ab-49af-8067-de31d0d13ee6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# open and view zarr dataset\n",
+    "fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')\n",
+    "ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr', \n",
+    "                             backend_kwargs={'consolidated':True}, chunks={})\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "996e60ba-13e4-453a-8534-e62ce747f0fa",
+   "metadata": {},
+   "source": [
+    "## Collection Metadata Input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "482d204d-b5b6-40e5-ac42-55b459be1097",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# description of STAC collection\n",
+    "collection_description = ds.attrs['title']\n",
+    "print(f'collection description: {collection_description}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91129d65-a614-4fe4-86b6-105b1f121f55",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# license for dataset\n",
+    "collection_license = stac_helpers.license_picker(ds.attrs['license'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc",
+   "metadata": {},
+   "source": [
+    "## Identify x, y, t dimensions of dataset\n",
+    "May require user input if dimensions cannot be auto-detected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab91268f-7200-4cb1-979a-c7d75531d2c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dims_auto_extract = ['X', 'Y', 'T']\n",
+    "# dim_names_dict = {}\n",
+    "# for d in dims_auto_extract:\n",
+    "#     dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n",
+    "dim_names_dict = {'X': 'x', 'Y': 'y', 'T': 'time'}\n",
+    "print(f\"Dimension dictionary: {dim_names_dict}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "810d7480-165d-41c0-bd09-163656a14003",
+   "metadata": {},
+   "source": [
+    "## Get crs info\n",
+    "If there is no crs info that can be automatically extracted from the dataset with pyproj, you will need to manually identify the crs and create a crs object. This reference list of cartopy projections may be a helpful resource: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "239d3b00-77f9-4178-954b-ba81a2b34512",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "crs_var = 'crs'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b03d52f3-1367-4255-a561-52ee4fc9e92d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# use pyproj to automatically extract crs info\n",
+    "crs = pyproj.CRS.from_cf(ds[crs_var].attrs)\n",
+    "\n",
+    "# alternatively, create the appropriate cartopy projection\n",
+    "# crs = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian, \n",
+    "#                             central_latitude=crs_info.latitude_of_projection_origin,\n",
+    "#                             standard_parallels=crs_info.standard_parallel)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "282c689e-07f0-48ee-8e3d-35876e8c5094",
+   "metadata": {},
+   "source": [
+    "### Compare dataset crs var to generated proj4 string to make sure it looks ok"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cee13ba-487d-483e-a013-b65685137502",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds[crs_var]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7bc73db-7717-450e-9679-525f7be0c910",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "crs.to_proj4()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c",
+   "metadata": {},
+   "source": [
+    "## Create Collection Extent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69f0d837-68a5-4fed-9a14-5d75cfbb0da4",
+   "metadata": {},
+   "source": [
+    "### Spatial Extent\n",
+    "##### WARNING - make sure data type is **float** NOT **numpy.float64**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d46805e0-8e94-4ebe-aa01-d9a2d7051459",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull out lat/lon bbox for data\n",
+    "# coordinates must be from WGS 84 datum\n",
+    "# left, bottom, right, top\n",
+    "\n",
+    "# Note: try changing around the commented out lines below to get type float rather than a numpy float\n",
+    "#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]\n",
+    "#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]\n",
+    "spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]\n",
+    "print(spatial_bounds)\n",
+    "print(f'\\nspatial_bounds data type: {type(spatial_bounds[0])}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f16fdb9e-7ed8-40fb-a4f1-9ecabdebc0a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "074fc23c-f4d9-4427-80d3-fbf691e6d411",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transformer = Transformer.from_crs(crs, \"EPSG:4326\", always_xy=True)\n",
+    "lon, lat = transformer.transform(XX.ravel(), YY.ravel())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5345c975-9fe3-48e1-a663-0275cdf275dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')\n",
+    "print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0a5a222-743d-403a-9411-2406374803cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a spatial extent object \n",
+    "spatial_extent = pystac.SpatialExtent(bboxes=[[min(lon).item(), min(lat).item(), max(lon).item(), max(lat).item()]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a04c8fca-1d33-43ac-9e2b-62d7be2887f7",
+   "metadata": {},
+   "source": [
+    "### Temporal Extent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41a84995-867c-4152-8c57-85e3758bbb77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull out first and last timestamps\n",
+    "# temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())\n",
+    "# temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())\n",
+    "# if you get an error:\n",
+    "# Cannot convert input [] of type <class 'cftime._cftime.DatetimeNoLeap'> to Timestamp\n",
+    "# use the following instead:\n",
+    "temporal_extent_lower = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min())\n",
+    "temporal_extent_upper = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max())\n",
+    "\n",
+    "print(f'min: {temporal_extent_lower} \\nmax: {temporal_extent_upper}')\n",
+    "# create a temporal extent object\n",
+    "temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b1e37c4-5348-46ad-abc9-e005b5d6c02b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20b00e88-5a13-46b3-9787-d9ac2d4e7bd6",
+   "metadata": {},
+   "source": [
+    "## Open up STAC Catalog and create a collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adf6c59d-58cd-48b1-a5fd-3bb205a3ef56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define folder location where your STAC catalog json file is\n",
+    "catalog_path = os.path.join('..', '..', '..', 'catalog')\n",
+    "# open catalog\n",
+    "catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fbb3380-88c0-435f-af77-ccdd45a63ced",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# open parent collection for this dataset\n",
+    "parent_collection_id = 'AIEM_permafrost'\n",
+    "parent_collection = catalog.get_child(parent_collection_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e96811b-95ae-406a-9728-55fc429d4e1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if parent_collection.get_child(collection_id):\n",
+    "    collection = parent_collection.get_child(collection_id)\n",
+    "    print(\"existing collection opened\")\n",
+    "    collection.extent=collection_extent\n",
+    "    collection.description=collection_description\n",
+    "    collection.license=collection_license\n",
+    "else:\n",
+    "    collection = pystac.Collection(id=collection_id,\n",
+    "                                   description=collection_description,\n",
+    "                                   extent=collection_extent,\n",
+    "                                   license=collection_license)\n",
+    "    print(\"new collection created\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a21c76e8-cd57-4eb5-a33f-7c668a3b3205",
+   "metadata": {},
+   "source": [
+    "## Add zarr url asset to collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c65c10b2-46b3-4e4e-a493-dd71579534f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# first clear out any existing assets - that way if we change the name, we won't have the old asset\n",
+    "# preserved\n",
+    "collection.assets.clear()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "094832af-d22b-4359-b0f6-cf687acce5cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "asset_id = \"zarr-s3-osn\"\n",
+    "asset = pystac.Asset(href=zarr_url,\n",
+    "                     title=asset_title,\n",
+    "                     description=asset_description,\n",
+    "                     media_type=\"application/vnd+zarr\",\n",
+    "                     roles=asset_roles,\n",
+    "                     extra_fields = xarray_opendataset_kwargs)\n",
+    "collection.add_asset(asset_id, asset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c298d07-f234-4a08-986d-87f4a39e9ae6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "asset_id2 = \"zarr-s3-aws\"\n",
+    "asset2 = pystac.Asset(href=zarr_url2,\n",
+    "                     title=asset_title2,\n",
+    "                     description=asset_description2,\n",
+    "                     media_type=\"application/vnd+zarr\",\n",
+    "                     roles=asset_roles2,\n",
+    "                     extra_fields = xarray_opendataset_kwargs2)\n",
+    "collection.add_asset(asset_id2, asset2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5305b35-c5ab-48e2-af1e-8f313c49b74c",
+   "metadata": {},
+   "source": [
+    "## Add Collection Link to Archival Assets on S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9182b8ef-675e-44fc-a24c-4759c295f420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "archival_id = \"legacy-s3-aws\"\n",
+    "archival_asset = pystac.Asset(href='s3://nhgf-development/thredds/AIEM_permafrost/',\n",
+    "                             title='Requester pays access to archival legacy files via S3 API', \n",
+    "                             description='Requester pays, public access (via the S3 API) to archival legacy files from WMA THREDDS server that were used to create this zarr store. This data is stored in an AWS S3 bucket.',\n",
+    "                             roles=[\"data\",\"directory\",\"s3\",\"aws\",\"requester-pays\"])\n",
+    "collection.add_asset(archival_id, archival_asset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f67cd5c9-db33-45c2-bc21-480cd67354f4",
+   "metadata": {},
+   "source": [
+    "## Add datacube extension to collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc00946d-2880-491d-9b3b-3aeeb4414d6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# instantiate extention on collection\n",
+    "dc = DatacubeExtension.ext(collection, add_if_missing=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bdd77a2-7587-485e-afb7-42af3a822241",
+   "metadata": {},
+   "source": [
+    "### Add cube dimensions (required field for extension)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7dc357c-91ec-49ae-83e5-400f791f9792",
+   "metadata": {},
+   "source": [
+    "#### user review needed\n",
+    "#### compare crs information to the projjson to make sure it looks correct"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea452f62-5644-49b6-8a4e-7dc4f649fd1a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# print out crs information in dataset\n",
+    "crs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b1d05ff-8e43-44a7-8343-178b112c4ad6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # the datacube extension can accept reference_system information as a numerical EPSG code, \n",
+    "# # WKT2 (ISO 19162) string or PROJJSON object.\n",
+    "# # we will use a projjson, as was done by Microsoft Planetary Computer here:\n",
+    "# # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n",
+    "# # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n",
+    "# projjson = json.loads(lcc.to_json())\n",
+    "\n",
+    "# alternatively, I think we could do this:\n",
+    "projjson = crs.to_json()\n",
+    "print(crs.to_json(pretty=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6b88ee9-60c2-4d91-af74-c1c56b094826",
+   "metadata": {},
+   "source": [
+    "#### user review needed\n",
+    "#### look at the spatial and temporal steps, make sure they are all successfully pulled and they look correct"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9e2bbcc5-e45a-4b8c-9d60-601f345e8134",
+   "metadata": {},
+   "source": [
+    "**Time**\n",
+    "\n",
+    "If you need to manually construct this field, here is a helpful reference: https://en.wikipedia.org/wiki/ISO_8601#Durations\n",
+    "\n",
+    "Manually constructing 1 year time step instead of the auto-extracted 365 day time step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82f1e9bd-52ee-46f5-9e95-c2359d95fcf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n",
+    "# if time is yearly or monthly, you will need to manually construct it:\n",
+    "time_step = \"P1Y0M0DT0H0M0S\"\n",
+    "print(f'time step: {time_step}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64be65b2-de20-447a-a9c2-bd8eca3e440e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # optional debugging for time steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
+    "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc8dff39-2a2e-44a0-9b30-987107c2d1e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # debugging for time steps, cont:\n",
+    "# # please choose one of the index locations printed above\n",
+    "# # this will print the time steps adjacent to it\n",
+    "# ix = 11\n",
+    "# ds.isel(time=slice(ix-1,ix+3)).time"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9aa6c8ff-8d9b-40a7-a281-39b502bd5a3d",
+   "metadata": {},
+   "source": [
+    "**X/lon**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8ba7695-ca45-4db2-bd46-c465f4e37eff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n",
+    "# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation\n",
+    "# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:\n",
+    "#x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n",
+    "print(f'x step: {x_step}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fac4c9f2-a952-4c7f-aa32-862957372d6f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # optional debugging for spatial steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
+    "# x_dim=dim_names_dict['X']\n",
+    "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)\n",
+    "# print(f'\\nx dim name (for next cell): {x_dim}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d0b5a2d-dc58-4ad6-b890-859ce6bb08de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # debugging for spatial steps, cont:\n",
+    "# # please choose one of the index locations printed above\n",
+    "# # this will print the time steps adjacent to it\n",
+    "# ix = 5\n",
+    "# ds.isel(x=slice(ix-1,ix+3)).x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21b5cca4-8bb4-498d-ae6b-6b8545fffe56",
+   "metadata": {},
+   "source": [
+    "**Y/lat**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7405583b-ecb9-44b0-8815-048e42e55a42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n",
+    "# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation\n",
+    "# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:\n",
+    "#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=12)\n",
+    "print(f'y step: {y_step}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ece0fe37-b54c-4721-aa9b-33d2998d191b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # optional debugging for spatial steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
+    "# y_dim=dim_names_dict['Y']\n",
+    "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)\n",
+    "# print(f'\\nx dim name (for next cell): {x_dim}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abdafb8f-5217-4b82-91b6-eec8183c9128",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # debugging for spatial steps, cont:\n",
+    "# # please choose one of the index locations printed above\n",
+    "# # this will print the time steps adjacent to it\n",
+    "# ix = 5\n",
+    "# ds.isel(y=slice(ix-1,ix+3)).y"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00a5e041-081d-428d-ac2e-75d16de205e6",
+   "metadata": {},
+   "source": [
+    "#### user input needed\n",
+    "#### you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes (type, axis, extent, etc.):\n",
+    "\n",
+    "Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.\n",
+    "\n",
+    "If you have a dimension like \"bnds\" or \"nv\" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type \"count\" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).\n",
+    "\n",
+    "Here is an example:\n",
+    "\n",
+    "```\n",
+    "dims_dict = {\n",
+    "            'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]})\n",
+    "            }\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acd45d3c-7845-47e6-9b7d-e35627a7ca9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dims = list(ds.dims)\n",
+    "print(dims)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a443497-67a9-4dce-a8e9-b08d31a88223",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a dictionary of datacube dimensions you would like to assign to this dataset\n",
+    "# dimension name should come from the dims printed in above cell\n",
+    "\n",
+    "# x, y, t dimension info is pulled out automatically using the dim dict we created above\n",
+    "# all other dims listed in above cell need to be manually written in\n",
+    "\n",
+    "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n",
+    "# note that the extent of each dimension should be pulled from the dataset\n",
+    "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n",
+    "             dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [spatial_bounds[0], spatial_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n",
+    "             dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [spatial_bounds[1], spatial_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n",
+    "             'z': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'z'), 'extent': [ds.z.min().item(), ds.z.max().item()]}),\n",
+    "            }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ab85b09-eb38-404c-910c-13349d5e2234",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make sure you added all the right dims\n",
+    "assert sorted(list(dims_dict.keys())) == sorted(dims)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f277883-a3fd-425f-966a-ca2140d0ef2f",
+   "metadata": {},
+   "source": [
+    "### Add cube variables (optional field for extension)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9272931-fc0b-4f2a-9546-283033e9cde8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop metpy_crs coordinate we have added\n",
+    "if 'metpy_crs' in ds.coords:\n",
+    "    ds = ds.drop_vars('metpy_crs')\n",
+    "\n",
+    "# pull list of vars from dataset\n",
+    "vars = list(ds.variables)\n",
+    "\n",
+    "# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.\n",
+    "# we will drop all values in dims from vars\n",
+    "vars = [v for v in vars if v not in dims]\n",
+    "\n",
+    "# Microsoft Planetary Computer includes coordinates and crs as variables here:\n",
+    "# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n",
+    "# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n",
+    "# we will keep those in the var list\n",
+    "\n",
+    "# create dictionary of dataset variables and associated dimensions\n",
+    "vars_dict={}\n",
+    "for v in vars:\n",
+    "    unit = stac_helpers.get_unit(ds, v)\n",
+    "    var_type = stac_helpers.get_var_type(ds, v, crs_var)\n",
+    "    long_name = stac_helpers.get_long_name(ds, v)\n",
+    "    vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11ad5352-884c-4472-8864-4570a96f66e5",
+   "metadata": {},
+   "source": [
+    "### Finalize extension"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10141fd4-91d6-491d-878b-02653720891d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add dimesions and variables to collection extension\n",
+    "dc.apply(dimensions=dims_dict, variables=vars_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "615ca168-75fb-4135-9941-0ef5fe4fd1cb",
+   "metadata": {},
+   "source": [
+    "## Add STAC Collection to Catalog and Save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2120a55-3d04-4122-a93f-29afcdb8cb1b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# # helper to find items of wrong type\n",
+    "# d = collection.to_dict()\n",
+    "# print(*stac_helpers.find_paths(d))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b75791b-6b2d-40be-b7c6-330a60888fb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if parent_collection.get_child(collection_id):\n",
+    "    collection.normalize_and_save(root_href=os.path.join(catalog_path, parent_collection_id, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)\n",
+    "else:\n",
+    "    parent_collection.add_child(collection)\n",
+    "    parent_collection.normalize_and_save(root_href=catalog_path+'/'+parent_collection_id, catalog_type=pystac.CatalogType.SELF_CONTAINED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6f676b5-e892-4bfb-8d73-2828addd838c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "global-global-pangeo",
+   "language": "python",
+   "name": "conda-env-global-global-pangeo-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
+
+# 5m_rcp45_G0.6SW_S0.15m_wMeta Zarr -> Collection Workflow
+This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr asset for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
+
+To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
+1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
+2. I am assuming all coordinates are from the WGS84 datum if not specified.
+
+%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
+
+``` python
+import pystac
+from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
+import xarray as xr
+import cf_xarray
+import os
+import fsspec
+import cf_xarray
+import hvplot.xarray
+import pandas as pd
+import json
+import numpy as np
+import pyproj
+from pyproj import Transformer
+import cartopy.crs as ccrs
+import cfunits
+import json
+import sys
+sys.path.insert(1, '../..')
+import stac_helpers
+```
+
+%% Cell type:markdown id:a71f9d19-8fb3-4f47-b4c4-447bb80d8dd5 tags:
+
+## Collection ID
+
+%% Cell type:code id:15ee060d-3127-4024-a1ad-6aa0648667e1 tags:
+
+``` python
+# name for STAC collection - should match name of zarr dataset
+collection_id = '5m_rcp45_G0.6SW_S0.15m_wMeta'
+```
+
+%% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:
+
+## Asset Metadata Input
+
+%% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:
+
+``` python
+# url to zarr store that you want to create a collection for
+zarr_url = f's3://mdmf/gdp/{collection_id}.zarr/'
+
+# define keyword arguments needed for opening the dataset with xarray
+# ref: https://github.com/stac-extensions/xarray-assets
+xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
+                          "xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
+# title - The displayed title for clients and users.
+asset_title = 'Requester pays access to zarr via S3 API'
+# description for zarr url asset attached to collection (zarr_url)
+asset_description = "Free, public access to zarr data store via the S3 API. This data is stored on an Open Storage Network Pod."
+# roles to tag zarr url asset with
+asset_roles = ["data","zarr","s3","osn"]
+```
+
+%% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:
+
+``` python
+# url to zarr store that you want to create a collection for
+zarr_url2 = f's3://nhgf-development/workspace/DataConversion/{collection_id}.zarr/'
+
+# define keyword arguments needed for opening the dataset with xarray
+# ref: https://github.com/stac-extensions/xarray-assets
+xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
+                          "xarray:storage_options":{"requester_pays":True}}
+# title - The displayed title for clients and users.
+asset_title2 = 'Free access to zarr via S3 API'
+# description for zarr url asset attached to collection (zarr_url)
+asset_description2 = "Requester pays, public access to zarr data store via the S3 API. This data is stored in an AWS S3 bucket."
+# roles to tag zarr url asset with
+asset_roles2 = ["data","zarr","s3","aws","requester-pays"]
+```
+
+%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
+
+## Data Exploration
+
+%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
+
+``` python
+# open and view zarr dataset
+fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
+ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
+                             backend_kwargs={'consolidated':True}, chunks={})
+ds
+```
+
+%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
+
+## Collection Metadata Input
+
+%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
+
+``` python
+# description of STAC collection
+collection_description = ds.attrs['title']
+print(f'collection description: {collection_description}')
+```
+
+%% Cell type:code id:91129d65-a614-4fe4-86b6-105b1f121f55 tags:
+
+``` python
+# license for dataset
+collection_license = stac_helpers.license_picker(ds.attrs['license'])
+```
+
+%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
+
+## Identify x, y, t dimensions of dataset
+May require user input if dimensions cannot be auto-detected.
+
+%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
+
+``` python
+# dims_auto_extract = ['X', 'Y', 'T']
+# dim_names_dict = {}
+# for d in dims_auto_extract:
+#     dim_names_dict[d] = stac_helpers.extract_dim(ds, d)
+dim_names_dict = {'X': 'x', 'Y': 'y', 'T': 'time'}
+print(f"Dimension dictionary: {dim_names_dict}")
+```
+
+%% Cell type:markdown id:810d7480-165d-41c0-bd09-163656a14003 tags:
+
+## Get crs info
+If there is no crs info that can be automatically extracted from the dataset with pyproj, you will need to manually identify the crs and create a crs object. This reference list of cartopy projections may be a helpful resource: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
+
+%% Cell type:code id:239d3b00-77f9-4178-954b-ba81a2b34512 tags:
+
+``` python
+crs_var = 'crs'
+```
+
+%% Cell type:code id:b03d52f3-1367-4255-a561-52ee4fc9e92d tags:
+
+``` python
+# use pyproj to automatically extract crs info
+crs = pyproj.CRS.from_cf(ds[crs_var].attrs)
+
+# alternatively, create the appropriate cartopy projection
+# crs = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
+#                             central_latitude=crs_info.latitude_of_projection_origin,
+#                             standard_parallels=crs_info.standard_parallel)
+```
+
+%% Cell type:markdown id:282c689e-07f0-48ee-8e3d-35876e8c5094 tags:
+
+### Compare dataset crs var to generated proj4 string to make sure it looks ok
+
+%% Cell type:code id:4cee13ba-487d-483e-a013-b65685137502 tags:
+
+``` python
+ds[crs_var]
+```
+
+%% Cell type:code id:f7bc73db-7717-450e-9679-525f7be0c910 tags:
+
+``` python
+crs.to_proj4()
+```
+
+%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
+
+## Create Collection Extent
+
+%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
+
+### Spatial Extent
+##### WARNING - make sure data type is **float** NOT **numpy.float64**
+
+%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
+
+``` python
+# pull out lat/lon bbox for data
+# coordinates must be from WGS 84 datum
+# left, bottom, right, top
+
+# Note: try changing around the commented out lines below to get type float rather than a numpy float
+#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]
+#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]
+spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
+print(spatial_bounds)
+print(f'\nspatial_bounds data type: {type(spatial_bounds[0])}')
+```
+
+%% Cell type:code id:f16fdb9e-7ed8-40fb-a4f1-9ecabdebc0a1 tags:
+
+``` python
+XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
+```
+
+%% Cell type:code id:074fc23c-f4d9-4427-80d3-fbf691e6d411 tags:
+
+``` python
+transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
+lon, lat = transformer.transform(XX.ravel(), YY.ravel())
+```
+
+%% Cell type:code id:5345c975-9fe3-48e1-a663-0275cdf275dc tags:
+
+``` python
+print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')
+print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')
+```
+
+%% Cell type:code id:e0a5a222-743d-403a-9411-2406374803cf tags:
+
+``` python
+# create a spatial extent object
+spatial_extent = pystac.SpatialExtent(bboxes=[[min(lon).item(), min(lat).item(), max(lon).item(), max(lat).item()]])
+```
+
+%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
+
+### Temporal Extent
+
+%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
+
+``` python
+# pull out first and last timestamps
+# temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
+# temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
+# if you get an error:
+# Cannot convert input [] of type <class 'cftime._cftime.DatetimeNoLeap'> to Timestamp
+# use the following instead:
+temporal_extent_lower = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min())
+temporal_extent_upper = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max())
+
+print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
+# create a temporal extent object
+temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
+```
+
+%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
+
+``` python
+collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
+```
+
+%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
+
+## Open up STAC Catalog and create a collection
+
+%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
+
+``` python
+# define folder location where your STAC catalog json file is
+catalog_path = os.path.join('..', '..', '..', 'catalog')
+# open catalog
+catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
+```
+
+%% Cell type:code id:9fbb3380-88c0-435f-af77-ccdd45a63ced tags:
+
+``` python
+# open parent collection for this dataset
+parent_collection_id = 'AIEM_permafrost'
+parent_collection = catalog.get_child(parent_collection_id)
+```
+
+%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
+
+``` python
+if parent_collection.get_child(collection_id):
+    collection = parent_collection.get_child(collection_id)
+    print("existing collection opened")
+    collection.extent=collection_extent
+    collection.description=collection_description
+    collection.license=collection_license
+else:
+    collection = pystac.Collection(id=collection_id,
+                                   description=collection_description,
+                                   extent=collection_extent,
+                                   license=collection_license)
+    print("new collection created")
+```
+
+%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
+
+## Add zarr url asset to collection
+
+%% Cell type:code id:c65c10b2-46b3-4e4e-a493-dd71579534f3 tags:
+
+``` python
+# first clear out any existing assets - that way if we change the name, we won't have the old asset
+# preserved
+collection.assets.clear()
+```
+
+%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
+
+``` python
+asset_id = "zarr-s3-osn"
+asset = pystac.Asset(href=zarr_url,
+                     title=asset_title,
+                     description=asset_description,
+                     media_type="application/vnd+zarr",
+                     roles=asset_roles,
+                     extra_fields = xarray_opendataset_kwargs)
+collection.add_asset(asset_id, asset)
+```
+
+%% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:
+
+``` python
+asset_id2 = "zarr-s3-aws"
+asset2 = pystac.Asset(href=zarr_url2,
+                     title=asset_title2,
+                     description=asset_description2,
+                     media_type="application/vnd+zarr",
+                     roles=asset_roles2,
+                     extra_fields = xarray_opendataset_kwargs2)
+collection.add_asset(asset_id2, asset2)
+```
+
+%% Cell type:markdown id:c5305b35-c5ab-48e2-af1e-8f313c49b74c tags:
+
+## Add Collection Link to Archival Assets on S3
+
+%% Cell type:code id:9182b8ef-675e-44fc-a24c-4759c295f420 tags:
+
+``` python
+archival_id = "legacy-s3-aws"
+archival_asset = pystac.Asset(href='s3://nhgf-development/thredds/AIEM_permafrost/',
+                             title='Requester pays access to archival legacy files via S3 API',
+                             description='Requester pays, public access (via the S3 API) to archival legacy files from WMA THREDDS server that were used to create this zarr store. This data is stored in an AWS S3 bucket.',
+                             roles=["data","directory","s3","aws","requester-pays"])
+collection.add_asset(archival_id, archival_asset)
+```
+
+%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
+
+## Add datacube extension to collection
+
+%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
+
+``` python
+# instantiate extention on collection
+dc = DatacubeExtension.ext(collection, add_if_missing=True)
+```
+
+%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
+
+### Add cube dimensions (required field for extension)
+
+%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
+
+#### user review needed
+#### compare crs information to the projjson to make sure it looks correct
+
+%% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:
+
+``` python
+# print out crs information in dataset
+crs
+```
+
+%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
+
+``` python
+# # the datacube extension can accept reference_system information as a numerical EPSG code,
+# # WKT2 (ISO 19162) string or PROJJSON object.
+# # we will use a projjson, as was done by Microsoft Planetary Computer here:
+# # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
+# # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
+# projjson = json.loads(lcc.to_json())
+
+# alternatively, I think we could do this:
+projjson = crs.to_json()
+print(crs.to_json(pretty=True))
+```
+
+%% Cell type:markdown id:b6b88ee9-60c2-4d91-af74-c1c56b094826 tags:
+
+#### user review needed
+#### look at the spatial and temporal steps, make sure they are all successfully pulled and they look correct
+
+%% Cell type:markdown id:9e2bbcc5-e45a-4b8c-9d60-601f345e8134 tags:
+
+**Time**
+
+If you need to manually construct this field, here is a helpful reference: https://en.wikipedia.org/wiki/ISO_8601#Durations
+
+Manually constructing 1 year time step instead of the auto-extracted 365 day time step.
+
+%% Cell type:code id:82f1e9bd-52ee-46f5-9e95-c2359d95fcf3 tags:
+
+``` python
+#time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()
+# if time is yearly or monthly, you will need to manually construct it:
+time_step = "P1Y0M0DT0H0M0S"
+print(f'time step: {time_step}')
+```
+
+%% Cell type:code id:64be65b2-de20-447a-a9c2-bd8eca3e440e tags:
+
+``` python
+# # optional debugging for time steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
+# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=4)
+```
+
+%% Cell type:code id:bc8dff39-2a2e-44a0-9b30-987107c2d1e2 tags:
+
+``` python
+# # debugging for time steps, cont:
+# # please choose one of the index locations printed above
+# # this will print the time steps adjacent to it
+# ix = 11
+# ds.isel(time=slice(ix-1,ix+3)).time
+```
+
+%% Cell type:markdown id:9aa6c8ff-8d9b-40a7-a281-39b502bd5a3d tags:
+
+**X/lon**
+
+%% Cell type:code id:a8ba7695-ca45-4db2-bd46-c465f4e37eff tags:
+
+``` python
+x_step = stac_helpers.get_step(ds, dim_names_dict['X'])
+# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
+# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
+#x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)
+print(f'x step: {x_step}')
+```
+
+%% Cell type:code id:fac4c9f2-a952-4c7f-aa32-862957372d6f tags:
+
+``` python
+# # optional debugging for spatial steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
+# x_dim=dim_names_dict['X']
+# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)
+# print(f'\nx dim name (for next cell): {x_dim}')
+```
+
+%% Cell type:code id:8d0b5a2d-dc58-4ad6-b890-859ce6bb08de tags:
+
+``` python
+# # debugging for spatial steps, cont:
+# # please choose one of the index locations printed above
+# # this will print the time steps adjacent to it
+# ix = 5
+# ds.isel(x=slice(ix-1,ix+3)).x
+```
+
+%% Cell type:markdown id:21b5cca4-8bb4-498d-ae6b-6b8545fffe56 tags:
+
+**Y/lat**
+
+%% Cell type:code id:7405583b-ecb9-44b0-8815-048e42e55a42 tags:
+
+``` python
+y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])
+# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
+# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
+#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=12)
+print(f'y step: {y_step}')
+```
+
+%% Cell type:code id:ece0fe37-b54c-4721-aa9b-33d2998d191b tags:
+
+``` python
+# # optional debugging for spatial steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
+# y_dim=dim_names_dict['Y']
+# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)
+# print(f'\nx dim name (for next cell): {x_dim}')
+```
+
+%% Cell type:code id:abdafb8f-5217-4b82-91b6-eec8183c9128 tags:
+
+``` python
+# # debugging for spatial steps, cont:
+# # please choose one of the index locations printed above
+# # this will print the time steps adjacent to it
+# ix = 5
+# ds.isel(y=slice(ix-1,ix+3)).y
+```
+
+%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
+
+#### user input needed
+#### you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes (type, axis, extent, etc.):
+
+Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.
+
+If you have a dimension like "bnds" or "nv" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type "count" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).
+
+Here is an example:
+
+```
+dims_dict = {
+            'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]})
+            }
+```
+
+%% Cell type:code id:acd45d3c-7845-47e6-9b7d-e35627a7ca9a tags:
+
+``` python
+dims = list(ds.dims)
+print(dims)
+```
+
+%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
+
+``` python
+# create a dictionary of datacube dimensions you would like to assign to this dataset
+# dimension name should come from the dims printed in above cell
+
+# x, y, t dimension info is pulled out automatically using the dim dict we created above
+# all other dims listed in above cell need to be manually written in
+
+# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
+# note that the extent of each dimension should be pulled from the dataset
+dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),
+             dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [spatial_bounds[0], spatial_bounds[2]], 'step': x_step, 'reference_system': projjson}),
+             dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [spatial_bounds[1], spatial_bounds[3]], 'step': y_step, 'reference_system': projjson}),
+             'z': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'z'), 'extent': [ds.z.min().item(), ds.z.max().item()]}),
+            }
+```
+
+%% Cell type:code id:8ab85b09-eb38-404c-910c-13349d5e2234 tags:
+
+``` python
+# make sure you added all the right dims
+assert sorted(list(dims_dict.keys())) == sorted(dims)
+```
+
+%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
+
+### Add cube variables (optional field for extension)
+
+%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
+
+``` python
+# drop metpy_crs coordinate we have added
+if 'metpy_crs' in ds.coords:
+    ds = ds.drop_vars('metpy_crs')
+
+# pull list of vars from dataset
+vars = list(ds.variables)
+
+# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
+# we will drop all values in dims from vars
+vars = [v for v in vars if v not in dims]
+
+# Microsoft Planetary Computer includes coordinates and crs as variables here:
+# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
+# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
+# we will keep those in the var list
+
+# create dictionary of dataset variables and associated dimensions
+vars_dict={}
+for v in vars:
+    unit = stac_helpers.get_unit(ds, v)
+    var_type = stac_helpers.get_var_type(ds, v, crs_var)
+    long_name = stac_helpers.get_long_name(ds, v)
+    vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
+```
+
+%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
+
+### Finalize extension
+
+%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
+
+``` python
+# add dimesions and variables to collection extension
+dc.apply(dimensions=dims_dict, variables=vars_dict)
+```
+
+%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
+
+## Add STAC Collection to Catalog and Save
+
+%% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:
+
+``` python
+# # helper to find items of wrong type
+# d = collection.to_dict()
+# print(*stac_helpers.find_paths(d))
+```
+
+%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
+
+``` python
+if parent_collection.get_child(collection_id):
+    collection.normalize_and_save(root_href=os.path.join(catalog_path, parent_collection_id, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
+else:
+    parent_collection.add_child(collection)
+    parent_collection.normalize_and_save(root_href=catalog_path+'/'+parent_collection_id, catalog_type=pystac.CatalogType.SELF_CONTAINED)
+```
+
+%% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:
+
+``` python
+```
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G0.6SW_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G0.6SW_wMeta_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G1.2SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G1.2SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G1.2SW_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G1.2SW_wMeta_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G1.8SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G1.8SW_S0.15m_wMeta_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/5m_rcp45_G1.8SW_wMeta_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/5m_rcp45_G1.8SW_wMeta_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/northslope_rcp85_metadata_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/northslope_rcp85_metadata_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/selawik_rcp45_metadata_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/selawik_rcp45_metadata_create_collection_from_zarr.ipynb
--- a/workflows/archive/AIEM_permafrost/selawik_rcp85_metadata_create_collection_from_zarr.ipynb
+++ b/workflows/archive/AIEM_permafrost/selawik_rcp85_metadata_create_collection_from_zarr.ipynb