updates to CooperMcKenzie workflow

e688cdb7 · Snyder, Amelia Marie · 18cf6b39 · e688cdb7 · e688cdb7
Commit e688cdb7 authored 1 year ago by Snyder, Amelia Marie
--- a/catalog/CooperMcKenzie/collection.json
+++ b/catalog/CooperMcKenzie/collection.json
@@ -36,7 +36,7 @@
        489701.14541243,
        601701.1454124299
      ],
-      "step": null,
+      "step": 100.0,
      "reference_system": "{\"$schema\":\"https://proj.org/schemas/v0.5/projjson.schema.json\",\"type\":\"ProjectedCRS\",\"name\":\"NAD83 / UTM zone 10N\",\"base_crs\":{\"name\":\"NAD83\",\"datum\":{\"type\":\"GeodeticReferenceFrame\",\"name\":\"North American Datum 1983\",\"ellipsoid\":{\"name\":\"GRS 1980\",\"semi_major_axis\":6378137,\"inverse_flattening\":298.257222101}},\"coordinate_system\":{\"subtype\":\"ellipsoidal\",\"axis\":[{\"name\":\"Geodetic latitude\",\"abbreviation\":\"Lat\",\"direction\":\"north\",\"unit\":\"degree\"},{\"name\":\"Geodetic longitude\",\"abbreviation\":\"Lon\",\"direction\":\"east\",\"unit\":\"degree\"}]},\"id\":{\"authority\":\"EPSG\",\"code\":4269}},\"conversion\":{\"name\":\"UTM zone 10N\",\"method\":{\"name\":\"Transverse Mercator\",\"id\":{\"authority\":\"EPSG\",\"code\":9807}},\"parameters\":[{\"name\":\"Latitude of natural origin\",\"value\":0,\"unit\":\"degree\",\"id\":{\"authority\":\"EPSG\",\"code\":8801}},{\"name\":\"Longitude of natural origin\",\"value\":-123,\"unit\":\"degree\",\"id\":{\"authority\":\"EPSG\",\"code\":8802}},{\"name\":\"Scale factor at natural origin\",\"value\":0.9996,\"unit\":\"unity\",\"id\":{\"authority\":\"EPSG\",\"code\":8805}},{\"name\":\"False easting\",\"value\":500000,\"unit\":\"metre\",\"id\":{\"authority\":\"EPSG\",\"code\":8806}},{\"name\":\"False northing\",\"value\":0,\"unit\":\"metre\",\"id\":{\"authority\":\"EPSG\",\"code\":8807}}]},\"coordinate_system\":{\"subtype\":\"Cartesian\",\"axis\":[{\"name\":\"Easting\",\"abbreviation\":\"E\",\"direction\":\"east\",\"unit\":\"metre\"},{\"name\":\"Northing\",\"abbreviation\":\"N\",\"direction\":\"north\",\"unit\":\"metre\"}]},\"scope\":\"Engineering survey, topographic mapping.\",\"area\":\"North America - between 126°W and 120°W - onshore and offshore. Canada - British Columbia; Northwest Territories; Yukon. United States (USA) - California; Oregon; Washington.\",\"bbox\":{\"south_latitude\":30.54,\"west_longitude\":-126,\"north_latitude\":81.8,\"east_longitude\":-119.99},\"id\":{\"authority\":\"EPSG\",\"code\":26910}}"
    },
    "UTM_Meters_North": {

--- a/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb
+++ b/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb
@@ -560,8 +560,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# # debugging for time steps: get all step values and locations\n",
-    "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)"
+    "# # optional debugging for time steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
+    "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=0)"
   ]
  },
  {
@@ -583,7 +585,9 @@
   "id": "57f9d11a-530f-4069-a21b-e7512c31b7c1",
   "metadata": {},
   "source": [
-    "**X/lon**"
+    "**X/lon**\n",
+    "\n",
+    "**rounding error in spatial steps**: need to round to 9th decimal to take care of rounding error that comes up in calculating spatial steps"
   ]
  },
  {
@@ -593,7 +597,10 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n",
+    "#x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n",
+    "# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation\n",
+    "# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:\n",
+    "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=9)\n",
    "print(f'x step: {x_step}')"
   ]
  },
@@ -604,9 +611,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# # debugging for spatial steps: get all step values and locations\n",
+    "# # optional debugging for spatial steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
    "# x_dim=dim_names_dict['X']\n",
-    "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n",
+    "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)\n",
    "# print(f'\\nx dim name (for next cell): {x_dim}')"
   ]
  },
@@ -640,6 +649,9 @@
   "outputs": [],
   "source": [
    "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n",
+    "# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation\n",
+    "# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:\n",
+    "#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n",
    "print(f'y step: {y_step}')"
   ]
  },
@@ -650,9 +662,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# # debugging for spatial steps: get all step values and locations\n",
+    "# # optional debugging for spatial steps:\n",
+    "# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed\n",
+    "# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred\n",
    "# y_dim=dim_names_dict['Y']\n",
-    "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n",
+    "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)\n",
    "# print(f'\\nx dim name (for next cell): {x_dim}')"
   ]
  },

 %% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:

 # CooperMcKenzie Zarr -> Collection Workflow
 This is a workflow to build [STAC collections](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr assets for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.

 To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
 1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
 2. I am assuming all coordinates are from the WGS84 datum if not specified.

 %% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:

 ``` python
 import pystac
 from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
 import xarray as xr
 import cf_xarray
 import os
 import fsspec
 import cf_xarray
 import hvplot.xarray
 import pandas as pd
 import json
 import numpy as np
 import pyproj
 from pyproj import Transformer
 import cartopy.crs as ccrs
 import cfunits
 import json
 import sys
 sys.path.insert(1, '..')
 import stac_helpers
 ```

 %% Cell type:markdown id:8cfb2138-2a0f-4e8f-a58f-d3a79eb4bdf1 tags:

 ## Collection ID

 %% Cell type:code id:91bd4f53-e74e-4f4b-99b4-130d5d7472fd tags:

 ``` python
 # name for STAC collection
 collection_id = 'CooperMcKenzie'
 ```

 %% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:

 ## Asset Metadata Input

 %% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:

 ``` python
 # url to zarr store that you want to create a collection for
 zarr_url = f's3://mdmf/gdp/{collection_id}.zarr/'

 # define keyword arguments needed for opening the dataset with xarray
 # ref: https://github.com/stac-extensions/xarray-assets
 xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
                          "xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
 # description for zarr url asset attached to collection (zarr_url)
 asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
 # roles to tag zarr url asset with
 asset_roles = ["data","zarr","s3"]
 ```

 %% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:

 ``` python
 # url to zarr store that you want to create a collection for
 zarr_url2 = f's3://nhgf-development/workspace/DataConversion/{collection_id}.zarr/'

 # define keyword arguments needed for opening the dataset with xarray
 # ref: https://github.com/stac-extensions/xarray-assets
 xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
                          "xarray:storage_options":{"requester_pays":True}}
 # description for zarr url asset attached to collection (zarr_url)
 asset_description2 = "S3 access to collection zarr group"
 # roles to tag zarr url asset with
 asset_roles2 = ["data","zarr","s3"]
 ```

 %% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:

 ## Data Exploration

 %% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:

 ``` python
 # open and view zarr dataset
 fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
 ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
                             backend_kwargs={'consolidated':True}, chunks={})
 ds
 ```

 %% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:

 ## Collection Metadata Input

 %% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:

 ``` python
 # description of STAC collection
 collection_description = ds.attrs['title']
 print(f'collection description: {collection_description}')
 ```

 %% Cell type:code id:b2bbf33b-0b20-4375-9cd9-dc4b66549707 tags:

 ``` python
 # license for dataset
 collection_license = stac_helpers.license_picker(ds.attrs['license'])
 ```

 %% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:

 ## Identify x, y, t dimensions of dataset
 May require user input if dimensions cannot be auto-detected.

 %% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:

 ``` python
 dims_auto_extract = ['X', 'Y', 'T']
 dim_names_dict = {}
 for d in dims_auto_extract:
    dim_names_dict[d] = stac_helpers.extract_dim(ds, d)
 print(f"Dimension dictionary: {dim_names_dict}")
 ```

 %% Cell type:markdown id:810d7480-165d-41c0-bd09-163656a14003 tags:

 ## Get crs info

 %% Cell type:code id:b03d52f3-1367-4255-a561-52ee4fc9e92d tags:

 ``` python
 crs = pyproj.CRS.from_cf(ds.crs.attrs)
 ```

 %% Cell type:markdown id:d16521ad-78a7-4df1-8ebb-6995846a2ad5 tags:

 ### Compare dataset crs var to generated proj4 string to make sure it looks ok

 %% Cell type:code id:38492a33-861e-46a3-8f0f-ceb7b2b5f42a tags:

 ``` python
 ds.crs
 ```

 %% Cell type:code id:4ba255b8-b76b-4f8d-8907-cc5589221e66 tags:

 ``` python
 crs.to_proj4()
 ```

 %% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:

 ## Create Collection Extent

 %% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:

 ### Spatial Extent

 %% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:

 ``` python
 # pull out lat/lon bbox for data
 # coordinates must be from WGS 84 datum
 # left, bottom, right, top

 # Note: try changing around the commented out lines below to get type float rather than a numpy float
 #spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]
 #spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]
 spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
 print(spatial_bounds)
 print(f'\nspatial_bounds data type: {type(spatial_bounds[0])}')
 ```

 %% Cell type:code id:c80c650f-6dc6-4e2d-869c-0e674f528520 tags:

 ``` python
 XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
 ```

 %% Cell type:code id:97fb5d51-ee5c-46d9-b256-6a5a51a42495 tags:

 ``` python
 transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
 lon, lat = transformer.transform(XX.ravel(), YY.ravel())
 ```

 %% Cell type:code id:4ae8c4e5-9069-4972-bcf6-0bb824ea83a3 tags:

 ``` python
 print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')
 print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')
 ```

 %% Cell type:code id:6a054422-6b84-44bd-b102-ac293745cda8 tags:

 ``` python
 # create a spatial extent object
 spatial_extent = pystac.SpatialExtent(bboxes=[[min(lon).item(), min(lat).item(), max(lon).item(), max(lat).item()]])
 ```

 %% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:

 ### Temporal Extent

 %% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:

 ``` python
 # pull out first and last timestamps
 #temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
 #temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
 # if you get an error:
 # Cannot convert input [] of type <class 'cftime._cftime.DatetimeNoLeap'> to Timestamp
 # use the following instead:
 temporal_extent_lower = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min())
 temporal_extent_upper = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max())

 print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
 # create a temporal extent object
 temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
 ```

 %% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:

 ``` python
 collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
 ```

 %% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:

 ## Open up NHGF STAC Catalog and create a collection

 %% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:

 ``` python
 # define folder location where your STAC catalog json file is
 catalog_path = os.path.join('..', '..', 'catalog')
 # open catalog
 catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
 ```

 %% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:

 ``` python
 if catalog.get_child(collection_id):
    collection = catalog.get_child(collection_id)
    print("existing collection opened")
    collection.extent=collection_extent
    collection.description=collection_description
    collection.license=collection_license
 else:
    collection = pystac.Collection(id=collection_id,
                                   description=collection_description,
                                   extent=collection_extent,
                                   license=collection_license)
    print("new collection created")
 ```

 %% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:

 ## Add zarr url asset to collection

 %% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:

 ``` python
 asset_id = "zarr-s3-osn"
 asset = pystac.Asset(href=zarr_url,
                     description=asset_description,
                     media_type="application/vnd+zarr",
                     roles=asset_roles,
                     extra_fields = xarray_opendataset_kwargs)
 collection.add_asset(asset_id, asset)
 ```

 %% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:

 ``` python
 asset_id2 = "zarr-s3"
 asset2 = pystac.Asset(href=zarr_url2,
                     description=asset_description2,
                     media_type="application/vnd+zarr",
                     roles=asset_roles2,
                     extra_fields = xarray_opendataset_kwargs2)
 collection.add_asset(asset_id2, asset2)
 ```

 %% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:

 ## Add datacube extension to collection

 %% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:

 ``` python
 # instantiate extention on collection
 dc = DatacubeExtension.ext(collection, add_if_missing=True)
 ```

 %% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:

 ### Add cube dimensions (required field for extension)

 %% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:

 ``` python
 # list out dataset dimensions
 # When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
 # raising an error if it can’t be found.
 dims = list(ds.dims)
 print(dims)
 ```

 %% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:

 #### user review needed
 #### compare crs information to the projjson to make sure it looks correct

 %% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:

 ``` python
 crs
 ```

 %% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:

 ``` python
 # # the datacube extension can accept reference_system information as a numerical EPSG code,
 # # WKT2 (ISO 19162) string or PROJJSON object.
 # # we will use a projjson, as was done by Microsoft Planetary Computer here:
 # # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
 # # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
 # projjson = json.loads(lcc.to_json())

 # alternatively, I think we could do this:
 projjson = crs.to_json()
 print(crs.to_json(pretty=True))
 ```

 %% Cell type:markdown id:39ffb2bc-afe8-4c67-b385-cd98251d5d4b tags:

 #### user review needed - looks at the steps pulled out and make sure they make sense

 %% Cell type:markdown id:3d8f421e-302c-4020-8fe3-4cea3ee53143 tags:

 **Time**

 %% Cell type:code id:13967ca9-1920-40f8-81cf-639d63439d71 tags:

 ``` python
 time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()
 print(f'time step: {time_step}')
 ```

 %% Cell type:code id:8640779d-5131-4973-aad9-455d4acffd7a tags:

 ``` python
-# # debugging for time steps: get all step values and locations
-# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)
+# # optional debugging for time steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
+# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=0)
 ```

 %% Cell type:code id:de0efee3-af6e-4451-87a1-264503e24948 tags:

 ``` python
 # # debugging for time steps, cont:
 # # please choose one of the index locations printed above
 # # this will print the time steps adjacent to it
 # ix = 3343
 # ds.isel(time=slice(ix-1,ix+3)).time
 ```

 %% Cell type:markdown id:57f9d11a-530f-4069-a21b-e7512c31b7c1 tags:

 **X/lon**

+**rounding error in spatial steps**: need to round to 9th decimal to take care of rounding error that comes up in calculating spatial steps
+
 %% Cell type:code id:73306fb1-9b51-42d0-86fa-8d5e72644ee1 tags:

 ``` python
-x_step = stac_helpers.get_step(ds, dim_names_dict['X'])
+#x_step = stac_helpers.get_step(ds, dim_names_dict['X'])
+# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
+# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
+x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=9)
 print(f'x step: {x_step}')
 ```

 %% Cell type:code id:2afe1dd0-ceb0-4a25-856e-38aefdbecd0c tags:

 ``` python
-# # debugging for spatial steps: get all step values and locations
+# # optional debugging for spatial steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
 # x_dim=dim_names_dict['X']
-# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)
+# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)
 # print(f'\nx dim name (for next cell): {x_dim}')
 ```

 %% Cell type:code id:ba997858-c048-4047-b03b-fc011a0d93b1 tags:

 ``` python
 # # debugging for spatial steps, cont:
 # # please choose one of the index locations printed above
 # # this will print the time steps adjacent to it
 # ix = 5
 # ds.isel(x=slice(ix-1,ix+3)).x
 ```

 %% Cell type:markdown id:27cd6e0f-5289-4672-8f92-092dcd133817 tags:

 **Y/lat**

 %% Cell type:code id:c046328f-4953-4df6-ac0d-2bab43f86888 tags:

 ``` python
 y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])
+# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
+# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
+#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)
 print(f'y step: {y_step}')
 ```

 %% Cell type:code id:6bd290eb-fdb3-4e2e-b9e8-4ebcfa3f608b tags:

 ``` python
-# # debugging for spatial steps: get all step values and locations
+# # optional debugging for spatial steps:
+# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
+# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
 # y_dim=dim_names_dict['Y']
-# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)
+# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)
 # print(f'\nx dim name (for next cell): {x_dim}')
 ```

 %% Cell type:code id:e27871aa-d33f-4af2-9f4e-96a2e54f03ca tags:

 ``` python
 # # debugging for spatial steps, cont:
 # # please choose one of the index locations printed above
 # # this will print the time steps adjacent to it
 # ix = 5
 # ds.isel(y=slice(ix-1,ix+3)).y
 ```

 %% Cell type:markdown id:ca0df0c7-27d0-468c-a615-b0f5a9a429a4 tags:

 #### extract x, y dimension lower and upper bounds

 %% Cell type:code id:6cfd212f-9309-44a8-845a-b2b6f536a598 tags:

 ``` python
 # get x, y bounds for extent of those dimensions (required)
 xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
 print(xy_bounds)
 ```

 %% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:

 #### user input needed - you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes(type, axis, extent, etc.):

 Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.

 If you have a dimension like "bnds" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type "count" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).

 %% Cell type:code id:acd45d3c-7845-47e6-9b7d-e35627a7ca9a tags:

 ``` python
 print(dims)
 ```

 %% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:

 ``` python
 # create a dictionary of datacube dimensions you would like to assign to this dataset
 # dimension name should come from the coordinates printed above
 # we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
 # note that the extent of each dimension should be pulled from the dataset
 dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),
             dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),
             dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),
            }
 ```

 %% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:

 ### Add cube variables (optional field for extension)

 %% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:

 ``` python
 # drop metpy_crs coordinate we have added
 if 'metpy_crs' in ds.coords:
    ds = ds.drop_vars('metpy_crs')

 # pull list of vars from dataset
 vars = list(ds.variables)

 # spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
 # we will drop all values in dims from vars
 vars = [v for v in vars if v not in dims]

 # Microsoft Planetary Computer includes coordinates and crs as variables here:
 # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
 # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
 # we will keep those in the var list

 # create dictionary of dataset variables and associated dimensions
 vars_dict={}
 for v in vars:
    unit = stac_helpers.get_unit(ds, v)
    var_type = stac_helpers.get_var_type(ds, v)
    long_name = stac_helpers.get_long_name(ds, v)
    vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
 ```

 %% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:

 ### Finalize extension

 %% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:

 ``` python
 # add dimesions and variables to collection extension
 dc.apply(dimensions=dims_dict, variables=vars_dict)
 ```

 %% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:

 ## Add STAC Collection to Catalog and Save

 %% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:

 ``` python
 # # helper to find items of wrong type
 # d = collection.to_dict()
 # print(*stac_helpers.find_paths(d))
 ```

 %% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:

 ``` python
 if catalog.get_child(collection_id):
    collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
 else:
    catalog.add_child(collection)
    catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
 ```

 %% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:

 ``` python
 ```