Skip to content
Snippets Groups Projects
Commit 9108150c authored by Snyder, Amelia Marie's avatar Snyder, Amelia Marie
Browse files

fix crs var detection

parent dc6a32b4
No related branches found
No related tags found
1 merge request!62fix crs var detection
......@@ -62,7 +62,7 @@
"cube:variables": {
"crs": {
"dimensions": [],
"type": "data",
"type": "auxiliary",
"description": null,
"unit": null
},
......
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# C-PrEP Zarr -> Collection Workflow
## In progress - figuring out how to label bnds dim of dataset
This is a workflow to build [STAC collections](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr assets for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import pyproj
from pyproj import Transformer
import cartopy.crs as ccrs
import cfunits
import json
import sys
sys.path.insert(1, '..')
import stac_helpers
```
%% Cell type:markdown id:9e9cff62-0e60-4e59-8994-aece90c2bbc6 tags:
## Explore list of files to incorporate
%% Cell type:code id:0bfe4a5b-72da-4f0a-93f8-5c899cc2602b tags:
``` python
# get list of zarr stores to create collections for - we will run through one
# as an example, then iterate through every item in the list
fs = fsspec.filesystem('s3', requester_pays=True)
s3_path_list = fs.ls('s3://nhgf-development/workspace/DataConversion/gdp/cprep')
zarr_name_list = [fp.split('/')[-1] for fp in s3_path_list]
print(f'NHGF S3 Endpoint\n')
print(f'zarr stores found: {len(zarr_name_list)}\n')
print(f'example file path: {s3_path_list[0]}\n')
print(f'example file name: {zarr_name_list[0]}')
```
%% Cell type:code id:eee2188c-1e17-4d94-ad37-cb791f701852 tags:
``` python
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
osn_path_list = fs2.ls('s3://mdmf/gdp/cprep')
zarr_list = [fp.split('/')[-1] for fp in fs2.ls('s3://mdmf/gdp/cprep')]
print(f'OSN Pod Endpoint\n')
print(f'zarr stores found: {len(zarr_list)}\n')
print(f'example file path: {osn_path_list[0]}\n')
print(f'example file name: {zarr_list[0]}')
```
%% Cell type:markdown id:d412ee91-7d5b-494f-9c57-61ee69f5261b tags:
## Test an example with one zarr from the list of 297
%% Cell type:markdown id:67ca098d-b64e-4379-b4db-c55cbf558dac tags:
## Collection ID
%% Cell type:code id:0ed579f2-9be9-4871-bac0-6e589d781c40 tags:
``` python
# name for STAC collection
collection_id = os.path.splitext(zarr_list[100])[0]
```
%% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:
## Asset Metadata Input
%% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = f's3://mdmf/gdp/cprep/{collection_id}.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
```
%% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url2 = f's3://nhgf-development/workspace/DataConversion/gdp/cprep/{collection_id}.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description2 = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles2 = ["data","zarr","s3"]
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:b5ee1874-d280-4158-8d56-ecab750b6000 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
## Collection Metadata Input
%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
``` python
# description of STAC collection
collection_description = ds.attrs['title']
print(f'collection description: {collection_description}')
```
%% Cell type:code id:df4fdc77-72f5-4cb1-afeb-b7b879d5ede1 tags:
``` python
# license for dataset
collection_license = stac_helpers.license_picker(ds.attrs['license'])
```
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
dim_names_dict = {}
for d in dims_auto_extract:
dim_names_dict[d] = stac_helpers.extract_dim(ds, d)
print(f"Dimension dictionary: {dim_names_dict}")
```
%% Cell type:markdown id:667cb9e8-c4a5-4d26-8e30-be8c934adc37 tags:
## Get crs info
%% Cell type:code id:4a981eca-93fd-4a9d-bbdf-52d0a99d6bc0 tags:
``` python
crs_var = 'crs'
```
%% Cell type:code id:172d9e6b-de88-4e3a-9184-706110a99659 tags:
``` python
# use pyproj to automatically extract crs info
crs = pyproj.CRS.from_cf(ds.crs.attrs)
crs = pyproj.CRS.from_cf(ds[crs_var].attrs)
# alternatively, create the appropriate cartopy projection
# crs = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
# central_latitude=crs_info.latitude_of_projection_origin,
# standard_parallels=crs_info.standard_parallel)
```
%% Cell type:code id:1a0f2a58-a126-4438-a4c2-5c35653f4a62 tags:
``` python
ds[crs_var]
```
%% Cell type:code id:a702f7ff-6201-4283-8e8c-c23f685d7165 tags:
``` python
crs.to_proj4()
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Collection Extent
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial Extent
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordinates must be from WGS 84 datum
# left, bottom, right, top
# Note: try changing around the commented out lines below to get type float rather than a numpy float
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]
spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
print(spatial_bounds)
print(f'\nspatial_bounds data type: {type(spatial_bounds[0])}')
```
%% Cell type:code id:a9febf0f-d299-4758-a2a8-abed7328698d tags:
``` python
XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
```
%% Cell type:code id:b0846798-c968-4db2-9505-3078327dd358 tags:
``` python
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
lon, lat = transformer.transform(XX.ravel(), YY.ravel())
```
%% Cell type:code id:680aeb26-51cb-468d-8bda-b6e6cc7af611 tags:
``` python
print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')
print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')
```
%% Cell type:code id:d72e37ab-2b9d-4a15-a502-be79622dd7cb tags:
``` python
# create a spatial extent object
spatial_extent = pystac.SpatialExtent(bboxes=[[min(lon).item(), min(lat).item(), max(lon).item(), max(lat).item()]])
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal Extent
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
# temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
# temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
temporal_extent_lower = ds.indexes[dim_names_dict['T']].to_datetimeindex().min()
temporal_extent_upper = ds.indexes[dim_names_dict['T']].to_datetimeindex().max()
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
# create a temporal extent object
temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
```
%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
``` python
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up NHGF STAC Catalog and create a collection
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', 'catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to collection
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3-osn"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
collection.add_asset(asset_id, asset)
```
%% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:
``` python
asset_id2 = "zarr-s3"
asset2 = pystac.Asset(href=zarr_url2,
description=asset_description2,
media_type="application/vnd+zarr",
roles=asset_roles2,
extra_fields = xarray_opendataset_kwargs2)
collection.add_asset(asset_id2, asset2)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to collection
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(collection, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:
``` python
# list out dataset dimensions
# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
# raising an error if it can’t be found.
dims = list(ds.dims)
print(dims)
```
%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
#### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:
reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
``` python
# the datacube extension can accept reference_system information as a numerical EPSG code,
# WKT2 (ISO 19162) string or PROJJSON object.
# we will use a projjson, as was done by Microsoft Planetary Computer here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
projjson = crs.to_json()
print(projjson)
```
%% Cell type:markdown id:b98e8e4d-91f5-4b42-8bc4-adc4ebcad264 tags:
#### user review needed - looks at the steps pulled out and make sure they make sense
%% Cell type:markdown id:0930906a-6dda-4ef6-b7c8-f3cd26f4bc56 tags:
**Time**
%% Cell type:code id:498fc9be-8e51-417b-accd-a5460792f760 tags:
``` python
time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()
print(f'time step: {time_step}')
```
%% Cell type:code id:990fe69c-1073-44e7-b429-64dde28ab5a5 tags:
``` python
# # debugging for time steps: get all step values and locations
# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)
```
%% Cell type:code id:79f23b28-289d-46d2-aa77-f965e99db9cc tags:
``` python
# # debugging for time steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 3343
# ds.isel(time=slice(ix-1,ix+3)).time
```
%% Cell type:markdown id:66ef74eb-66d4-432f-be4c-56818579e04a tags:
**X/lon**
%% Cell type:markdown id:354bdf96-b5f6-4949-90b7-0c762c70fb78 tags:
**rounding error in spatial steps**: need to round to 13th decimal to take care of rounding error that comes up in calculating spatial steps
%% Cell type:code id:3f903b3a-c37d-45b1-ab8d-e89516fbc511 tags:
``` python
x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)
print(f'x step: {x_step}')
```
%% Cell type:code id:118f7c27-e07b-43a7-a0b5-9f4adcdddbbd tags:
``` python
# # debugging for spatial steps: get all step values and locations
# x_dim=dim_names_dict['X']
# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:ce23eee0-8d5f-4e8a-b6fa-13ab276eb660 tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(lon=slice(ix-1,ix+3)).lon
```
%% Cell type:markdown id:f28d25eb-c43d-4a98-8350-8faf517a90c5 tags:
**Y/lat**
%% Cell type:markdown id:f199759e-06c0-4cb0-b821-5f006a837dc7 tags:
**rounding error in spatial steps**: need to round to 13th decimal to take care of rounding error that comes up in calculating spatial steps
%% Cell type:code id:0c0b0938-a6ee-4023-b201-75712029f2b5 tags:
``` python
y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)
print(f'y step: {y_step}')
```
%% Cell type:code id:8a8ba920-0959-49bf-aed2-08d56c511b2c tags:
``` python
# # debugging for spatial steps: get all step values and locations
# y_dim=dim_names_dict['Y']
# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:16a23616-8309-4855-ba75-fcd04704d5e6 tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(lat=slice(ix-1,ix+3)).lat
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed - you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes(type, axis, extent, etc.):
Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.
If you have a dimension like "bnds" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type "count" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).
%% Cell type:code id:4c862f5e-d89a-4cf7-9936-610a0df730ae tags:
``` python
print(dims)
```
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the coordinates printed above
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),
dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [spatial_bounds[0], spatial_bounds[2]], 'step': x_step, 'reference_system': projjson}),
dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [spatial_bounds[1], spatial_bounds[3]], 'step': y_step, 'reference_system': projjson}),
'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),
}
```
%% Cell type:code id:27e62de2-1143-453f-8f56-cf0a80048158 tags:
``` python
# make sure you added all the right dims
assert sorted(list(dims_dict.keys())) == sorted(dims)
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# drop metpy_crs coordinate we have added
if 'metpy_crs' in ds.coords:
ds = ds.drop_vars('metpy_crs')
# pull list of vars from dataset
vars = list(ds.variables)
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coordinates and crs as variables here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
unit = stac_helpers.get_unit(ds, v)
var_type = stac_helpers.get_var_type(ds, v)
var_type = stac_helpers.get_var_type(ds, v, crs_var)
long_name = stac_helpers.get_long_name(ds, v)
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Collection to Catalog and Save
%% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:
``` python
# # helper to find items of wrong type
# d = collection.to_dict()
# print(*stac_helpers.find_paths(d))
```
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if catalog.get_child(collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:markdown id:c734db5d-a9b4-42b4-88fb-50d84493f694 tags:
## The big job
Now that we have tested the workflow, we will implement it in a big for loop for all 297 zarrs.
%% Cell type:code id:929a90d4-2ae1-480a-aec3-348d19a667d8 tags:
``` python
```
%% Cell type:markdown id:ac25d30a-0a27-4241-a631-522f6dcbe764 tags:
## Supplemental - Calendar Exploration based on zarr conversion workflow
A special handling of the calendars was set up in the [zarr converstion workflow](https://code.usgs.gov/wma/nhgf/geo-data-portal/gdp_data_processing/-/blob/main/workflows/opendap/CIDA/CPREP/cprep_conversion.ipynb?ref_type=heads) for this dataset, so this was further explored to see if it was necessary.
%% Cell type:markdown id:3877e708-f600-48c2-b3e4-e267dd547c57 tags:
### Types of calendars
There appear to be two types of calendars used in in this dataset's time variable. Let's check that first.
%% Cell type:code id:e314605d-5676-4add-935e-1ca59e4852d4 tags:
``` python
ds_example = []
calendar_list = []
calendar_count = {}
for f in zarr_list:
collection_id = os.path.splitext(f)[0]
zarr_url = f's3://mdmf/gdp/cprep/{collection_id}.zarr/'
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={},
decode_times=False)
calendar = ds.time.calendar
try:
calendar_count[calendar] = calendar_count[calendar] + 1
except:
calendar_count[calendar] = 1
if calendar not in calendar_list:
ds_example.append(collection_id)
calendar_list.append(calendar)
```
%% Cell type:code id:0ff931d7-f0dd-4721-97ad-0b65b796db53 tags:
``` python
calendar_count
```
%% Cell type:code id:030a1cfd-f568-4cfe-b7d0-5fae88c1e819 tags:
``` python
calendar_list
```
%% Cell type:code id:7493aa82-e707-4f10-bb22-125661c30c79 tags:
``` python
ds_example
```
%% Cell type:markdown id:c17f24ff-7763-4400-80b3-3d815c2285fd tags:
### Julian calendar
#### What do we get for min/max times if we decode the times upon reading in the data (the xarray default)?
%% Cell type:code id:03fdf554-c6ba-47a4-b8ce-8993d1cd00e9 tags:
``` python
# Julian
zarr_url = f's3://mdmf/gdp/cprep/cprep_pr_day_I35prp1-DeltaSD-A32D01K00_rcp26_r1i1p1_I35Land_20060101-20991231.zarr/'
```
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={},
decode_times=True)
ds
```
%% Cell type:code id:c6a94627-edb3-43f2-8b79-e45d5fad89cc tags:
``` python
print('min time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().min()
```
%% Cell type:code id:c87e35ad-6fe9-4057-83eb-c5e335a75446 tags:
``` python
print('max time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().max()
```
%% Cell type:markdown id:459848fa-b464-4254-a648-7205c032db01 tags:
#### What do we get for min/max times if we do NOT decode the times (like was done in the zarr conversion workflow)?
%% Cell type:code id:601db820-e332-42c2-a90b-24f527bb197b tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={},
decode_times=False)
ds
```
%% Cell type:code id:5ec7044a-c744-4980-813b-fd15e5e06d7e tags:
``` python
z = zarr_url
start_date = z.split('/')[5].split('_')[7].split('-')[0].split('.')[0]
end_date = z.split('/')[5].split('_')[7].split('-')[1].split('.')[0]
start_date = f'{start_date[0:4]}-{start_date[4:6]}-{start_date[6:8]}'
end_date = f'{end_date[0:4]}-{end_date[4:6]}-{end_date[6:8]}'
calendar = ds.time.calendar
if calendar == 'julian':
cftime_index = xr.cftime_range(start_date, end_date, freq='D', calendar='julian')
ds['time'] = cftime_index
elif calendar == 'noleap':
cftime_index = xr.cftime_range(start_date, end_date, freq='D', calendar='noleap')
ds['time'] = cftime_index
```
%% Cell type:code id:950168e8-b2c2-4177-add0-efab5794db44 tags:
``` python
ds
```
%% Cell type:code id:faddefa3-cd71-43f9-ba6f-1e886a5a442f tags:
``` python
print('min time step:')
pd.Timestamp((ds.indexes['time'].to_datetimeindex().min()))
```
%% Cell type:code id:68b10114-3b44-4ed6-9c3f-7882e33b04f3 tags:
``` python
print('max time step:')
pd.Timestamp((ds.indexes['time'].to_datetimeindex().max()))
```
%% Cell type:markdown id:61bf449c-bd99-427d-a7d1-1eed51d2da75 tags:
### noleap
#### What do we get for min/max times if we decode the times upon reading in the data (the xarray default)?
%% Cell type:code id:0a486983-7643-4f6e-b68f-3f6084074210 tags:
``` python
# noleap
zarr_url = f's3://mdmf/gdp/cprep/cprep_pr_day_I35prp1-DeltaSD-A12D01K00_rcp26_r6i1p1_I35Land_20060101-20991231.zarr/'
```
%% Cell type:code id:14f03b46-04fd-4a78-b5cf-bcc3331d7a30 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={},
decode_times=True)
ds
```
%% Cell type:code id:f225f473-adde-443e-b8c0-6d477639b40d tags:
``` python
print('min time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().min()
```
%% Cell type:code id:e20aaa0e-218c-4dfe-8cae-a2ca3854233d tags:
``` python
print('max time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().max()
```
%% Cell type:markdown id:50a860f5-98b4-48b0-b872-63bc5dc271ae tags:
#### What do we get for min/max times if we do NOT decode the times (like was done in the zarr conversion workflow)?### What do we get for min/max times if we do NOT decode the times? - noleap
%% Cell type:code id:737458bc-cba5-4c93-83bf-963cbb635f24 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={},
decode_times=False)
ds
```
%% Cell type:code id:c1ae19f2-5713-4ca8-8299-a566c15b29ae tags:
``` python
z = zarr_url
start_date = z.split('/')[5].split('_')[7].split('-')[0].split('.')[0]
end_date = z.split('/')[5].split('_')[7].split('-')[1].split('.')[0]
start_date = f'{start_date[0:4]}-{start_date[4:6]}-{start_date[6:8]}'
end_date = f'{end_date[0:4]}-{end_date[4:6]}-{end_date[6:8]}'
calendar = ds.time.calendar
if calendar == 'julian':
cftime_index = xr.cftime_range(start_date, end_date, freq='D', calendar='julian')
ds['time'] = cftime_index
elif calendar == 'noleap':
cftime_index = xr.cftime_range(start_date, end_date, freq='D', calendar='noleap')
ds['time'] = cftime_index
```
%% Cell type:code id:b7cc4b04-5b08-4bd0-bd49-6c9ba37f4df9 tags:
``` python
ds
```
%% Cell type:code id:df3038fa-a03b-41dc-a029-773a8fbf496d tags:
``` python
print('min time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().min()
```
%% Cell type:code id:e3bada84-0d24-42db-b345-6d657c40ba06 tags:
``` python
print('max time step:')
ds.indexes[dim_names_dict['T']].to_datetimeindex().min()
```
%% Cell type:markdown id:fb491fae-4681-4b1a-aef6-95e4bbb764f1 tags:
### Conclusion:
The workflow is overwriting the calendar values so that instead of being marked in the 12:00 hour, they are marked with the 00:00 hour. There doesn't seem to be different handling for Julian vs. noleap calendars. We will use the data's true data values and let xarray decode the times for us, rather than using this shifted timestamp.
......
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# CONUS404 Daily Zarr -> Collection Workflow
This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr asset for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import pyproj
from pyproj import Transformer
import cartopy.crs as ccrs
import cfunits
import json
import sys
sys.path.insert(1, '..')
import stac_helpers
```
%% Cell type:markdown id:a71f9d19-8fb3-4f47-b4c4-447bb80d8dd5 tags:
## Collection ID
%% Cell type:code id:15ee060d-3127-4024-a1ad-6aa0648667e1 tags:
``` python
# name for STAC collection - should match name of zarr dataset
collection_id = 'conus404-daily'
```
%% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:
## Asset Metadata Input
%% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = 's3://hytest/conus404/conus404_daily.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
```
%% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url2 = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description2 = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles2 = ["data","zarr","s3"]
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
## Collection Metadata Input
%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
``` python
# description of STAC collection
collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'
# you can consider pulling this fram dataset attributes instead of manually typing it:
# collection_description = ds.attrs['title']
# license for dataset
collection_license = stac_helpers.license_picker(ds.attrs['license'])
```
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
dim_names_dict = {}
for d in dims_auto_extract:
dim_names_dict[d] = stac_helpers.extract_dim(ds, d)
print(f"Dimension dictionary: {dim_names_dict}")
```
%% Cell type:markdown id:810d7480-165d-41c0-bd09-163656a14003 tags:
## Get crs info
If there is no crs info that can be automatically extracted from the dataset with pyproj, you will need to manually identify the crs and create a crs object. This reference list of cartopy projections may be a helpful resource: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
%% Cell type:code id:239d3b00-77f9-4178-954b-ba81a2b34512 tags:
``` python
crs_var = 'crs'
```
%% Cell type:code id:b03d52f3-1367-4255-a561-52ee4fc9e92d tags:
``` python
# use pyproj to automatically extract crs info
crs = pyproj.CRS.from_cf(ds[crs_var].attrs)
# alternatively, create the appropriate cartopy projection
# crs = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
# central_latitude=crs_info.latitude_of_projection_origin,
# standard_parallels=crs_info.standard_parallel)
```
%% Cell type:markdown id:282c689e-07f0-48ee-8e3d-35876e8c5094 tags:
### Compare dataset crs var to generated proj4 string to make sure it looks ok
%% Cell type:code id:4cee13ba-487d-483e-a013-b65685137502 tags:
``` python
ds.crs
ds[crs_var]
```
%% Cell type:code id:f7bc73db-7717-450e-9679-525f7be0c910 tags:
``` python
crs.to_proj4()
```
%% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:
## Plot a map
%% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:
``` python
# plot a map of a single variable
var_to_plot = 'SNOW'
da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
```
%% Cell type:markdown id:5e057a6c-06fb-4406-823b-e81c58e520e4 tags:
## Plot a time series at a specific point
This can help you verify a variable's values
%% Cell type:code id:c7de2681-88c2-4597-857c-8f169c596f8b tags:
``` python
# enter lat, lon of point you want to plot time series for
lat,lon = 39.978322,-105.2772194
time_start = '2013-01-01 00:00'
time_end = '2013-12-31 00:00'
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
x, y = transformer.transform(lon, lat)
da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()
da.hvplot(x=dim_names_dict['T'], grid=True)
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Collection Extent
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial Extent
##### WARNING - make sure data type is **float** NOT **numpy.float64**
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordinates must be from WGS 84 datum
# left, bottom, right, top
# Note: try changing around the commented out lines below to get type float rather than a numpy float
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]
spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
print(spatial_bounds)
print(f'\nspatial_bounds data type: {type(spatial_bounds[0])}')
```
%% Cell type:code id:f16fdb9e-7ed8-40fb-a4f1-9ecabdebc0a1 tags:
``` python
XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
```
%% Cell type:code id:074fc23c-f4d9-4427-80d3-fbf691e6d411 tags:
``` python
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
lon, lat = transformer.transform(XX.ravel(), YY.ravel())
```
%% Cell type:code id:5345c975-9fe3-48e1-a663-0275cdf275dc tags:
``` python
print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')
print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')
```
%% Cell type:code id:e0a5a222-743d-403a-9411-2406374803cf tags:
``` python
# create a spatial extent object
spatial_extent = pystac.SpatialExtent(bboxes=[[min(lon).item(), min(lat).item(), max(lon).item(), max(lat).item()]])
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal Extent
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
# if you get an error:
# Cannot convert input [] of type <class 'cftime._cftime.DatetimeNoLeap'> to Timestamp
# use the following instead:
#temporal_extent_lower = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min())
#temporal_extent_upper = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max())
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
# create a temporal extent object
temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
```
%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
``` python
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up STAC Catalog and create a collection
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', 'catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to collection
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3-osn"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
collection.add_asset(asset_id, asset)
```
%% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:
``` python
asset_id2 = "zarr-s3"
asset2 = pystac.Asset(href=zarr_url2,
description=asset_description2,
media_type="application/vnd+zarr",
roles=asset_roles2,
extra_fields = xarray_opendataset_kwargs2)
collection.add_asset(asset_id2, asset2)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to collection
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(collection, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
#### user review needed
#### compare crs information to the projjson to make sure it looks correct
%% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:
``` python
# print out crs information in dataset
crs
```
%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
``` python
# # the datacube extension can accept reference_system information as a numerical EPSG code,
# # WKT2 (ISO 19162) string or PROJJSON object.
# # we will use a projjson, as was done by Microsoft Planetary Computer here:
# # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# projjson = json.loads(lcc.to_json())
# alternatively, I think we could do this:
projjson = crs.to_json()
print(crs.to_json(pretty=True))
```
%% Cell type:markdown id:b6b88ee9-60c2-4d91-af74-c1c56b094826 tags:
#### user review needed
#### look at the spatial and temporal steps, make sure they are all successfully pulled and they look correct
%% Cell type:markdown id:9e2bbcc5-e45a-4b8c-9d60-601f345e8134 tags:
**Time**
If you need to manually construct this field, here is a helpful reference: https://en.wikipedia.org/wiki/ISO_8601#Durations
%% Cell type:code id:82f1e9bd-52ee-46f5-9e95-c2359d95fcf3 tags:
``` python
time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()
# if time is yearly or monthly, you will need to manually construct it:
#time_step = "P1Y0M0DT0H0M0S"
print(f'time step: {time_step}')
```
%% Cell type:code id:64be65b2-de20-447a-a9c2-bd8eca3e440e tags:
``` python
# # optional debugging for time steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=0)
```
%% Cell type:code id:bc8dff39-2a2e-44a0-9b30-987107c2d1e2 tags:
``` python
# # debugging for time steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 3343
# ds.isel(time=slice(ix-1,ix+3)).time
```
%% Cell type:markdown id:9aa6c8ff-8d9b-40a7-a281-39b502bd5a3d tags:
**X/lon**
%% Cell type:code id:a8ba7695-ca45-4db2-bd46-c465f4e37eff tags:
``` python
x_step = stac_helpers.get_step(ds, dim_names_dict['X'])
# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
#x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)
print(f'x step: {x_step}')
```
%% Cell type:code id:fac4c9f2-a952-4c7f-aa32-862957372d6f tags:
``` python
# # optional debugging for spatial steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# x_dim=dim_names_dict['X']
# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:8d0b5a2d-dc58-4ad6-b890-859ce6bb08de tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(x=slice(ix-1,ix+3)).x
```
%% Cell type:markdown id:21b5cca4-8bb4-498d-ae6b-6b8545fffe56 tags:
**Y/lat**
%% Cell type:code id:7405583b-ecb9-44b0-8815-048e42e55a42 tags:
``` python
y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])
# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)
print(f'y step: {y_step}')
```
%% Cell type:code id:ece0fe37-b54c-4721-aa9b-33d2998d191b tags:
``` python
# # optional debugging for spatial steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# y_dim=dim_names_dict['Y']
# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:abdafb8f-5217-4b82-91b6-eec8183c9128 tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(y=slice(ix-1,ix+3)).y
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed
#### you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes (type, axis, extent, etc.):
Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.
If you have a dimension like "bnds" or "nv" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type "count" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).
Here is an example:
```
dims_dict = {
'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]})
}
```
%% Cell type:code id:acd45d3c-7845-47e6-9b7d-e35627a7ca9a tags:
``` python
dims = list(ds.dims)
print(dims)
```
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the dims printed in above cell
# x, y, t dimension info is pulled out automatically using the dim dict we created above
# all other dims listed in above cell need to be manually written in
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),
dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [spatial_bounds[0], spatial_bounds[2]], 'step': x_step, 'reference_system': projjson}),
dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [spatial_bounds[1], spatial_bounds[3]], 'step': y_step, 'reference_system': projjson}),
'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top_stag')}),
'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top')}),
'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'soil_layers_stag')}),
'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, 'x_stag')}),
'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y_stag')}),
'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snow_layers_stag')}),
'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snso_layers_stag')}),
}
```
%% Cell type:code id:8ab85b09-eb38-404c-910c-13349d5e2234 tags:
``` python
# make sure you added all the right dims
assert sorted(list(dims_dict.keys())) == sorted(dims)
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# drop metpy_crs coordinate we have added
if 'metpy_crs' in ds.coords:
ds = ds.drop_vars('metpy_crs')
# pull list of vars from dataset
vars = list(ds.variables)
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coordinates and crs as variables here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
unit = stac_helpers.get_unit(ds, v)
var_type = stac_helpers.get_var_type(ds, v, crs_var)
long_name = stac_helpers.get_long_name(ds, v)
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Collection to Catalog and Save
%% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:
``` python
# # helper to find items of wrong type
# d = collection.to_dict()
# print(*stac_helpers.find_paths(d))
```
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if catalog.get_child(collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:
``` python
```
......
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# CONUS404 Daily Zarr -> Item Workflow
This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr asset for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import pyproj
from pyproj import Transformer
import cartopy.crs as ccrs
import cfunits
import json
from shapely.geometry import Polygon, mapping
import orjson
import sys
sys.path.insert(1, '..')
import stac_helpers
```
%% Cell type:markdown id:f8c93dbc-174c-4387-be7a-00eccf004509 tags:
## Item ID
%% Cell type:code id:65b8979c-a462-4f68-8912-e82158d8811e tags:
``` python
# name for STAC item - should match name of zarr dataset
item_id = 'conus404-daily'
```
%% Cell type:markdown id:c5bb519a-a8b2-414c-82c7-12c0bc9c9612 tags:
## Asset Metadata Input
%% Cell type:code id:ba881d0a-8fe7-4f3a-b2b6-f5363c0eda2b tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = f's3://mdmf/gdp/{item_id}.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
```
%% Cell type:code id:6dcd88c3-5c60-42f9-8264-94113be41c69 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url2 = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description2 = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles2 = ["data","zarr","s3"]
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
dim_names_dict = {}
for d in dims_auto_extract:
dim_names_dict[d] = stac_helpers.extract_dim(ds, d)
print(f"Dimension dictionary: {dim_names_dict}")
```
%% Cell type:markdown id:7819e5c8-0cc0-4816-a2b1-e88a6ebe9f13 tags:
## Get crs info
If there is no crs info that can be automatically extracted from the dataset with pyproj, you will need to manually identify the crs and create a crs object. This reference list of cartopy projections may be a helpful resource: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
%% Cell type:code id:6bae3ab0-b0cc-4486-aaaf-209c4d30ec1f tags:
``` python
crs_var = 'crs'
```
%% Cell type:code id:a702a217-bfbc-4264-8ac6-ecc8fad4a8df tags:
``` python
# use pyproj to automatically extract crs info
crs = pyproj.CRS.from_cf(ds[crs_var].attrs)
# alternatively, create the appropriate cartopy projection
# crs = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
# central_latitude=crs_info.latitude_of_projection_origin,
# standard_parallels=crs_info.standard_parallel)
```
%% Cell type:markdown id:958d5dbd-6867-4101-a360-fd0643bf3e19 tags:
### Compare dataset crs var to generated proj4 string to make sure it looks ok
%% Cell type:code id:528c6f4c-aa01-4f3d-a96b-088a9ddab4e7 tags:
``` python
ds.crs
ds[crs_var]
```
%% Cell type:code id:b2a02e46-0f50-4550-b193-ffef44f23539 tags:
``` python
crs.to_proj4()
```
%% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:
## Plot a map
%% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:
``` python
# plot a map of a single variable
var_to_plot = 'SNOW'
da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
```
%% Cell type:markdown id:5e057a6c-06fb-4406-823b-e81c58e520e4 tags:
## Plot a time series at a specific point
This can help you verify a variable's values
%% Cell type:code id:c7de2681-88c2-4597-857c-8f169c596f8b tags:
``` python
# enter lat, lon of point you want to plot time series for
lat,lon = 39.978322,-105.2772194
time_start = '2013-01-01 00:00'
time_end = '2013-12-31 00:00'
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
x, y = transformer.transform(lon, lat)
da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()
da.hvplot(x=dim_names_dict['T'], grid=True)
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Item Geometry, BBox, Datetime
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial - Geometry and bbox
##### WARNING - make sure data type is **float** NOT **numpy.float64**
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordinates must be from WGS 84 datum
# left, bottom, right, top
# Note: try changing around the commented out lines below to get type float ratherthan a numpy float
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float), ds[dim_names_dict['Y']].data.min().compute().astype(float), ds[dim_names_dict['X']].data.max().compute().astype(float), ds[dim_names_dict['Y']].data.max().compute().astype(float)]
#spatial_bounds = [ds[dim_names_dict['X']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.min().compute().astype(float).tolist(), ds[dim_names_dict['X']].data.max().compute().astype(float).tolist(), ds[dim_names_dict['Y']].data.max().compute().astype(float).tolist()]
spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
print(spatial_bounds)
print(f'\nspatial_bounds data type: {type(spatial_bounds[0])}')
```
%% Cell type:code id:3b5cc182-54d4-44b9-98c0-d1be3fd6c6d0 tags:
``` python
XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
```
%% Cell type:code id:85024093-da0c-425c-9126-6154cfaabd36 tags:
``` python
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
lon, lat = transformer.transform(XX.ravel(), YY.ravel())
```
%% Cell type:code id:73d130f7-2dba-4d3c-9981-f18120073314 tags:
``` python
print(f'lower left coordinates (WGS84): {min(lon)}, {min(lat)}')
print(f'upper right coordinates (WGS84): {max(lon)}, {max(lat)}')
```
%% Cell type:code id:86206def-9e39-474f-9edb-20458a004803 tags:
``` python
# create a geometry object
footprint = mapping(Polygon([
[min(lon).item(), min(lat).item()],
[min(lon).item(), max(lat).item()],
[max(lon).item(), max(lat).item()],
[max(lon).item(), min(lat).item()]
]))
print(footprint)
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal - datetime
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
# if you get an error:
# Cannot convert input [] of type <class 'cftime._cftime.DatetimeNoLeap'> to Timestamp
# use the following instead:
#temporal_extent_lower = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min())
#temporal_extent_upper = pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max())
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up STAC Catalog and create an item
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', 'catalog_items')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if next(catalog.get_items(item_id), None):
item = next(catalog.get_items(item_id), None)
print("existing item opened")
else:
item = pystac.Item(id=item_id,
geometry=footprint,
bbox=coord_bounds,
datetime=temporal_extent_upper,
properties={})
print("new item created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to item
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3-osn"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
item.add_asset(asset_id, asset)
```
%% Cell type:code id:d7361e6c-bebe-4ac9-8cdc-14b43fe64393 tags:
``` python
asset_id2 = "zarr-s3"
asset2 = pystac.Asset(href=zarr_url2,
description=asset_description2,
media_type="application/vnd+zarr",
roles=asset_roles2,
extra_fields = xarray_opendataset_kwargs2)
item.add_asset(asset_id2, asset2)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to item
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(item, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
#### user review needed
#### compare crs information to the projjson to make sure it looks correct
%% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:
``` python
# print out crs information in dataset
crs
```
%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
``` python
# # the datacube extension can accept reference_system information as a numerical EPSG code,
# # WKT2 (ISO 19162) string or PROJJSON object.
# # we will use a projjson, as was done by Microsoft Planetary Computer here:
# # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# projjson = json.loads(lcc.to_json())
# alternatively, I think we could do this:
projjson = crs.to_json()
print(crs.to_json(pretty=True))
```
%% Cell type:markdown id:6307b977-a151-4dce-bfca-5a83257501da tags:
#### user review needed
#### look at the spatial and temporal steps, make sure they are all successfully pulled and they look correct
%% Cell type:markdown id:18b8950f-030f-4f78-b9ac-799dc9263cb6 tags:
**Time**
If you need to manually construct this field, here is a helpful reference: https://en.wikipedia.org/wiki/ISO_8601#Durations
%% Cell type:code id:65e670a4-f41b-41b3-b84c-f48b1bdc3b95 tags:
``` python
time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()
# if time is yearly or monthly, you will need to manually construct it:
#time_step = "P1Y0M0DT0H0M0S"
print(f'time step: {time_step}')
```
%% Cell type:code id:72986296-83de-4029-bd54-3c585341c38e tags:
``` python
# # optional debugging for time steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=0)
```
%% Cell type:code id:79d747a4-0f9c-4afd-aaea-84483a927904 tags:
``` python
# # debugging for time steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 3343
# ds.isel(time=slice(ix-1,ix+3)).time
```
%% Cell type:markdown id:c2898abe-ca26-4587-bc6d-531268547266 tags:
**X/lon**
%% Cell type:code id:9a8652ec-8edf-42b7-9bb1-acb789552caf tags:
``` python
x_step = stac_helpers.get_step(ds, dim_names_dict['X'])
# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
#x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)
print(f'x step: {x_step}')
```
%% Cell type:code id:2f29a57d-2203-4f49-9dea-c04c1362d685 tags:
``` python
# # optional debugging for spatial steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# x_dim=dim_names_dict['X']
# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=0)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:448714dd-4f57-4e74-99bc-c3b3de400231 tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(x=slice(ix-1,ix+3)).x
```
%% Cell type:markdown id:b40ef6ce-8830-4465-a960-f31357ac73ad tags:
**Y/lat**
%% Cell type:code id:a4c80267-edca-46e8-8862-b56db8391120 tags:
``` python
y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])
# a common issue that causes the spatial step not to be identified comes from rounding errors in the step calculation
# use the debugging cells below to identify if this is the issue, if so, use the round_dec argument to round to a higher decimal place:
#y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)
print(f'y step: {y_step}')
```
%% Cell type:code id:15975722-2f96-4a39-b904-c99cdf2921c3 tags:
``` python
# # optional debugging for spatial steps:
# # check all step sizes (step_list), get number of occurences of each (step_count), and get index locations where each step size occurs in the dataset so you can manually inspect the values, if needed
# # please specify the index of the step in step_list with the step_ix field - this will return the indices in the dataset where this step size occurred
# y_dim=dim_names_dict['Y']
# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=0)
# print(f'\nx dim name (for next cell): {x_dim}')
```
%% Cell type:code id:142e62f0-412c-4e9b-a117-205c5a2c08af tags:
``` python
# # debugging for spatial steps, cont:
# # please choose one of the index locations printed above
# # this will print the time steps adjacent to it
# ix = 5
# ds.isel(y=slice(ix-1,ix+3)).y
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed
#### you will need to copy all of the dimensions printed below into the dict and fill in the appropriate attributes (type, axis, extent, etc.):
Please see [datacube spec](https://github.com/stac-extensions/datacube?tab=readme-ov-file#dimension-object) for details on required fields.
If you have a dimension like "bnds" or "nv" that is used on variables like time_bnds, lon_bnds, lat_bnds to choose either the lower or upper bound, you can use and [additional dimension object](https://github.com/stac-extensions/datacube?tab=readme-ov-file#additional-dimension-object). We recommend making the type "count" as Microsoft Planetary Computer did [here](https://github.com/stac-extensions/datacube/blob/9e74fa706c9bdd971e01739cf18dcc53bdd3dd4f/examples/daymet-hi-annual.json#L76).
Here is an example:
```
dims_dict = {
'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]})
}
```
%% Cell type:code id:e72dd20a-fc59-4c18-a9ba-7f40e0abb707 tags:
``` python
dims = list(ds.dims)
print(dims)
```
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the dims printed in above cell
# x, y, t dimension info is pulled out automatically using the dim dict we created above
# all other dims listed in above cell need to be manually written in
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),
dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [spatial_bounds[0], spatial_bounds[2]], 'step': x_step, 'reference_system': projjson}),
dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [spatial_bounds[1], spatial_bounds[3]], 'step': y_step, 'reference_system': projjson}),
'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top_stag')}),
'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top')}),
'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'soil_layers_stag')}),
'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, 'x_stag')}),
'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y_stag')}),
'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snow_layers_stag')}),
'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snso_layers_stag')}),
}
```
%% Cell type:code id:816ed76c-c287-4116-a507-6a5c2734e24b tags:
``` python
# make sure you added all the right dims
assert sorted(list(dims_dict.keys())) == sorted(dims)
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# drop metpy_crs coordinate we have added
if 'metpy_crs' in ds.coords:
ds = ds.drop_vars('metpy_crs')
# pull list of vars from dataset
vars = list(ds.variables)
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coordinates and crs as variables here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
unit = stac_helpers.get_unit(ds, v)
var_type = stac_helpers.get_var_type(ds, v, crs_var)
long_name = stac_helpers.get_long_name(ds, v)
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Item to Catalog and Save
%% Cell type:code id:3cd21f72-1966-4bb1-bbe6-9ce6eb4b3c29 tags:
``` python
# # helper to find items of wrong type
# d = item.to_dict()
# print(*stac_helpers.find_paths(d))
```
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if next(catalog.get_items(item_id), None):
#item.normalize_and_save(root_href=os.path.join(catalog_path, item_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
catalog.normalize_hrefs(catalog_path)
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_item(item)
catalog.normalize_hrefs(catalog_path)
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)
#catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:code id:da040aae-f977-468c-bdca-4857bffa24b6 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment