Skip to content
Snippets Groups Projects
Commit 56dc5aa1 authored by Snyder, Amelia Marie's avatar Snyder, Amelia Marie
Browse files

add x, y, t dimension steps

parent 08b62f9e
No related branches found
No related tags found
1 merge request!2add x, y, t dimension steps
......@@ -24,7 +24,8 @@
"extent": [
"1979-10-01T00:00:00Z",
"2021-09-25T00:00:00Z"
]
],
"step": "P1DT0H0M0S"
},
"x": {
"type": "spatial",
......@@ -32,7 +33,8 @@
"extent": [
-2732097.901153542,
2731902.098846458
]
],
"step": 4000.0
},
"y": {
"type": "spatial",
......@@ -40,7 +42,8 @@
"extent": [
-2027960.8996368449,
2028039.1003631551
]
],
"step": 4000.0
},
"bottom_top_stag": {
"type": "spatial",
......
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# CONUS404 Daily Zarr -> Collection Exploratory Workflow
This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up NHGF STAC Catalog
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', 'catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
## Collection Metadata Input
%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
# name for STAC collection
collection_id = 'conus404-daily'
# description of STAC collection
collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'
# license for dataset
collection_license = 'CC0-1.0'
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', requester_pays=True)
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
def extract_dim(ds, d):
try:
dim_list = ds.cf.axes[d]
assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'
dim = dim_list[0]
except KeyError:
print(f"Could not auto-extract {d} dimension name.")
print("Look at the xarray output above showing the dataset dimensions.")
dim = str(input(f"What is the name of the {d} dimension of this dataset?"))
assert dim in ds.dims, "That is not a valid dimension name for this dataset"
print(f"name of {d} dimension: {dim}\n")
return dim
dims_dict = {}
for d in dims_auto_extract:
dims_dict[d] = extract_dim(ds, d)
print(f"Dimension dictionary: {dims_dict}")
```
%% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:
## Plot a map
%% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:
``` python
# plot a map of a single variable
var_to_plot = 'SNOW'
da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Collection Extent
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial Extent
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordiantes must be from WGS 84 datum
# left, bottom, right, top
coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]
# Note: I'm not sure why but half the time you need to run .compute() to get the value and the other half
# you shouldn't - I've included both options below - switch the commented line if you have this issue
#coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]
coord_bounds = [ds.lon.data.min().astype(float), ds.lat.data.min().astype(float), ds.lon.data.max().astype(float), ds.lat.data.max().astype(float)]
print(coord_bounds)
# create a spatial extent object
spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal Extent
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
temporal_extent_lower = pd.Timestamp(ds.time.data.min())
temporal_extent_upper = pd.Timestamp(ds.time.data.max())
temporal_extent_lower = pd.Timestamp(ds[dims_dict['T']].data.min())
temporal_extent_upper = pd.Timestamp(ds[dims_dict['T']].data.max())
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
# create a temporal extent object
temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
```
%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
``` python
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Cell type:markdown id:cfb71202-03df-45b5-ac2f-0dc2ee1ab780 tags:
## Create pystac collection
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to collection
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
collection.add_asset(asset_id, asset)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to collection
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(collection, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:
``` python
# list out dataset dimensions
# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
# raising an error if it can’t be found.
dims = list(ds.dims)
print(dims)
```
%% Cell type:code id:00a18a29-fb9a-4b56-8009-493122997b16 tags:
``` python
# get x, y bounds for extent of those dimensions (required)
xy_bounds = [ds.x.data.min().astype(float), ds.y.data.min().astype(float), ds.x.data.max().astype(float), ds.y.data.max().astype(float)]
xy_bounds = [ds[dims_dict['X']].data.min().astype(float), ds[dims_dict['Y']].data.min().astype(float), ds[dims_dict['X']].data.max().astype(float), ds[dims_dict['Y']].data.max().astype(float)]
print(xy_bounds)
```
%% Cell type:code id:0f49d6d7-9e30-4144-909b-fa1238e6c77a tags:
``` python
def get_step(dim_name):
dim_vals = ds[dim_name].values
diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]
unique_steps = np.unique(diffs)
# set step - if all steps are the same length
# datacube spec specifies to use null for irregularly spaced steps
if len(unique_steps)==1:
step = unique_steps[0]
else:
step = None
return(step)
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):
#### user inpds['bottom_top_stag'].values.min()ut needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the coordinates printed above
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')]}),
'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]]}),
'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [xy_bounds[1], xy_bounds[3]]}),
dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step('time')).isoformat()}),
'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step('x')}),
'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step('y')}),
'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),
'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),
'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),
'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x'}),
'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y'}),
'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),
'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),
}
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# pull list of vars from dataset
vars = list(ds.variables)
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coorindates and crs as variables here: https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': 'data'})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Collection to Catalog and Save
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if catalog.get_child(collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment